Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSylvain Jeaugey <sjeaugey@nvidia.com>2020-04-10 03:14:07 +0300
committerSylvain Jeaugey <sjeaugey@nvidia.com>2020-04-10 03:14:07 +0300
commitb5b6c6acdd40b816e79fcffb251346ca73dd7bcd (patch)
tree525c768c9169d4c0d3a76a59b3a20571d513e8c0
parent533e3702cf713a9ab9a634fbb8b4c380ecf381e6 (diff)
Fix bug #307 : wrong NIC selection on the reduction tree.
The reduction tree (tree up) was inverting the NICs to use, causing performance issue in cases where we are using different NICs on a given channel.
-rw-r--r--src/graph/search.cc7
-rw-r--r--src/include/graph.h2
-rw-r--r--src/transport/coll_net.cc4
-rw-r--r--src/transport/net.cc4
4 files changed, 10 insertions, 7 deletions
diff --git a/src/graph/search.cc b/src/graph/search.cc
index b4c3e35..1bbb7d3 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -804,7 +804,10 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
return ncclSuccess;
}
-ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* dev) {
- *dev = graph->inter[(channelId%graph->nChannels)*2+dir];
+ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int rank, int channelId, int* dev) {
+ int channel = channelId%graph->nChannels;
+ int ngpus = system->nodes[GPU].count;
+ int index = graph->intra[channel*ngpus] == rank ? 0 : 1;
+ *dev = graph->inter[channel*2+index];
return ncclSuccess;
}
diff --git a/src/include/graph.h b/src/include/graph.h
index b27ea35..1814440 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -27,7 +27,7 @@ void ncclTopoFree(struct ncclTopoSystem* system);
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
// Query topology
-ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* net);
+ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int rank, int channelId, int* net);
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p);
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index 73e9fdd..435c88d 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -84,7 +84,7 @@ ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
NCCLCHECK(ncclCalloc(&sendResources, 1));
send->transportResources = sendResources;
- NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &sendResources->netDev));
+ NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &sendResources->netDev));
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, sendResources->netDev, 1, &sendResources->useGdr));
int sendSize = sizeof(struct ncclSendMem);
@@ -110,7 +110,7 @@ ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph*
NCCLCHECK(ncclCalloc(&recvResources, 1));
recv->transportResources = recvResources;
- NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &recvResources->netDev));
+ NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &recvResources->netDev));
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, recvResources->netDev, 0, &recvResources->useGdr));
int sendSize = sizeof(struct ncclSendMem);
diff --git a/src/transport/net.cc b/src/transport/net.cc
index db82a40..288ad92 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -60,7 +60,7 @@ ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
- NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &resources->netDev));
+ NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &resources->netDev));
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
int sendSize = sizeof(struct ncclSendMem);
@@ -83,7 +83,7 @@ ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
- NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &resources->netDev));
+ NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &resources->netDev));
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
int sendSize = sizeof(struct ncclSendMem);