Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/graph/paths.cc')
-rw-r--r--src/graph/paths.cc285
1 files changed, 169 insertions, 116 deletions
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index eba1964..0872ae7 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -42,7 +42,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
basePath->count = 0;
basePath->width = LOC_WIDTH;
- basePath->type = LINK_LOC;
+ basePath->type = PATH_LOC;
while (nodeList.count) {
nextNodeList.count = 0;
@@ -58,7 +58,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
}
struct ncclTopoLinkList* remPath;
NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
- int width = std::min(path->width, link->width);
+ float width = std::min(path->width, link->width);
if (remPath->width < width) {
// Find reverse link
for (int l=0; l<remNode->nlinks; l++) {
@@ -68,8 +68,8 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
}
}
if (remPath->list[0] == NULL) {
- WARN("Failed to find reverse path from remNode id %d type %d nlinks %d to node id %d type %d",
- remNode->id, remNode->type, remNode->nlinks, node->id, node->type);
+ WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx",
+ remNode->type, remNode->id, remNode->nlinks, node->type, node->id);
return ncclInternalError;
}
// Copy the rest of the path
@@ -77,9 +77,17 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
remPath->count = path->count + 1;
remPath->width = width;
- // Consider the path is QPI when going through the CPU
- // Also don't consider LINK_NET as we only care about the NIC->GPU path.
- int type = remNode->type == CPU ? LINK_QPI : link->type == LINK_NET ? 0 : link->type;
+ // Start with path type = link type. PATH and LINK types are supposed to match.
+ // Don't consider LINK_NET as we only care about the NIC->GPU path.
+ int type = link->type == LINK_NET ? 0 : link->type;
+ // Differentiate between one and multiple PCI switches
+ if (type == PATH_PIX && (node->type == PCI || link->remNode->type == PCI) && remPath->count > 3) type = PATH_PXB;
+ // Consider a path going through the CPU as PATH_PHB
+ if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB;
+ // Ignore Power CPU in an NVLink path
+ if (path->type == PATH_NVL && type == PATH_SYS && link->remNode->type == CPU &&
+ link->remNode->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) type = 0;
+
remPath->type = std::max(path->type, type);
// Add to the list for the next iteration if not already in the list
@@ -117,9 +125,9 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n
sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id);
offset = strlen(line);
}
- INFO(NCCL_GRAPH, "%s (%d)", line, node->paths[t][n].width);
+ INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].width);
#else
- sprintf(line+offset, "%s/%lX (%d/%d/%d) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, node->paths[t][n].type);
+ sprintf(line+offset, "%s/%lX (%d/%f/%s) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, topoPathTypeStr[node->paths[t][n].type]);
offset = strlen(line);
#endif
}
@@ -171,7 +179,7 @@ static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int
// Update path characteristics
srcNode->paths[t2][i2].count = l;
- srcNode->paths[t2][i2].type = LINK_QPI;
+ srcNode->paths[t2][i2].type = std::max(srcNode->paths[CPU][c].type, cpuNode->paths[t2][i2].type);
srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width);
return ncclSuccess;
}
@@ -194,6 +202,127 @@ static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType)
}
}
+static const int levelsOldToNew[] = { PATH_LOC, PATH_PIX, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS };
+ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelEnv) {
+ if (*level == -1) {
+ int l = -1;
+ if (disableEnv) {
+ char* str = getenv(disableEnv);
+ if (str) {
+ int disable = strtol(str, NULL, 0);
+ if (disable == 1) l = 0;
+ }
+ }
+ if (l == -1) {
+ char* str = getenv(levelEnv);
+ if (str) {
+ for (int i=0; i<PATH_NET; i++) {
+ if (strcmp(str, topoPathTypeStr[i]) == 0) {
+ l = i;
+ break;
+ }
+ }
+ // Old style numbering
+ if (l == -1 && str[0] >= '0' && str[0] <= '9') {
+ int oldLevel = strtol(str, NULL, 0);
+ const int maxOldLevel = sizeof(levelsOldToNew)/sizeof(int) - 1;
+ if (oldLevel > maxOldLevel) oldLevel = maxOldLevel;
+ l = levelsOldToNew[oldLevel];
+ }
+ }
+ }
+ if (l >= 0) INFO(NCCL_GRAPH, "%s set from environment to %s", levelEnv, topoPathTypeStr[l]);
+ *level = l >= 0 ? l : -2;
+ }
+ return ncclSuccess;
+}
+
+int ncclTopoUserP2pLevel = -1;
+ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p) {
+ *p2p = 0;
+
+ // Get GPUs from topology
+ int g1, g2;
+ NCCLCHECK(ncclTopoIdToIndex(system, GPU, id1, &g1));
+ struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1;
+ if (ncclTopoIdToIndex(system, GPU, id2, &g2) == ncclInternalError) {
+ // GPU not found, we can't use p2p.
+ return ncclSuccess;
+ }
+ struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2;
+
+ // In general, use P2P whenever we can.
+ int p2pLevel = PATH_SYS;
+
+ // Don't use P2P through ARM CPUs
+ int arch, vendor, model;
+ NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
+ if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB;
+ if (arch == NCCL_TOPO_CPU_ARCH_X86 &&
+ vendor == NCCL_TOPO_CPU_VENDOR_INTEL &&
+ model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB;
+
+ // User override
+ NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL"));
+ if (ncclTopoUserP2pLevel != -2) p2pLevel = ncclTopoUserP2pLevel;
+
+ // Compute the PCI distance and compare with the p2pLevel.
+ if (path->type <= p2pLevel) *p2p = 1;
+
+ return ncclSuccess;
+}
+
+NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
+int ncclTopoUserGdrLevel = -1;
+
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int netDev, int read, int* useGdr) {
+ *useGdr = 0;
+
+ // Get GPU and NET
+ int n, g;
+ NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n));
+ struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+ NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g));
+ struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+
+ // Check that both the NIC and GPUs support it
+ if (net->net.gdrSupport == 0) return ncclSuccess;
+ if (gpu->gpu.gdrSupport == 0) return ncclSuccess;
+
+ if (read) { // For reads (sends) only enable under certain conditions
+ int gdrReadParam = ncclParamNetGdrRead();
+ if (gdrReadParam == 0) return ncclSuccess;
+ if (gdrReadParam < 0) {
+ int nvlink = 0;
+ // Since we don't know whether there are other communicators,
+ // it's better to keep things local if we have a single GPU.
+ if (system->nodes[GPU].count == 1) nvlink = 1;
+ for (int i=0; i<system->nodes[GPU].count; i++) {
+ if (i == g) continue;
+ if (gpu->paths[GPU][i].type == PATH_NVL) {
+ nvlink = 1;
+ break;
+ }
+ }
+ if (!nvlink) return ncclSuccess;
+ }
+ }
+
+ // Check if we are close enough that it makes sense to enable GDR
+ int netGdrLevel = PATH_PXB;
+ NCCLCHECK(ncclGetLevel(&ncclTopoUserGdrLevel, NULL, "NCCL_NET_GDR_LEVEL"));
+ if (ncclTopoUserGdrLevel != -2) netGdrLevel = ncclTopoUserGdrLevel;
+ int distance = gpu->paths[NET][n].type;
+ if (distance > netGdrLevel) {
+ INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel);
+ return ncclSuccess;
+ }
+
+ *useGdr = 1;
+ INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d <= %d), read %d", busId, netDev, distance, netGdrLevel, read);
+ return ncclSuccess;
+}
+
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) {
// Precompute paths between GPUs/NICs.
@@ -210,26 +339,29 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
// Compute paths to GPU g
NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system));
- if (peerInfos == NULL) continue;
- // Update paths from GPUs p to GPU g when we can't or don't want to use P2P or even SHM
- struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].rank;
+ // Update path when we don't want to / can't use GPU Direct P2P
for (int p=0; p<system->nodes[GPU].count; p++) {
- if (p == g) continue;
- struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].rank;
int p2p;
- NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo));
+ NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p));
if (p2p == 0) {
- int shm;
- NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
- if (shm == 1) {
- // We cannot use GPU Direct, so we need all traffic to go through a CPU
- int cpu;
- NCCLCHECK(getLocalCpu(system, g, &cpu));
- NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g));
- } else {
- // We cannot communicate with that peer.
- system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
- }
+ // Divert all traffic through the CPU
+ int cpu;
+ NCCLCHECK(getLocalCpu(system, g, &cpu));
+ NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g));
+ }
+ }
+
+ if (peerInfos == NULL) continue;
+ // Remove GPUs we can't talk to because of containers.
+ struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].gpu.rank;
+ for (int p=0; p<system->nodes[GPU].count; p++) {
+ if (p == g) continue;
+ struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].gpu.rank;
+ int shm;
+ NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
+ if (shm == 0) {
+ // Mark this peer as inaccessible. We'll trim it later.
+ system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
}
}
}
@@ -239,11 +371,12 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
struct ncclTopoNode* netNode = system->nodes[NET].nodes+n;
NCCLCHECK(ncclTopoSetPaths(netNode, system));
- if (peerInfos == NULL) continue;
for (int g=0; g<system->nodes[GPU].count; g++) {
- if ((peerInfos[system->nodes[GPU].nodes[g].rank].gdrSupport & (1 << n)) == 0) {
- // We cannot use GPU Direct RDMA, so we need all NIC<->GPU paths
- // to go through a CPU
+ // Update path when we dont want to / can't use GPU Direct RDMA.
+ int gdr;
+ NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
+ if (gdr == 0) {
+ // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
int localCpu;
NCCLCHECK(getLocalCpu(system, g, &localCpu));
NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g));
@@ -251,7 +384,6 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
}
}
}
-
return ncclSuccess;
}
@@ -270,7 +402,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
domains[g] = std::min(domains[g], domains[p]);
}
}
- if (gpu->rank == comm->rank) myDomain = domains[g];
+ if (gpu->gpu.rank == comm->rank) myDomain = domains[g];
}
int ngpus = system->nodes[GPU].count;
@@ -288,98 +420,19 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
free(ids);
return ncclInternalError;
}
-
- // Remove GPUs I can't access (even indirectly) from my view of the node
- for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
- for (int n=0; n<system->nodes[t].count; n++) {
- struct ncclTopoNode* node = system->nodes[t].nodes+n;
- if (node == gpu) continue;
- for (int l=0; l<node->nlinks; l++) {
- while (l<node->nlinks && node->links[l].remNode == gpu) {
- if (l<node->nlinks-1)
- memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink));
- node->nlinks--;
- }
- if (l<node->nlinks && node->links[l].remNode->type == GPU && node->links[l].remNode >= gpu) {
- node->links[l].remNode--;
- }
- }
- }
- }
- if (g != system->nodes[GPU].count-1)
- memmove(gpu, gpu+1, (system->nodes[GPU].count-g-1)*sizeof(struct ncclTopoNode));
- system->nodes[GPU].count--;
+ NCCLCHECK(ncclTopoRemoveNode(system, GPU, g));
}
comm->localRanks = system->nodes[GPU].count;
if (system->nodes[GPU].count == comm->nRanks) {
- // Trim network
- ncclTopoRemovePathType(system, NET);
- system->nodes[NET].count = 0;
- for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
- for (int n=0; n<system->nodes[t].count; n++) {
- struct ncclTopoNode* node = system->nodes[t].nodes+n;
- for (int l=0; l<node->nlinks; l++) {
- struct ncclTopoLink* link = &(node->links[l]);
- if (link->remNode->type == NET) {
- // Remove the link
- for (int i=l; i<(node->nlinks-1); i++) {
- memcpy(&(node->links[i]), &(node->links[i+1]), sizeof(ncclTopoLink));
- }
- node->nlinks--;
- l--; // revisit the same value of "l" for the next iteration, since we edited the list in the middle of the loop
- }
- }
- }
- }
+ for (int n=system->nodes[NET].count-1; n>=0; n--)
+ NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
}
free(domains);
free(ids);
return ncclSuccess;
}
-static ncclResult_t getGpuSpeed(struct ncclTopoNode* node, int* speed) {
- int nvlSpeed = 0;
- int nvlPeers = 0;
- int pciSpeed = 0;
- for (int l=0; l<node->nlinks; l++) {
- if (node->links[l].type == LINK_NVL) nvlSpeed += node->links[l].width;
- if (node->links[l].remNode->type == GPU) nvlPeers++; else nvlPeers = 2;
- if (node->links[l].type == LINK_PCI) pciSpeed = node->links[l].width;
- }
- *speed = std::min(*speed, std::max(nvlSpeed, pciSpeed));
- return ncclSuccess;
-}
-
-ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system) {
- // Compute max speed to try to accelerate the search.
- system->maxSpeed = LOC_WIDTH;
-
- for (int g=0; g<system->nodes[GPU].count; g++) {
- NCCLCHECK(getGpuSpeed(system->nodes[GPU].nodes+g, &system->maxSpeed));
- }
- if (system->nodes[NET].count) {
- // Try to assign one NIC per GPU
- int netMaxSpeed = 0;
- int netMaxSpeedCount = 0;
- for (int n=0; n<system->nodes[NET].count; n++) {
- int maxSpeed = 0;
- struct ncclTopoNode* net = system->nodes[NET].nodes+n;
- for (int g=0; g<system->nodes[GPU].count; g++) {
- maxSpeed = std::max(maxSpeed, net->paths[GPU][g].width);
- }
- if (maxSpeed > netMaxSpeed) {
- netMaxSpeed = maxSpeed;
- netMaxSpeedCount = 1;
- } else if (maxSpeed == netMaxSpeed) {
- netMaxSpeedCount++;
- }
- }
- system->maxSpeed = std::min(system->maxSpeed, netMaxSpeedCount*NET_WIDTH);
- }
- return ncclSuccess;
-}
-
void ncclTopoFree(struct ncclTopoSystem* system) {
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
free(system);