diff options
Diffstat (limited to 'src/graph/paths.cc')
-rw-r--r-- | src/graph/paths.cc | 285 |
1 files changed, 169 insertions, 116 deletions
diff --git a/src/graph/paths.cc b/src/graph/paths.cc index eba1964..0872ae7 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -42,7 +42,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath)); basePath->count = 0; basePath->width = LOC_WIDTH; - basePath->type = LINK_LOC; + basePath->type = PATH_LOC; while (nodeList.count) { nextNodeList.count = 0; @@ -58,7 +58,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT } struct ncclTopoLinkList* remPath; NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath)); - int width = std::min(path->width, link->width); + float width = std::min(path->width, link->width); if (remPath->width < width) { // Find reverse link for (int l=0; l<remNode->nlinks; l++) { @@ -68,8 +68,8 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT } } if (remPath->list[0] == NULL) { - WARN("Failed to find reverse path from remNode id %d type %d nlinks %d to node id %d type %d", - remNode->id, remNode->type, remNode->nlinks, node->id, node->type); + WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx", + remNode->type, remNode->id, remNode->nlinks, node->type, node->id); return ncclInternalError; } // Copy the rest of the path @@ -77,9 +77,17 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT remPath->count = path->count + 1; remPath->width = width; - // Consider the path is QPI when going through the CPU - // Also don't consider LINK_NET as we only care about the NIC->GPU path. - int type = remNode->type == CPU ? LINK_QPI : link->type == LINK_NET ? 0 : link->type; + // Start with path type = link type. PATH and LINK types are supposed to match. + // Don't consider LINK_NET as we only care about the NIC->GPU path. + int type = link->type == LINK_NET ? 0 : link->type; + // Differentiate between one and multiple PCI switches + if (type == PATH_PIX && (node->type == PCI || link->remNode->type == PCI) && remPath->count > 3) type = PATH_PXB; + // Consider a path going through the CPU as PATH_PHB + if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB; + // Ignore Power CPU in an NVLink path + if (path->type == PATH_NVL && type == PATH_SYS && link->remNode->type == CPU && + link->remNode->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) type = 0; + remPath->type = std::max(path->type, type); // Add to the list for the next iteration if not already in the list @@ -117,9 +125,9 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id); offset = strlen(line); } - INFO(NCCL_GRAPH, "%s (%d)", line, node->paths[t][n].width); + INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].width); #else - sprintf(line+offset, "%s/%lX (%d/%d/%d) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, node->paths[t][n].type); + sprintf(line+offset, "%s/%lX (%d/%f/%s) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, topoPathTypeStr[node->paths[t][n].type]); offset = strlen(line); #endif } @@ -171,7 +179,7 @@ static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int // Update path characteristics srcNode->paths[t2][i2].count = l; - srcNode->paths[t2][i2].type = LINK_QPI; + srcNode->paths[t2][i2].type = std::max(srcNode->paths[CPU][c].type, cpuNode->paths[t2][i2].type); srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width); return ncclSuccess; } @@ -194,6 +202,127 @@ static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) } } +static const int levelsOldToNew[] = { PATH_LOC, PATH_PIX, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS }; +ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelEnv) { + if (*level == -1) { + int l = -1; + if (disableEnv) { + char* str = getenv(disableEnv); + if (str) { + int disable = strtol(str, NULL, 0); + if (disable == 1) l = 0; + } + } + if (l == -1) { + char* str = getenv(levelEnv); + if (str) { + for (int i=0; i<PATH_NET; i++) { + if (strcmp(str, topoPathTypeStr[i]) == 0) { + l = i; + break; + } + } + // Old style numbering + if (l == -1 && str[0] >= '0' && str[0] <= '9') { + int oldLevel = strtol(str, NULL, 0); + const int maxOldLevel = sizeof(levelsOldToNew)/sizeof(int) - 1; + if (oldLevel > maxOldLevel) oldLevel = maxOldLevel; + l = levelsOldToNew[oldLevel]; + } + } + } + if (l >= 0) INFO(NCCL_GRAPH, "%s set from environment to %s", levelEnv, topoPathTypeStr[l]); + *level = l >= 0 ? l : -2; + } + return ncclSuccess; +} + +int ncclTopoUserP2pLevel = -1; +ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p) { + *p2p = 0; + + // Get GPUs from topology + int g1, g2; + NCCLCHECK(ncclTopoIdToIndex(system, GPU, id1, &g1)); + struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1; + if (ncclTopoIdToIndex(system, GPU, id2, &g2) == ncclInternalError) { + // GPU not found, we can't use p2p. + return ncclSuccess; + } + struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2; + + // In general, use P2P whenever we can. + int p2pLevel = PATH_SYS; + + // Don't use P2P through ARM CPUs + int arch, vendor, model; + NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); + if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB; + if (arch == NCCL_TOPO_CPU_ARCH_X86 && + vendor == NCCL_TOPO_CPU_VENDOR_INTEL && + model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB; + + // User override + NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL")); + if (ncclTopoUserP2pLevel != -2) p2pLevel = ncclTopoUserP2pLevel; + + // Compute the PCI distance and compare with the p2pLevel. + if (path->type <= p2pLevel) *p2p = 1; + + return ncclSuccess; +} + +NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2); +int ncclTopoUserGdrLevel = -1; + +ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int netDev, int read, int* useGdr) { + *useGdr = 0; + + // Get GPU and NET + int n, g; + NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n)); + struct ncclTopoNode* net = system->nodes[NET].nodes+n; + NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g)); + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + + // Check that both the NIC and GPUs support it + if (net->net.gdrSupport == 0) return ncclSuccess; + if (gpu->gpu.gdrSupport == 0) return ncclSuccess; + + if (read) { // For reads (sends) only enable under certain conditions + int gdrReadParam = ncclParamNetGdrRead(); + if (gdrReadParam == 0) return ncclSuccess; + if (gdrReadParam < 0) { + int nvlink = 0; + // Since we don't know whether there are other communicators, + // it's better to keep things local if we have a single GPU. + if (system->nodes[GPU].count == 1) nvlink = 1; + for (int i=0; i<system->nodes[GPU].count; i++) { + if (i == g) continue; + if (gpu->paths[GPU][i].type == PATH_NVL) { + nvlink = 1; + break; + } + } + if (!nvlink) return ncclSuccess; + } + } + + // Check if we are close enough that it makes sense to enable GDR + int netGdrLevel = PATH_PXB; + NCCLCHECK(ncclGetLevel(&ncclTopoUserGdrLevel, NULL, "NCCL_NET_GDR_LEVEL")); + if (ncclTopoUserGdrLevel != -2) netGdrLevel = ncclTopoUserGdrLevel; + int distance = gpu->paths[NET][n].type; + if (distance > netGdrLevel) { + INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel); + return ncclSuccess; + } + + *useGdr = 1; + INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d <= %d), read %d", busId, netDev, distance, netGdrLevel, read); + return ncclSuccess; +} + ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) { // Precompute paths between GPUs/NICs. @@ -210,26 +339,29 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer // Compute paths to GPU g NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system)); - if (peerInfos == NULL) continue; - // Update paths from GPUs p to GPU g when we can't or don't want to use P2P or even SHM - struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].rank; + // Update path when we don't want to / can't use GPU Direct P2P for (int p=0; p<system->nodes[GPU].count; p++) { - if (p == g) continue; - struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].rank; int p2p; - NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo)); + NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p)); if (p2p == 0) { - int shm; - NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo)); - if (shm == 1) { - // We cannot use GPU Direct, so we need all traffic to go through a CPU - int cpu; - NCCLCHECK(getLocalCpu(system, g, &cpu)); - NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g)); - } else { - // We cannot communicate with that peer. - system->nodes[GPU].nodes[p].paths[GPU][g].count = 0; - } + // Divert all traffic through the CPU + int cpu; + NCCLCHECK(getLocalCpu(system, g, &cpu)); + NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g)); + } + } + + if (peerInfos == NULL) continue; + // Remove GPUs we can't talk to because of containers. + struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].gpu.rank; + for (int p=0; p<system->nodes[GPU].count; p++) { + if (p == g) continue; + struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].gpu.rank; + int shm; + NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo)); + if (shm == 0) { + // Mark this peer as inaccessible. We'll trim it later. + system->nodes[GPU].nodes[p].paths[GPU][g].count = 0; } } } @@ -239,11 +371,12 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer struct ncclTopoNode* netNode = system->nodes[NET].nodes+n; NCCLCHECK(ncclTopoSetPaths(netNode, system)); - if (peerInfos == NULL) continue; for (int g=0; g<system->nodes[GPU].count; g++) { - if ((peerInfos[system->nodes[GPU].nodes[g].rank].gdrSupport & (1 << n)) == 0) { - // We cannot use GPU Direct RDMA, so we need all NIC<->GPU paths - // to go through a CPU + // Update path when we dont want to / can't use GPU Direct RDMA. + int gdr; + NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr)); + if (gdr == 0) { + // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU int localCpu; NCCLCHECK(getLocalCpu(system, g, &localCpu)); NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g)); @@ -251,7 +384,6 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer } } } - return ncclSuccess; } @@ -270,7 +402,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* domains[g] = std::min(domains[g], domains[p]); } } - if (gpu->rank == comm->rank) myDomain = domains[g]; + if (gpu->gpu.rank == comm->rank) myDomain = domains[g]; } int ngpus = system->nodes[GPU].count; @@ -288,98 +420,19 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* free(ids); return ncclInternalError; } - - // Remove GPUs I can't access (even indirectly) from my view of the node - for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) { - for (int n=0; n<system->nodes[t].count; n++) { - struct ncclTopoNode* node = system->nodes[t].nodes+n; - if (node == gpu) continue; - for (int l=0; l<node->nlinks; l++) { - while (l<node->nlinks && node->links[l].remNode == gpu) { - if (l<node->nlinks-1) - memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink)); - node->nlinks--; - } - if (l<node->nlinks && node->links[l].remNode->type == GPU && node->links[l].remNode >= gpu) { - node->links[l].remNode--; - } - } - } - } - if (g != system->nodes[GPU].count-1) - memmove(gpu, gpu+1, (system->nodes[GPU].count-g-1)*sizeof(struct ncclTopoNode)); - system->nodes[GPU].count--; + NCCLCHECK(ncclTopoRemoveNode(system, GPU, g)); } comm->localRanks = system->nodes[GPU].count; if (system->nodes[GPU].count == comm->nRanks) { - // Trim network - ncclTopoRemovePathType(system, NET); - system->nodes[NET].count = 0; - for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) { - for (int n=0; n<system->nodes[t].count; n++) { - struct ncclTopoNode* node = system->nodes[t].nodes+n; - for (int l=0; l<node->nlinks; l++) { - struct ncclTopoLink* link = &(node->links[l]); - if (link->remNode->type == NET) { - // Remove the link - for (int i=l; i<(node->nlinks-1); i++) { - memcpy(&(node->links[i]), &(node->links[i+1]), sizeof(ncclTopoLink)); - } - node->nlinks--; - l--; // revisit the same value of "l" for the next iteration, since we edited the list in the middle of the loop - } - } - } - } + for (int n=system->nodes[NET].count-1; n>=0; n--) + NCCLCHECK(ncclTopoRemoveNode(system, NET, n)); } free(domains); free(ids); return ncclSuccess; } -static ncclResult_t getGpuSpeed(struct ncclTopoNode* node, int* speed) { - int nvlSpeed = 0; - int nvlPeers = 0; - int pciSpeed = 0; - for (int l=0; l<node->nlinks; l++) { - if (node->links[l].type == LINK_NVL) nvlSpeed += node->links[l].width; - if (node->links[l].remNode->type == GPU) nvlPeers++; else nvlPeers = 2; - if (node->links[l].type == LINK_PCI) pciSpeed = node->links[l].width; - } - *speed = std::min(*speed, std::max(nvlSpeed, pciSpeed)); - return ncclSuccess; -} - -ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system) { - // Compute max speed to try to accelerate the search. - system->maxSpeed = LOC_WIDTH; - - for (int g=0; g<system->nodes[GPU].count; g++) { - NCCLCHECK(getGpuSpeed(system->nodes[GPU].nodes+g, &system->maxSpeed)); - } - if (system->nodes[NET].count) { - // Try to assign one NIC per GPU - int netMaxSpeed = 0; - int netMaxSpeedCount = 0; - for (int n=0; n<system->nodes[NET].count; n++) { - int maxSpeed = 0; - struct ncclTopoNode* net = system->nodes[NET].nodes+n; - for (int g=0; g<system->nodes[GPU].count; g++) { - maxSpeed = std::max(maxSpeed, net->paths[GPU][g].width); - } - if (maxSpeed > netMaxSpeed) { - netMaxSpeed = maxSpeed; - netMaxSpeedCount = 1; - } else if (maxSpeed == netMaxSpeed) { - netMaxSpeedCount++; - } - } - system->maxSpeed = std::min(system->maxSpeed, netMaxSpeedCount*NET_WIDTH); - } - return ncclSuccess; -} - void ncclTopoFree(struct ncclTopoSystem* system) { for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t); free(system); |