Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/graph/paths.cc')
-rw-r--r--src/graph/paths.cc363
1 files changed, 363 insertions, 0 deletions
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
new file mode 100644
index 0000000..ce1772c
--- /dev/null
+++ b/src/graph/paths.cc
@@ -0,0 +1,363 @@
+/*************************************************************************
+ * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "graph.h"
+#include "topo.h"
+#include "comm.h"
+#include "net.h"
+
+// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths
+
+struct ncclTopoNodeList {
+ struct ncclTopoNode* list[NCCL_TOPO_MAX_NODES];
+ int count;
+};
+
+static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode* node, int t, int64_t id, struct ncclTopoLinkList** path) {
+ for (int i=0; i<system->nodes[t].count; i++) {
+ if (system->nodes[t].nodes[i].id == id) {
+ *path = node->paths[t]+i;
+ return ncclSuccess;
+ }
+ }
+ WARN("Could not find node of type %d id %lx\n", t, id);
+ return ncclInternalError;
+}
+
+static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
+ if (baseNode->paths[baseNode->type] == NULL) {
+ NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
+ }
+
+ // breadth-first search to set all paths to that node in the system
+ struct ncclTopoNodeList nodeList;
+ struct ncclTopoNodeList nextNodeList;
+ nodeList.count = 1; nodeList.list[0] = baseNode;
+ nextNodeList.count = 0;
+ struct ncclTopoLinkList* basePath;
+ NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
+ basePath->count = 0;
+ basePath->width = LOC_WIDTH;
+ basePath->type = LINK_LOC;
+
+ while (nodeList.count) {
+ nextNodeList.count = 0;
+ for (int n=0; n<nodeList.count; n++) {
+ struct ncclTopoNode* node = nodeList.list[n];
+ struct ncclTopoLinkList* path;
+ NCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
+ for (int l=0; l<node->nlinks; l++) {
+ struct ncclTopoLink* link = node->links+l;
+ struct ncclTopoNode* remNode = link->remNode;
+ if (remNode->paths[baseNode->type] == NULL) {
+ NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count));
+ }
+ struct ncclTopoLinkList* remPath;
+ NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
+ int width = std::min(path->width, link->width);
+ if (remPath->width < width) {
+ // Find reverse link
+ for (int l=0; l<remNode->nlinks; l++) {
+ if (remNode->links[l].remNode == node) {
+ remPath->list[0] = remNode->links+l;
+ break;
+ }
+ }
+ if (remPath->list[0] == NULL) {
+ WARN("Failed to find reverse path from remNode id %d type %d nlinks %d to node id %d type %d",
+ remNode->id, remNode->type, remNode->nlinks, node->id, node->type);
+ return ncclInternalError;
+ }
+ // Copy the rest of the path
+ for (int i=0; i<path->count; i++) remPath->list[i+1] = path->list[i];
+ remPath->count = path->count + 1;
+ remPath->width = width;
+
+ // Consider the path is QPI when going through the CPU
+ // Also don't consider LINK_NET as we only care about the NIC->GPU path.
+ int type = remNode->type == CPU ? LINK_QPI : link->type == LINK_NET ? 0 : link->type;
+ remPath->type = std::max(path->type, type);
+
+ // Add to the list for the next iteration if not already in the list
+ // Disallow GPUs as intermediate steps for now
+ if (remNode->type != GPU) {
+ int i;
+ for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break;
+ if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode;
+ }
+ }
+ }
+ }
+ memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
+ }
+ return ncclSuccess;
+}
+
+static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* node) {
+ char line[1024];
+#ifdef ENABLE_TRACE
+ INFO(NCCL_GRAPH, "Paths from %s/%lX :", topoNodeTypeStr[node->type], node->id);
+#else
+ sprintf(line, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
+ int offset = strlen(line);
+#endif
+ for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
+ if (node->paths[t] == NULL) continue;
+ for (int n = 0; n<system->nodes[t].count; n++) {
+#ifdef ENABLE_TRACE
+ line[0] = 0;
+ int offset = 0;
+ for (int i=0; i<node->paths[t][n].count; i++) {
+ struct ncclTopoLink* link = node->paths[t][n].list[i];
+ struct ncclTopoNode* remNode = link->remNode;
+ sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id);
+ offset = strlen(line);
+ }
+ INFO(NCCL_GRAPH, "%s (%d)", line, node->paths[t][n].width);
+#else
+ sprintf(line+offset, "%s/%lX (%d/%d/%d) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, node->paths[t][n].type);
+ offset = strlen(line);
+#endif
+ }
+ }
+#ifndef ENABLE_TRACE
+ INFO(NCCL_GRAPH, "%s", line);
+#endif
+}
+
+ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system) {
+ for (int i=0; i<system->nodes[GPU].count; i++) {
+ printNodePaths(system, system->nodes[GPU].nodes+i);
+ }
+ for (int i=0; i<system->nodes[NET].count; i++) {
+ printNodePaths(system, system->nodes[NET].nodes+i);
+ }
+ return ncclSuccess;
+}
+
+static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) {
+ // Find the closest CPU to a GPU
+ int minHops = 0;
+ int localCpu = -1;
+ struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU];
+ for (int c=0; c<system->nodes[CPU].count; c++) {
+ int hops = paths[c].count;
+ if (minHops == 0 || hops < minHops) {
+ localCpu = c;
+ minHops = hops;
+ }
+ }
+ if (localCpu == -1) {
+ WARN("Error : could not find CPU close to GPU %d", gpu);
+ return ncclInternalError;
+ }
+ *retCpu = localCpu;
+ return ncclSuccess;
+}
+
+static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int i1, int t2, int i2) {
+ struct ncclTopoNode* cpuNode = system->nodes[CPU].nodes+c;
+ struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1;
+
+ int l=0;
+ // Node 1 -> CPU
+ for (int i=0; i<srcNode->paths[CPU][c].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[CPU][c].list[i];
+ // CPU -> Node 2
+ for (int i=0; i<cpuNode->paths[t2][i2].count; i++) srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i];
+
+ // Update path characteristics
+ srcNode->paths[t2][i2].count = l;
+ srcNode->paths[t2][i2].type = LINK_QPI;
+ srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width);
+ return ncclSuccess;
+}
+
+// Remove/free paths for a given type
+static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) {
+ for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
+ for (int n=0; n<system->nodes[t].count; n++) {
+ struct ncclTopoNode* node = system->nodes[t].nodes+n;
+ free(node->paths[nodeType]);
+ node->paths[nodeType] = NULL;
+ }
+ }
+}
+
+ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) {
+ // Precompute paths between GPUs/NICs.
+
+ // Remove everything in case we're re-computing
+ for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
+
+ // Set direct paths from/to CPUs. We need them in many cases.
+ for (int c=0; c<system->nodes[CPU].count; c++) {
+ NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system));
+ }
+
+ // Set direct paths from/to GPUs.
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ // Compute paths to GPU g
+ NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system));
+
+ if (peerInfos == NULL) continue;
+ // Update paths from GPUs p to GPU g when we can't or don't want to use P2P or even SHM
+ struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].rank;
+ for (int p=0; p<system->nodes[GPU].count; p++) {
+ if (p == g) continue;
+ struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].rank;
+ int p2p;
+ NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo));
+ if (p2p == 0) {
+ int shm;
+ NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
+ if (shm == 1) {
+ // We cannot use GPU Direct, so we need all traffic to go through a CPU
+ int cpu;
+ NCCLCHECK(getLocalCpu(system, g, &cpu));
+ NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g));
+ } else {
+ // We cannot communicate with that peer.
+ system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
+ }
+ }
+ }
+ }
+
+ // Set direct paths from/to NICs.
+ for (int n=0; n<system->nodes[NET].count; n++) {
+ struct ncclTopoNode* netNode = system->nodes[NET].nodes+n;
+ NCCLCHECK(ncclTopoSetPaths(netNode, system));
+
+ if (peerInfos == NULL) continue;
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ if ((peerInfos[system->nodes[GPU].nodes[g].rank].gdrSupport & (1 << n)) == 0) {
+ // We cannot use GPU Direct RDMA, so we need all NIC<->GPU paths
+ // to go through a CPU
+ int localCpu;
+ NCCLCHECK(getLocalCpu(system, g, &localCpu));
+ NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g));
+ NCCLCHECK(addCpuStep(system, localCpu, GPU, g, NET, n));
+ }
+ }
+ }
+
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm) {
+ int *domains;
+ int64_t *ids;
+ NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count));
+ NCCLCHECK(ncclCalloc(&ids, system->nodes[GPU].count));
+ int myDomain = 0;
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+ domains[g] = g;
+ ids[g] = gpu->id;
+ for (int p=0; p<g; p++) {
+ if (gpu->paths[GPU][p].count > 0) {
+ domains[g] = std::min(domains[g], domains[p]);
+ }
+ }
+ if (gpu->rank == comm->rank) myDomain = domains[g];
+ }
+
+ int ngpus = system->nodes[GPU].count;
+ for (int i=0; i<ngpus; i++) {
+ if (domains[i] == myDomain) continue;
+ struct ncclTopoNode* gpu = NULL;
+ int g;
+ for (g=0; g<system->nodes[GPU].count /* This one varies over the loops */; g++) {
+ gpu = system->nodes[GPU].nodes+g;
+ if (gpu->id == ids[i]) break; else gpu=NULL;
+ }
+ if (gpu == NULL) {
+ WARN("Could not find id %lx", ids[i]);
+ free(domains);
+ free(ids);
+ return ncclInternalError;
+ }
+
+ // Remove GPUs I can't access (even indirectly) from my view of the node
+ for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
+ for (int n=0; n<system->nodes[t].count; n++) {
+ struct ncclTopoNode* node = system->nodes[t].nodes+n;
+ if (node == gpu) continue;
+ for (int l=0; l<node->nlinks; l++) {
+ while (l<node->nlinks && node->links[l].remNode == gpu) {
+ if (l<node->nlinks-1)
+ memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink));
+ node->nlinks--;
+ }
+ if (l<node->nlinks && node->links[l].remNode->type == GPU && node->links[l].remNode >= gpu) {
+ node->links[l].remNode--;
+ }
+ }
+ }
+ }
+ if (g != system->nodes[GPU].count-1)
+ memmove(gpu, gpu+1, (system->nodes[GPU].count-g-1)*sizeof(struct ncclTopoNode));
+ system->nodes[GPU].count--;
+ }
+
+ comm->localRanks = system->nodes[GPU].count;
+ if (system->nodes[GPU].count == comm->nRanks) {
+ // Trim network
+ ncclTopoRemovePathType(system, NET);
+ system->nodes[NET].count = 0;
+ }
+ free(domains);
+ free(ids);
+ return ncclSuccess;
+}
+
+static ncclResult_t getGpuSpeed(struct ncclTopoNode* node, int* speed) {
+ int nvlSpeed = 0;
+ int nvlPeers = 0;
+ int pciSpeed = 0;
+ for (int l=0; l<node->nlinks; l++) {
+ if (node->links[l].type == LINK_NVL) nvlSpeed += node->links[l].width;
+ if (node->links[l].remNode->type == GPU) nvlPeers++; else nvlPeers = 2;
+ if (node->links[l].type == LINK_PCI) pciSpeed = node->links[l].width;
+ }
+ *speed = std::min(*speed, std::max(nvlSpeed, pciSpeed));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system) {
+ // Compute max speed to try to accelerate the search.
+ system->maxSpeed = LOC_WIDTH;
+
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ NCCLCHECK(getGpuSpeed(system->nodes[GPU].nodes+g, &system->maxSpeed));
+ }
+ if (system->nodes[NET].count) {
+ // Try to assign one NIC per GPU
+ int netMaxSpeed = 0;
+ int netMaxSpeedCount = 0;
+ for (int n=0; n<system->nodes[NET].count; n++) {
+ int maxSpeed = 0;
+ struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ maxSpeed = std::max(maxSpeed, net->paths[GPU][g].width);
+ }
+ if (maxSpeed > netMaxSpeed) {
+ netMaxSpeed = maxSpeed;
+ netMaxSpeedCount = 1;
+ } else if (maxSpeed == netMaxSpeed) {
+ netMaxSpeedCount++;
+ }
+ }
+ system->maxSpeed = std::min(system->maxSpeed, netMaxSpeedCount*NET_WIDTH);
+ }
+ return ncclSuccess;
+}
+
+void ncclTopoFree(struct ncclTopoSystem* system) {
+ for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
+ free(system);
+}