Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/graph/topo.cc')
-rw-r--r--src/graph/topo.cc641
1 files changed, 641 insertions, 0 deletions
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
new file mode 100644
index 0000000..a1b3209
--- /dev/null
+++ b/src/graph/topo.cc
@@ -0,0 +1,641 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "graph.h"
+#include "topo.h"
+#include "comm.h"
+#include "nvmlwrap.h"
+#include "net.h"
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#define BUSID_SIZE (sizeof("0000:00:00.0"))
+#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
+
+const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
+
+const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
+const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "QPI", "NET" };
+
+/******************************************************************/
+/******************* Graph Creation Functions *********************/
+/******************************************************************/
+static int getNumaId(char *path) {
+ char npath[PATH_MAX];
+ snprintf(npath, PATH_MAX, "%s/numa_node", path);
+ npath[PATH_MAX-1] = '\0';
+
+ int numaId = -1;
+ FILE *file = fopen(npath, "r");
+ if (file == NULL) return -1;
+ if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
+ fclose(file);
+
+ return numaId;
+}
+
+static ncclResult_t getPciPath(char* busId, char** path) {
+ for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
+ char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
+ memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
+ memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
+ *path = realpath(busPath, NULL);
+ if (*path == NULL) {
+ WARN("Could not find real path of %s", busPath);
+ return ncclSystemError;
+ }
+ return ncclSuccess;
+}
+
+// Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000.
+ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
+ char* str = path+offset;
+ // Remove trailing "/"
+ if (*str == '/') str--;
+ // Find next /
+ while (*str != '/') str--;
+ str++;
+ NCCLCHECK(busIdToInt64(str, id));
+ return ncclSuccess;
+}
+
+static ncclResult_t idToIndex(struct ncclTopoSystem* system, int64_t id, int* index) {
+ *index = -1;
+ for (int i=0; i<system->nodes[GPU].count; i++) {
+ if (system->nodes[GPU].nodes[i].id == id) {
+ *index = i;
+ }
+ }
+ return ncclSuccess;
+}
+
+
+static ncclResult_t getPath(int64_t id, char** path) {
+ char busId[] = "0000:00:00.0";
+ NCCLCHECK(int64ToBusId(id, busId));
+ NCCLCHECK(getPciPath(busId, path));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoCudaPath(int cudaDev, char** path) {
+ char busId[BUSID_SIZE];
+ CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
+ NCCLCHECK(getPciPath(busId, path));
+ return ncclSuccess;
+}
+
+
+int interCpuWidth = 0;
+int cpuPciWidth = 0;
+
+static ncclResult_t getCpuWidths() {
+ // Check if already detected
+ if (interCpuWidth + cpuPciWidth) return ncclSuccess;
+
+ // Defaults
+ char cpu[256];
+ sprintf(cpu, "Generic");
+ cpuPciWidth = interCpuWidth = PCI_WIDTH;
+
+#ifdef __PPC__
+ sprintf(cpu, "ppc64");
+ interCpuWidth = P9_WIDTH;
+#endif
+#ifdef __x86_64__
+ sprintf(cpu, "x86_64");
+ union {
+ struct {
+ // CPUID 0 String register order
+ uint32_t ebx;
+ uint32_t edx;
+ uint32_t ecx;
+ };
+ char vendor[12];
+ } cpuid0;
+
+ asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0));
+ if (strncmp(cpuid0.vendor, "GenuineIntel", 12) == 0) sprintf(cpu, "Intel");
+
+ if (strcmp(cpu, "Intel") == 0) {
+ union {
+ struct {
+ int steppingId:4;
+ int model:4;
+ int familyId:4;
+ int processorType:2;
+ int resv0:2;
+ int extModelId:4;
+ int modelId:8;
+ int resv1:4;
+ };
+ uint32_t val;
+ } cpuid1;
+ asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1));
+ if (cpuid1.familyId == 6 && cpuid1.modelId >= 0x55) { // Skylake
+ sprintf(cpu, "Intel/Skylake (or later)");
+ interCpuWidth = SKL_QPI_WIDTH;
+ } else {
+ interCpuWidth = QPI_WIDTH;
+ }
+ }
+#endif
+ INFO(NCCL_GRAPH, "%s CPU (PCI %d, InterCpu %d)", cpu, cpuPciWidth, interCpuWidth);
+ return ncclSuccess;
+}
+
+static ncclResult_t ncclTopoGetInterCpuWidth(int* width) {
+ NCCLCHECK(getCpuWidths());
+ *width = interCpuWidth;
+ return ncclSuccess;
+}
+static ncclResult_t ncclTopoGetCpuPciP2pWidth(int* width) {
+ NCCLCHECK(getCpuWidths());
+ *width = cpuPciWidth;
+ return ncclSuccess;
+}
+static ncclResult_t ncclTopoGetPciWidth(int* width) {
+ *width = PCI_WIDTH;
+ return ncclSuccess;
+}
+static ncclResult_t ncclTopoGetNetWidth(int* width) {
+ *width = NET_WIDTH;
+ return ncclSuccess;
+}
+
+enum ncclNvLinkDeviceType {
+ ncclNvLinkDeviceUnknown,
+ ncclNvLinkDeviceGpu,
+ ncclNvLinkDeviceSwitch,
+ ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
+};
+
+static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
+ char classPath[] = "/sys/bus/pci/devices/0000:00:00.0/class";
+ memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
+ char* rPath = realpath(classPath, NULL);
+ int fd;
+ if ((fd = open(rPath, O_RDONLY)) == -1) {
+ // Could not find device. It might be because we're in a VM and
+ // we don't see the whole machine. This is handled silently so
+ // we don't want to print an INFO error.
+ TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
+ return ncclSystemError;
+ }
+ free(rPath);
+ char pciClass[9];
+ strncpy(pciClass, "0x000000", 9);
+ int len;
+ SYSCHECKVAL(read(fd, pciClass, 8), "read", len);
+ SYSCHECK(close(fd), "close");
+ if (strcmp(pciClass, "0x068000") == 0) {
+ // PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
+ *type = ncclNvLinkDeviceSwitch;
+ } else if (strcmp(pciClass, "0x068001") == 0) {
+ // PCI device is of type "Bridge: IBM Device 04ea"
+ *type = ncclNvLinkDeviceBridge;
+ } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
+ || strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce)
+ *type = ncclNvLinkDeviceGpu;
+ } else {
+ *type = ncclNvLinkDeviceUnknown;
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoConnectCpu(struct ncclTopoSystem* system, int numaId, struct ncclTopoNode* node, int linkType, int linkWidth) {
+ struct ncclTopoNode* cpuNode = NULL;
+ for (int c=0; c<system->nodes[CPU].count; c++) {
+ if (system->nodes[CPU].nodes[c].id == numaId) cpuNode = system->nodes[CPU].nodes+c;
+ }
+ if (cpuNode == NULL) { // Create CPU
+ NCCLCHECK(ncclTopoCreateNode(system, &cpuNode, CPU, numaId));
+ }
+ NCCLCHECK(ncclTopoConnectNodes(node, cpuNode, linkType, linkWidth));
+ NCCLCHECK(ncclTopoConnectNodes(cpuNode, node, linkType, linkWidth));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoConnectNVLink(nvmlDevice_t* nvmlDevs, struct ncclTopoSystem* system) {
+ struct ncclTopoNode* nvsNode = NULL;
+
+ int minNvlinks = 6, minWidth = VOLTA_NVLINK_WIDTH;
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+ int cudaMajor, cudaMinor;
+ NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDevs[g], &cudaMajor, &cudaMinor));
+ int maxNvLinks, width;
+ if (cudaMajor < 6) {
+ maxNvLinks = 0;
+ width = 0;
+ } else if (cudaMajor == 6) {
+ maxNvLinks = 4;
+ width = PASCAL_NVLINK_WIDTH;
+ } else {
+ maxNvLinks = 6;
+ width = VOLTA_NVLINK_WIDTH;
+ }
+
+ int nvlinks = 0;
+ for (int l=0; l<maxNvLinks; ++l) {
+ // Check whether we can use this NVLink for P2P
+ unsigned canP2P;
+ if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDevs[g], l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
+
+ // Make sure the Nvlink is up. The previous call should have trained the link.
+ nvmlEnableState_t isActive;
+ if ((wrapNvmlDeviceGetNvLinkState(nvmlDevs[g], l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
+
+ // Try to figure out what's on the other side of the NVLink
+ nvmlPciInfo_t remoteProc;
+ if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevs[g], l, &remoteProc) != ncclSuccess) continue;
+
+ // Make a lower case copy of the bus ID for calling ncclDeviceType
+ // PCI system path is in lower case
+ char* p = remoteProc.busId;
+ char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+ for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
+ lowerId[c] = tolower(p[c]);
+ if (p[c] == 0) break;
+ }
+
+ enum ncclNvLinkDeviceType type;
+ NCCLCHECK(ncclDeviceType(lowerId, &type));
+ if (type == ncclNvLinkDeviceGpu) {
+ int64_t remoteId;
+ NCCLCHECK(busIdToInt64(lowerId, &remoteId));
+ int peer;
+ NCCLCHECK(idToIndex(system, remoteId, &peer));
+ if (peer != -1) {
+ NCCLCHECK(ncclTopoConnectNodes(gpu, system->nodes[GPU].nodes+peer, LINK_NVL, width));
+ nvlinks++;
+ }
+ } else if (type == ncclNvLinkDeviceBridge) {
+ // Nvlink between GPU and CPU (PPC)
+ // Since the remote bridge does not have a valid numa_node, assume we
+ // are connected to the closest CPU.
+ char* path;
+ NCCLCHECK(getPath(gpu->id, &path));
+ int numaId = getNumaId(path);
+ free(path);
+ NCCLCHECK(ncclTopoConnectCpu(system, numaId, gpu, LINK_NVL, width));
+ nvlinks++;
+ } else { // Nvswitch
+ if (type == ncclNvLinkDeviceUnknown) {
+ // The NVLink is up but we couldn't find the PCI device on the other
+ // side. Assume it's an NVswitch outside a VM.
+ if (l == 0) INFO(NCCL_INIT, "%d/%d -> %s : Assuming NVLink is connected to NVswitch", g, l, lowerId);
+ }
+ if (nvsNode == NULL) { // Create nvswitch
+ NCCLCHECK(ncclTopoCreateNode(system, &nvsNode, NVS, 0));
+ }
+ NCCLCHECK(ncclTopoConnectNodes(gpu, nvsNode, LINK_NVL, VOLTA_NVLINK_WIDTH));
+ NCCLCHECK(ncclTopoConnectNodes(nvsNode, gpu, LINK_NVL, VOLTA_NVLINK_WIDTH));
+ nvlinks++;
+ }
+ }
+ minNvlinks = std::min(minNvlinks, nvlinks);
+ minWidth = std::min(minWidth, width);
+ }
+ int pciWidth;
+ NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
+ system->maxSpeed = minNvlinks ? minNvlinks*minWidth : pciWidth;
+ system->maxWidth = minNvlinks ? minWidth : pciWidth;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoCreatePciPath(struct ncclTopoSystem* system, struct ncclTopoNode* endNode, char* path) {
+ struct ncclTopoNode* lastNode = endNode;
+ int pciWidth;
+ NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
+ // Find intermediate PCI switches
+ int slashCount = 0;
+ int offsetRC = 0;
+ while (offsetRC < strlen(path)) {
+ if (path[offsetRC] == '/') slashCount++;
+ if (slashCount == 4) break;
+ offsetRC++;
+ }
+ int offset = strlen(path);
+ slashCount = 0;
+ while (--offset > offsetRC) {
+ if (path[offset] == '/') {
+ slashCount++;
+ // Find if already existing
+ if ((slashCount%2) == 0) {
+ int64_t pciId;
+ NCCLCHECK(pciPathToInt64(path, offset, offsetRC, &pciId));
+ for (int p=0; p<system->nodes[PCI].count; p++) {
+ if (system->nodes[PCI].nodes[p].id == pciId) {
+ // Found our PCI switch. Attach and stop since the rest should already
+ // be connected
+ NCCLCHECK(ncclTopoConnectNodes(system->nodes[PCI].nodes+p, lastNode, LINK_PCI, pciWidth));
+ NCCLCHECK(ncclTopoConnectNodes(lastNode, system->nodes[PCI].nodes+p, LINK_PCI, pciWidth));
+ return ncclSuccess;
+ }
+ }
+ struct ncclTopoNode* pciNode;
+ NCCLCHECK(ncclTopoCreateNode(system, &pciNode, PCI, pciId));
+ NCCLCHECK(ncclTopoConnectNodes(pciNode, lastNode, LINK_PCI, pciWidth));
+ NCCLCHECK(ncclTopoConnectNodes(lastNode, pciNode, LINK_PCI, pciWidth));
+ lastNode = pciNode;
+ }
+ }
+ }
+ // Then attach to a CPU node
+ int numaId = getNumaId(path);
+ int width;
+ NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
+ NCCLCHECK(ncclTopoConnectCpu(system, numaId, lastNode, LINK_PCI, width));
+ return ncclSuccess;
+}
+
+// Try to detect if IB cards are in fact the same physical NIC, hence sharing ports.
+#include <glob.h>
+#define IB_GUID_PATH "%s/infiniband/mlx5_*/sys_image_guid"
+uint64_t getIbGuid(char* path) {
+ uint64_t guid = 0ULL;
+ char guidPath[PATH_MAX];
+ snprintf(guidPath, PATH_MAX, IB_GUID_PATH, path);
+ // PATH has a wildcard in it so use glob()
+ glob_t globbuf;
+ glob(guidPath, 0, NULL, &globbuf);
+ if (globbuf.gl_pathc > 0)
+ strncpy(guidPath, globbuf.gl_pathv[0], PATH_MAX);
+ globfree(&globbuf);
+ guidPath[PATH_MAX-1] = '\0';
+ FILE *file = fopen(guidPath, "r");
+ if (file != NULL) {
+ uint64_t a, b, c, d;
+ if (fscanf(file, "%04lx:%04lx:%04lx:%04lx", &a, &b, &c, &d) != EOF) {
+ guid = (a << 48) + (b << 32) + (c<<16) + d;
+ TRACE(NCCL_GRAPH, "Opened %s guid %lx", guidPath, guid);
+ }
+ fclose(file);
+ }
+ return guid;
+}
+
+struct netInfo {
+ char* path;
+ int64_t nic;
+ uint64_t asic;
+ int port;
+ int net;
+};
+
+ncclResult_t ncclTopoComputeNetInfo(struct netInfo* netInfos, int ndev) {
+ for (int n=0; n<ndev; n++) {
+ struct netInfo* info = netInfos+n;
+ uint64_t ibGuid;
+ info->nic = n;
+ info->asic = n;
+ info->port = 0;
+ info->net = n;
+ if (info->path && (ibGuid = getIbGuid(info->path)) != 0) {
+ info->asic = ibGuid;
+
+ // Ignore PCI subdevice when computing the ID to merge multi-port cards
+ // and make them use the same PCI link.
+ char* path = strdup(info->path);
+ path[strlen(path)-1]='0';
+ NCCLCHECK(pciPathToInt64(path, strlen(path), 0, &info->nic));
+ free(path);
+
+ // Same PCI path -> different ports of the same NIC
+ for (int i=0; i<n; i++) if (netInfos[i].nic == info->nic) info->port++;
+
+ // Same GUID -> same network links as the other NIC
+ for (int i=0; i<n; i++) if (netInfos[i].asic == info->asic && netInfos[i].port == info->port) info->net = netInfos[i].net;
+ }
+ INFO(NCCL_GRAPH, "%s -> %x/%lx/%d/%d", info->path, info->nic, info->asic, info->port, info->net);
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoConnectPCI(struct ncclTopoSystem* system) {
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+ char* path;
+ NCCLCHECK(getPath(gpu->id, &path));
+ NCCLCHECK(ncclTopoCreatePciPath(system, gpu, path));
+ free(path);
+ }
+
+ // Connect the NICs
+ int netDevCount;
+ NCCLCHECK(ncclNetDevices(&netDevCount));
+ int netWidth;
+ NCCLCHECK(ncclTopoGetNetWidth(&netWidth));
+
+ struct netInfo* netInfos;
+ NCCLCHECK(ncclCalloc(&netInfos, netDevCount));
+
+ for (int n=0; n<netDevCount; n++) {
+ ncclResult_t res = ncclNetPciPath(n, &netInfos[n].path);
+ if (res != ncclSuccess) netInfos[n].path = NULL;
+ }
+
+ NCCLCHECK(ncclTopoComputeNetInfo(netInfos, netDevCount));
+
+ for (int n=0; n<netDevCount; n++) {
+ struct netInfo* info = netInfos+n;
+ // Create NIC and attach it to the PCI tree
+ struct ncclTopoNode* nicNode = NULL;
+ for (int i=0; i<system->nodes[NIC].count; i++) {
+ if (system->nodes[NIC].nodes[i].id == info->nic) {
+ nicNode = system->nodes[NIC].nodes+i;
+ break;
+ }
+ }
+ if (!nicNode) {
+ NCCLCHECK(ncclTopoCreateNode(system, &nicNode, NIC, info->nic));
+ if (info->path) {
+ // Create the PCI path
+ NCCLCHECK(ncclTopoCreatePciPath(system, nicNode, info->path));
+ } else {
+ // This is probably a virtual NIC. Just attach it directly to CPU 0
+ int width;
+ NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
+ NCCLCHECK(ncclTopoConnectCpu(system, 0, nicNode, LINK_PCI, width));
+ }
+ }
+ free(info->path);
+
+ // Create the network side
+ struct ncclTopoNode* netNode;
+ NCCLCHECK(ncclTopoCreateNode(system, &netNode, NET, n));
+
+ // Use rank to store the net information
+ netNode->rank = info->net;
+
+ NCCLCHECK(ncclTopoConnectNodes(nicNode, netNode, LINK_NET, netWidth));
+ NCCLCHECK(ncclTopoConnectNodes(netNode, nicNode, LINK_NET, netWidth));
+ }
+ free(netInfos);
+
+ // And connect all CPU nodes together
+ for (int n=0; n<system->nodes[CPU].count; n++) {
+ for (int p=0; p<system->nodes[CPU].count; p++) {
+ if (n == p) continue;
+ int width;
+ NCCLCHECK(ncclTopoGetInterCpuWidth(&width));
+ NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_QPI, width));
+ }
+ }
+ return ncclSuccess;
+}
+
+static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) {
+ if (node->type == GPU) {
+ sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->rank);
+ } else {
+ sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
+ }
+ INFO(NCCL_GRAPH, "%s", line);
+ for (int i=0; i<offset; i++) line[i] = ' ';
+
+ for (int l=0; l<node->nlinks; l++) {
+ struct ncclTopoLink* link = node->links+l;
+ if (link->type == LINK_LOC) continue;
+ if (link->remNode != prevNode) {
+ sprintf(line+offset, "+ %s[%2d] - ", topoLinkTypeStr[link->type], link->width);
+ int nextOffset = strlen(line);
+ if (link->type == LINK_PCI) {
+ NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
+ } else {
+ if (link->remNode->type == NET) {
+ sprintf(line+nextOffset, "%s/%lX (%d)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->rank);
+ } else {
+ sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
+ }
+ INFO(NCCL_GRAPH, "%s", line);
+ }
+ }
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
+ INFO(NCCL_GRAPH, "=== System : maxWidth %2d maxSpeed %2d ===", s->maxWidth, s->maxSpeed);
+ char line[1024];
+ for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
+ INFO(NCCL_GRAPH, "==========================================");
+ NCCLCHECK(ncclTopoPrintPaths(s));
+ return ncclSuccess;
+}
+
+static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* upNode) {
+ // Shift all links to have upLink as last link
+ if (upNode) {
+ int l=0;
+ while (node->links[l].remNode != upNode) l++;
+ struct ncclTopoLink upLink;
+ memcpy(&upLink, node->links+l, sizeof(struct ncclTopoLink));
+ while (node->links[l+1].remNode) {
+ memcpy(node->links+l, node->links+l+1, sizeof(struct ncclTopoLink));
+ l++;
+ }
+ memcpy(node->links+l, &upLink, sizeof(struct ncclTopoLink));
+ }
+
+ // Recursively sort the PCI tree
+ for (int l=0; l<node->nlinks; l++) {
+ struct ncclTopoLink* link = node->links+l;
+ if (link->type == LINK_PCI && link->remNode != upNode) NCCLCHECK(ncclTopoSort(link->remNode, node));
+ }
+ return ncclSuccess;
+}
+
+// We want the graph to be organized to ease/accelerate traversal :
+// 1. NVLinks (already the case)
+// 2. PCI down
+// 3. PCI up
+// 4. QPI (already the case)
+ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) {
+ for (int n=0; n<system->nodes[CPU].count; n++) NCCLCHECK(ncclTopoSort(system->nodes[CPU].nodes+n, NULL));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
+ struct ncclTopoSystem* s;
+ NCCLCHECK(ncclCalloc(&s, 1));
+ nvmlDevice_t* nvmlDevs;
+ int g = 0;
+ NCCLCHECK(ncclCalloc(&nvmlDevs, comm->nRanks));
+ for (int r=0; r<comm->nRanks; r++) {
+ if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
+ // Consider the GPU as outside of our node if we can't see it through NVML.
+ char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+ NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
+ if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevs+g) != ncclSuccess) continue;
+ g++;
+ struct ncclTopoNode* gpuNode;
+ NCCLCHECK(ncclTopoCreateNode(s, &gpuNode, GPU, comm->peerInfo[r].busId));
+ gpuNode->rank = r;
+ }
+ }
+
+ NCCLCHECK(ncclTopoConnectNVLink(nvmlDevs, s));
+ NCCLCHECK(ncclTopoConnectPCI(s));
+
+ free(nvmlDevs);
+ NCCLCHECK(ncclTopoSortSystem(s));
+ *system = s;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink) {
+ int g1, g2;
+ NCCLCHECK(idToIndex(system, busId1, &g1));
+ NCCLCHECK(idToIndex(system, busId2, &g2));
+ *nvlink = g1 != -1 && g2 != -1 && system->nodes[GPU].nodes[g1].paths[GPU][g2].type == LINK_NVL;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink) {
+ int g;
+ NCCLCHECK(idToIndex(system, busId, &g));
+ for (int i=0; i<system->nodes[GPU].count; i++) {
+ if (i == g) continue;
+ if (system->nodes[GPU].nodes[g].paths[GPU][i].type == LINK_NVL) {
+ *nvlink = 1;
+ return ncclSuccess;
+ }
+ }
+ *nvlink = 0;
+ return ncclSuccess;
+}
+
+static int pathDistance(struct ncclTopoLinkList* links) {
+ int distance = PATH_PIX;
+ if (links->count > 2) distance = PATH_PXB;
+ for (int l=0; l<links->count; l++) {
+ // PHB if we go through 1 CPU, SYS if we go through 2 CPUs
+ if (links->list[l]->remNode->type == CPU) distance = (distance == PATH_PHB) ? PATH_SYS : PATH_PHB;
+ }
+ return distance;
+}
+
+ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance) {
+ int g1, g2;
+ NCCLCHECK(idToIndex(system, busId1, &g1));
+ NCCLCHECK(idToIndex(system, busId2, &g2));
+ *distance = pathDistance(system->nodes[GPU].nodes[g1].paths[GPU]+g2);
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance) {
+ int g;
+ NCCLCHECK(idToIndex(system, busId, &g));
+ *distance = pathDistance(system->nodes[GPU].nodes[g].paths[NET]+netDev);
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count) {
+ *count = system->nodes[CPU].count;
+ return ncclSuccess;
+}