diff options
Diffstat (limited to 'src/graph/topo.cc')
-rw-r--r-- | src/graph/topo.cc | 641 |
1 files changed, 641 insertions, 0 deletions
diff --git a/src/graph/topo.cc b/src/graph/topo.cc new file mode 100644 index 0000000..a1b3209 --- /dev/null +++ b/src/graph/topo.cc @@ -0,0 +1,641 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" +#include "graph.h" +#include "topo.h" +#include "comm.h" +#include "nvmlwrap.h" +#include "net.h" +#include <sys/stat.h> +#include <fcntl.h> + +#define BUSID_SIZE (sizeof("0000:00:00.0")) +#define BUSID_REDUCED_SIZE (sizeof("0000:00")) + +const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" }; + +const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" }; +const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "QPI", "NET" }; + +/******************************************************************/ +/******************* Graph Creation Functions *********************/ +/******************************************************************/ +static int getNumaId(char *path) { + char npath[PATH_MAX]; + snprintf(npath, PATH_MAX, "%s/numa_node", path); + npath[PATH_MAX-1] = '\0'; + + int numaId = -1; + FILE *file = fopen(npath, "r"); + if (file == NULL) return -1; + if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; } + fclose(file); + + return numaId; +} + +static ncclResult_t getPciPath(char* busId, char** path) { + for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]); + char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0"; + memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1); + memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1); + *path = realpath(busPath, NULL); + if (*path == NULL) { + WARN("Could not find real path of %s", busPath); + return ncclSystemError; + } + return ncclSuccess; +} + +// Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000. +ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) { + char* str = path+offset; + // Remove trailing "/" + if (*str == '/') str--; + // Find next / + while (*str != '/') str--; + str++; + NCCLCHECK(busIdToInt64(str, id)); + return ncclSuccess; +} + +static ncclResult_t idToIndex(struct ncclTopoSystem* system, int64_t id, int* index) { + *index = -1; + for (int i=0; i<system->nodes[GPU].count; i++) { + if (system->nodes[GPU].nodes[i].id == id) { + *index = i; + } + } + return ncclSuccess; +} + + +static ncclResult_t getPath(int64_t id, char** path) { + char busId[] = "0000:00:00.0"; + NCCLCHECK(int64ToBusId(id, busId)); + NCCLCHECK(getPciPath(busId, path)); + return ncclSuccess; +} + +ncclResult_t ncclTopoCudaPath(int cudaDev, char** path) { + char busId[BUSID_SIZE]; + CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev)); + NCCLCHECK(getPciPath(busId, path)); + return ncclSuccess; +} + + +int interCpuWidth = 0; +int cpuPciWidth = 0; + +static ncclResult_t getCpuWidths() { + // Check if already detected + if (interCpuWidth + cpuPciWidth) return ncclSuccess; + + // Defaults + char cpu[256]; + sprintf(cpu, "Generic"); + cpuPciWidth = interCpuWidth = PCI_WIDTH; + +#ifdef __PPC__ + sprintf(cpu, "ppc64"); + interCpuWidth = P9_WIDTH; +#endif +#ifdef __x86_64__ + sprintf(cpu, "x86_64"); + union { + struct { + // CPUID 0 String register order + uint32_t ebx; + uint32_t edx; + uint32_t ecx; + }; + char vendor[12]; + } cpuid0; + + asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0)); + if (strncmp(cpuid0.vendor, "GenuineIntel", 12) == 0) sprintf(cpu, "Intel"); + + if (strcmp(cpu, "Intel") == 0) { + union { + struct { + int steppingId:4; + int model:4; + int familyId:4; + int processorType:2; + int resv0:2; + int extModelId:4; + int modelId:8; + int resv1:4; + }; + uint32_t val; + } cpuid1; + asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1)); + if (cpuid1.familyId == 6 && cpuid1.modelId >= 0x55) { // Skylake + sprintf(cpu, "Intel/Skylake (or later)"); + interCpuWidth = SKL_QPI_WIDTH; + } else { + interCpuWidth = QPI_WIDTH; + } + } +#endif + INFO(NCCL_GRAPH, "%s CPU (PCI %d, InterCpu %d)", cpu, cpuPciWidth, interCpuWidth); + return ncclSuccess; +} + +static ncclResult_t ncclTopoGetInterCpuWidth(int* width) { + NCCLCHECK(getCpuWidths()); + *width = interCpuWidth; + return ncclSuccess; +} +static ncclResult_t ncclTopoGetCpuPciP2pWidth(int* width) { + NCCLCHECK(getCpuWidths()); + *width = cpuPciWidth; + return ncclSuccess; +} +static ncclResult_t ncclTopoGetPciWidth(int* width) { + *width = PCI_WIDTH; + return ncclSuccess; +} +static ncclResult_t ncclTopoGetNetWidth(int* width) { + *width = NET_WIDTH; + return ncclSuccess; +} + +enum ncclNvLinkDeviceType { + ncclNvLinkDeviceUnknown, + ncclNvLinkDeviceGpu, + ncclNvLinkDeviceSwitch, + ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea) +}; + +static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) { + char classPath[] = "/sys/bus/pci/devices/0000:00:00.0/class"; + memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1); + char* rPath = realpath(classPath, NULL); + int fd; + if ((fd = open(rPath, O_RDONLY)) == -1) { + // Could not find device. It might be because we're in a VM and + // we don't see the whole machine. This is handled silently so + // we don't want to print an INFO error. + TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno)); + return ncclSystemError; + } + free(rPath); + char pciClass[9]; + strncpy(pciClass, "0x000000", 9); + int len; + SYSCHECKVAL(read(fd, pciClass, 8), "read", len); + SYSCHECK(close(fd), "close"); + if (strcmp(pciClass, "0x068000") == 0) { + // PCI device is of type "Bridge / Other Bridge Device" (NVswitch) + *type = ncclNvLinkDeviceSwitch; + } else if (strcmp(pciClass, "0x068001") == 0) { + // PCI device is of type "Bridge: IBM Device 04ea" + *type = ncclNvLinkDeviceBridge; + } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla) + || strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce) + *type = ncclNvLinkDeviceGpu; + } else { + *type = ncclNvLinkDeviceUnknown; + } + return ncclSuccess; +} + +ncclResult_t ncclTopoConnectCpu(struct ncclTopoSystem* system, int numaId, struct ncclTopoNode* node, int linkType, int linkWidth) { + struct ncclTopoNode* cpuNode = NULL; + for (int c=0; c<system->nodes[CPU].count; c++) { + if (system->nodes[CPU].nodes[c].id == numaId) cpuNode = system->nodes[CPU].nodes+c; + } + if (cpuNode == NULL) { // Create CPU + NCCLCHECK(ncclTopoCreateNode(system, &cpuNode, CPU, numaId)); + } + NCCLCHECK(ncclTopoConnectNodes(node, cpuNode, linkType, linkWidth)); + NCCLCHECK(ncclTopoConnectNodes(cpuNode, node, linkType, linkWidth)); + return ncclSuccess; +} + +ncclResult_t ncclTopoConnectNVLink(nvmlDevice_t* nvmlDevs, struct ncclTopoSystem* system) { + struct ncclTopoNode* nvsNode = NULL; + + int minNvlinks = 6, minWidth = VOLTA_NVLINK_WIDTH; + for (int g=0; g<system->nodes[GPU].count; g++) { + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + int cudaMajor, cudaMinor; + NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDevs[g], &cudaMajor, &cudaMinor)); + int maxNvLinks, width; + if (cudaMajor < 6) { + maxNvLinks = 0; + width = 0; + } else if (cudaMajor == 6) { + maxNvLinks = 4; + width = PASCAL_NVLINK_WIDTH; + } else { + maxNvLinks = 6; + width = VOLTA_NVLINK_WIDTH; + } + + int nvlinks = 0; + for (int l=0; l<maxNvLinks; ++l) { + // Check whether we can use this NVLink for P2P + unsigned canP2P; + if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDevs[g], l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue; + + // Make sure the Nvlink is up. The previous call should have trained the link. + nvmlEnableState_t isActive; + if ((wrapNvmlDeviceGetNvLinkState(nvmlDevs[g], l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue; + + // Try to figure out what's on the other side of the NVLink + nvmlPciInfo_t remoteProc; + if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevs[g], l, &remoteProc) != ncclSuccess) continue; + + // Make a lower case copy of the bus ID for calling ncclDeviceType + // PCI system path is in lower case + char* p = remoteProc.busId; + char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) { + lowerId[c] = tolower(p[c]); + if (p[c] == 0) break; + } + + enum ncclNvLinkDeviceType type; + NCCLCHECK(ncclDeviceType(lowerId, &type)); + if (type == ncclNvLinkDeviceGpu) { + int64_t remoteId; + NCCLCHECK(busIdToInt64(lowerId, &remoteId)); + int peer; + NCCLCHECK(idToIndex(system, remoteId, &peer)); + if (peer != -1) { + NCCLCHECK(ncclTopoConnectNodes(gpu, system->nodes[GPU].nodes+peer, LINK_NVL, width)); + nvlinks++; + } + } else if (type == ncclNvLinkDeviceBridge) { + // Nvlink between GPU and CPU (PPC) + // Since the remote bridge does not have a valid numa_node, assume we + // are connected to the closest CPU. + char* path; + NCCLCHECK(getPath(gpu->id, &path)); + int numaId = getNumaId(path); + free(path); + NCCLCHECK(ncclTopoConnectCpu(system, numaId, gpu, LINK_NVL, width)); + nvlinks++; + } else { // Nvswitch + if (type == ncclNvLinkDeviceUnknown) { + // The NVLink is up but we couldn't find the PCI device on the other + // side. Assume it's an NVswitch outside a VM. + if (l == 0) INFO(NCCL_INIT, "%d/%d -> %s : Assuming NVLink is connected to NVswitch", g, l, lowerId); + } + if (nvsNode == NULL) { // Create nvswitch + NCCLCHECK(ncclTopoCreateNode(system, &nvsNode, NVS, 0)); + } + NCCLCHECK(ncclTopoConnectNodes(gpu, nvsNode, LINK_NVL, VOLTA_NVLINK_WIDTH)); + NCCLCHECK(ncclTopoConnectNodes(nvsNode, gpu, LINK_NVL, VOLTA_NVLINK_WIDTH)); + nvlinks++; + } + } + minNvlinks = std::min(minNvlinks, nvlinks); + minWidth = std::min(minWidth, width); + } + int pciWidth; + NCCLCHECK(ncclTopoGetPciWidth(&pciWidth)); + system->maxSpeed = minNvlinks ? minNvlinks*minWidth : pciWidth; + system->maxWidth = minNvlinks ? minWidth : pciWidth; + return ncclSuccess; +} + +ncclResult_t ncclTopoCreatePciPath(struct ncclTopoSystem* system, struct ncclTopoNode* endNode, char* path) { + struct ncclTopoNode* lastNode = endNode; + int pciWidth; + NCCLCHECK(ncclTopoGetPciWidth(&pciWidth)); + // Find intermediate PCI switches + int slashCount = 0; + int offsetRC = 0; + while (offsetRC < strlen(path)) { + if (path[offsetRC] == '/') slashCount++; + if (slashCount == 4) break; + offsetRC++; + } + int offset = strlen(path); + slashCount = 0; + while (--offset > offsetRC) { + if (path[offset] == '/') { + slashCount++; + // Find if already existing + if ((slashCount%2) == 0) { + int64_t pciId; + NCCLCHECK(pciPathToInt64(path, offset, offsetRC, &pciId)); + for (int p=0; p<system->nodes[PCI].count; p++) { + if (system->nodes[PCI].nodes[p].id == pciId) { + // Found our PCI switch. Attach and stop since the rest should already + // be connected + NCCLCHECK(ncclTopoConnectNodes(system->nodes[PCI].nodes+p, lastNode, LINK_PCI, pciWidth)); + NCCLCHECK(ncclTopoConnectNodes(lastNode, system->nodes[PCI].nodes+p, LINK_PCI, pciWidth)); + return ncclSuccess; + } + } + struct ncclTopoNode* pciNode; + NCCLCHECK(ncclTopoCreateNode(system, &pciNode, PCI, pciId)); + NCCLCHECK(ncclTopoConnectNodes(pciNode, lastNode, LINK_PCI, pciWidth)); + NCCLCHECK(ncclTopoConnectNodes(lastNode, pciNode, LINK_PCI, pciWidth)); + lastNode = pciNode; + } + } + } + // Then attach to a CPU node + int numaId = getNumaId(path); + int width; + NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width)); + NCCLCHECK(ncclTopoConnectCpu(system, numaId, lastNode, LINK_PCI, width)); + return ncclSuccess; +} + +// Try to detect if IB cards are in fact the same physical NIC, hence sharing ports. +#include <glob.h> +#define IB_GUID_PATH "%s/infiniband/mlx5_*/sys_image_guid" +uint64_t getIbGuid(char* path) { + uint64_t guid = 0ULL; + char guidPath[PATH_MAX]; + snprintf(guidPath, PATH_MAX, IB_GUID_PATH, path); + // PATH has a wildcard in it so use glob() + glob_t globbuf; + glob(guidPath, 0, NULL, &globbuf); + if (globbuf.gl_pathc > 0) + strncpy(guidPath, globbuf.gl_pathv[0], PATH_MAX); + globfree(&globbuf); + guidPath[PATH_MAX-1] = '\0'; + FILE *file = fopen(guidPath, "r"); + if (file != NULL) { + uint64_t a, b, c, d; + if (fscanf(file, "%04lx:%04lx:%04lx:%04lx", &a, &b, &c, &d) != EOF) { + guid = (a << 48) + (b << 32) + (c<<16) + d; + TRACE(NCCL_GRAPH, "Opened %s guid %lx", guidPath, guid); + } + fclose(file); + } + return guid; +} + +struct netInfo { + char* path; + int64_t nic; + uint64_t asic; + int port; + int net; +}; + +ncclResult_t ncclTopoComputeNetInfo(struct netInfo* netInfos, int ndev) { + for (int n=0; n<ndev; n++) { + struct netInfo* info = netInfos+n; + uint64_t ibGuid; + info->nic = n; + info->asic = n; + info->port = 0; + info->net = n; + if (info->path && (ibGuid = getIbGuid(info->path)) != 0) { + info->asic = ibGuid; + + // Ignore PCI subdevice when computing the ID to merge multi-port cards + // and make them use the same PCI link. + char* path = strdup(info->path); + path[strlen(path)-1]='0'; + NCCLCHECK(pciPathToInt64(path, strlen(path), 0, &info->nic)); + free(path); + + // Same PCI path -> different ports of the same NIC + for (int i=0; i<n; i++) if (netInfos[i].nic == info->nic) info->port++; + + // Same GUID -> same network links as the other NIC + for (int i=0; i<n; i++) if (netInfos[i].asic == info->asic && netInfos[i].port == info->port) info->net = netInfos[i].net; + } + INFO(NCCL_GRAPH, "%s -> %x/%lx/%d/%d", info->path, info->nic, info->asic, info->port, info->net); + } + return ncclSuccess; +} + +ncclResult_t ncclTopoConnectPCI(struct ncclTopoSystem* system) { + for (int g=0; g<system->nodes[GPU].count; g++) { + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + char* path; + NCCLCHECK(getPath(gpu->id, &path)); + NCCLCHECK(ncclTopoCreatePciPath(system, gpu, path)); + free(path); + } + + // Connect the NICs + int netDevCount; + NCCLCHECK(ncclNetDevices(&netDevCount)); + int netWidth; + NCCLCHECK(ncclTopoGetNetWidth(&netWidth)); + + struct netInfo* netInfos; + NCCLCHECK(ncclCalloc(&netInfos, netDevCount)); + + for (int n=0; n<netDevCount; n++) { + ncclResult_t res = ncclNetPciPath(n, &netInfos[n].path); + if (res != ncclSuccess) netInfos[n].path = NULL; + } + + NCCLCHECK(ncclTopoComputeNetInfo(netInfos, netDevCount)); + + for (int n=0; n<netDevCount; n++) { + struct netInfo* info = netInfos+n; + // Create NIC and attach it to the PCI tree + struct ncclTopoNode* nicNode = NULL; + for (int i=0; i<system->nodes[NIC].count; i++) { + if (system->nodes[NIC].nodes[i].id == info->nic) { + nicNode = system->nodes[NIC].nodes+i; + break; + } + } + if (!nicNode) { + NCCLCHECK(ncclTopoCreateNode(system, &nicNode, NIC, info->nic)); + if (info->path) { + // Create the PCI path + NCCLCHECK(ncclTopoCreatePciPath(system, nicNode, info->path)); + } else { + // This is probably a virtual NIC. Just attach it directly to CPU 0 + int width; + NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width)); + NCCLCHECK(ncclTopoConnectCpu(system, 0, nicNode, LINK_PCI, width)); + } + } + free(info->path); + + // Create the network side + struct ncclTopoNode* netNode; + NCCLCHECK(ncclTopoCreateNode(system, &netNode, NET, n)); + + // Use rank to store the net information + netNode->rank = info->net; + + NCCLCHECK(ncclTopoConnectNodes(nicNode, netNode, LINK_NET, netWidth)); + NCCLCHECK(ncclTopoConnectNodes(netNode, nicNode, LINK_NET, netWidth)); + } + free(netInfos); + + // And connect all CPU nodes together + for (int n=0; n<system->nodes[CPU].count; n++) { + for (int p=0; p<system->nodes[CPU].count; p++) { + if (n == p) continue; + int width; + NCCLCHECK(ncclTopoGetInterCpuWidth(&width)); + NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_QPI, width)); + } + } + return ncclSuccess; +} + +static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) { + if (node->type == GPU) { + sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->rank); + } else { + sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id); + } + INFO(NCCL_GRAPH, "%s", line); + for (int i=0; i<offset; i++) line[i] = ' '; + + for (int l=0; l<node->nlinks; l++) { + struct ncclTopoLink* link = node->links+l; + if (link->type == LINK_LOC) continue; + if (link->remNode != prevNode) { + sprintf(line+offset, "+ %s[%2d] - ", topoLinkTypeStr[link->type], link->width); + int nextOffset = strlen(line); + if (link->type == LINK_PCI) { + NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset)); + } else { + if (link->remNode->type == NET) { + sprintf(line+nextOffset, "%s/%lX (%d)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->rank); + } else { + sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id); + } + INFO(NCCL_GRAPH, "%s", line); + } + } + } + return ncclSuccess; +} + +ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) { + INFO(NCCL_GRAPH, "=== System : maxWidth %2d maxSpeed %2d ===", s->maxWidth, s->maxSpeed); + char line[1024]; + for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0)); + INFO(NCCL_GRAPH, "=========================================="); + NCCLCHECK(ncclTopoPrintPaths(s)); + return ncclSuccess; +} + +static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* upNode) { + // Shift all links to have upLink as last link + if (upNode) { + int l=0; + while (node->links[l].remNode != upNode) l++; + struct ncclTopoLink upLink; + memcpy(&upLink, node->links+l, sizeof(struct ncclTopoLink)); + while (node->links[l+1].remNode) { + memcpy(node->links+l, node->links+l+1, sizeof(struct ncclTopoLink)); + l++; + } + memcpy(node->links+l, &upLink, sizeof(struct ncclTopoLink)); + } + + // Recursively sort the PCI tree + for (int l=0; l<node->nlinks; l++) { + struct ncclTopoLink* link = node->links+l; + if (link->type == LINK_PCI && link->remNode != upNode) NCCLCHECK(ncclTopoSort(link->remNode, node)); + } + return ncclSuccess; +} + +// We want the graph to be organized to ease/accelerate traversal : +// 1. NVLinks (already the case) +// 2. PCI down +// 3. PCI up +// 4. QPI (already the case) +ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) { + for (int n=0; n<system->nodes[CPU].count; n++) NCCLCHECK(ncclTopoSort(system->nodes[CPU].nodes+n, NULL)); + return ncclSuccess; +} + +ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) { + struct ncclTopoSystem* s; + NCCLCHECK(ncclCalloc(&s, 1)); + nvmlDevice_t* nvmlDevs; + int g = 0; + NCCLCHECK(ncclCalloc(&nvmlDevs, comm->nRanks)); + for (int r=0; r<comm->nRanks; r++) { + if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) { + // Consider the GPU as outside of our node if we can't see it through NVML. + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId)); + if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevs+g) != ncclSuccess) continue; + g++; + struct ncclTopoNode* gpuNode; + NCCLCHECK(ncclTopoCreateNode(s, &gpuNode, GPU, comm->peerInfo[r].busId)); + gpuNode->rank = r; + } + } + + NCCLCHECK(ncclTopoConnectNVLink(nvmlDevs, s)); + NCCLCHECK(ncclTopoConnectPCI(s)); + + free(nvmlDevs); + NCCLCHECK(ncclTopoSortSystem(s)); + *system = s; + return ncclSuccess; +} + +ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink) { + int g1, g2; + NCCLCHECK(idToIndex(system, busId1, &g1)); + NCCLCHECK(idToIndex(system, busId2, &g2)); + *nvlink = g1 != -1 && g2 != -1 && system->nodes[GPU].nodes[g1].paths[GPU][g2].type == LINK_NVL; + return ncclSuccess; +} + +ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink) { + int g; + NCCLCHECK(idToIndex(system, busId, &g)); + for (int i=0; i<system->nodes[GPU].count; i++) { + if (i == g) continue; + if (system->nodes[GPU].nodes[g].paths[GPU][i].type == LINK_NVL) { + *nvlink = 1; + return ncclSuccess; + } + } + *nvlink = 0; + return ncclSuccess; +} + +static int pathDistance(struct ncclTopoLinkList* links) { + int distance = PATH_PIX; + if (links->count > 2) distance = PATH_PXB; + for (int l=0; l<links->count; l++) { + // PHB if we go through 1 CPU, SYS if we go through 2 CPUs + if (links->list[l]->remNode->type == CPU) distance = (distance == PATH_PHB) ? PATH_SYS : PATH_PHB; + } + return distance; +} + +ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance) { + int g1, g2; + NCCLCHECK(idToIndex(system, busId1, &g1)); + NCCLCHECK(idToIndex(system, busId2, &g2)); + *distance = pathDistance(system->nodes[GPU].nodes[g1].paths[GPU]+g2); + return ncclSuccess; +} + +ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance) { + int g; + NCCLCHECK(idToIndex(system, busId, &g)); + *distance = pathDistance(system->nodes[GPU].nodes[g].paths[NET]+netDev); + return ncclSuccess; +} + +ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count) { + *count = system->nodes[CPU].count; + return ncclSuccess; +} |