Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/graph/topo.cc')
-rw-r--r--src/graph/topo.cc941
1 files changed, 474 insertions, 467 deletions
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index a1b3209..5cd8d4e 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,46 +10,22 @@
#include "comm.h"
#include "nvmlwrap.h"
#include "net.h"
+#include "coll_net.h"
#include <sys/stat.h>
#include <fcntl.h>
+#include "xml.h"
+#include "cpuset.h"
#define BUSID_SIZE (sizeof("0000:00:00.0"))
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
-const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
-
const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
-const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "QPI", "NET" };
+const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "", "", "SYS", "NET" };
+const char* topoPathTypeStr[] = { "LOC", "NVL", "PIX", "PXB", "PHB", "SYS", "NET" };
/******************************************************************/
/******************* Graph Creation Functions *********************/
/******************************************************************/
-static int getNumaId(char *path) {
- char npath[PATH_MAX];
- snprintf(npath, PATH_MAX, "%s/numa_node", path);
- npath[PATH_MAX-1] = '\0';
-
- int numaId = -1;
- FILE *file = fopen(npath, "r");
- if (file == NULL) return -1;
- if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
- fclose(file);
-
- return numaId;
-}
-
-static ncclResult_t getPciPath(char* busId, char** path) {
- for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
- char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
- memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
- memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
- *path = realpath(busPath, NULL);
- if (*path == NULL) {
- WARN("Could not find real path of %s", busPath);
- return ncclSystemError;
- }
- return ncclSuccess;
-}
// Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000.
ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
@@ -59,110 +35,43 @@ ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id)
// Find next /
while (*str != '/') str--;
str++;
- NCCLCHECK(busIdToInt64(str, id));
+ int64_t numid;
+ NCCLCHECK(busIdToInt64(str, &numid));
+ // Ignore subdevice because those should use the same PCI link so we want to merge nodes.
+ numid -= numid & 0xf;
+ *id = numid;
return ncclSuccess;
}
-static ncclResult_t idToIndex(struct ncclTopoSystem* system, int64_t id, int* index) {
- *index = -1;
- for (int i=0; i<system->nodes[GPU].count; i++) {
- if (system->nodes[GPU].nodes[i].id == id) {
- *index = i;
- }
+static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu) {
+ *cpu = NULL;
+ if (node->type == CPU) {
+ *cpu = node;
+ return ncclSuccess;
+ }
+ for (int l=0; l<node->nlinks; l++) {
+ if (node->links[l].type == LINK_PCI) NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
+ if (*cpu != NULL) return ncclSuccess;
}
return ncclSuccess;
}
-
-static ncclResult_t getPath(int64_t id, char** path) {
- char busId[] = "0000:00:00.0";
- NCCLCHECK(int64ToBusId(id, busId));
- NCCLCHECK(getPciPath(busId, path));
- return ncclSuccess;
-}
-
-ncclResult_t ncclTopoCudaPath(int cudaDev, char** path) {
- char busId[BUSID_SIZE];
- CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
- NCCLCHECK(getPciPath(busId, path));
- return ncclSuccess;
-}
-
-
int interCpuWidth = 0;
int cpuPciWidth = 0;
-static ncclResult_t getCpuWidths() {
- // Check if already detected
- if (interCpuWidth + cpuPciWidth) return ncclSuccess;
-
- // Defaults
- char cpu[256];
- sprintf(cpu, "Generic");
- cpuPciWidth = interCpuWidth = PCI_WIDTH;
-
-#ifdef __PPC__
- sprintf(cpu, "ppc64");
- interCpuWidth = P9_WIDTH;
-#endif
-#ifdef __x86_64__
- sprintf(cpu, "x86_64");
- union {
- struct {
- // CPUID 0 String register order
- uint32_t ebx;
- uint32_t edx;
- uint32_t ecx;
- };
- char vendor[12];
- } cpuid0;
-
- asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0));
- if (strncmp(cpuid0.vendor, "GenuineIntel", 12) == 0) sprintf(cpu, "Intel");
-
- if (strcmp(cpu, "Intel") == 0) {
- union {
- struct {
- int steppingId:4;
- int model:4;
- int familyId:4;
- int processorType:2;
- int resv0:2;
- int extModelId:4;
- int modelId:8;
- int resv1:4;
- };
- uint32_t val;
- } cpuid1;
- asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1));
- if (cpuid1.familyId == 6 && cpuid1.modelId >= 0x55) { // Skylake
- sprintf(cpu, "Intel/Skylake (or later)");
- interCpuWidth = SKL_QPI_WIDTH;
- } else {
- interCpuWidth = QPI_WIDTH;
- }
+static ncclResult_t ncclTopoGetInterCpuWidth(struct ncclTopoNode* cpu, float* width) {
+ *width = LOC_WIDTH;
+ if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) {
+ *width = P9_WIDTH;
+ return ncclSuccess;
+ }
+ if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_ARM) {
+ *width = ARM_WIDTH;
+ return ncclSuccess;
+ }
+ if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
+ *width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_WIDTH : QPI_WIDTH;
}
-#endif
- INFO(NCCL_GRAPH, "%s CPU (PCI %d, InterCpu %d)", cpu, cpuPciWidth, interCpuWidth);
- return ncclSuccess;
-}
-
-static ncclResult_t ncclTopoGetInterCpuWidth(int* width) {
- NCCLCHECK(getCpuWidths());
- *width = interCpuWidth;
- return ncclSuccess;
-}
-static ncclResult_t ncclTopoGetCpuPciP2pWidth(int* width) {
- NCCLCHECK(getCpuWidths());
- *width = cpuPciWidth;
- return ncclSuccess;
-}
-static ncclResult_t ncclTopoGetPciWidth(int* width) {
- *width = PCI_WIDTH;
- return ncclSuccess;
-}
-static ncclResult_t ncclTopoGetNetWidth(int* width) {
- *width = NET_WIDTH;
return ncclSuccess;
}
@@ -173,317 +82,101 @@ enum ncclNvLinkDeviceType {
ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
};
-static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
- char classPath[] = "/sys/bus/pci/devices/0000:00:00.0/class";
- memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
- char* rPath = realpath(classPath, NULL);
- int fd;
- if ((fd = open(rPath, O_RDONLY)) == -1) {
- // Could not find device. It might be because we're in a VM and
- // we don't see the whole machine. This is handled silently so
- // we don't want to print an INFO error.
- TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
- return ncclSystemError;
- }
- free(rPath);
- char pciClass[9];
- strncpy(pciClass, "0x000000", 9);
- int len;
- SYSCHECKVAL(read(fd, pciClass, 8), "read", len);
- SYSCHECK(close(fd), "close");
- if (strcmp(pciClass, "0x068000") == 0) {
- // PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
- *type = ncclNvLinkDeviceSwitch;
- } else if (strcmp(pciClass, "0x068001") == 0) {
- // PCI device is of type "Bridge: IBM Device 04ea"
- *type = ncclNvLinkDeviceBridge;
- } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
- || strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce)
- *type = ncclNvLinkDeviceGpu;
- } else {
- *type = ncclNvLinkDeviceUnknown;
+ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
+ for (int i=0; i<system->nodes[type].count; i++) {
+ if (system->nodes[type].nodes[i].id == id) {
+ *node = system->nodes[type].nodes+i;
+ return ncclSuccess;
+ }
}
return ncclSuccess;
}
-ncclResult_t ncclTopoConnectCpu(struct ncclTopoSystem* system, int numaId, struct ncclTopoNode* node, int linkType, int linkWidth) {
- struct ncclTopoNode* cpuNode = NULL;
- for (int c=0; c<system->nodes[CPU].count; c++) {
- if (system->nodes[CPU].nodes[c].id == numaId) cpuNode = system->nodes[CPU].nodes+c;
+ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
+ if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
+ WARN("Error : tried to create too many nodes of type %d\n", type);
+ return ncclInternalError;
}
- if (cpuNode == NULL) { // Create CPU
- NCCLCHECK(ncclTopoCreateNode(system, &cpuNode, CPU, numaId));
+ struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
+ system->nodes[type].count++;
+ n->type = type;
+ n->id = id;
+ if (type == GPU) {
+ // Create link to itself (used in some corner cases)
+ n->nlinks=1;
+ n->links[0].type = LINK_LOC;
+ n->links[0].remNode = n;
+ n->links[0].width = LOC_WIDTH;
+ n->gpu.dev = NCCL_TOPO_UNDEF;
+ n->gpu.rank = NCCL_TOPO_UNDEF;
+ n->gpu.cudaCompCap = NCCL_TOPO_UNDEF;
+ } else if (type == CPU) {
+ n->cpu.arch = NCCL_TOPO_UNDEF;
+ n->cpu.vendor = NCCL_TOPO_UNDEF;
+ n->cpu.model = NCCL_TOPO_UNDEF;
+ } else if (type == NET) {
+ n->net.asic = 0ULL;
+ n->net.port = NCCL_TOPO_UNDEF;
+ n->net.width = 0.0;
}
- NCCLCHECK(ncclTopoConnectNodes(node, cpuNode, linkType, linkWidth));
- NCCLCHECK(ncclTopoConnectNodes(cpuNode, node, linkType, linkWidth));
+ *node = n;
return ncclSuccess;
}
-ncclResult_t ncclTopoConnectNVLink(nvmlDevice_t* nvmlDevs, struct ncclTopoSystem* system) {
- struct ncclTopoNode* nvsNode = NULL;
-
- int minNvlinks = 6, minWidth = VOLTA_NVLINK_WIDTH;
- for (int g=0; g<system->nodes[GPU].count; g++) {
- struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
- int cudaMajor, cudaMinor;
- NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDevs[g], &cudaMajor, &cudaMinor));
- int maxNvLinks, width;
- if (cudaMajor < 6) {
- maxNvLinks = 0;
- width = 0;
- } else if (cudaMajor == 6) {
- maxNvLinks = 4;
- width = PASCAL_NVLINK_WIDTH;
- } else {
- maxNvLinks = 6;
- width = VOLTA_NVLINK_WIDTH;
- }
-
- int nvlinks = 0;
- for (int l=0; l<maxNvLinks; ++l) {
- // Check whether we can use this NVLink for P2P
- unsigned canP2P;
- if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDevs[g], l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
-
- // Make sure the Nvlink is up. The previous call should have trained the link.
- nvmlEnableState_t isActive;
- if ((wrapNvmlDeviceGetNvLinkState(nvmlDevs[g], l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
-
- // Try to figure out what's on the other side of the NVLink
- nvmlPciInfo_t remoteProc;
- if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevs[g], l, &remoteProc) != ncclSuccess) continue;
-
- // Make a lower case copy of the bus ID for calling ncclDeviceType
- // PCI system path is in lower case
- char* p = remoteProc.busId;
- char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
- for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
- lowerId[c] = tolower(p[c]);
- if (p[c] == 0) break;
- }
-
- enum ncclNvLinkDeviceType type;
- NCCLCHECK(ncclDeviceType(lowerId, &type));
- if (type == ncclNvLinkDeviceGpu) {
- int64_t remoteId;
- NCCLCHECK(busIdToInt64(lowerId, &remoteId));
- int peer;
- NCCLCHECK(idToIndex(system, remoteId, &peer));
- if (peer != -1) {
- NCCLCHECK(ncclTopoConnectNodes(gpu, system->nodes[GPU].nodes+peer, LINK_NVL, width));
- nvlinks++;
- }
- } else if (type == ncclNvLinkDeviceBridge) {
- // Nvlink between GPU and CPU (PPC)
- // Since the remote bridge does not have a valid numa_node, assume we
- // are connected to the closest CPU.
- char* path;
- NCCLCHECK(getPath(gpu->id, &path));
- int numaId = getNumaId(path);
- free(path);
- NCCLCHECK(ncclTopoConnectCpu(system, numaId, gpu, LINK_NVL, width));
- nvlinks++;
- } else { // Nvswitch
- if (type == ncclNvLinkDeviceUnknown) {
- // The NVLink is up but we couldn't find the PCI device on the other
- // side. Assume it's an NVswitch outside a VM.
- if (l == 0) INFO(NCCL_INIT, "%d/%d -> %s : Assuming NVLink is connected to NVswitch", g, l, lowerId);
+ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int index) {
+ struct ncclTopoNode* delNode = system->nodes[type].nodes+index;
+ for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
+ free(delNode->paths[t]);
+ for (int n=0; n<system->nodes[t].count; n++) {
+ struct ncclTopoNode* node = system->nodes[t].nodes+n;
+ if (node == delNode) continue;
+ for (int l=0; l<node->nlinks; l++) {
+ while (l<node->nlinks && node->links[l].remNode == delNode) {
+ memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink));
+ node->nlinks--;
}
- if (nvsNode == NULL) { // Create nvswitch
- NCCLCHECK(ncclTopoCreateNode(system, &nvsNode, NVS, 0));
+ if (l<node->nlinks && node->links[l].remNode->type == type && node->links[l].remNode >= delNode) {
+ node->links[l].remNode--;
}
- NCCLCHECK(ncclTopoConnectNodes(gpu, nvsNode, LINK_NVL, VOLTA_NVLINK_WIDTH));
- NCCLCHECK(ncclTopoConnectNodes(nvsNode, gpu, LINK_NVL, VOLTA_NVLINK_WIDTH));
- nvlinks++;
}
}
- minNvlinks = std::min(minNvlinks, nvlinks);
- minWidth = std::min(minWidth, width);
}
- int pciWidth;
- NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
- system->maxSpeed = minNvlinks ? minNvlinks*minWidth : pciWidth;
- system->maxWidth = minNvlinks ? minWidth : pciWidth;
+ memmove(delNode, delNode+1, (system->nodes[type].count-index-1)*sizeof(struct ncclTopoNode));
+ system->nodes[type].count--;
return ncclSuccess;
}
-ncclResult_t ncclTopoCreatePciPath(struct ncclTopoSystem* system, struct ncclTopoNode* endNode, char* path) {
- struct ncclTopoNode* lastNode = endNode;
- int pciWidth;
- NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
- // Find intermediate PCI switches
- int slashCount = 0;
- int offsetRC = 0;
- while (offsetRC < strlen(path)) {
- if (path[offsetRC] == '/') slashCount++;
- if (slashCount == 4) break;
- offsetRC++;
- }
- int offset = strlen(path);
- slashCount = 0;
- while (--offset > offsetRC) {
- if (path[offset] == '/') {
- slashCount++;
- // Find if already existing
- if ((slashCount%2) == 0) {
- int64_t pciId;
- NCCLCHECK(pciPathToInt64(path, offset, offsetRC, &pciId));
- for (int p=0; p<system->nodes[PCI].count; p++) {
- if (system->nodes[PCI].nodes[p].id == pciId) {
- // Found our PCI switch. Attach and stop since the rest should already
- // be connected
- NCCLCHECK(ncclTopoConnectNodes(system->nodes[PCI].nodes+p, lastNode, LINK_PCI, pciWidth));
- NCCLCHECK(ncclTopoConnectNodes(lastNode, system->nodes[PCI].nodes+p, LINK_PCI, pciWidth));
- return ncclSuccess;
- }
- }
- struct ncclTopoNode* pciNode;
- NCCLCHECK(ncclTopoCreateNode(system, &pciNode, PCI, pciId));
- NCCLCHECK(ncclTopoConnectNodes(pciNode, lastNode, LINK_PCI, pciWidth));
- NCCLCHECK(ncclTopoConnectNodes(lastNode, pciNode, LINK_PCI, pciWidth));
- lastNode = pciNode;
- }
- }
+ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width) {
+ // Aggregate links into higher width for NVLink
+ struct ncclTopoLink* link;
+ for (link = node->links; link->remNode; link++) {
+ if (link->remNode == remNode && link->type == type) break;
}
- // Then attach to a CPU node
- int numaId = getNumaId(path);
- int width;
- NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
- NCCLCHECK(ncclTopoConnectCpu(system, numaId, lastNode, LINK_PCI, width));
- return ncclSuccess;
-}
-
-// Try to detect if IB cards are in fact the same physical NIC, hence sharing ports.
-#include <glob.h>
-#define IB_GUID_PATH "%s/infiniband/mlx5_*/sys_image_guid"
-uint64_t getIbGuid(char* path) {
- uint64_t guid = 0ULL;
- char guidPath[PATH_MAX];
- snprintf(guidPath, PATH_MAX, IB_GUID_PATH, path);
- // PATH has a wildcard in it so use glob()
- glob_t globbuf;
- glob(guidPath, 0, NULL, &globbuf);
- if (globbuf.gl_pathc > 0)
- strncpy(guidPath, globbuf.gl_pathv[0], PATH_MAX);
- globfree(&globbuf);
- guidPath[PATH_MAX-1] = '\0';
- FILE *file = fopen(guidPath, "r");
- if (file != NULL) {
- uint64_t a, b, c, d;
- if (fscanf(file, "%04lx:%04lx:%04lx:%04lx", &a, &b, &c, &d) != EOF) {
- guid = (a << 48) + (b << 32) + (c<<16) + d;
- TRACE(NCCL_GRAPH, "Opened %s guid %lx", guidPath, guid);
- }
- fclose(file);
- }
- return guid;
-}
-
-struct netInfo {
- char* path;
- int64_t nic;
- uint64_t asic;
- int port;
- int net;
-};
-
-ncclResult_t ncclTopoComputeNetInfo(struct netInfo* netInfos, int ndev) {
- for (int n=0; n<ndev; n++) {
- struct netInfo* info = netInfos+n;
- uint64_t ibGuid;
- info->nic = n;
- info->asic = n;
- info->port = 0;
- info->net = n;
- if (info->path && (ibGuid = getIbGuid(info->path)) != 0) {
- info->asic = ibGuid;
-
- // Ignore PCI subdevice when computing the ID to merge multi-port cards
- // and make them use the same PCI link.
- char* path = strdup(info->path);
- path[strlen(path)-1]='0';
- NCCLCHECK(pciPathToInt64(path, strlen(path), 0, &info->nic));
- free(path);
-
- // Same PCI path -> different ports of the same NIC
- for (int i=0; i<n; i++) if (netInfos[i].nic == info->nic) info->port++;
-
- // Same GUID -> same network links as the other NIC
- for (int i=0; i<n; i++) if (netInfos[i].asic == info->asic && netInfos[i].port == info->port) info->net = netInfos[i].net;
- }
- INFO(NCCL_GRAPH, "%s -> %x/%lx/%d/%d", info->path, info->nic, info->asic, info->port, info->net);
+ if (link->remNode == NULL) node->nlinks++;
+ link->type = type;
+ link->remNode = remNode;
+ link->width += width;
+
+ // Sort links in BW descending order
+ struct ncclTopoLink linkSave;
+ memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
+ while (link != node->links) {
+ if ((link-1)->width >= linkSave.width) break;
+ memcpy(link, link-1, sizeof(struct ncclTopoLink));
+ link--;
}
+ memcpy(link, &linkSave, sizeof(struct ncclTopoLink));
return ncclSuccess;
}
-ncclResult_t ncclTopoConnectPCI(struct ncclTopoSystem* system) {
- for (int g=0; g<system->nodes[GPU].count; g++) {
- struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
- char* path;
- NCCLCHECK(getPath(gpu->id, &path));
- NCCLCHECK(ncclTopoCreatePciPath(system, gpu, path));
- free(path);
- }
-
- // Connect the NICs
- int netDevCount;
- NCCLCHECK(ncclNetDevices(&netDevCount));
- int netWidth;
- NCCLCHECK(ncclTopoGetNetWidth(&netWidth));
-
- struct netInfo* netInfos;
- NCCLCHECK(ncclCalloc(&netInfos, netDevCount));
-
- for (int n=0; n<netDevCount; n++) {
- ncclResult_t res = ncclNetPciPath(n, &netInfos[n].path);
- if (res != ncclSuccess) netInfos[n].path = NULL;
- }
-
- NCCLCHECK(ncclTopoComputeNetInfo(netInfos, netDevCount));
-
- for (int n=0; n<netDevCount; n++) {
- struct netInfo* info = netInfos+n;
- // Create NIC and attach it to the PCI tree
- struct ncclTopoNode* nicNode = NULL;
- for (int i=0; i<system->nodes[NIC].count; i++) {
- if (system->nodes[NIC].nodes[i].id == info->nic) {
- nicNode = system->nodes[NIC].nodes+i;
- break;
- }
- }
- if (!nicNode) {
- NCCLCHECK(ncclTopoCreateNode(system, &nicNode, NIC, info->nic));
- if (info->path) {
- // Create the PCI path
- NCCLCHECK(ncclTopoCreatePciPath(system, nicNode, info->path));
- } else {
- // This is probably a virtual NIC. Just attach it directly to CPU 0
- int width;
- NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
- NCCLCHECK(ncclTopoConnectCpu(system, 0, nicNode, LINK_PCI, width));
- }
- }
- free(info->path);
-
- // Create the network side
- struct ncclTopoNode* netNode;
- NCCLCHECK(ncclTopoCreateNode(system, &netNode, NET, n));
-
- // Use rank to store the net information
- netNode->rank = info->net;
-
- NCCLCHECK(ncclTopoConnectNodes(nicNode, netNode, LINK_NET, netWidth));
- NCCLCHECK(ncclTopoConnectNodes(netNode, nicNode, LINK_NET, netWidth));
- }
- free(netInfos);
-
+ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
// And connect all CPU nodes together
for (int n=0; n<system->nodes[CPU].count; n++) {
for (int p=0; p<system->nodes[CPU].count; p++) {
if (n == p) continue;
- int width;
- NCCLCHECK(ncclTopoGetInterCpuWidth(&width));
- NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_QPI, width));
+ float width;
+ NCCLCHECK(ncclTopoGetInterCpuWidth(system->nodes[CPU].nodes+n, &width));
+ NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_SYS, width));
}
}
return ncclSuccess;
@@ -491,7 +184,9 @@ ncclResult_t ncclTopoConnectPCI(struct ncclTopoSystem* system) {
static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) {
if (node->type == GPU) {
- sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->rank);
+ sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank);
+ } else if (node->type == CPU) {
+ sprintf(line+offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model);
} else {
sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
}
@@ -501,14 +196,14 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
for (int l=0; l<node->nlinks; l++) {
struct ncclTopoLink* link = node->links+l;
if (link->type == LINK_LOC) continue;
- if (link->remNode != prevNode) {
- sprintf(line+offset, "+ %s[%2d] - ", topoLinkTypeStr[link->type], link->width);
+ if (link->type != LINK_PCI || link->remNode != prevNode) {
+ sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->width);
int nextOffset = strlen(line);
if (link->type == LINK_PCI) {
NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
} else {
if (link->remNode->type == NET) {
- sprintf(line+nextOffset, "%s/%lX (%d)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->rank);
+ sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.width);
} else {
sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
}
@@ -520,7 +215,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
}
ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
- INFO(NCCL_GRAPH, "=== System : maxWidth %2d maxSpeed %2d ===", s->maxWidth, s->maxSpeed);
+ INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f ===", s->maxWidth);
char line[1024];
for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
INFO(NCCL_GRAPH, "==========================================");
@@ -554,88 +249,400 @@ static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode*
// 1. NVLinks (already the case)
// 2. PCI down
// 3. PCI up
-// 4. QPI (already the case)
+// 4. SYS (already the case)
ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) {
for (int n=0; n<system->nodes[CPU].count; n++) NCCLCHECK(ncclTopoSort(system->nodes[CPU].nodes+n, NULL));
return ncclSuccess;
}
-ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
- struct ncclTopoSystem* s;
- NCCLCHECK(ncclCalloc(&s, 1));
- nvmlDevice_t* nvmlDevs;
- int g = 0;
- NCCLCHECK(ncclCalloc(&nvmlDevs, comm->nRanks));
- for (int r=0; r<comm->nRanks; r++) {
- if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
- // Consider the GPU as outside of our node if we can't see it through NVML.
- char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
- NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
- if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevs+g) != ncclSuccess) continue;
- g++;
- struct ncclTopoNode* gpuNode;
- NCCLCHECK(ncclTopoCreateNode(s, &gpuNode, GPU, comm->peerInfo[r].busId));
- gpuNode->rank = r;
+ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
+ int dev;
+ NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));
+
+ struct ncclTopoNode* net;
+ NCCLCHECK(ncclTopoCreateNode(system, &net, NET, dev));
+ const char* str;
+ NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str));
+ if (str) sscanf(str, "0x%lx", &net->net.asic);
+ else net->net.asic = dev;
+
+ ncclDebugNoWarn = NCCL_GRAPH;
+ int mbps;
+ if (xmlGetAttrInt(xmlNet, "speed", &mbps) != ncclSuccess) mbps = 0;
+ if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1
+ net->net.width = mbps / 8000.0;
+ if (xmlGetAttrInt(xmlNet, "port", &net->net.port) != ncclSuccess) net->net.port = 0;
+ if (xmlGetAttrInt(xmlNet, "gdr", &net->net.gdrSupport) != ncclSuccess) net->net.gdrSupport = 0;
+ if (xmlGetAttrInt(xmlNet, "maxconn", &net->net.maxChannels) != ncclSuccess) net->net.maxChannels = MAXCHANNELS;
+ if (xmlGetAttrInt(xmlNet, "coll", &net->net.collSupport) != ncclSuccess) net->net.collSupport = 0;
+ ncclDebugNoWarn = 0;
+
+ NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width));
+ NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.width));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
+ for (int s=0; s<xmlNic->nSubs; s++) {
+ struct ncclXmlNode* xmlNet = xmlNic->subs[s];
+ if (strcmp(xmlNet->name, "net") != 0) continue;
+ int index;
+ NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));
+ if (index == -1) continue;
+ NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic));
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
+ NCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap));
+ NCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank));
+ NCCLCHECK(xmlGetAttrInt(xmlGpu, "dev", &gpu->gpu.dev));
+ NCCLCHECK(xmlGetAttrInt(xmlGpu, "gdr", &gpu->gpu.gdrSupport));
+ // Do not go any further, nvlinks will be added in a second pass
+ return ncclSuccess;
+}
+
+struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, 0 } };
+struct kvDict kvDictPciGen[] = { { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { NULL, 0 } }; // x100 Mbps per lane
+ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) {
+ const char* str;
+
+ int type;
+ NCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str));
+ NCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass));
+
+ int64_t busId;
+ NCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str));
+ NCCLCHECK(busIdToInt64(str, &busId));
+
+ struct ncclTopoNode* node = NULL;
+ if (type == GPU) {
+ struct ncclXmlNode* xmlGpu;
+ NCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu));
+ if (xmlGpu == NULL) return ncclSuccess;
+ int index;
+ NCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index));
+ if (index == -1) return ncclSuccess;
+ NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
+ NCCLCHECK(ncclTopoAddGpu(xmlGpu, system, node));
+ }
+ if (type == NIC) {
+ struct ncclXmlNode* xmlNic;
+ NCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic));
+ if (xmlNic == NULL) return ncclSuccess;
+
+ // Ignore sub device ID and merge multi-port NICs into one PCI device.
+ busId &= 0xfffffffffffffff0;
+ struct ncclTopoNode* nicNode = NULL;
+ NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, busId));
+ if (nicNode == NULL) {
+ NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, busId));
+ node = nicNode; // Connect it to parent later on
+ }
+ NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode));
+ } else if (type == PCI) {
+ NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
+ for (int s=0; s<xmlPci->nSubs; s++) {
+ struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
+ NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node));
}
}
- NCCLCHECK(ncclTopoConnectNVLink(nvmlDevs, s));
- NCCLCHECK(ncclTopoConnectPCI(s));
+ if (node) {
+ int width, speed;
+ NCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width));
+ NCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str));
+
+ // Manage cases where speed was not indicated in /sys
+ if (width == 0) width = 16;
+ if (strlen(str) == 0 || strcasecmp(str, "Unknown speed") == 0) str = "8 GT/s";
- free(nvmlDevs);
- NCCLCHECK(ncclTopoSortSystem(s));
- *system = s;
+ NCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end)
+
+ NCCLCHECK(ncclTopoConnectNodes(node, parent, LINK_PCI, width*speed/80.0));
+ NCCLCHECK(ncclTopoConnectNodes(parent, node, LINK_PCI, width*speed/80.0));
+ }
return ncclSuccess;
}
-ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink) {
- int g1, g2;
- NCCLCHECK(idToIndex(system, busId1, &g1));
- NCCLCHECK(idToIndex(system, busId2, &g2));
- *nvlink = g1 != -1 && g2 != -1 && system->nodes[GPU].nodes[g1].paths[GPU][g2].type == LINK_NVL;
+struct kvDict kvDictCpuArch[] = { { "x86_64", NCCL_TOPO_CPU_ARCH_X86 }, { "arm64", NCCL_TOPO_CPU_ARCH_ARM }, { "ppc64", NCCL_TOPO_CPU_ARCH_POWER }, { NULL, 0 } };
+struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { NULL, 0 } };
+
+ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) {
+ int numaId;
+ NCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId));
+ struct ncclTopoNode* cpu;
+ NCCLCHECK(ncclTopoCreateNode(system, &cpu, CPU, numaId));
+ const char* str;
+ NCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str));
+ if (str != NULL) {
+ NCCLCHECK(ncclStrToCpuset(str, &cpu->cpu.affinity));
+ }
+
+ NCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str));
+ NCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch));
+ if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86) {
+ NCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str));
+ NCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor));
+ if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
+ int familyId, modelId;
+ NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
+ NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
+ cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;
+ }
+ }
+ for (int s=0; s<xmlCpu->nSubs; s++) {
+ struct ncclXmlNode* node = xmlCpu->subs[s];
+ if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu));
+ if (strcmp(node->name, "nic") == 0) {
+ struct ncclTopoNode* nic = NULL;
+ NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0));
+ if (nic == NULL) {
+ NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, 0));
+ NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_WIDTH));
+ NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_WIDTH));
+ }
+ NCCLCHECK(ncclTopoAddNic(node, system, nic));
+ }
+ }
return ncclSuccess;
}
-ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink) {
- int g;
- NCCLCHECK(idToIndex(system, busId, &g));
- for (int i=0; i<system->nodes[GPU].count; i++) {
- if (i == g) continue;
- if (system->nodes[GPU].nodes[g].paths[GPU][i].type == LINK_NVL) {
- *nvlink = 1;
- return ncclSuccess;
+ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) {
+ if (strcmp(node->name, "nvlink") == 0) {
+ struct ncclTopoNode* gpu = NULL;
+ int64_t pBusId;
+ NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
+ NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
+ if (gpu == NULL) {
+ WARN("Add NVLink error : could not find GPU %lx\n", pBusId);
+ return ncclInternalError;
+ }
+ int count;
+ NCCLCHECK(xmlGetAttrInt(node, "count", &count));
+ const char* targetClass;
+ NCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass));
+ int targetType;
+ NCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass));
+ struct ncclTopoNode* remote = NULL;
+ if (targetType == GPU) {
+ // NVL P2P connection to another GPU
+ const char* target;
+ NCCLCHECK(xmlGetAttrStr(node, "target", &target));
+ int64_t busId;
+ NCCLCHECK(busIdToInt64(target, &busId));
+ NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, busId));
+ } else if (targetType == CPU) {
+ // NVL connection to the local CPU
+ NCCLCHECK(findLocalCpu(gpu, &remote));
+ } else {
+ if (system->nodes[NVS].count == 0) {
+ NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0));
+ } else {
+ remote = system->nodes[NVS].nodes;
+ }
+ }
+ if (remote) {
+ int nvlSpeed = gpu->gpu.cudaCompCap == 60 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
+ NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed));
+ if (remote->type != GPU) {
+ NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed));
+ }
+ }
+ } else {
+ const char* busId;
+ NCCLCHECK(xmlGetAttr(node, "busid", &busId));
+ for (int s=0; s<node->nSubs; s++) {
+ NCCLCHECK(ncclTopoAddNvLinks(node->subs[s], system, busId ? busId : parentBusId));
}
}
- *nvlink = 0;
return ncclSuccess;
}
-static int pathDistance(struct ncclTopoLinkList* links) {
- int distance = PATH_PIX;
- if (links->count > 2) distance = PATH_PXB;
- for (int l=0; l<links->count; l++) {
- // PHB if we go through 1 CPU, SYS if we go through 2 CPUs
- if (links->list[l]->remNode->type == CPU) distance = (distance == PATH_PHB) ? PATH_SYS : PATH_PHB;
+ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem) {
+ NCCLCHECK(ncclCalloc(topoSystem, 1));
+ struct ncclXmlNode* topNode;
+ NCCLCHECK(xmlFindTag(xml, "system", &topNode));
+ for (int s=0; s<topNode->nSubs; s++) {
+ struct ncclXmlNode* node = topNode->subs[s];
+ if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
+ }
+ NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL));
+
+ NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
+ NCCLCHECK(ncclTopoSortSystem(*topoSystem));
+
+ return ncclSuccess;
+}
+
+NCCL_PARAM(TopoDumpFileRank, "TOPO_DUMP_FILE_RANK", 0);
+
+// Only set values if not already set
+static ncclResult_t xmlInitAttrInt(struct ncclXmlNode* node, const char* attrName, const int value) {
+ int index;
+ NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+ if (index == -1) {
+ index = node->nAttrs++;
+ strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+ snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
}
- return distance;
+ return ncclSuccess;
+}
+static ncclResult_t xmlInitAttrUint64(struct ncclXmlNode* node, const char* attrName, const uint64_t value) {
+ int index;
+ NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+ if (index == -1) {
+ index = node->nAttrs++;
+ strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+ snprintf(node->attrs[index].value, MAX_STR_LEN, "0x%lx", value);
+ }
+ return ncclSuccess;
}
-ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance) {
- int g1, g2;
- NCCLCHECK(idToIndex(system, busId1, &g1));
- NCCLCHECK(idToIndex(system, busId2, &g2));
- *distance = pathDistance(system->nodes[GPU].nodes[g1].paths[GPU]+g2);
+
+ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
+ struct ncclXml* xml;
+ NCCLCHECK(ncclCalloc(&xml, 1));
+ char* xmlTopoFile = getenv("NCCL_TOPO_FILE");
+ if (xmlTopoFile) {
+ NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml));
+ }
+ if (xml->maxIndex == 0) {
+ // Create top tag
+ struct ncclXmlNode* top;
+ NCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
+ NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION));
+ }
+
+ // Auto-detect GPUs if needed
+ for (int r=0; r<comm->nRanks; r++) {
+ if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
+ char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+ NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
+ struct ncclXmlNode* node;
+ NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
+ NCCLCHECK(xmlSetAttrInt(node, "rank", r));
+ NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
+ }
+ }
+ // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
+ // so we start with collnet so that it has precedence.
+ int netDevCount = 0;
+ if (ncclCollNet) {
+ NCCLCHECK(collNetDevices(&netDevCount));
+ for (int n=0; n<netDevCount; n++) {
+ ncclNetProperties_t props;
+ NCCLCHECK(collNetGetProperties(n, &props));
+ struct ncclXmlNode* netNode;
+ NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
+ NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
+ NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
+ NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
+ NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
+ NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
+ NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
+ NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1));
+ }
+ }
+ if (netDevCount == 0) {
+ NCCLCHECK(ncclNetDevices(&netDevCount));
+ }
+ for (int n=0; n<netDevCount; n++) {
+ ncclNetProperties_t props;
+ NCCLCHECK(ncclNetGetProperties(n, &props));
+ struct ncclXmlNode* netNode;
+ NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
+ NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
+ NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
+ NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
+ NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
+ NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
+ NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
+ }
+
+ xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE");
+ if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
+ NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
+ }
+
+ NCCLCHECK(ncclTopoGetSystemFromXml(xml, system));
+ free(xml);
return ncclSuccess;
}
-ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance) {
- int g;
- NCCLCHECK(idToIndex(system, busId, &g));
- *distance = pathDistance(system->nodes[GPU].nodes[g].paths[NET]+netDev);
+/****************************/
+/* External query functions */
+/****************************/
+
+ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model) {
+ *arch = system->nodes[CPU].nodes[0].cpu.arch;
+ *vendor = system->nodes[CPU].nodes[0].cpu.vendor;
+ *model = system->nodes[CPU].nodes[0].cpu.model;
return ncclSuccess;
}
-ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count) {
- *count = system->nodes[CPU].count;
+NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
+
+ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank) {
+ struct ncclTopoNode* cpu = NULL, *gpu = NULL;
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
+ gpu = system->nodes[GPU].nodes+g;
+ // Find closer CPU
+ int cpuIndex = -1, minHops = 0;
+ for (int c=0; c<system->nodes[CPU].count; c++) {
+ int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count;
+ if (cpuIndex == -1 || nHops < minHops) {
+ cpuIndex = c;
+ minHops = nHops;
+ }
+ }
+ cpu = system->nodes[CPU].nodes+cpuIndex;
+ }
+ }
+ if (cpu == NULL) {
+ WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank);
+ return ncclInternalError;
+ }
+
+ // Query the CPU affinity set we were provided
+ cpu_set_t mask;
+ SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
+
+#ifdef ENABLE_TRACE
+ {
+ char affinityStr[sizeof(cpu_set_t)*2];
+ NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
+ TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev, affinityStr);
+ }
+#endif
+
+ // Get the affinity of the CPU close to our GPU.
+ cpu_set_t cpuMask = cpu->cpu.affinity;
+
+#ifdef ENABLE_TRACE
+ {
+ char affinityStr[sizeof(cpu_set_t)*2];
+ NCCLCHECK(ncclCpusetToStr(&cpuMask, affinityStr));
+ TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev, affinityStr);
+ }
+#endif
+
+ cpu_set_t finalMask;
+ if (ncclParamIgnoreCpuAffinity())
+ // Ignore the CPU affinity set and use the GPU one instead
+ finalMask = cpuMask;
+ else
+ // Use a subset of the GPU affinity set
+ CPU_AND(&finalMask, &mask, &cpuMask);
+
+ // If there is a non empty set, use it to set affinity
+ if (CPU_COUNT(&finalMask)) {
+ char affinityStr[sizeof(cpu_set_t)*2];
+ NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
+ INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr);
+ SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity");
+ }
return ncclSuccess;
}