1 files changed, 474 insertions, 467 deletions
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index a1b3209..5cd8d4e 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,46 +10,22 @@
 #include "comm.h"
 #include "nvmlwrap.h"
 #include "net.h"
+#include "coll_net.h"
 #include <sys/stat.h>
 #include <fcntl.h>
+#include "xml.h"
+#include "cpuset.h"
 
 #define BUSID_SIZE (sizeof("0000:00:00.0"))
 #define BUSID_REDUCED_SIZE (sizeof("0000:00"))
 
-const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
-
 const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
-const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "QPI", "NET" };
+const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "",    "",    "SYS", "NET" };
+const char* topoPathTypeStr[] = { "LOC", "NVL", "PIX", "PXB", "PHB", "SYS", "NET" };
 
 /******************************************************************/
 /******************* Graph Creation Functions *********************/
 /******************************************************************/
-static int getNumaId(char *path) {
-  char npath[PATH_MAX];
-  snprintf(npath, PATH_MAX, "%s/numa_node", path);
-  npath[PATH_MAX-1] = '\0';
-
-  int numaId = -1;
-  FILE *file = fopen(npath, "r");
-  if (file == NULL) return -1;
-  if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
-  fclose(file);
-
-  return numaId;
-}
-
-static ncclResult_t getPciPath(char* busId, char** path) {
-  for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
-  char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
-  memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
-  memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
-  *path = realpath(busPath, NULL);
-  if (*path == NULL) {
-    WARN("Could not find real path of %s", busPath);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
 
 // Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000.
 ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
@@ -59,110 +35,43 @@ ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id)
   // Find next /
   while (*str != '/') str--;
   str++;
-  NCCLCHECK(busIdToInt64(str, id));
+  int64_t numid;
+  NCCLCHECK(busIdToInt64(str, &numid));
+  // Ignore subdevice because those should use the same PCI link so we want to merge nodes.
+  numid -= numid & 0xf;
+  *id = numid;
   return ncclSuccess;
 }
 
-static ncclResult_t idToIndex(struct ncclTopoSystem* system, int64_t id, int* index) {
-  *index = -1;
-  for (int i=0; i<system->nodes[GPU].count; i++) {
-    if (system->nodes[GPU].nodes[i].id == id) {
-      *index = i;
-    }
+static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu) {
+  *cpu = NULL;
+  if (node->type == CPU) {
+    *cpu = node;
+    return ncclSuccess;
+  }
+  for (int l=0; l<node->nlinks; l++) {
+    if (node->links[l].type == LINK_PCI) NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
+    if (*cpu != NULL) return ncclSuccess;
   }
   return ncclSuccess;
 }
 
-
-static ncclResult_t getPath(int64_t id, char** path) {
-  char busId[] = "0000:00:00.0";
-  NCCLCHECK(int64ToBusId(id, busId));
-  NCCLCHECK(getPciPath(busId, path));
-  return ncclSuccess;
-}
-
-ncclResult_t ncclTopoCudaPath(int cudaDev, char** path) {
-  char busId[BUSID_SIZE];
-  CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
-  NCCLCHECK(getPciPath(busId, path));
-  return ncclSuccess;
-}
-
-
 int interCpuWidth = 0;
 int cpuPciWidth = 0;
 
-static ncclResult_t getCpuWidths() {
-  // Check if already detected
-  if (interCpuWidth + cpuPciWidth) return ncclSuccess;
-
-  // Defaults
-  char cpu[256];
-  sprintf(cpu, "Generic");
-  cpuPciWidth = interCpuWidth = PCI_WIDTH;
-
-#ifdef __PPC__
-  sprintf(cpu, "ppc64");
-  interCpuWidth = P9_WIDTH;
-#endif
-#ifdef __x86_64__
-  sprintf(cpu, "x86_64");
-  union {
-    struct {
-      // CPUID 0 String register order
-      uint32_t ebx;
-      uint32_t edx;
-      uint32_t ecx;
-    };
-    char vendor[12];
-  } cpuid0;
-
-  asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0));
-  if (strncmp(cpuid0.vendor, "GenuineIntel", 12) == 0) sprintf(cpu, "Intel");
-
-  if (strcmp(cpu, "Intel") == 0) {
-    union {
-      struct {
-        int steppingId:4;
-        int model:4;
-        int familyId:4;
-        int processorType:2;
-        int resv0:2;
-        int extModelId:4;
-        int modelId:8;
-        int resv1:4;
-      };
-      uint32_t val;
-    } cpuid1;
-    asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1));
-    if (cpuid1.familyId == 6 && cpuid1.modelId >= 0x55) { // Skylake
-      sprintf(cpu, "Intel/Skylake (or later)");
-      interCpuWidth = SKL_QPI_WIDTH;
-    } else {
-      interCpuWidth = QPI_WIDTH;
-    }
+static ncclResult_t ncclTopoGetInterCpuWidth(struct ncclTopoNode* cpu, float* width) {
+  *width = LOC_WIDTH;
+  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) {
+    *width = P9_WIDTH;
+    return ncclSuccess;
+  }
+  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_ARM) {
+    *width = ARM_WIDTH;
+    return ncclSuccess;
+  }
+  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
+    *width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_WIDTH : QPI_WIDTH;
   }
-#endif
-  INFO(NCCL_GRAPH, "%s CPU (PCI %d, InterCpu %d)", cpu, cpuPciWidth, interCpuWidth);
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclTopoGetInterCpuWidth(int* width) {
-  NCCLCHECK(getCpuWidths());
-  *width = interCpuWidth;
-  return ncclSuccess;
-}
-static ncclResult_t ncclTopoGetCpuPciP2pWidth(int* width) {
-  NCCLCHECK(getCpuWidths());
-  *width = cpuPciWidth;
-  return ncclSuccess;
-}
-static ncclResult_t ncclTopoGetPciWidth(int* width) {
-  *width = PCI_WIDTH;
-  return ncclSuccess;
-}
-static ncclResult_t ncclTopoGetNetWidth(int* width) {
-  *width = NET_WIDTH;
   return ncclSuccess;
 }
 
@@ -173,317 +82,101 @@ enum ncclNvLinkDeviceType {
   ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
 };
 
-static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
-  char classPath[] =  "/sys/bus/pci/devices/0000:00:00.0/class";
-  memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
-  char* rPath = realpath(classPath, NULL);
-  int fd;
-  if ((fd = open(rPath, O_RDONLY)) == -1) {
-    // Could not find device. It might be because we're in a VM and
-    // we don't see the whole machine. This is handled silently so
-    // we don't want to print an INFO error.
-    TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
-    return ncclSystemError;
-  }
-  free(rPath);
-  char pciClass[9];
-  strncpy(pciClass, "0x000000", 9);
-  int len;
-  SYSCHECKVAL(read(fd, pciClass, 8), "read", len);
-  SYSCHECK(close(fd), "close");
-  if (strcmp(pciClass, "0x068000") == 0) {
-    // PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
-    *type = ncclNvLinkDeviceSwitch;
-  } else if (strcmp(pciClass, "0x068001") == 0) {
-    // PCI device is of type "Bridge: IBM Device 04ea"
-    *type = ncclNvLinkDeviceBridge;
-  } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
-      || strcmp(pciClass, "0x030000") == 0) {  // "VGA Controller" (GeForce)
-    *type = ncclNvLinkDeviceGpu;
-  } else {
-    *type = ncclNvLinkDeviceUnknown;
+ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
+  for (int i=0; i<system->nodes[type].count; i++) {
+    if (system->nodes[type].nodes[i].id == id) {
+      *node = system->nodes[type].nodes+i;
+      return ncclSuccess;
+    }
   }
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoConnectCpu(struct ncclTopoSystem* system, int numaId, struct ncclTopoNode* node, int linkType, int linkWidth) {
-  struct ncclTopoNode* cpuNode = NULL;
-  for (int c=0; c<system->nodes[CPU].count; c++) {
-    if (system->nodes[CPU].nodes[c].id == numaId) cpuNode = system->nodes[CPU].nodes+c;
+ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
+  if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
+    WARN("Error : tried to create too many nodes of type %d\n", type);
+    return ncclInternalError;
   }
-  if (cpuNode == NULL) { // Create CPU
-    NCCLCHECK(ncclTopoCreateNode(system, &cpuNode, CPU, numaId));
+  struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
+  system->nodes[type].count++;
+  n->type = type;
+  n->id = id;
+  if (type == GPU) {
+    // Create link to itself (used in some corner cases)
+    n->nlinks=1;
+    n->links[0].type = LINK_LOC;
+    n->links[0].remNode = n;
+    n->links[0].width = LOC_WIDTH;
+    n->gpu.dev = NCCL_TOPO_UNDEF;
+    n->gpu.rank = NCCL_TOPO_UNDEF;
+    n->gpu.cudaCompCap = NCCL_TOPO_UNDEF;
+  } else if (type == CPU) {
+    n->cpu.arch = NCCL_TOPO_UNDEF;
+    n->cpu.vendor = NCCL_TOPO_UNDEF;
+    n->cpu.model = NCCL_TOPO_UNDEF;
+  } else if (type == NET) {
+    n->net.asic = 0ULL;
+    n->net.port = NCCL_TOPO_UNDEF;
+    n->net.width = 0.0;
   }
-  NCCLCHECK(ncclTopoConnectNodes(node, cpuNode, linkType, linkWidth));
-  NCCLCHECK(ncclTopoConnectNodes(cpuNode, node, linkType, linkWidth));
+  *node = n;
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoConnectNVLink(nvmlDevice_t* nvmlDevs, struct ncclTopoSystem* system) {
-  struct ncclTopoNode* nvsNode = NULL;
-
-  int minNvlinks = 6, minWidth = VOLTA_NVLINK_WIDTH;
-  for (int g=0; g<system->nodes[GPU].count; g++) {
-    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
-    int cudaMajor, cudaMinor;
-    NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDevs[g], &cudaMajor, &cudaMinor));
-    int maxNvLinks, width;
-    if (cudaMajor < 6) {
-      maxNvLinks = 0;
-      width = 0;
-    } else if (cudaMajor == 6) {
-      maxNvLinks = 4;
-      width = PASCAL_NVLINK_WIDTH;
-    } else {
-      maxNvLinks = 6;
-      width = VOLTA_NVLINK_WIDTH;
-    }
-
-    int nvlinks = 0;
-    for (int l=0; l<maxNvLinks; ++l) {
-      // Check whether we can use this NVLink for P2P
-      unsigned canP2P;
-      if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDevs[g], l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
-
-      // Make sure the Nvlink is up. The previous call should have trained the link.
-      nvmlEnableState_t isActive;
-      if ((wrapNvmlDeviceGetNvLinkState(nvmlDevs[g], l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
-
-      // Try to figure out what's on the other side of the NVLink
-      nvmlPciInfo_t remoteProc;
-      if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevs[g], l, &remoteProc) != ncclSuccess) continue;
-
-      // Make a lower case copy of the bus ID for calling ncclDeviceType
-      // PCI system path is in lower case
-      char* p = remoteProc.busId;
-      char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-      for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
-        lowerId[c] = tolower(p[c]);
-        if (p[c] == 0) break;
-      }
-
-      enum ncclNvLinkDeviceType type;
-      NCCLCHECK(ncclDeviceType(lowerId, &type));
-      if (type == ncclNvLinkDeviceGpu) {
-        int64_t remoteId;
-        NCCLCHECK(busIdToInt64(lowerId, &remoteId));
-        int peer;
-        NCCLCHECK(idToIndex(system, remoteId, &peer));
-        if (peer != -1) {
-          NCCLCHECK(ncclTopoConnectNodes(gpu, system->nodes[GPU].nodes+peer, LINK_NVL, width));
-          nvlinks++;
-        }
-      } else if (type == ncclNvLinkDeviceBridge) {
-        // Nvlink between GPU and CPU (PPC)
-        // Since the remote bridge does not have a valid numa_node, assume we
-        // are connected to the closest CPU.
-        char* path;
-        NCCLCHECK(getPath(gpu->id, &path));
-        int numaId = getNumaId(path);
-        free(path);
-        NCCLCHECK(ncclTopoConnectCpu(system, numaId, gpu, LINK_NVL, width));
-        nvlinks++;
-      } else { // Nvswitch
-        if (type == ncclNvLinkDeviceUnknown) {
-          // The NVLink is up but we couldn't find the PCI device on the other
-          // side. Assume it's an NVswitch outside a VM.
-          if (l == 0) INFO(NCCL_INIT, "%d/%d -> %s : Assuming NVLink is connected to NVswitch", g, l, lowerId);
+ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int index) {
+  struct ncclTopoNode* delNode = system->nodes[type].nodes+index;
+  for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
+    free(delNode->paths[t]);
+    for (int n=0; n<system->nodes[t].count; n++) {
+      struct ncclTopoNode* node = system->nodes[t].nodes+n;
+      if (node == delNode) continue;
+      for (int l=0; l<node->nlinks; l++) {
+        while (l<node->nlinks && node->links[l].remNode == delNode) {
+          memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink));
+          node->nlinks--;
         }
-        if (nvsNode == NULL) { // Create nvswitch
-          NCCLCHECK(ncclTopoCreateNode(system, &nvsNode, NVS, 0));
+        if (l<node->nlinks && node->links[l].remNode->type == type && node->links[l].remNode >= delNode) {
+          node->links[l].remNode--;
         }
-        NCCLCHECK(ncclTopoConnectNodes(gpu, nvsNode, LINK_NVL, VOLTA_NVLINK_WIDTH));
-        NCCLCHECK(ncclTopoConnectNodes(nvsNode, gpu, LINK_NVL, VOLTA_NVLINK_WIDTH));
-        nvlinks++;
       }
     }
-    minNvlinks = std::min(minNvlinks, nvlinks);
-    minWidth = std::min(minWidth, width);
   }
-  int pciWidth;
-  NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
-  system->maxSpeed = minNvlinks ? minNvlinks*minWidth : pciWidth;
-  system->maxWidth = minNvlinks ? minWidth : pciWidth;
+  memmove(delNode, delNode+1, (system->nodes[type].count-index-1)*sizeof(struct ncclTopoNode));
+  system->nodes[type].count--;
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoCreatePciPath(struct ncclTopoSystem* system, struct ncclTopoNode* endNode, char* path) {
-  struct ncclTopoNode* lastNode = endNode;
-  int pciWidth;
-  NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
-  // Find intermediate PCI switches
-  int slashCount = 0;
-  int offsetRC = 0;
-  while (offsetRC < strlen(path)) {
-    if (path[offsetRC] == '/') slashCount++;
-    if (slashCount == 4) break;
-    offsetRC++;
-  }
-  int offset = strlen(path);
-  slashCount = 0;
-  while (--offset > offsetRC) {
-    if (path[offset] == '/') {
-      slashCount++;
-      // Find if already existing
-      if ((slashCount%2) == 0) {
-        int64_t pciId;
-        NCCLCHECK(pciPathToInt64(path, offset, offsetRC, &pciId));
-        for (int p=0; p<system->nodes[PCI].count; p++) {
-          if (system->nodes[PCI].nodes[p].id == pciId) {
-            // Found our PCI switch. Attach and stop since the rest should already
-            // be connected
-            NCCLCHECK(ncclTopoConnectNodes(system->nodes[PCI].nodes+p, lastNode, LINK_PCI, pciWidth));
-            NCCLCHECK(ncclTopoConnectNodes(lastNode, system->nodes[PCI].nodes+p, LINK_PCI, pciWidth));
-            return ncclSuccess;
-          }
-        }
-        struct ncclTopoNode* pciNode;
-        NCCLCHECK(ncclTopoCreateNode(system, &pciNode, PCI, pciId));
-        NCCLCHECK(ncclTopoConnectNodes(pciNode, lastNode, LINK_PCI, pciWidth));
-        NCCLCHECK(ncclTopoConnectNodes(lastNode, pciNode, LINK_PCI, pciWidth));
-        lastNode = pciNode;
-      }
-    }
+ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width) {
+  // Aggregate links into higher width for NVLink
+  struct ncclTopoLink* link;
+  for (link = node->links; link->remNode; link++) {
+    if (link->remNode == remNode && link->type == type) break;
   }
-  // Then attach to a CPU node
-  int numaId = getNumaId(path);
-  int width;
-  NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
-  NCCLCHECK(ncclTopoConnectCpu(system, numaId, lastNode, LINK_PCI, width));
-  return ncclSuccess;
-}
-
-// Try to detect if IB cards are in fact the same physical NIC, hence sharing ports.
-#include <glob.h>
-#define IB_GUID_PATH "%s/infiniband/mlx5_*/sys_image_guid"
-uint64_t getIbGuid(char* path) {
-  uint64_t guid = 0ULL;
-  char guidPath[PATH_MAX];
-  snprintf(guidPath, PATH_MAX, IB_GUID_PATH, path);
-  // PATH has a wildcard in it so use glob()
-  glob_t globbuf;
-  glob(guidPath, 0, NULL, &globbuf);
-  if (globbuf.gl_pathc > 0)
-    strncpy(guidPath, globbuf.gl_pathv[0], PATH_MAX);
-  globfree(&globbuf);
-  guidPath[PATH_MAX-1] = '\0';
-  FILE *file = fopen(guidPath, "r");
-  if (file != NULL) {
-    uint64_t a, b, c, d;
-    if (fscanf(file, "%04lx:%04lx:%04lx:%04lx", &a, &b, &c, &d) != EOF) {
-      guid = (a << 48) + (b << 32) + (c<<16) + d;
-      TRACE(NCCL_GRAPH, "Opened %s guid %lx", guidPath, guid);
-    }
-    fclose(file);
-  }
-  return guid;
-}
-
-struct netInfo {
-  char* path;
-  int64_t nic;
-  uint64_t asic;
-  int port;
-  int net;
-};
-
-ncclResult_t ncclTopoComputeNetInfo(struct netInfo* netInfos, int ndev) {
-  for (int n=0; n<ndev; n++) {
-    struct netInfo* info = netInfos+n;
-    uint64_t ibGuid;
-    info->nic = n;
-    info->asic = n;
-    info->port = 0;
-    info->net = n;
-    if (info->path && (ibGuid = getIbGuid(info->path)) != 0) {
-      info->asic = ibGuid;
-
-      // Ignore PCI subdevice when computing the ID to merge multi-port cards
-      // and make them use the same PCI link.
-      char* path = strdup(info->path);
-      path[strlen(path)-1]='0';
-      NCCLCHECK(pciPathToInt64(path, strlen(path), 0, &info->nic));
-      free(path);
-
-      // Same PCI path -> different ports of the same NIC
-      for (int i=0; i<n; i++) if (netInfos[i].nic == info->nic) info->port++;
-
-      // Same GUID -> same network links as the other NIC
-      for (int i=0; i<n; i++) if (netInfos[i].asic == info->asic && netInfos[i].port == info->port) info->net = netInfos[i].net;
-    }
-    INFO(NCCL_GRAPH, "%s -> %x/%lx/%d/%d", info->path, info->nic, info->asic, info->port, info->net);
+  if (link->remNode == NULL) node->nlinks++;
+  link->type = type;
+  link->remNode = remNode;
+  link->width += width;
+
+  // Sort links in BW descending order
+  struct ncclTopoLink linkSave;
+  memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
+  while (link != node->links) {
+    if ((link-1)->width >= linkSave.width) break;
+    memcpy(link, link-1, sizeof(struct ncclTopoLink));
+    link--;
   }
+  memcpy(link, &linkSave, sizeof(struct ncclTopoLink));
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoConnectPCI(struct ncclTopoSystem* system) {
-  for (int g=0; g<system->nodes[GPU].count; g++) {
-    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
-    char* path;
-    NCCLCHECK(getPath(gpu->id, &path));
-    NCCLCHECK(ncclTopoCreatePciPath(system, gpu, path));
-    free(path);
-  }
-
-  // Connect the NICs
-  int netDevCount;
-  NCCLCHECK(ncclNetDevices(&netDevCount));
-  int netWidth;
-  NCCLCHECK(ncclTopoGetNetWidth(&netWidth));
-
-  struct netInfo* netInfos;
-  NCCLCHECK(ncclCalloc(&netInfos, netDevCount));
-
-  for (int n=0; n<netDevCount; n++) {
-    ncclResult_t res = ncclNetPciPath(n, &netInfos[n].path);
-    if (res != ncclSuccess) netInfos[n].path = NULL;
-  }
-
-  NCCLCHECK(ncclTopoComputeNetInfo(netInfos, netDevCount));
-
-  for (int n=0; n<netDevCount; n++) {
-    struct netInfo* info = netInfos+n;
-    // Create NIC and attach it to the PCI tree
-    struct ncclTopoNode* nicNode = NULL;
-    for (int i=0; i<system->nodes[NIC].count; i++) {
-      if (system->nodes[NIC].nodes[i].id == info->nic) {
-        nicNode = system->nodes[NIC].nodes+i;
-        break;
-      }
-    }
-    if (!nicNode) {
-      NCCLCHECK(ncclTopoCreateNode(system, &nicNode, NIC, info->nic));
-      if (info->path) {
-        // Create the PCI path
-        NCCLCHECK(ncclTopoCreatePciPath(system, nicNode, info->path));
-      } else {
-        // This is probably a virtual NIC. Just attach it directly to CPU 0
-        int width;
-        NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
-        NCCLCHECK(ncclTopoConnectCpu(system, 0, nicNode, LINK_PCI, width));
-      }
-    }
-    free(info->path);
-
-    // Create the network side
-    struct ncclTopoNode* netNode;
-    NCCLCHECK(ncclTopoCreateNode(system, &netNode, NET, n));
-
-    // Use rank to store the net information
-    netNode->rank = info->net;
-
-    NCCLCHECK(ncclTopoConnectNodes(nicNode, netNode, LINK_NET, netWidth));
-    NCCLCHECK(ncclTopoConnectNodes(netNode, nicNode, LINK_NET, netWidth));
-  }
-  free(netInfos);
-
+ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
   // And connect all CPU nodes together
   for (int n=0; n<system->nodes[CPU].count; n++) {
     for (int p=0; p<system->nodes[CPU].count; p++) {
       if (n == p) continue;
-      int width;
-      NCCLCHECK(ncclTopoGetInterCpuWidth(&width));
-      NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_QPI, width));
+      float width;
+      NCCLCHECK(ncclTopoGetInterCpuWidth(system->nodes[CPU].nodes+n, &width));
+      NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_SYS, width));
     }
   }
   return ncclSuccess;
@@ -491,7 +184,9 @@ ncclResult_t ncclTopoConnectPCI(struct ncclTopoSystem* system) {
 
 static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) {
   if (node->type == GPU) {
-    sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->rank);
+    sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank);
+  } else if (node->type == CPU) {
+    sprintf(line+offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model);
   } else {
     sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
   }
@@ -501,14 +196,14 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
   for (int l=0; l<node->nlinks; l++) {
     struct ncclTopoLink* link = node->links+l;
     if (link->type == LINK_LOC) continue;
-    if (link->remNode != prevNode) {
-      sprintf(line+offset, "+ %s[%2d] - ", topoLinkTypeStr[link->type], link->width);
+    if (link->type != LINK_PCI || link->remNode != prevNode) {
+      sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->width);
       int nextOffset = strlen(line);
       if (link->type == LINK_PCI) {
         NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
       } else {
         if (link->remNode->type == NET) {
-          sprintf(line+nextOffset, "%s/%lX (%d)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->rank);
+          sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.width);
         } else {
           sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
         }
@@ -520,7 +215,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
 }
 
 ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
-  INFO(NCCL_GRAPH, "=== System : maxWidth %2d maxSpeed %2d ===", s->maxWidth, s->maxSpeed);
+  INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f ===", s->maxWidth);
   char line[1024];
   for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
   INFO(NCCL_GRAPH, "==========================================");
@@ -554,88 +249,400 @@ static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode*
 // 1. NVLinks (already the case)
 // 2. PCI down
 // 3. PCI up
-// 4. QPI (already the case)
+// 4. SYS (already the case)
 ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) {
   for (int n=0; n<system->nodes[CPU].count; n++) NCCLCHECK(ncclTopoSort(system->nodes[CPU].nodes+n, NULL));
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
-  struct ncclTopoSystem* s;
-  NCCLCHECK(ncclCalloc(&s, 1));
-  nvmlDevice_t* nvmlDevs;
-  int g = 0;
-  NCCLCHECK(ncclCalloc(&nvmlDevs, comm->nRanks));
-  for (int r=0; r<comm->nRanks; r++) {
-    if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
-      // Consider the GPU as outside of our node if we can't see it through NVML.
-      char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-      NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
-      if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevs+g) != ncclSuccess) continue;
-      g++;
-      struct ncclTopoNode* gpuNode;
-      NCCLCHECK(ncclTopoCreateNode(s, &gpuNode, GPU, comm->peerInfo[r].busId));
-      gpuNode->rank = r;
+ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
+  int dev;
+  NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));
+
+  struct ncclTopoNode* net;
+  NCCLCHECK(ncclTopoCreateNode(system, &net, NET, dev));
+  const char* str;
+  NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str));
+  if (str) sscanf(str, "0x%lx", &net->net.asic);
+  else net->net.asic = dev;
+
+  ncclDebugNoWarn = NCCL_GRAPH;
+  int mbps;
+  if (xmlGetAttrInt(xmlNet, "speed", &mbps) != ncclSuccess) mbps = 0;
+  if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1
+  net->net.width = mbps / 8000.0;
+  if (xmlGetAttrInt(xmlNet, "port", &net->net.port) != ncclSuccess) net->net.port = 0;
+  if (xmlGetAttrInt(xmlNet, "gdr", &net->net.gdrSupport) != ncclSuccess) net->net.gdrSupport = 0;
+  if (xmlGetAttrInt(xmlNet, "maxconn", &net->net.maxChannels) != ncclSuccess) net->net.maxChannels = MAXCHANNELS;
+  if (xmlGetAttrInt(xmlNet, "coll", &net->net.collSupport) != ncclSuccess) net->net.collSupport = 0;
+  ncclDebugNoWarn = 0;
+
+  NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width));
+  NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.width));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
+  for (int s=0; s<xmlNic->nSubs; s++) {
+    struct ncclXmlNode* xmlNet = xmlNic->subs[s];
+    if (strcmp(xmlNet->name, "net") != 0) continue;
+    int index;
+    NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));
+    if (index == -1) continue;
+    NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
+  NCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap));
+  NCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank));
+  NCCLCHECK(xmlGetAttrInt(xmlGpu, "dev", &gpu->gpu.dev));
+  NCCLCHECK(xmlGetAttrInt(xmlGpu, "gdr", &gpu->gpu.gdrSupport));
+  // Do not go any further, nvlinks will be added in a second pass
+  return ncclSuccess;
+}
+
+struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, 0 } };
+struct kvDict kvDictPciGen[] = { { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { NULL, 0 } }; // x100 Mbps per lane
+ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) {
+  const char* str;
+
+  int type;
+  NCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str));
+  NCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass));
+
+  int64_t busId;
+  NCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str));
+  NCCLCHECK(busIdToInt64(str, &busId));
+
+  struct ncclTopoNode* node = NULL;
+  if (type == GPU) {
+    struct ncclXmlNode* xmlGpu;
+    NCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu));
+    if (xmlGpu == NULL) return ncclSuccess;
+    int index;
+    NCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index));
+    if (index == -1) return ncclSuccess;
+    NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
+    NCCLCHECK(ncclTopoAddGpu(xmlGpu, system, node));
+  }
+  if (type == NIC) {
+    struct ncclXmlNode* xmlNic;
+    NCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic));
+    if (xmlNic == NULL) return ncclSuccess;
+
+    // Ignore sub device ID and merge multi-port NICs into one PCI device.
+    busId &= 0xfffffffffffffff0;
+    struct ncclTopoNode* nicNode = NULL;
+    NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, busId));
+    if (nicNode == NULL) {
+      NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, busId));
+      node = nicNode; // Connect it to parent later on
+    }
+    NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode));
+  } else if (type == PCI) {
+    NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
+    for (int s=0; s<xmlPci->nSubs; s++) {
+      struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
+      NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node));
     }
   }
 
-  NCCLCHECK(ncclTopoConnectNVLink(nvmlDevs, s));
-  NCCLCHECK(ncclTopoConnectPCI(s));
+  if (node) {
+    int width, speed;
+    NCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width));
+    NCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str));
+
+    // Manage cases where speed was not indicated in /sys
+    if (width == 0) width = 16;
+    if (strlen(str) == 0 || strcasecmp(str, "Unknown speed") == 0) str = "8 GT/s";
 
-  free(nvmlDevs);
-  NCCLCHECK(ncclTopoSortSystem(s));
-  *system = s;
+    NCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end)
+
+    NCCLCHECK(ncclTopoConnectNodes(node, parent, LINK_PCI, width*speed/80.0));
+    NCCLCHECK(ncclTopoConnectNodes(parent, node, LINK_PCI, width*speed/80.0));
+  }
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink) {
-  int g1, g2;
-  NCCLCHECK(idToIndex(system, busId1, &g1));
-  NCCLCHECK(idToIndex(system, busId2, &g2));
-  *nvlink = g1 != -1 && g2 != -1 && system->nodes[GPU].nodes[g1].paths[GPU][g2].type == LINK_NVL;
+struct kvDict kvDictCpuArch[] = { { "x86_64", NCCL_TOPO_CPU_ARCH_X86 }, { "arm64", NCCL_TOPO_CPU_ARCH_ARM }, { "ppc64", NCCL_TOPO_CPU_ARCH_POWER }, { NULL, 0 } };
+struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { NULL, 0 } };
+
+ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) {
+  int numaId;
+  NCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId));
+  struct ncclTopoNode* cpu;
+  NCCLCHECK(ncclTopoCreateNode(system, &cpu, CPU, numaId));
+  const char* str;
+  NCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str));
+  if (str != NULL) {
+    NCCLCHECK(ncclStrToCpuset(str, &cpu->cpu.affinity));
+  }
+
+  NCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str));
+  NCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch));
+  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86) {
+    NCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str));
+    NCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor));
+    if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
+      int familyId, modelId;
+      NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
+      NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
+      cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;
+    }
+  }
+  for (int s=0; s<xmlCpu->nSubs; s++) {
+    struct ncclXmlNode* node = xmlCpu->subs[s];
+    if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu));
+    if (strcmp(node->name, "nic") == 0) {
+      struct ncclTopoNode* nic = NULL;
+      NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0));
+      if (nic == NULL) {
+        NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, 0));
+        NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_WIDTH));
+        NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_WIDTH));
+      }
+      NCCLCHECK(ncclTopoAddNic(node, system, nic));
+    }
+  }
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink) {
-  int g;
-  NCCLCHECK(idToIndex(system, busId, &g));
-  for (int i=0; i<system->nodes[GPU].count; i++) {
-    if (i == g) continue;
-    if (system->nodes[GPU].nodes[g].paths[GPU][i].type == LINK_NVL) {
-      *nvlink = 1;
-      return ncclSuccess;
+ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) {
+  if (strcmp(node->name, "nvlink") == 0) {
+    struct ncclTopoNode* gpu = NULL;
+    int64_t pBusId;
+    NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
+    NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
+    if (gpu == NULL) {
+      WARN("Add NVLink error : could not find GPU %lx\n", pBusId);
+      return ncclInternalError;
+    }
+    int count;
+    NCCLCHECK(xmlGetAttrInt(node, "count", &count));
+    const char* targetClass;
+    NCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass));
+    int targetType;
+    NCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass));
+    struct ncclTopoNode* remote = NULL;
+    if (targetType == GPU) {
+      // NVL P2P connection to another GPU
+      const char* target;
+      NCCLCHECK(xmlGetAttrStr(node, "target", &target));
+      int64_t busId;
+      NCCLCHECK(busIdToInt64(target, &busId));
+      NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, busId));
+    } else if (targetType == CPU) {
+      // NVL connection to the local CPU
+      NCCLCHECK(findLocalCpu(gpu, &remote));
+    } else {
+      if (system->nodes[NVS].count == 0) {
+        NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0));
+      } else {
+        remote = system->nodes[NVS].nodes;
+      }
+    }
+    if (remote) {
+      int nvlSpeed = gpu->gpu.cudaCompCap == 60 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
+      NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed));
+      if (remote->type != GPU) {
+        NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed));
+      }
+    }
+  } else {
+    const char* busId;
+    NCCLCHECK(xmlGetAttr(node, "busid", &busId));
+    for (int s=0; s<node->nSubs; s++) {
+      NCCLCHECK(ncclTopoAddNvLinks(node->subs[s], system, busId ? busId : parentBusId));
     }
   }
-  *nvlink = 0;
   return ncclSuccess;
 }
 
-static int pathDistance(struct ncclTopoLinkList* links) {
-  int distance = PATH_PIX;
-  if (links->count > 2) distance = PATH_PXB;
-  for (int l=0; l<links->count; l++) {
-    // PHB if we go through 1 CPU, SYS if we go through 2 CPUs
-    if (links->list[l]->remNode->type == CPU) distance = (distance == PATH_PHB) ? PATH_SYS : PATH_PHB;
+ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem) {
+  NCCLCHECK(ncclCalloc(topoSystem, 1));
+  struct ncclXmlNode* topNode;
+  NCCLCHECK(xmlFindTag(xml, "system", &topNode));
+  for (int s=0; s<topNode->nSubs; s++) {
+    struct ncclXmlNode* node = topNode->subs[s];
+    if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
+  }
+  NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL));
+
+  NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
+  NCCLCHECK(ncclTopoSortSystem(*topoSystem));
+
+  return ncclSuccess;
+}
+
+NCCL_PARAM(TopoDumpFileRank, "TOPO_DUMP_FILE_RANK", 0);
+
+// Only set values if not already set
+static ncclResult_t xmlInitAttrInt(struct ncclXmlNode* node, const char* attrName, const int value) {
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+  if (index == -1) {
+    index = node->nAttrs++;
+    strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+    snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
   }
-  return distance;
+  return ncclSuccess;
+}
+static ncclResult_t xmlInitAttrUint64(struct ncclXmlNode* node, const char* attrName, const uint64_t value) {
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+  if (index == -1) {
+    index = node->nAttrs++;
+    strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+    snprintf(node->attrs[index].value, MAX_STR_LEN, "0x%lx", value);
+  }
+  return ncclSuccess;
 }
 
-ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance) {
-  int g1, g2;
-  NCCLCHECK(idToIndex(system, busId1, &g1));
-  NCCLCHECK(idToIndex(system, busId2, &g2));
-  *distance = pathDistance(system->nodes[GPU].nodes[g1].paths[GPU]+g2);
+
+ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
+  struct ncclXml* xml;
+  NCCLCHECK(ncclCalloc(&xml, 1));
+  char* xmlTopoFile = getenv("NCCL_TOPO_FILE");
+  if (xmlTopoFile) {
+    NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml));
+  }
+  if (xml->maxIndex == 0) {
+    // Create top tag
+    struct ncclXmlNode* top;
+    NCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
+    NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION));
+  }
+
+  // Auto-detect GPUs if needed
+  for (int r=0; r<comm->nRanks; r++) {
+    if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
+      char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+      NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
+      struct ncclXmlNode* node;
+      NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
+      NCCLCHECK(xmlSetAttrInt(node, "rank", r));
+      NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
+    }
+  }
+  // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
+  // so we start with collnet so that it has precedence.
+  int netDevCount = 0;
+  if (ncclCollNet) {
+    NCCLCHECK(collNetDevices(&netDevCount));
+    for (int n=0; n<netDevCount; n++) {
+      ncclNetProperties_t props;
+      NCCLCHECK(collNetGetProperties(n, &props));
+      struct ncclXmlNode* netNode;
+      NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
+      NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
+      NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
+      NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
+      NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
+      NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
+      NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
+      NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1));
+    }
+  }
+  if (netDevCount == 0) {
+    NCCLCHECK(ncclNetDevices(&netDevCount));
+  }
+  for (int n=0; n<netDevCount; n++) {
+    ncclNetProperties_t props;
+    NCCLCHECK(ncclNetGetProperties(n, &props));
+    struct ncclXmlNode* netNode;
+    NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
+    NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
+    NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
+    NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
+    NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
+    NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
+    NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
+  }
+
+  xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE");
+  if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
+    NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
+  }
+
+  NCCLCHECK(ncclTopoGetSystemFromXml(xml, system));
+  free(xml);
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance) {
-  int g;
-  NCCLCHECK(idToIndex(system, busId, &g));
-  *distance = pathDistance(system->nodes[GPU].nodes[g].paths[NET]+netDev);
+/****************************/
+/* External query functions */
+/****************************/
+
+ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model) {
+  *arch = system->nodes[CPU].nodes[0].cpu.arch;
+  *vendor = system->nodes[CPU].nodes[0].cpu.vendor;
+  *model = system->nodes[CPU].nodes[0].cpu.model;
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count) {
-  *count = system->nodes[CPU].count;
+NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
+
+ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank) {
+  struct ncclTopoNode* cpu = NULL, *gpu = NULL;
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
+      gpu = system->nodes[GPU].nodes+g;
+      // Find closer CPU
+      int cpuIndex = -1, minHops = 0;
+      for (int c=0; c<system->nodes[CPU].count; c++) {
+        int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count;
+        if (cpuIndex == -1 || nHops < minHops) {
+          cpuIndex = c;
+          minHops = nHops;
+        }
+      }
+      cpu = system->nodes[CPU].nodes+cpuIndex;
+    }
+  }
+  if (cpu == NULL) {
+    WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank);
+    return ncclInternalError;
+  }
+
+  // Query the CPU affinity set we were provided
+  cpu_set_t mask;
+  SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
+
+#ifdef ENABLE_TRACE
+  {
+    char affinityStr[sizeof(cpu_set_t)*2];
+    NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
+    TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev, affinityStr);
+  }
+#endif
+
+  // Get the affinity of the CPU close to our GPU.
+  cpu_set_t cpuMask = cpu->cpu.affinity;
+
+#ifdef ENABLE_TRACE
+  {
+    char affinityStr[sizeof(cpu_set_t)*2];
+    NCCLCHECK(ncclCpusetToStr(&cpuMask, affinityStr));
+    TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev, affinityStr);
+  }
+#endif
+
+  cpu_set_t finalMask;
+  if (ncclParamIgnoreCpuAffinity())
+    // Ignore the CPU affinity set and use the GPU one instead
+    finalMask = cpuMask;
+  else
+    // Use a subset of the GPU affinity set
+    CPU_AND(&finalMask, &mask, &cpuMask);
+
+  // If there is a non empty set, use it to set affinity
+  if (CPU_COUNT(&finalMask)) {
+    char affinityStr[sizeof(cpu_set_t)*2];
+    NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
+    INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr);
+    SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity");
+  }
   return ncclSuccess;
 }