2.6.4-1

Add support for network collectives. Add support for XML topology dump/injection. Add text values for GDR and P2P Levels, including "NVL". Add speed detection for PCI, Infiniband and Ethernet cards. Add CPU detection for ARM and AMD CPUs. Add support for adaptive routing on Infiniband. Change NET plugin API to v3 : merge PCI path and GPU pointer capability into a single structure and add other properties.
author: Sylvain Jeaugey <sjeaugey@nvidia.com> 2020-01-17 03:02:42 +0300
committer: Sylvain Jeaugey <sjeaugey@nvidia.com> 2020-03-21 00:58:36 +0300
commit: b221128ecacf4ce1b3054172b9f30163307042c5 (patch)
tree: 43aa7da7992fea7ce30b8cc3e6220bc56f93dd16
parent: c38f174bd436031dbc79dce19ff969f377976a8a (diff)
48 files changed, 3602 insertions, 1379 deletions
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 2e44826..ece18c7 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/makefiles/version.mk b/makefiles/version.mk
index 05abbc7..883e625 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 5
-NCCL_PATCH   := 7
+NCCL_MINOR   := 6
+NCCL_PATCH   := 4
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/Makefile b/src/Makefile
index b11de5e..db1698a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -11,9 +11,9 @@ include ../makefiles/version.mk
 INCEXPORTS  := nccl.h nccl_net.h
 LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc \
                 misc/nvmlwrap.cc misc/ibvwrap.cc misc/utils.cc misc/argcheck.cc \
-		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
+		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
                 collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
-                graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc
+                graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
 
 ##### lib files
 LIBNAME     := libnccl.so
diff --git a/src/channel.cc b/src/channel.cc
index b053e5b..0a43e17 100644
--- a/src/channel.cc
+++ b/src/channel.cc
@@ -6,24 +6,32 @@
 
 #include "channel.h"
 #include "param.h"
+#include "graph.h"
 
-NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
+#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
+#define DEFAULT_BUFFER_SIZE_BYTES_ARM (1LL << 20) /* 1MiB */
+
+NCCL_PARAM(Buffsize, "BUFFSIZE", -2);
 
 ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
   struct ncclChannel* channel = comm->channels+channelid;
   channel->id = channelid;
 
   // Setup intermediate buffering
-  channel->buffSize = ncclParamBuffsize();
+  int buffSize = ncclParamBuffsize();
+  int cpuArch, cpuVendor, cpuModel;
+  NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
+  channel->buffSize = buffSize != -2 ? buffSize :
+	  cpuArch == NCCL_TOPO_CPU_ARCH_ARM ? DEFAULT_BUFFER_SIZE_BYTES_ARM : DEFAULT_BUFFER_SIZE_BYTES;
 
   // Ring index to user rank table.
   NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
   NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
 
   // Communication structures with peers.
-  NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks));
-  NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks));
-  for (size_t i=0; i<comm->nRanks; ++i) {
+  NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network)
+  NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1));
+  for (size_t i=0; i<comm->nRanks+1; ++i) {
     channel->peers[i].send.comm = comm;
     channel->peers[i].recv.comm = comm;
   }
@@ -42,9 +50,13 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
   CUDACHECK(cudaFree(channel->ring.devUserRanks));
 
   // Free transport proxy resources
-  for (int r=0; r<nRanks; r++) {
+  // Note: free all send resources first due to CollNet arrangement
+  for (int r=0; r<nRanks+1; r++) {
     struct ncclPeer* peer = channel->peers+r;
     if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
+  }
+  for (int r=0; r<nRanks+1; r++) {
+    struct ncclPeer* peer = channel->peers+r;
     if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
   }
 
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index 0ad5ba9..059092c 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -69,6 +69,9 @@ __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
 template<int UNROLL, class FUNC, typename T>
 __device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }
 
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclAllGatherCollNetKernel(struct CollectiveArgs* args) { }
+
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
@@ -130,6 +133,9 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
 
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllGatherCollNetLLKernel(struct CollectiveArgs* args) { }
+
 #include "prims_ll128.h"
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
@@ -193,3 +199,6 @@ __device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
 
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { }
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllGatherCollNetLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index 2449c2b..173b5fa 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -106,7 +106,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
   do {
     struct ncclTree* tree = &channel->treeUp;
     // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
-    ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
+    ncclPrimitives<UNROLL/2, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
       // Up
       ssize_t offset = gridOffset + bid*chunkSize;
@@ -124,7 +124,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
   do {
     struct ncclTree* tree = &channel->treeDn;
     // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
-    ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
+    ncclPrimitives<UNROLL/2, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
       // Down
       ssize_t offset = gridOffset + bid*chunkSize;
@@ -140,6 +140,62 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
   } while(0);
 }
 
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclAllReduceCollNetKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = args->nThreads-WARP_SIZE;
+  const int bid = args->bid;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  const ssize_t size = args->N;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  int chunkSize = args->lastChunkSize;
+  const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nChannels*chunkSize;
+
+  if (loopSize > size) {
+    chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+  }
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  if (blockIdx.x < args->nChannels) { // first half of the channels do reduce
+    struct ncclTree* tree = &channel->collTreeUp;
+    ncclPrimitives<UNROLL, 1, 1, T, 1, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Up
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        prims.send(thisInput+offset, nelem);
+      } else {
+        prims.recvReduceSend(thisInput+offset, nelem);
+      }
+    }
+  }
+
+  if (blockIdx.x >= args->nChannels) { // second half of the channels do broadcast
+    struct ncclTree* tree = &channel->collTreeDn;
+    ncclPrimitives<UNROLL, 1, 1, T, 1, 1, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Down
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        prims.send(thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        prims.recv(thisOutput+offset, nelem);
+      } else {
+        prims.recvCopySend(thisOutput+offset, nelem);
+      }
+    }
+  }
+}
+
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
@@ -271,6 +327,61 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
   } while(0);
 }
 
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllReduceCollNetLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = args->nThreads;
+  const int bid = args->bid;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  const ssize_t size = args->N;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nChannels*chunkSize;
+
+  if (loopSize > size) {
+    chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+  }
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  if (blockIdx.x < args->nChannels) { // first half of the channels do reduce
+    struct ncclTree* tree = &channel->collTreeUp;
+    ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Up
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        LLprims.send(thisInput+offset, nelem);
+      } else {
+        LLprims.recvReduceSend(thisInput+offset, nelem);
+      }
+    }
+  }
+
+  if (blockIdx.x >= args->nChannels) { // second half of the channels do broadcast
+    struct ncclTree* tree = &channel->collTreeDn;
+    ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Down
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        LLprims.send(thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        LLprims.recv(thisOutput+offset, nelem);
+      } else {
+        LLprims.recvCopySend(thisOutput+offset, nelem);
+      }
+    }
+  }
+}
+
 #include "prims_ll128.h"
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
@@ -408,3 +519,6 @@ __device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
     }
   }
 }
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllReduceCollNetLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index de8b989..5146682 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -54,6 +54,9 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
 template<int UNROLL, class FUNC, typename T>
 __device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }
 
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclBroadcastCollNetKernel(struct CollectiveArgs* args) { }
+
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
@@ -101,6 +104,9 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
 
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclBroadcastCollNetLLKernel(struct CollectiveArgs* args) { }
+
 #include "prims_ll128.h"
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
@@ -148,3 +154,6 @@ __device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
 
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { }
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclBroadcastCollNetLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index 46eb9f5..6e06369 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -102,7 +102,8 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
 
 #define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
   IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \
-  IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING)
+  IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING) \
+  IMPL_COLL4(coll##CollNet, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_COLLNET)
 
 #if NCCL_TYPE == 0
 #define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu
index 034fe96..d10f11e 100644
--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@@ -17,7 +17,8 @@ __device__ volatile uint64_t* ncclShmem;
 
 #define NCCL_FUNC4(coll, op, dtype) \
   NCCL_FUNC5(coll##Tree, op, dtype), \
-  NCCL_FUNC5(coll##Ring, op, dtype)
+  NCCL_FUNC5(coll##Ring, op, dtype), \
+  NCCL_FUNC5(coll##CollNet, op, dtype)
 
 // Must be consistent with ncclDataType_t
 #define NCCL_FUNCS3A(coll, op) \
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index b624359..c1067bf 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -227,7 +227,7 @@ class ncclPrimitives {
     recvStep[i] = conn->step;
     recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
     recvDirectBuff[i] = NULL;
-    if (directBuff && conn->direct) {
+    if (directBuff && (conn->direct & NCCL_DIRECT_GPU)) {
       recvDirectBuff[i] = directBuff;
       if (tid == 0) *conn->ptrExchange = directBuff;
     }
@@ -254,7 +254,7 @@ class ncclPrimitives {
     sendStep[i] = conn->step;
     sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
     sendDirectBuff[i] = NULL;
-    if (directBuff && conn->direct) {
+    if (directBuff && (conn->direct & NCCL_DIRECT_GPU)) {
       void* volatile* ptr = conn->ptrExchange;
       while ((sendDirectBuff[i] = (T*)(*ptr)) == NULL);
       barrier();
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index 0680abe..e36613f 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -50,6 +50,9 @@ __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
 template<int UNROLL, class FUNC, typename T>
 __device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { }
 
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclReduceCollNetKernel(struct CollectiveArgs* args) { }
+
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
@@ -94,6 +97,9 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
 
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceCollNetLLKernel(struct CollectiveArgs* args) { }
+
 #include "prims_ll128.h"
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) {
@@ -138,3 +144,6 @@ __device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) {
 
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclReduceTreeLL128Kernel(struct CollectiveArgs* args) { }
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceCollNetLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index 1985148..0b0ae81 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -64,6 +64,9 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
 template<int UNROLL, class FUNC, typename T>
 __device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { }
 
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclReduceScatterCollNetKernel(struct CollectiveArgs* args) { }
+
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
@@ -122,6 +125,9 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
 
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceScatterCollNetLLKernel(struct CollectiveArgs* args) { }
+
 #include "prims_ll128.h"
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) {
@@ -182,3 +188,6 @@ __device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) {
 
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclReduceScatterTreeLL128Kernel(struct CollectiveArgs* args) { }
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceScatterCollNetLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/debug.cc b/src/debug.cc
index 03a77ae..b2fc03c 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -107,7 +107,6 @@ void ncclDebugInit() {
     if (debugFn[0] != '\0') {
       FILE *file = fopen(debugFn, "w");
       if (file != NULL) {
-        INFO(NCCL_ALL,"DEBUG file is '%s'", debugFn);
         ncclDebugFile = file;
       }
     }
@@ -125,7 +124,7 @@ void ncclDebugInit() {
  */
 void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
   if (ncclDebugLevel == -1) ncclDebugInit();
-  if (ncclDebugNoWarn == 1 && level == NCCL_LOG_WARN) level = NCCL_LOG_INFO;
+  if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
 
   char hostname[1024];
   getHostName(hostname, 1024, '.');
@@ -135,7 +134,6 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
   char buffer[1024];
   size_t len = 0;
   pthread_mutex_lock(&ncclDebugLock);
-  if (ncclDebugNoWarn && ncclDebugLevel == NCCL_LOG_WARN) printf("WARN -> INFO\n");
   if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
     len = snprintf(buffer, sizeof(buffer),
                    "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 2239865..92f3467 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -1,11 +1,12 @@
 /*************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
 #include "enqueue.h"
 #include "argcheck.h"
+#include "coll_net.h"
 
 // Only generate inline kernels for LL
 #define NCCL_FUNC5(coll, op, dtype) \
@@ -15,7 +16,8 @@
 
 #define NCCL_FUNC4(coll, op, dtype) \
   (void*)NCCL_FUNC5(coll##Tree, op, dtype), \
-  (void*)NCCL_FUNC5(coll##Ring, op, dtype)
+  (void*)NCCL_FUNC5(coll##Ring, op, dtype), \
+  (void*)NCCL_FUNC5(coll##CollNet, op, dtype)
 
 // Must be consistent with ncclDataType_t
 #define NCCL_FUNCS3A(coll, op) \
@@ -227,28 +229,23 @@ ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
 /* Enqueueing system : computation of kernel and proxy operations parameters */
 /*****************************************************************************/
 
-// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
-// factor is not ideal but works quite well. Powers of two, 64 B to 1 GB.
-static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
-  { 1.0, 1.0, 1.0, 1.0,  .9,  .8,  .7,  .7,  .7,  .7,  .6,  .5,  .5,  .5,  .6,  .7,  .8,  .9,  .9, 1.0, 1.0, 1.0 },
-  { 1.0, 1.0, 1.0, 1.0, 1.0,  .9,  .8,  .8,  .8,  .8,  .7,  .7,  .7,  .6,  .6,  .7,  .7,  .8,  .8,  .9,  .9, 1.0 },
-  {  .9,  .9,  .9,  .9,  .9,  .9,  .9,  .8,  .7,  .6,  .6,  .5,  .5,  .5,  .5,  .5,  .5,  .6,  .6,  .7,  .8,  .9 }
-};
-
 static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
   struct ncclComm* comm = info->comm;
-  float minTime = 3600000.0; // Hopefully no operation will take an hour to complete.
+  float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete.
   // Find algorithm / protocol.
   info->algorithm = -1;
   info->protocol = -1;
-  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+  int nAlgos = NCCL_NUM_ALGORITHMS;
+  // Check collNet support
+  int collNetTypeSupport = 0;
+  if (info->comm->collNetSupport)
+    NCCLCHECK(collNetReduceSupport(info->datatype, info->op, &collNetTypeSupport));
+  if (collNetTypeSupport != 1) nAlgos--;
+  for (int a=0; a<nAlgos; a++) {
     for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      float bw = comm->bandwidths[info->coll][a][p];
-      if (bw == 0) continue;
-      int logSize = log2i(info->nBytes>>6);
-      if (a == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[p][logSize];
-      float time = comm->latencies[info->coll][a][p] + (info->nBytes) / (1000 * bw);
-      if (time < minTime) {
+      float time;
+      NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, &time));
+      if (time >= 0 && time < minTime) {
         info->algorithm = a;
         info->protocol = p;
         minTime = time;
@@ -259,14 +256,14 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
     WARN("Error : no algorithm/protocol available");
     return ncclInternalError;
   }
-  //if (comm->rank == 0) INFO(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %d", info->nBytes, info->algorithm, info->protocol, minTime);
+  //if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
   TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
 
-  int nc = comm->nChannels;
-  int nt = comm->maxThreads[info->protocol];
+  int nc = (info->algorithm == NCCL_ALGO_COLLNET) ? comm->nChannels/2 : comm->nChannels; // CollNet uses one channel for up and one channel for down
+  int nt = comm->maxThreads[info->algorithm][info->protocol];
   int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
   while (info->nBytes < nc*nt*threadThreshold) {
-    if (nc >= 2) nc--;
+    if (info->algorithm != NCCL_ALGO_COLLNET && nc >= 2) nc--;
     else if ((nt % 128) == 0) nt/=2;
     else break;
   }
@@ -286,7 +283,7 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) {
     case ncclCollAllGather:
       info->pattern = ncclPatternRing; break;
     case ncclCollAllReduce:
-      info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
+      info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUp : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
     default:
       WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
       return ncclInternalError;
@@ -301,6 +298,8 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
     case ncclPatternTreeUpDown:
     case ncclPatternPipelineFrom:
     case ncclPatternPipelineTo:
+    case ncclPatternCollTreeUp:
+    case ncclPatternCollTreeDown:
       info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
     case ncclPatternRing:
       info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
@@ -345,6 +344,13 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
     }
     // Use lastChunkSize as chunkSize
     coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+  } else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) {
+    // Optimize chunkSize / nSteps
+    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*16 && chunkSize > 131072) chunkSize /= 2;
+    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
+    while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth && chunkSize > 32768) chunkSize /= 2;
+    // Use lastChunkSize as chunkSize
+    coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
   } else if (info->protocol == NCCL_PROTO_LL) {
     int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t);
     const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
@@ -369,6 +375,8 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
   proxyArgs->chunkSteps = chunkSteps;
   proxyArgs->protocol = info->protocol;
   proxyArgs->opCount = info->comm->opCount;
+  proxyArgs->dtype = info->datatype;
+  proxyArgs->redOp = info->op;
   TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
       coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
       nLoops, proxyArgs->nsteps, info->comm);
@@ -395,8 +403,11 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
     WARN("Error : mixing different streams within a group call is not supported.");
     return ncclInvalidUsage;
   }
-  for (int bid=0; bid<coll.args.nChannels; bid++) {
-    struct ncclChannel* channel = info->comm->channels+(info->comm->myParams->gridDim.x % info->comm->nChannels);
+
+  int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1;
+  for (int bid=0; bid<coll.args.nChannels*nSubChannels; bid++) {
+    int channelId = info->comm->myParams->gridDim.x % info->comm->nChannels;
+    struct ncclChannel* channel = info->comm->channels+channelId;
 
     if (channel->collCount == NCCL_MAX_OPS) {
       WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
@@ -405,6 +416,10 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
 
     // Proxy
     proxyArgs.channel = channel;
+    // Adjust pattern for CollNet based on channel index
+    if (nSubChannels == 2) {
+      info->pattern = (channelId < info->comm->nChannels/nSubChannels) ? ncclPatternCollTreeUp : ncclPatternCollTreeDown;
+    }
     NCCLCHECK(transportSaveProxies(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
 
     info->comm->myParams->gridDim.x++;
@@ -416,7 +431,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
 
     memcpy(c, &coll, sizeof(struct ncclColl));
 
-    c->args.bid = bid;
+    c->args.bid = bid % coll.args.nChannels;
     c->active = 1;
     opIndex = (opIndex+1)%NCCL_MAX_OPS;
     c->nextIndex = opIndex;
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index af481d2..dd9f9f0 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -14,7 +14,7 @@
 /******************************************************************/
 
 ncclResult_t ncclTopoPreset(struct ncclComm* comm,
-    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
+    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
     struct ncclTopoRanks* topoRanks) {
   int rank = comm->rank;
   int localRanks = comm->localRanks;
@@ -27,9 +27,14 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
     for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeUp.down[i] = -1;
     channel->treeDn.up = -1;
     for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeDn.down[i] = -1;
+    channel->collTreeUp.up = -1;
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTreeUp.down[i] = -1;
+    channel->collTreeDn.up = -1;
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTreeDn.down[i] = -1;
 
     int* ringIntra = ringGraph->intra+c*localRanks;
     int* treeIntra = treeGraph->intra+c*localRanks;
+    int* collNetIntra = collNetGraph->intra+c*localRanks;
 
     for (int i=0; i<localRanks; i++) {
       if (ringIntra[i] == rank) {
@@ -57,6 +62,16 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
         channel->treeUp.down[0]  = sym ? channel->treeDn.down[0]  : channel->treeDn.up ;
         channel->treeUp.up       = sym ? channel->treeDn.up       : channel->treeDn.down[0];
       }
+      if (collNetIntra[i] == rank) {
+        int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
+
+        // CollTrees are always symmetric, i.e.
+        // up/down go in reverse directions
+        channel->collTreeDn.up      = collNetIntra[prev];
+        channel->collTreeDn.down[0] = collNetIntra[next];
+        channel->collTreeUp.down[0] = channel->collTreeDn.down[0];
+        channel->collTreeUp.up      = channel->collTreeDn.up;
+      }
     }
     topoRanks->ringPrev[c] = channel->ring.prev;
     topoRanks->ringNext[c] = channel->ring.next;
@@ -174,6 +189,40 @@ static ncclResult_t connectTrees(struct ncclComm* comm, int* treeUpRecv, int* tr
   return ncclSuccess;
 }
 
+ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank) {
+  int nranks = comm->nRanks;
+  int depth = nranks/comm->nNodes;
+  int sendIndex = collNetGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;  // send GPU index depends on topo pattern
+  int sendEndIndex = (sendIndex+comm->localRanks-1)%comm->localRanks;
+  for (int c=0; c<comm->nChannels/2; c++) {
+    struct ncclChannel* channel = comm->channels+c;
+    // Set root of collTree to id nranks
+    if (rank == collNetGraph->intra[sendIndex+c*comm->localRanks]) { // is master
+      channel->collTreeUp.up = channel->collTreeDn.up = nranks;
+    }
+    if (rank == collNetGraph->intra[sendEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
+      channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1;
+    }
+    channel->collTreeUp.depth = channel->collTreeDn.depth = depth;
+    INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTreeUp.up, channel->collTreeUp.down[0]);
+  }
+  int recvIndex = 0;  // recv GPU index is always 0
+  int recvEndIndex = (recvIndex+comm->localRanks-1)%comm->localRanks;
+  for (int c=0; c<comm->nChannels/2; c++) {
+    struct ncclChannel* channel = comm->channels+comm->nChannels/2+c;
+    // Set root of collTree to id nranks
+    if (rank == collNetGraph->intra[recvIndex+c*comm->localRanks]) { // is master
+      channel->collTreeUp.up = channel->collTreeDn.up = nranks;
+    }
+    if (rank == collNetGraph->intra[recvEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
+      channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1;
+    }
+    channel->collTreeUp.depth = channel->collTreeDn.depth = depth;
+    INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTreeDn.up, channel->collTreeDn.down[0]);
+  }
+  return ncclSuccess;
+}
+
 // Legacy naming
 NCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
 NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index eba1964..0872ae7 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -42,7 +42,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
   NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
   basePath->count = 0;
   basePath->width = LOC_WIDTH;
-  basePath->type = LINK_LOC;
+  basePath->type = PATH_LOC;
 
   while (nodeList.count) {
     nextNodeList.count = 0;
@@ -58,7 +58,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
         }
         struct ncclTopoLinkList* remPath;
         NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
-        int width = std::min(path->width, link->width);
+        float width = std::min(path->width, link->width);
         if (remPath->width < width) {
           // Find reverse link
           for (int l=0; l<remNode->nlinks; l++) {
@@ -68,8 +68,8 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
             }
           }
           if (remPath->list[0] == NULL) {
-            WARN("Failed to find reverse path from remNode id %d type %d nlinks %d to node id %d type %d",
-                 remNode->id, remNode->type, remNode->nlinks, node->id, node->type);
+            WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx",
+                 remNode->type, remNode->id, remNode->nlinks, node->type, node->id);
             return ncclInternalError;
           }
           // Copy the rest of the path
@@ -77,9 +77,17 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
           remPath->count = path->count + 1;
           remPath->width = width;
 
-          // Consider the path is QPI when going through the CPU
-          // Also don't consider LINK_NET as we only care about the NIC->GPU path.
-          int type = remNode->type == CPU ? LINK_QPI : link->type == LINK_NET ? 0 : link->type;
+          // Start with path type = link type. PATH and LINK types are supposed to match.
+          // Don't consider LINK_NET as we only care about the NIC->GPU path.
+          int type = link->type == LINK_NET ? 0 : link->type;
+          // Differentiate between one and multiple PCI switches
+          if (type == PATH_PIX && (node->type == PCI || link->remNode->type == PCI) && remPath->count > 3) type = PATH_PXB;
+          // Consider a path going through the CPU as PATH_PHB
+          if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB;
+          // Ignore Power CPU in an NVLink path
+          if (path->type == PATH_NVL && type == PATH_SYS && link->remNode->type == CPU &&
+              link->remNode->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) type = 0;
+
           remPath->type = std::max(path->type, type);
 
           // Add to the list for the next iteration if not already in the list
@@ -117,9 +125,9 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n
         sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id);
         offset = strlen(line);
       }
-      INFO(NCCL_GRAPH, "%s (%d)", line, node->paths[t][n].width);
+      INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].width);
 #else
-      sprintf(line+offset, "%s/%lX (%d/%d/%d) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, node->paths[t][n].type);
+      sprintf(line+offset, "%s/%lX (%d/%f/%s) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, topoPathTypeStr[node->paths[t][n].type]);
       offset = strlen(line);
 #endif
     }
@@ -171,7 +179,7 @@ static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int
 
   // Update path characteristics
   srcNode->paths[t2][i2].count = l;
-  srcNode->paths[t2][i2].type = LINK_QPI;
+  srcNode->paths[t2][i2].type = std::max(srcNode->paths[CPU][c].type, cpuNode->paths[t2][i2].type);
   srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width);
   return ncclSuccess;
 }
@@ -194,6 +202,127 @@ static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType)
   }
 }
 
+static const int levelsOldToNew[] = { PATH_LOC, PATH_PIX, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS };
+ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelEnv) {
+  if (*level == -1) {
+    int l = -1;
+    if (disableEnv) {
+      char* str = getenv(disableEnv);
+      if (str) {
+        int disable = strtol(str, NULL, 0);
+        if (disable == 1) l = 0;
+      }
+    }
+    if (l == -1) {
+      char* str = getenv(levelEnv);
+      if (str) {
+        for (int i=0; i<PATH_NET; i++) {
+          if (strcmp(str, topoPathTypeStr[i]) == 0) {
+            l = i;
+            break;
+          }
+        }
+        // Old style numbering
+        if (l == -1 && str[0] >= '0' && str[0] <= '9') {
+          int oldLevel = strtol(str, NULL, 0);
+          const int maxOldLevel = sizeof(levelsOldToNew)/sizeof(int) - 1;
+          if (oldLevel > maxOldLevel) oldLevel = maxOldLevel;
+          l = levelsOldToNew[oldLevel];
+        }
+      }
+    }
+    if (l >= 0) INFO(NCCL_GRAPH, "%s set from environment to %s", levelEnv, topoPathTypeStr[l]);
+    *level = l >= 0 ? l : -2;
+  }
+  return ncclSuccess;
+}
+
+int ncclTopoUserP2pLevel = -1;
+ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p) {
+  *p2p = 0;
+
+  // Get GPUs from topology
+  int g1, g2;
+  NCCLCHECK(ncclTopoIdToIndex(system, GPU, id1, &g1));
+  struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1;
+  if (ncclTopoIdToIndex(system, GPU, id2, &g2) == ncclInternalError) {
+    // GPU not found, we can't use p2p.
+    return ncclSuccess;
+  }
+  struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2;
+
+  // In general, use P2P whenever we can.
+  int p2pLevel = PATH_SYS;
+
+  // Don't use P2P through ARM CPUs
+  int arch, vendor, model;
+  NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
+  if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB;
+  if (arch == NCCL_TOPO_CPU_ARCH_X86 &&
+      vendor == NCCL_TOPO_CPU_VENDOR_INTEL &&
+      model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB;
+
+  // User override
+  NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL"));
+  if (ncclTopoUserP2pLevel != -2) p2pLevel = ncclTopoUserP2pLevel;
+
+  // Compute the PCI distance and compare with the p2pLevel.
+  if (path->type <= p2pLevel) *p2p = 1;
+
+  return ncclSuccess;
+}
+
+NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
+int ncclTopoUserGdrLevel = -1;
+
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int netDev, int read, int* useGdr) {
+  *useGdr = 0;
+
+  // Get GPU and NET
+  int n, g;
+  NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n));
+  struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+  NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g));
+  struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+
+  // Check that both the NIC and GPUs support it
+  if (net->net.gdrSupport == 0) return ncclSuccess;
+  if (gpu->gpu.gdrSupport == 0) return ncclSuccess;
+
+  if (read) { // For reads (sends) only enable under certain conditions
+    int gdrReadParam = ncclParamNetGdrRead();
+    if (gdrReadParam == 0) return ncclSuccess;
+    if (gdrReadParam < 0) {
+      int nvlink = 0;
+      // Since we don't know whether there are other communicators,
+      // it's better to keep things local if we have a single GPU.
+      if (system->nodes[GPU].count == 1) nvlink = 1;
+      for (int i=0; i<system->nodes[GPU].count; i++) {
+        if (i == g) continue;
+        if (gpu->paths[GPU][i].type == PATH_NVL) {
+          nvlink = 1;
+          break;
+        }
+      }
+      if (!nvlink) return ncclSuccess;
+    }
+  }
+
+  // Check if we are close enough that it makes sense to enable GDR
+  int netGdrLevel = PATH_PXB;
+  NCCLCHECK(ncclGetLevel(&ncclTopoUserGdrLevel, NULL, "NCCL_NET_GDR_LEVEL"));
+  if (ncclTopoUserGdrLevel != -2) netGdrLevel = ncclTopoUserGdrLevel;
+  int distance = gpu->paths[NET][n].type;
+  if (distance > netGdrLevel) {
+    INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel);
+    return ncclSuccess;
+  }
+
+  *useGdr = 1;
+  INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d <= %d), read %d", busId, netDev, distance, netGdrLevel, read);
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) {
   // Precompute paths between GPUs/NICs.
 
@@ -210,26 +339,29 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
     // Compute paths to GPU g
     NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system));
 
-    if (peerInfos == NULL) continue;
-    // Update paths from GPUs p to GPU g when we can't or don't want to use P2P or even SHM
-    struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].rank;
+    // Update path when we don't want to / can't use GPU Direct P2P
     for (int p=0; p<system->nodes[GPU].count; p++) {
-      if (p == g) continue;
-      struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].rank;
       int p2p;
-      NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo));
+      NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p));
       if (p2p == 0) {
-        int shm;
-        NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
-        if (shm == 1) {
-          // We cannot use GPU Direct, so we need all traffic to go through a CPU
-          int cpu;
-          NCCLCHECK(getLocalCpu(system, g, &cpu));
-          NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g));
-        } else {
-          // We cannot communicate with that peer.
-          system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
-        }
+        // Divert all traffic through the CPU
+        int cpu;
+        NCCLCHECK(getLocalCpu(system, g, &cpu));
+        NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g));
+      }
+    }
+
+    if (peerInfos == NULL) continue;
+    // Remove GPUs we can't talk to because of containers.
+    struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].gpu.rank;
+    for (int p=0; p<system->nodes[GPU].count; p++) {
+      if (p == g) continue;
+      struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].gpu.rank;
+      int shm;
+      NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
+      if (shm == 0) {
+        // Mark this peer as inaccessible. We'll trim it later.
+        system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
       }
     }
   }
@@ -239,11 +371,12 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
     struct ncclTopoNode* netNode = system->nodes[NET].nodes+n;
     NCCLCHECK(ncclTopoSetPaths(netNode, system));
 
-    if (peerInfos == NULL) continue;
     for (int g=0; g<system->nodes[GPU].count; g++) {
-      if ((peerInfos[system->nodes[GPU].nodes[g].rank].gdrSupport & (1 << n)) == 0) {
-        // We cannot use GPU Direct RDMA, so we need all NIC<->GPU paths
-        // to go through a CPU
+      // Update path when we dont want to / can't use GPU Direct RDMA.
+      int gdr;
+      NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
+      if (gdr == 0) {
+        // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
         int localCpu;
         NCCLCHECK(getLocalCpu(system, g, &localCpu));
         NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g));
@@ -251,7 +384,6 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
       }
     }
   }
-
   return ncclSuccess;
 }
 
@@ -270,7 +402,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
         domains[g] = std::min(domains[g], domains[p]);
       }
     }
-    if (gpu->rank == comm->rank) myDomain = domains[g];
+    if (gpu->gpu.rank == comm->rank) myDomain = domains[g];
   }
 
   int ngpus = system->nodes[GPU].count;
@@ -288,98 +420,19 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
       free(ids);
       return ncclInternalError;
     }
-
-    // Remove GPUs I can't access (even indirectly) from my view of the node
-    for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
-      for (int n=0; n<system->nodes[t].count; n++) {
-        struct ncclTopoNode* node = system->nodes[t].nodes+n;
-        if (node == gpu) continue;
-        for (int l=0; l<node->nlinks; l++) {
-          while (l<node->nlinks && node->links[l].remNode == gpu) {
-            if (l<node->nlinks-1)
-              memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink));
-            node->nlinks--;
-          }
-          if (l<node->nlinks && node->links[l].remNode->type == GPU && node->links[l].remNode >= gpu) {
-            node->links[l].remNode--;
-          }
-        }
-      }
-    }
-    if (g != system->nodes[GPU].count-1)
-      memmove(gpu, gpu+1, (system->nodes[GPU].count-g-1)*sizeof(struct ncclTopoNode));
-    system->nodes[GPU].count--;
+    NCCLCHECK(ncclTopoRemoveNode(system, GPU, g));
   }
 
   comm->localRanks = system->nodes[GPU].count;
   if (system->nodes[GPU].count == comm->nRanks) {
-    // Trim network
-    ncclTopoRemovePathType(system, NET);
-    system->nodes[NET].count = 0;
-    for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
-      for (int n=0; n<system->nodes[t].count; n++) {
-        struct ncclTopoNode* node = system->nodes[t].nodes+n;
-        for (int l=0; l<node->nlinks; l++) {
-          struct ncclTopoLink* link = &(node->links[l]);
-          if (link->remNode->type == NET) {
-            // Remove the link
-            for (int i=l; i<(node->nlinks-1); i++) {
-              memcpy(&(node->links[i]), &(node->links[i+1]), sizeof(ncclTopoLink));
-            }
-            node->nlinks--;
-            l--;  // revisit the same value of "l" for the next iteration, since we edited the list in the middle of the loop
-          }
-        }
-      }
-    }
+    for (int n=system->nodes[NET].count-1; n>=0; n--)
+      NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
   }
   free(domains);
   free(ids);
   return ncclSuccess;
 }
 
-static ncclResult_t getGpuSpeed(struct ncclTopoNode* node, int* speed) {
-  int nvlSpeed = 0;
-  int nvlPeers = 0;
-  int pciSpeed = 0;
-  for (int l=0; l<node->nlinks; l++) {
-    if (node->links[l].type == LINK_NVL) nvlSpeed += node->links[l].width;
-    if (node->links[l].remNode->type == GPU) nvlPeers++; else nvlPeers = 2;
-    if (node->links[l].type == LINK_PCI) pciSpeed = node->links[l].width;
-  }
-  *speed = std::min(*speed, std::max(nvlSpeed, pciSpeed));
-  return ncclSuccess;
-}
-
-ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system) {
-  // Compute max speed to try to accelerate the search.
-  system->maxSpeed = LOC_WIDTH;
-
-  for (int g=0; g<system->nodes[GPU].count; g++) {
-    NCCLCHECK(getGpuSpeed(system->nodes[GPU].nodes+g, &system->maxSpeed));
-  }
-  if (system->nodes[NET].count) {
-    // Try to assign one NIC per GPU
-    int netMaxSpeed = 0;
-    int netMaxSpeedCount = 0;
-    for (int n=0; n<system->nodes[NET].count; n++) {
-      int maxSpeed = 0;
-      struct ncclTopoNode* net = system->nodes[NET].nodes+n;
-      for (int g=0; g<system->nodes[GPU].count; g++) {
-        maxSpeed = std::max(maxSpeed, net->paths[GPU][g].width);
-      }
-      if (maxSpeed > netMaxSpeed) {
-        netMaxSpeed = maxSpeed;
-        netMaxSpeedCount = 1;
-      } else if (maxSpeed == netMaxSpeed) {
-        netMaxSpeedCount++;
-      }
-    }
-    system->maxSpeed = std::min(system->maxSpeed, netMaxSpeedCount*NET_WIDTH);
-  }
-  return ncclSuccess;
-}
-
 void ncclTopoFree(struct ncclTopoSystem* system) {
   for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
   free(system);
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 3a8b4e7..b4c3e35 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,29 +7,121 @@
 #include "core.h"
 #include "graph.h"
 #include "topo.h"
+#include "xml.h"
+#include <math.h>
+
+// Initialize system->maxWidth. This is the per-channel (i.e. per-SM)
+// max speed.
+static float getMaxWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) {
+  float nvLinkWidth = gpu->gpu.cudaCompCap > 60 ? VOLTA_NVLINK_WIDTH : PASCAL_NVLINK_WIDTH;
+  float maxWidth = 0.0;
+  for (int i=0; i<system->nodes[type].count; i++) {
+    struct ncclTopoLinkList* path = gpu->paths[type]+i;
+    float width = path->width;
+    if (path->count == 0) continue;
+    if (path->type == PATH_NVL) width = std::min(nvLinkWidth, width);
+    maxWidth = std::max(maxWidth, width);
+  }
+  return maxWidth;
+}
+ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
+  system->maxWidth = 0.0;
+  int inter = system->nodes[NET].count;
+  if (inter == 0 && system->nodes[GPU].count == 1) {
+    system->maxWidth = LOC_WIDTH;
+    return ncclSuccess;
+  }
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+    system->maxWidth = std::max(system->maxWidth, getMaxWidth(system, gpu, inter ? NET : GPU));
+  }
+  return ncclSuccess;
+}
 
-static ncclResult_t ncclTopoFollowPath(struct ncclTopoGraph* graph, struct ncclTopoLinkList* path, struct ncclTopoNode** node, int width, int typeSave) {
-  if (path->count == 0) return ncclSuccess;
+static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* node2, struct ncclTopoLink** revLink) {
+  for (int l=0; l<node2->nlinks; l++) {
+    struct ncclTopoLink* link = node2->links+l;
+    if (link->remNode == node1) {
+      *revLink = link;
+      return ncclSuccess;
+    }
+  }
+  WARN("Could not find rev link for %d/%d -> %d/%d\n", node1->type, node1->id, node2->type, node2->id);
+  return ncclInternalError;
+}
 
-  *node = NULL;
-  if (width > 0) {
-    if (path->type > graph->type) return ncclSuccess;
-    graph->type = std::max(graph->type, path->type);
-    graph->nHops += path->count;
-  } else {
-    graph->type = typeSave;
-    graph->nHops -= path->count;
+// This is unfortunately needed since manipulating floats often results in rounding errors.
+#define SUB_ROUND(a, b) (a = roundf((a-b)*1000)/1000)
+
+static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNode* start, int maxSteps, float speed, int* steps) {
+  float pciSpeed = speed;
+  for (int step=0; step<path->count; step++) {
+    struct ncclTopoNode* node = path->list[step]->remNode;
+    if (node->type == CPU) {
+      // Account for P2P inefficiency through Intel CPU RC
+      if (path->type == PATH_PHB && start->type == GPU &&
+          node->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 &&
+          node->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
+        pciSpeed = INTEL_P2P_OVERHEAD(speed);
+      }
+    }
   }
 
-  for (int i=0; i<path->count; i++) {
-    if (path->list[i]->width < width) {
-      // Can't follow this path, rewind and exit
-      for (int j=0; j<i; j++) path->list[j]->width += width;
-      return ncclSuccess;
+  struct ncclTopoNode* node = start;
+  for (int step=0; step<maxSteps; step++) {
+    struct ncclTopoLink* link = path->list[step];
+    struct ncclTopoLink* revLink = NULL;
+    float fwSpeed = link->type == LINK_PCI ? pciSpeed : speed;
+    float revSpeed = 0;
+    if (link->remNode->type == GPU && start->type != GPU) {
+      if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink));
+      revSpeed += fwSpeed/8;
+    }
+    if (link->remNode->type == CPU && link->type == LINK_NVL) {
+      if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink));
+      revSpeed += fwSpeed;
     }
-    path->list[i]->width -= width;
+    if (link->width < fwSpeed || (revSpeed && revLink->width < revSpeed)) { *steps = step; return ncclSuccess; }
+    SUB_ROUND(link->width, fwSpeed);
+    if (revSpeed) SUB_ROUND(revLink->width, revSpeed);
+    node = link->remNode;
   }
-  *node = path->list[path->count-1]->remNode;
+  *steps = maxSteps;
+  return ncclSuccess;
+}
+
+// Try to go from node type1/index1 to no type2/index2. mult indicates whether we are counting the bandwidth (1) or undoing (-1).
+static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int type1, int index1, int type2, int index2, int mult, struct ncclTopoNode** node) {
+  // First handle easy cases
+  *node = system->nodes[type2].nodes+index2;
+  if (type1 == -1) return ncclSuccess;
+  struct ncclTopoNode* node1 = system->nodes[type1].nodes+index1;
+  struct ncclTopoLinkList* path = node1->paths[type2]+index2;
+  if (path->count == 0 ) return ncclSuccess;
+
+  // Now check link type
+  *node = NULL;
+  int intra = type1 == GPU && type2 == GPU;
+  float speed = intra ? graph->speedIntra : graph->speedInter;
+  int type = intra ? graph->typeIntra : graph->typeInter;
+
+  if (mult == 1 && (path->type > type)) return ncclSuccess;
+
+  speed *= mult;
+
+  // Check there is enough bandwidth on paths.
+  int step = 0;
+  NCCLCHECK(followPath(path, node1, path->count, speed, &step));
+  if (step < path->count) goto rewind;
+
+  // Enough bandwidth : return destination node.
+  graph->nHops += mult*path->count;
+  *node = system->nodes[type2].nodes+index2;
+  return ncclSuccess;
+
+rewind:
+  // Not enough bandwidth : rewind and exit.
+  NCCLCHECK(followPath(path, node1, step, -speed, &step));
   return ncclSuccess;
 }
 
@@ -80,22 +172,42 @@ static int cmpIntraScores(struct ncclGpuScore* scores, int count) {
   return 0;
 }
 
-static ncclResult_t getNetPaths(struct ncclTopoSystem* system, const uint64_t flag, struct ncclTopoLinkList** netPaths) {
+static ncclResult_t getGpuIndex(struct ncclTopoSystem* system, int rank, int* index) {
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
+      *index = g;
+      return ncclSuccess;
+    }
+  }
+  WARN("Could not find gpu rank %d\n", rank);
+  return ncclInternalError;
+}
+
+static ncclResult_t getNetIndex(struct ncclTopoSystem* system, int64_t id, int* index) {
   for (int n=0; n<system->nodes[NET].count; n++) {
-    if (system->nodes[NET].nodes[n].used & flag) {
-      *netPaths=system->nodes[NET].nodes[n].paths[GPU];
+    if (system->nodes[NET].nodes[n].id == id) {
+      *index = n;
       return ncclSuccess;
     }
   }
+  WARN("Could not find net id %lx\n", id);
   return ncclInternalError;
 }
 
+static ncclResult_t getNetPaths(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoLinkList** netPaths) {
+  int netId = graph->inter[graph->nChannels*2];
+  int n;
+  NCCLCHECK(getNetIndex(system, netId, &n));
+  *netPaths=system->nodes[NET].nodes[n].paths[GPU];
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoNode* gpu, int* next, int* countPtr, int sortNet) {
   const uint64_t flag = 1ULL<<(graph->nChannels);
   int ngpus = system->nodes[GPU].count;
   struct ncclTopoLinkList* paths = gpu->paths[GPU];
   struct ncclTopoLinkList* netPaths = NULL;
-  if (sortNet) NCCLCHECK(getNetPaths(system, flag, &netPaths));
+  if (sortNet) NCCLCHECK(getNetPaths(system, graph, &netPaths));
 
   struct ncclGpuScore scores[NCCL_TOPO_MAX_NODES];
   memset(scores, 0, ngpus*sizeof(struct ncclGpuScore));
@@ -130,9 +242,13 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time);
+ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time);
 
-#define NCCL_SEARCH_TIMEOUT (1ULL<<20) // This should get contain all search within a second or so.
+// Try to keep all searchs within one second
+#define NCCL_SEARCH_GLOBAL_TIMEOUT (3ULL<<19)
+#define NCCL_SEARCH_TIMEOUT (1<<18)
+#define NCCL_SEARCH_TIMEOUT_TREE (1<<17)
+#define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<10)
 
 #define FORCED_ORDER_PCI 1
 #define FORCED_ORDER_REPLAY 2
@@ -142,7 +258,7 @@ ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopo
   if (graph->nChannels == 0) return ncclInternalError;
   int ngpus = system->nodes[GPU].count;
   int nextRank = graph->intra[(graph->nChannels-1)*ngpus+step+1];
-  for (int i=0; i<ngpus; i++) if (system->nodes[GPU].nodes[i].rank == nextRank) {
+  for (int i=0; i<ngpus; i++) if (system->nodes[GPU].nodes[i].gpu.rank == nextRank) {
     *g = i;
     return ncclSuccess;
   }
@@ -150,44 +266,37 @@ ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopo
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time);
+ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time);
 
-ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoLinkList* paths, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time, int g, int speed) {
-  int typeSave = graph->type;
+ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time, int type, int index, int g) {
   const uint64_t flag = 1ULL<<(graph->nChannels);
-  struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
-  if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, speed, typeSave));
+  struct ncclTopoNode* gpu;
+  NCCLCHECK(ncclTopoFollowPath(system, graph, type, index, GPU, g, 1, &gpu));
   if (gpu) {
     gpu->used ^= flag;
-    NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, maxSpeed, time));
+    NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, time));
     gpu->used ^= flag;
-    if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, -speed, typeSave));
+    NCCLCHECK(ncclTopoFollowPath(system, graph, type, index, GPU, g, -1, &gpu));
   }
   return ncclSuccess;
 }
 
 ncclResult_t ncclTopoCompareGraphs(struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
-  // 0. When we are trying to increase speedIntra, do not copy if the solution has less channels
-  // since it would likely impact the rings algorithms too.
-  if (graph->speedIntra > graph->speedInter && graph->nChannels < refGraph->nChannels) return ncclSuccess;
+  // 1. Constraint to get the same nChannels between Rings and Trees
+  if (graph->nChannels < graph->minChannels) return ncclSuccess;
 
-  // 1. Try to get better bandwidth
+  // 2. Try to get better bandwidth
   if (graph->nChannels*graph->speedIntra < refGraph->nChannels*refGraph->speedIntra) return ncclSuccess;
   if (graph->nChannels*graph->speedIntra > refGraph->nChannels*refGraph->speedIntra) {
     *copy = 1;
     return ncclSuccess;
   }
-  // 2. Give an advantage when all channels are the same
-  if (graph->nChannels > 1 && graph->sameChannels && refGraph->sameChannels == 0) {
-    *copy = 1;
-    return ncclSuccess;
-  }
-  // 3. Less hops
-  if (graph->nHops < refGraph->nHops) *copy = 1;
+  // 3. Less hops (but not at the price of going cross NICs)
+  if (graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1;
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time) {
+ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
   if ((*time) <= 0) return ncclSuccess;
   (*time)--;
 
@@ -195,51 +304,39 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
   if (step == ngpus) {
     // Determine whether we found a better solution or not
     int copy = 0;
-    int sameChannels = graph->sameChannels;
-    if (graph->nChannels > 0) {
-      int* intra = graph->intra+graph->nChannels*ngpus;
-      for (int g=0; g<ngpus; g++) if (intra[g] != intra[g-ngpus]) graph->sameChannels = 0;
-    }
     graph->nChannels++;
     NCCLCHECK(ncclTopoCompareGraphs(graph, saveGraph, &copy));
     if (copy) {
       memcpy(saveGraph, graph, sizeof(struct ncclTopoGraph));
-      if (graph->nChannels*graph->speedIntra == maxSpeed) *time = -1;
+      if (graph->nChannels == graph->maxChannels) *time = -1;
     }
-    if (graph->nChannels < MAXCHANNELS/2) {
-      NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, maxSpeed, time));
+    if (graph->nChannels < graph->maxChannels) {
+      NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, time));
     }
     graph->nChannels--;
-    graph->sameChannels = sameChannels;
     return ncclSuccess;
   }
-  graph->intra[graph->nChannels*ngpus+step] = gpu->rank;
+  graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank;
+  int g = gpu - system->nodes[GPU].nodes;
   if (step == backToNet) {
     // first get back to NIC
     if (system->nodes[NET].count) {
-      int maxWidth = 0;
-      struct ncclTopoLinkList* paths = gpu->paths[NET];
-      for (int n=0; n<system->nodes[NET].count; n++) {
-        if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue;
-        maxWidth = std::max(paths[n].width, maxWidth);
-      }
+      int startNetIndex;
+      NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
+      struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
       for (int n=0; n<system->nodes[NET].count; n++) {
-        if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue;
-        if (paths[n].width == maxWidth) {
-          struct ncclTopoNode* net = system->nodes[NET].nodes+n;
-          int typeSave = graph->type;
-          NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, graph->speedInter, typeSave));
-          if (net) {
-            graph->inter[graph->nChannels*2+1] = net->id;
-            NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, maxSpeed, time));
-            NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, -graph->speedInter, typeSave));
-          }
+        struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+        if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
+        NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
+        if (net) {
+          graph->inter[graph->nChannels*2+1] = net->id;
+          NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, time));
+          NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
         }
       }
     }
   } else if (step < system->nodes[GPU].count-1) {
     // Go to next GPU
-    struct ncclTopoLinkList* paths = gpu->paths[GPU];
     int next[NCCL_TOPO_MAX_NODES];
     int count;
     if (forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order
@@ -252,64 +349,59 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
       NCCLCHECK(ncclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step+1 ? 1 : -1 ));
     }
     for (int i=0; i<count; i++) {
-      int g = next[i];
-      int nvlink = graph->nvlink;
-      graph->nvlink &= paths[g].type <= LINK_NVL ? 1 : 0;
-      int speed = graph->speedIntra;
-      if (paths[g].type == LINK_QPI) speed = INTEL_P2P_OVERHEAD(speed);
-      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, step+1, backToNet, backToFirstRank, forcedOrder, maxSpeed, time, g, speed));
-      graph->nvlink = nvlink;
+      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, step+1, backToNet, backToFirstRank, forcedOrder, time, GPU, g, next[i]));
     }
   } else if (step == backToFirstRank) {
     // Find first GPU and loop back to it
-    int g;
-    int rank = graph->intra[graph->nChannels*ngpus];
-    for (g=0; g<ngpus; g++) {
-      if (system->nodes[GPU].nodes[g].rank == rank) break;
-    }
-    if (g == ngpus) {
-      WARN("Could not find GPU with rank %d\n", rank);
-      return ncclInternalError;
-    }
-    struct ncclTopoLinkList* paths = gpu->paths[GPU];
-    struct ncclTopoNode* firstGpu = system->nodes[GPU].nodes+g;
-    int typeSave = graph->type;
-    NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, graph->speedIntra, typeSave));
+    int p;
+    NCCLCHECK(getGpuIndex(system, graph->intra[graph->nChannels*ngpus], &p));
+    struct ncclTopoNode* firstGpu;
+    NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, p, 1, &firstGpu));
     if (firstGpu) {
-      NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step+1, backToNet, -1, forcedOrder, maxSpeed, time));
-      NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, -graph->speedIntra, typeSave));
+      NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step+1, backToNet, -1, forcedOrder, time));
+      NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, p, -1, &firstGpu));
     }
   } else {
     // Next path
-    NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, maxSpeed, time));
+    NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time));
   }
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int maxSpeed, int* time) {
-  const uint64_t flag = 1ULL<<(graph->nChannels);
+ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
   const int speed = graph->speedInter;
   for (int n=0; n<system->nodes[NET].count; n++) {
     struct ncclTopoNode* net = system->nodes[NET].nodes+n;
     struct ncclTopoNode* gpu;
-    if (net->used == 0) {
-      graph->inter[graph->nChannels*2] = net->id;
-      for (int i=0; i<system->nodes[NET].count; i++) {
-        if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag;
+    if (graph->collNet && net->net.collSupport == 0) continue;
+    if (net->net.width < speed) continue;
+    if (net->net.maxChannels == 0) continue;
+
+    graph->inter[graph->nChannels*2] = net->id;
+    for (int i=0; i<system->nodes[NET].count; i++) {
+      if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) &&
+          (system->nodes[NET].nodes[i].net.port == net->net.port)) {
+        system->nodes[NET].nodes[i].net.width -= speed;
       }
-      struct ncclTopoLinkList* paths = net->paths[GPU];
+    }
+    net->net.maxChannels--;
 
-      // First try the PCI order to set a reference
-      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, speed));
-      // Then try to replay the last channel
-      if (graph->nChannels > 0) {
-        int g;
-        NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
-        NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, g, speed));
+    // First try to replay the last channel
+    if (graph->nChannels > 0) {
+      int g;
+      NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
+      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
+    }
+    if (graph->nChannels == 0 || graph->sameChannels == 0) {
+      if (graph->nChannels == 0) {
+        // Always try the PCI order first to set a reference
+        NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, NET, n, 0));
       }
 
       // Then try the most local GPUs
-      int maxWidth = 0, minHops = 0xfffffff;
+      float maxWidth = 0;
+      int minHops = 0xfffffff;
+      struct ncclTopoLinkList* paths = net->paths[GPU];
       for (int g=0; g<system->nodes[GPU].count; g++) {
         if (paths[g].width > maxWidth) {
           maxWidth = paths[g].width;
@@ -328,14 +420,19 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
               gpu = system->nodes[GPU].nodes+g;
               int gpuUsed = gpuPciWidth(gpu) > 0 ? 0 : 1;
               if (tryGpuBidir == gpuUsed) {
-                NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, 0, maxSpeed, time, g, speed));
+                NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
               }
             }
           }
         }
       }
-      for (int i=0; i<system->nodes[NET].count; i++) {
-        if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag;
+    }
+
+    net->net.maxChannels++;
+    for (int i=0; i<system->nodes[NET].count; i++) {
+      if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) &&
+          (system->nodes[NET].nodes[i].net.port == net->net.port)) {
+        system->nodes[NET].nodes[i].net.width += speed;
       }
     }
   }
@@ -374,126 +471,201 @@ ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, in
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time) {
+ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time) {
   int backToNet, backToFirstRank;
   NCCLCHECK(ncclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank));
   if (system->nodes[NET].count) {
     // Start from NET
-    ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, maxSpeed, time);
+    ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time);
   } else {
-    // Start from GPU 0
-    NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, graph->speedIntra));
-    if (graph->nChannels > 0) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, 0, graph->speedIntra));
-    NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, 0, maxSpeed, time, 0, graph->speedIntra));
+    // Intra-node only.
+    if (graph->nChannels == 0) {
+      // Try PCI order first
+      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0));
+    } else {
+      // Also try to replay previous channel
+      int g;
+      NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
+      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, -1, -1, g));
+    }
+    if (graph->sameChannels == 0 || graph->nChannels == 0) {
+      // Finally, try all other possibilities unless we are forced to use the same channels
+      for (int g=0; g<system->nodes[GPU].count; g++) {
+        NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g));
+      }
+    }
   }
   return ncclSuccess;
 }
 
-/* Parse user defined rings. Format is like :
- * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
- * Rings with a non-matching number of ranks are ignored so we can provide
- * rings for multiple cases.
- */
-#define MAX_ENV_RANKS 512
-static ncclResult_t parseGraph(const char* str, int* nChannelsRet, int ngpus, int* channels) {
-  int ranks[MAX_ENV_RANKS];
-  int nChannels = 0;
-  int rank = 0;
-  int offset = 0;
-  int status = 0; // 0 : between numbers, 1 : inside number
-  do {
-    int digit = str[offset] - '0';
-    if (digit >= 0 && digit <= 9) {
-      if (status == 0) {
-        ranks[rank] = digit;
-        status = 1;
-      } else {
-        ranks[rank] = ranks[rank]*10+digit;
-      }
-    } else {
-      if (status == 1) {
-        rank++;
-        if (rank == MAX_ENV_RANKS) goto end;
+/************************************/
+/* User defined graph from XML file */
+/************************************/
+
+struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "LOC", PATH_LOC }, { NULL, 0 } };
+ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
+  int ngpus = system->nodes[GPU].count;
+  int* inter = graph->inter+2*c;
+  int* intra = graph->intra+ngpus*c;
+  int n=0, g=0;
+  for (int s=0; s<xmlChannel->nSubs; s++) {
+    struct ncclXmlNode* sub = xmlChannel->subs[s];
+    int dev;
+    NCCLCHECK(xmlGetAttrInt(sub, "dev", &dev));
+    if (strcmp(sub->name, "net") == 0) {
+      inter[n++] = dev;
+    } else if (strcmp(sub->name, "gpu") == 0) {
+      int rank = -1;
+      for (int g=0; g<ngpus; g++) {
+        if (system->nodes[GPU].nodes[g].gpu.dev == dev) rank = system->nodes[GPU].nodes[g].gpu.rank;
       }
-      status = 0;
-      if (str[offset] == '|' || str[offset] == '\0') {
-        // Ignore if ngpus doesn't match
-        if (rank != ngpus) goto newchannel;
-
-        for (int r=0; r<ngpus; r++) {
-          int rank = ranks[r];
-          // Ignore if ranks are out of bounds
-          if (rank < 0 || rank >= ngpus) goto newchannel;
-          // Ignore if ranks are duplicate
-          for (int i=0; i<r; i++)
-            if (ranks[i] == rank) goto newchannel;
-
-          channels[nChannels*ngpus+r] = rank;
-        }
-        nChannels++;
-newchannel:
-        rank = 0;
+      if (rank == -1) {
+        WARN("XML Import Channel : dev %d not found.", dev);
+        return ncclSystemError;
       }
+      intra[g++] = rank;
+    }
+  }
+  return ncclSuccess;
+}
+ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
+  int id;
+  NCCLCHECK(xmlGetAttrInt(xmlGraph, "id", &id));
+  if (graph->id != id) return ncclSuccess;
+
+  int crossNic;
+  NCCLCHECK(xmlGetAttrInt(xmlGraph, "crossnic", &crossNic));
+  if (graph->crossNic == 0 && crossNic == 1) return ncclSuccess;
+  graph->crossNic = crossNic;
+
+  NCCLCHECK(xmlGetAttrInt(xmlGraph, "pattern", &graph->pattern));
+  NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels));
+  NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->speedIntra));
+  NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->speedInter));
+  const char* str;
+  NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str));
+  NCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType));
+  NCCLCHECK(xmlGetAttr(xmlGraph, "typeinter", &str));
+  NCCLCHECK(kvConvertToInt(str, &graph->typeInter, kvDictLinkType));
+  NCCLCHECK(xmlGetAttrInt(xmlGraph, "samechannels", &graph->sameChannels));
+  for (int s=0; s<xmlGraph->nSubs; s++) {
+    NCCLCHECK(ncclTopoGetChannelFromXml(xmlGraph->subs[s], s, system, graph));
+  }
+  return ncclSuccess;
+}
+ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
+  for (int s=0; s<xmlGraphs->nSubs; s++) {
+    NCCLCHECK(ncclTopoGetGraphFromXmlSub(xmlGraphs->subs[s], system, graph));
+  }
+  return ncclSuccess;
+}
+
+/* And the reverse : graph->xml */
+ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int c, struct ncclTopoSystem* system, struct ncclXml *xml, struct ncclXmlNode* parent) {
+  struct ncclXmlNode* xmlChannel;
+  int ngpus = system->nodes[GPU].count;
+  int* inter = graph->inter+2*c;
+  int* intra = graph->intra+ngpus*c;
+  NCCLCHECK(xmlAddNode(xml, parent, "channel", &xmlChannel));
+  struct ncclXmlNode* node;
+  if (system->nodes[NET].count) {
+    NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
+    NCCLCHECK(xmlSetAttrInt(node, "dev", inter[0]));
+  }
+  for (int g=0; g<ngpus; g++) {
+    NCCLCHECK(xmlAddNode(xml, xmlChannel, "gpu", &node));
+    int dev = -1;
+    for (int i=0; i<ngpus; i++) {
+      if (system->nodes[GPU].nodes[i].gpu.rank == intra[g]) dev = system->nodes[GPU].nodes[i].gpu.dev;
+    }
+    if (dev == -1) {
+      WARN("XML Export Channel : rank %d not found.", intra[g]);
+      return ncclInternalError;
     }
-  } while (str[offset++] != 0);
-end:
-  *nChannelsRet = nChannels;
+    NCCLCHECK(xmlSetAttrInt(node, "dev", dev));
+  }
+  if (system->nodes[NET].count) {
+    NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
+    NCCLCHECK(xmlSetAttrInt(node, "dev", inter[1]));
+  }
   return ncclSuccess;
 }
+ncclResult_t ncclTopoGetXmlFromGraph(struct ncclTopoGraph* graph, struct ncclTopoSystem* system, struct ncclXml *xml, struct ncclXmlNode* parent) {
+  struct ncclXmlNode* xmlGraph;
+  NCCLCHECK(xmlAddNode(xml, parent, "graph", &xmlGraph));
+  NCCLCHECK(xmlSetAttrInt(xmlGraph, "id", graph->id));
+  NCCLCHECK(xmlSetAttrInt(xmlGraph, "pattern", graph->pattern));
+  NCCLCHECK(xmlSetAttrInt(xmlGraph, "crossnic", graph->crossNic));
+  NCCLCHECK(xmlSetAttrInt(xmlGraph, "nchannels", graph->nChannels));
+  NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->speedIntra));
+  NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->speedInter));
+  const char* str;
+  NCCLCHECK(kvConvertToStr(graph->typeIntra, &str, kvDictLinkType));
+  NCCLCHECK(xmlSetAttr(xmlGraph, "typeintra", str));
+  NCCLCHECK(kvConvertToStr(graph->typeInter, &str, kvDictLinkType));
+  NCCLCHECK(xmlSetAttr(xmlGraph, "typeinter", str));
+  NCCLCHECK(xmlSetAttrInt(xmlGraph, "samechannels", graph->sameChannels));
+  for (int c=0; c<graph->nChannels; c++) {
+    NCCLCHECK(ncclTopoGetXmlFromChannel(graph, c, system, xml, xmlGraph));
+  }
+  return ncclSuccess;
+}
+ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml) {
+  xml->maxIndex = 0;
+  struct ncclXmlNode* xmlGraphs;
+  NCCLCHECK(xmlAddNode(xml, NULL, "graphs", &xmlGraphs));
+  NCCLCHECK(xmlSetAttrInt(xmlGraphs, "version", NCCL_GRAPH_XML_VERSION));
+  for (int g=0; g<ngraphs; g++) {
+    NCCLCHECK(ncclTopoGetXmlFromGraph(graphs[g], system, xml, xmlGraphs));
+  }
+  return ncclSuccess;
+}
+
+float speedArray[] = { 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+#define NSPEEDS (sizeof(speedArray)/sizeof(float))
 
 ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
   int ngpus = system->nodes[GPU].count;
   int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
   graph->speedIntra = graph->speedInter = 0;
   if (graph->crossNic == 2) graph->crossNic = 0;
-  graph->nvlink = 0;
-  graph->type = LINK_LOC;
+  graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
+  graph->typeInter = PATH_PIX;
   graph->nChannels = 0;
   graph->sameChannels = 1;
 
-  char* str = getenv("NCCL_GRAPH");
+  char* str = getenv("NCCL_GRAPH_FILE");
   if (str) {
-    NCCLCHECK(parseGraph(str, &graph->nChannels, ngpus, graph->intra));
-    for (int i=0; i<graph->nChannels*ngpus; i++) {
-      // Translate gpu numbers into ranks
-      graph->intra[i] = system->nodes[GPU].nodes[graph->intra[i]].rank;
-    }
-    // TODO : let user specify NICs
-    graph->inter[0] = graph->inter[1] = 0;
-    graph->speedIntra = graph->speedInter = PCI_WIDTH+2;
-    graph->nvlink = 0;
-    if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
-      // Reverse the loop
-      for (int c=0; c<graph->nChannels; c++) {
-        for (int i=0; i<=ngpus/2; i++) {
-          int tmp = graph->intra[ngpus*c+i];
-          graph->intra[ngpus*c+i] = graph->intra[ngpus*c+(ngpus-i)%ngpus];
-          graph->intra[ngpus*c+ngpus-i] = tmp;
-        }
-      }
-    }
-    if (graph->nChannels) return ncclSuccess;
+    struct ncclXml* xml;
+    NCCLCHECK(ncclCalloc(&xml, 1));
+    NCCLCHECK(ncclTopoGetXmlGraphFromFile(str, xml));
+    NCCLCHECK(ncclTopoGetGraphFromXml(xml->nodes, system, graph));
+    free(xml);
+    if (graph->nChannels > 0) return ncclSuccess;
   }
 
   if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
 
   struct ncclTopoGraph tmpGraph;
   memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
-  int bestSpeed = 0;
 
   // First try crossnic, then decrease speed and finally increase speedIntra.
-  tmpGraph.speedIntra = tmpGraph.speedInter = system->maxWidth;
-  int maxSpeed = system->maxSpeed;
   tmpGraph.pattern = graph->pattern;
+  int pass = 1;
+  int speedIndex = 0;
+  while (speedArray[speedIndex] > system->maxWidth && speedIndex < NSPEEDS-1) speedIndex++;
+  tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex];
+  int64_t globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
 
 search:
-  int time = NCCL_SEARCH_TIMEOUT;
-  tmpGraph.nvlink = 1;
+  int time = tmpGraph.sameChannels ? NCCL_SEARCH_TIMEOUT_SAMECHANNELS :
+    tmpGraph.pattern == NCCL_TOPO_PATTERN_TREE ? NCCL_SEARCH_TIMEOUT_TREE : NCCL_SEARCH_TIMEOUT;
   tmpGraph.nChannels = 0;
-  tmpGraph.sameChannels = 1;
-  NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, maxSpeed, &time));
+  globalTimeout -= time;
+
+  NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time));
 #if 0
-  printf("Pattern %d, crossNic %d, Speed %d/%d, type %d -> nChannels %dx%d/%d %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.type, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : "");
+  printf("Pattern %d, crossNic %d, Speed %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : "");
   for (int c=0; c<graph->nChannels; c++) {
     printf("%2d : ", c);
     for (int g=0; g<ngpus; g++) {
@@ -502,13 +674,34 @@ search:
     printf("\n");
   }
 #endif
-  if (time == -1) goto done;
-  // We already have a solution and we timed out so lower speed will just timeout as well
-  if (time == 0 && graph->nChannels > 0) goto done;
-  if ((graph->nChannels > 0) && (bestSpeed == 0)) bestSpeed = graph->speedIntra;
+  // Optimal solution, stop here
+  if (graph->nChannels == graph->maxChannels && graph->speedInter == system->maxWidth) goto done;
+
+  if (pass == 1) {
+    // First pass, we don't have a solution yet ; try other options
+
+    // Try having different channels
+    if (tmpGraph.sameChannels == 1) {
+      tmpGraph.sameChannels = 0;
+      goto search;
+    }
+    tmpGraph.sameChannels = 1;
 
-  if (tmpGraph.speedIntra == tmpGraph.speedInter) {
-    // First pass, we don't have a solution yet ; try to go slower.
+    if (time != -1) globalTimeout += time;
+    else globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
+    if (globalTimeout < 0) goto done;
+
+    int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
+    if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
+      tmpGraph.typeIntra += 1;
+      goto search;
+    }
+    tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
+    if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXB)) {
+      tmpGraph.typeInter += 1;
+      goto search;
+    }
+    tmpGraph.typeInter = PATH_PIX;
 
     // Try a simpler tree
     if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) {
@@ -521,50 +714,61 @@ search:
     }
     tmpGraph.pattern = graph->pattern;
 
-    if (tmpGraph.type < LINK_QPI) {
-      tmpGraph.type += 1;
-      goto search;
-    }
-    tmpGraph.type = graph->type;
-
     if (crossNic && tmpGraph.crossNic == 0) {
       // Try again with crossNic if permitted
       tmpGraph.crossNic = crossNic;
       goto search;
     }
-    tmpGraph.crossNic = graph->crossNic;
+    tmpGraph.crossNic = 0;
+
+    // Decrease speed until we find a solution
+    if ((speedIndex < NSPEEDS-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->speedInter > .49))) {
+      tmpGraph.speedInter = tmpGraph.speedIntra = speedArray[++speedIndex];
+      goto search;
+    }
+    speedIndex = 0;
+    while (speedArray[speedIndex] > system->maxWidth && speedIndex < NSPEEDS-1) speedIndex++;
+    tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex];
 
-    // Try to reduce speed per channel
-    tmpGraph.speedIntra = tmpGraph.speedInter -= 3;
-    if (tmpGraph.speedIntra >= bestSpeed/2 && tmpGraph.speedIntra >= 3) goto search;
   }
 
 done:
-  // We have a solution now. See if we can increase speedIntra
-  if (tmpGraph.speedIntra == tmpGraph.speedInter) {
+  // We have a solution. Start from that solution and move to pass 2.
+  if (pass == 1) {
     time = -1;
     memcpy(&tmpGraph, graph, sizeof(tmpGraph));
+    speedIndex = 0;
+    while (speedArray[speedIndex] > graph->speedInter && speedIndex < NSPEEDS-1) speedIndex++;
+    tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex];
+    tmpGraph.minChannels = graph->nChannels;
+    pass = 2;
   }
-  if (time != 0 && tmpGraph.pattern != NCCL_TOPO_PATTERN_RING && tmpGraph.speedIntra == graph->speedIntra) {
-    // Try to increase the intra speed only but keeping nChannels the same
-    tmpGraph.speedIntra += 3;
-    maxSpeed = tmpGraph.speedIntra * graph->nChannels;
-    if (tmpGraph.speedIntra <= tmpGraph.speedInter*2) goto search;
+
+  // 3. See if we can increase speedIntra for trees (2 nodes or collnet)
+  if (pass == 2) {
+    if (time != 0 && graph->pattern != NCCL_TOPO_PATTERN_RING &&
+        tmpGraph.speedIntra == graph->speedIntra && tmpGraph.speedIntra < tmpGraph.speedInter*2 &&
+        speedIndex > 0) {
+      tmpGraph.speedIntra = speedArray[--speedIndex];
+      goto search;
+    }
+    time = -1;
+    memcpy(&tmpGraph, graph, sizeof(tmpGraph));
   }
 
-  if (graph->nChannels == 0) {
+  if (graph->nChannels == 0 && graph->collNet == 0) {
     WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern);
-    for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].rank;
+    for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
     graph->inter[0] = graph->inter[1] = 0;
-    graph->speedIntra = graph->speedInter = 3;
-    graph->nvlink = 0;
+    graph->speedIntra = graph->speedInter = 0.1;
+    graph->typeIntra = graph->typeInter = PATH_SYS;
     graph->nChannels = 1;
   }
   return ncclSuccess;
 }
 
 ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
-  INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, speed %d/%d, nvlink %d, type %d, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->speedIntra, graph->speedInter, graph->nvlink, graph->type, graph->sameChannels);
+  INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, speed %f/%f, type %s/%s, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->speedIntra, graph->speedInter, topoPathTypeStr[graph->typeIntra], topoPathTypeStr[graph->typeInter], graph->sameChannels);
   int ngpus = system->nodes[GPU].count;
 
   char line[1024];
@@ -588,6 +792,18 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
   return ncclSuccess;
 }
 
+ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs) {
+  char* str = getenv("NCCL_GRAPH_DUMP_FILE");
+  if (str) {
+    struct ncclXml* xml;
+    NCCLCHECK(ncclCalloc(&xml, 1));
+    NCCLCHECK(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml));
+    NCCLCHECK(ncclTopoDumpXmlToFile(str, xml));
+    free(xml);
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* dev) {
   *dev = graph->inter[(channelId%graph->nChannels)*2+dir];
   return ncclSuccess;
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index a1b3209..5cd8d4e 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,46 +10,22 @@
 #include "comm.h"
 #include "nvmlwrap.h"
 #include "net.h"
+#include "coll_net.h"
 #include <sys/stat.h>
 #include <fcntl.h>
+#include "xml.h"
+#include "cpuset.h"
 
 #define BUSID_SIZE (sizeof("0000:00:00.0"))
 #define BUSID_REDUCED_SIZE (sizeof("0000:00"))
 
-const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
-
 const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
-const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "QPI", "NET" };
+const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "",    "",    "SYS", "NET" };
+const char* topoPathTypeStr[] = { "LOC", "NVL", "PIX", "PXB", "PHB", "SYS", "NET" };
 
 /******************************************************************/
 /******************* Graph Creation Functions *********************/
 /******************************************************************/
-static int getNumaId(char *path) {
-  char npath[PATH_MAX];
-  snprintf(npath, PATH_MAX, "%s/numa_node", path);
-  npath[PATH_MAX-1] = '\0';
-
-  int numaId = -1;
-  FILE *file = fopen(npath, "r");
-  if (file == NULL) return -1;
-  if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
-  fclose(file);
-
-  return numaId;
-}
-
-static ncclResult_t getPciPath(char* busId, char** path) {
-  for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
-  char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
-  memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
-  memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
-  *path = realpath(busPath, NULL);
-  if (*path == NULL) {
-    WARN("Could not find real path of %s", busPath);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
 
 // Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000.
 ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
@@ -59,110 +35,43 @@ ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id)
   // Find next /
   while (*str != '/') str--;
   str++;
-  NCCLCHECK(busIdToInt64(str, id));
+  int64_t numid;
+  NCCLCHECK(busIdToInt64(str, &numid));
+  // Ignore subdevice because those should use the same PCI link so we want to merge nodes.
+  numid -= numid & 0xf;
+  *id = numid;
   return ncclSuccess;
 }
 
-static ncclResult_t idToIndex(struct ncclTopoSystem* system, int64_t id, int* index) {
-  *index = -1;
-  for (int i=0; i<system->nodes[GPU].count; i++) {
-    if (system->nodes[GPU].nodes[i].id == id) {
-      *index = i;
-    }
+static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu) {
+  *cpu = NULL;
+  if (node->type == CPU) {
+    *cpu = node;
+    return ncclSuccess;
+  }
+  for (int l=0; l<node->nlinks; l++) {
+    if (node->links[l].type == LINK_PCI) NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
+    if (*cpu != NULL) return ncclSuccess;
   }
   return ncclSuccess;
 }
 
-
-static ncclResult_t getPath(int64_t id, char** path) {
-  char busId[] = "0000:00:00.0";
-  NCCLCHECK(int64ToBusId(id, busId));
-  NCCLCHECK(getPciPath(busId, path));
-  return ncclSuccess;
-}
-
-ncclResult_t ncclTopoCudaPath(int cudaDev, char** path) {
-  char busId[BUSID_SIZE];
-  CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
-  NCCLCHECK(getPciPath(busId, path));
-  return ncclSuccess;
-}
-
-
 int interCpuWidth = 0;
 int cpuPciWidth = 0;
 
-static ncclResult_t getCpuWidths() {
-  // Check if already detected
-  if (interCpuWidth + cpuPciWidth) return ncclSuccess;
-
-  // Defaults
-  char cpu[256];
-  sprintf(cpu, "Generic");
-  cpuPciWidth = interCpuWidth = PCI_WIDTH;
-
-#ifdef __PPC__
-  sprintf(cpu, "ppc64");
-  interCpuWidth = P9_WIDTH;
-#endif
-#ifdef __x86_64__
-  sprintf(cpu, "x86_64");
-  union {
-    struct {
-      // CPUID 0 String register order
-      uint32_t ebx;
-      uint32_t edx;
-      uint32_t ecx;
-    };
-    char vendor[12];
-  } cpuid0;
-
-  asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0));
-  if (strncmp(cpuid0.vendor, "GenuineIntel", 12) == 0) sprintf(cpu, "Intel");
-
-  if (strcmp(cpu, "Intel") == 0) {
-    union {
-      struct {
-        int steppingId:4;
-        int model:4;
-        int familyId:4;
-        int processorType:2;
-        int resv0:2;
-        int extModelId:4;
-        int modelId:8;
-        int resv1:4;
-      };
-      uint32_t val;
-    } cpuid1;
-    asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1));
-    if (cpuid1.familyId == 6 && cpuid1.modelId >= 0x55) { // Skylake
-      sprintf(cpu, "Intel/Skylake (or later)");
-      interCpuWidth = SKL_QPI_WIDTH;
-    } else {
-      interCpuWidth = QPI_WIDTH;
-    }
+static ncclResult_t ncclTopoGetInterCpuWidth(struct ncclTopoNode* cpu, float* width) {
+  *width = LOC_WIDTH;
+  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) {
+    *width = P9_WIDTH;
+    return ncclSuccess;
+  }
+  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_ARM) {
+    *width = ARM_WIDTH;
+    return ncclSuccess;
+  }
+  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
+    *width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_WIDTH : QPI_WIDTH;
   }
-#endif
-  INFO(NCCL_GRAPH, "%s CPU (PCI %d, InterCpu %d)", cpu, cpuPciWidth, interCpuWidth);
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclTopoGetInterCpuWidth(int* width) {
-  NCCLCHECK(getCpuWidths());
-  *width = interCpuWidth;
-  return ncclSuccess;
-}
-static ncclResult_t ncclTopoGetCpuPciP2pWidth(int* width) {
-  NCCLCHECK(getCpuWidths());
-  *width = cpuPciWidth;
-  return ncclSuccess;
-}
-static ncclResult_t ncclTopoGetPciWidth(int* width) {
-  *width = PCI_WIDTH;
-  return ncclSuccess;
-}
-static ncclResult_t ncclTopoGetNetWidth(int* width) {
-  *width = NET_WIDTH;
   return ncclSuccess;
 }
 
@@ -173,317 +82,101 @@ enum ncclNvLinkDeviceType {
   ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
 };
 
-static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
-  char classPath[] =  "/sys/bus/pci/devices/0000:00:00.0/class";
-  memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
-  char* rPath = realpath(classPath, NULL);
-  int fd;
-  if ((fd = open(rPath, O_RDONLY)) == -1) {
-    // Could not find device. It might be because we're in a VM and
-    // we don't see the whole machine. This is handled silently so
-    // we don't want to print an INFO error.
-    TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
-    return ncclSystemError;
-  }
-  free(rPath);
-  char pciClass[9];
-  strncpy(pciClass, "0x000000", 9);
-  int len;
-  SYSCHECKVAL(read(fd, pciClass, 8), "read", len);
-  SYSCHECK(close(fd), "close");
-  if (strcmp(pciClass, "0x068000") == 0) {
-    // PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
-    *type = ncclNvLinkDeviceSwitch;
-  } else if (strcmp(pciClass, "0x068001") == 0) {
-    // PCI device is of type "Bridge: IBM Device 04ea"
-    *type = ncclNvLinkDeviceBridge;
-  } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
-      || strcmp(pciClass, "0x030000") == 0) {  // "VGA Controller" (GeForce)
-    *type = ncclNvLinkDeviceGpu;
-  } else {
-    *type = ncclNvLinkDeviceUnknown;
+ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
+  for (int i=0; i<system->nodes[type].count; i++) {
+    if (system->nodes[type].nodes[i].id == id) {
+      *node = system->nodes[type].nodes+i;
+      return ncclSuccess;
+    }
   }
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoConnectCpu(struct ncclTopoSystem* system, int numaId, struct ncclTopoNode* node, int linkType, int linkWidth) {
-  struct ncclTopoNode* cpuNode = NULL;
-  for (int c=0; c<system->nodes[CPU].count; c++) {
-    if (system->nodes[CPU].nodes[c].id == numaId) cpuNode = system->nodes[CPU].nodes+c;
+ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
+  if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
+    WARN("Error : tried to create too many nodes of type %d\n", type);
+    return ncclInternalError;
   }
-  if (cpuNode == NULL) { // Create CPU
-    NCCLCHECK(ncclTopoCreateNode(system, &cpuNode, CPU, numaId));
+  struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
+  system->nodes[type].count++;
+  n->type = type;
+  n->id = id;
+  if (type == GPU) {
+    // Create link to itself (used in some corner cases)
+    n->nlinks=1;
+    n->links[0].type = LINK_LOC;
+    n->links[0].remNode = n;
+    n->links[0].width = LOC_WIDTH;
+    n->gpu.dev = NCCL_TOPO_UNDEF;
+    n->gpu.rank = NCCL_TOPO_UNDEF;
+    n->gpu.cudaCompCap = NCCL_TOPO_UNDEF;
+  } else if (type == CPU) {
+    n->cpu.arch = NCCL_TOPO_UNDEF;
+    n->cpu.vendor = NCCL_TOPO_UNDEF;
+    n->cpu.model = NCCL_TOPO_UNDEF;
+  } else if (type == NET) {
+    n->net.asic = 0ULL;
+    n->net.port = NCCL_TOPO_UNDEF;
+    n->net.width = 0.0;
   }
-  NCCLCHECK(ncclTopoConnectNodes(node, cpuNode, linkType, linkWidth));
-  NCCLCHECK(ncclTopoConnectNodes(cpuNode, node, linkType, linkWidth));
+  *node = n;
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoConnectNVLink(nvmlDevice_t* nvmlDevs, struct ncclTopoSystem* system) {
-  struct ncclTopoNode* nvsNode = NULL;
-
-  int minNvlinks = 6, minWidth = VOLTA_NVLINK_WIDTH;
-  for (int g=0; g<system->nodes[GPU].count; g++) {
-    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
-    int cudaMajor, cudaMinor;
-    NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDevs[g], &cudaMajor, &cudaMinor));
-    int maxNvLinks, width;
-    if (cudaMajor < 6) {
-      maxNvLinks = 0;
-      width = 0;
-    } else if (cudaMajor == 6) {
-      maxNvLinks = 4;
-      width = PASCAL_NVLINK_WIDTH;
-    } else {
-      maxNvLinks = 6;
-      width = VOLTA_NVLINK_WIDTH;
-    }
-
-    int nvlinks = 0;
-    for (int l=0; l<maxNvLinks; ++l) {
-      // Check whether we can use this NVLink for P2P
-      unsigned canP2P;
-      if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDevs[g], l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
-
-      // Make sure the Nvlink is up. The previous call should have trained the link.
-      nvmlEnableState_t isActive;
-      if ((wrapNvmlDeviceGetNvLinkState(nvmlDevs[g], l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
-
-      // Try to figure out what's on the other side of the NVLink
-      nvmlPciInfo_t remoteProc;
-      if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevs[g], l, &remoteProc) != ncclSuccess) continue;
-
-      // Make a lower case copy of the bus ID for calling ncclDeviceType
-      // PCI system path is in lower case
-      char* p = remoteProc.busId;
-      char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-      for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
-        lowerId[c] = tolower(p[c]);
-        if (p[c] == 0) break;
-      }
-
-      enum ncclNvLinkDeviceType type;
-      NCCLCHECK(ncclDeviceType(lowerId, &type));
-      if (type == ncclNvLinkDeviceGpu) {
-        int64_t remoteId;
-        NCCLCHECK(busIdToInt64(lowerId, &remoteId));
-        int peer;
-        NCCLCHECK(idToIndex(system, remoteId, &peer));
-        if (peer != -1) {
-          NCCLCHECK(ncclTopoConnectNodes(gpu, system->nodes[GPU].nodes+peer, LINK_NVL, width));
-          nvlinks++;
-        }
-      } else if (type == ncclNvLinkDeviceBridge) {
-        // Nvlink between GPU and CPU (PPC)
-        // Since the remote bridge does not have a valid numa_node, assume we
-        // are connected to the closest CPU.
-        char* path;
-        NCCLCHECK(getPath(gpu->id, &path));
-        int numaId = getNumaId(path);
-        free(path);
-        NCCLCHECK(ncclTopoConnectCpu(system, numaId, gpu, LINK_NVL, width));
-        nvlinks++;
-      } else { // Nvswitch
-        if (type == ncclNvLinkDeviceUnknown) {
-          // The NVLink is up but we couldn't find the PCI device on the other
-          // side. Assume it's an NVswitch outside a VM.
-          if (l == 0) INFO(NCCL_INIT, "%d/%d -> %s : Assuming NVLink is connected to NVswitch", g, l, lowerId);
+ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int index) {
+  struct ncclTopoNode* delNode = system->nodes[type].nodes+index;
+  for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
+    free(delNode->paths[t]);
+    for (int n=0; n<system->nodes[t].count; n++) {
+      struct ncclTopoNode* node = system->nodes[t].nodes+n;
+      if (node == delNode) continue;
+      for (int l=0; l<node->nlinks; l++) {
+        while (l<node->nlinks && node->links[l].remNode == delNode) {
+          memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink));
+          node->nlinks--;
         }
-        if (nvsNode == NULL) { // Create nvswitch
-          NCCLCHECK(ncclTopoCreateNode(system, &nvsNode, NVS, 0));
+        if (l<node->nlinks && node->links[l].remNode->type == type && node->links[l].remNode >= delNode) {
+          node->links[l].remNode--;
         }
-        NCCLCHECK(ncclTopoConnectNodes(gpu, nvsNode, LINK_NVL, VOLTA_NVLINK_WIDTH));
-        NCCLCHECK(ncclTopoConnectNodes(nvsNode, gpu, LINK_NVL, VOLTA_NVLINK_WIDTH));
-        nvlinks++;
       }
     }
-    minNvlinks = std::min(minNvlinks, nvlinks);
-    minWidth = std::min(minWidth, width);
   }
-  int pciWidth;
-  NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
-  system->maxSpeed = minNvlinks ? minNvlinks*minWidth : pciWidth;
-  system->maxWidth = minNvlinks ? minWidth : pciWidth;
+  memmove(delNode, delNode+1, (system->nodes[type].count-index-1)*sizeof(struct ncclTopoNode));
+  system->nodes[type].count--;
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoCreatePciPath(struct ncclTopoSystem* system, struct ncclTopoNode* endNode, char* path) {
-  struct ncclTopoNode* lastNode = endNode;
-  int pciWidth;
-  NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
-  // Find intermediate PCI switches
-  int slashCount = 0;
-  int offsetRC = 0;
-  while (offsetRC < strlen(path)) {
-    if (path[offsetRC] == '/') slashCount++;
-    if (slashCount == 4) break;
-    offsetRC++;
-  }
-  int offset = strlen(path);
-  slashCount = 0;
-  while (--offset > offsetRC) {
-    if (path[offset] == '/') {
-      slashCount++;
-      // Find if already existing
-      if ((slashCount%2) == 0) {
-        int64_t pciId;
-        NCCLCHECK(pciPathToInt64(path, offset, offsetRC, &pciId));
-        for (int p=0; p<system->nodes[PCI].count; p++) {
-          if (system->nodes[PCI].nodes[p].id == pciId) {
-            // Found our PCI switch. Attach and stop since the rest should already
-            // be connected
-            NCCLCHECK(ncclTopoConnectNodes(system->nodes[PCI].nodes+p, lastNode, LINK_PCI, pciWidth));
-            NCCLCHECK(ncclTopoConnectNodes(lastNode, system->nodes[PCI].nodes+p, LINK_PCI, pciWidth));
-            return ncclSuccess;
-          }
-        }
-        struct ncclTopoNode* pciNode;
-        NCCLCHECK(ncclTopoCreateNode(system, &pciNode, PCI, pciId));
-        NCCLCHECK(ncclTopoConnectNodes(pciNode, lastNode, LINK_PCI, pciWidth));
-        NCCLCHECK(ncclTopoConnectNodes(lastNode, pciNode, LINK_PCI, pciWidth));
-        lastNode = pciNode;
-      }
-    }
+ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width) {
+  // Aggregate links into higher width for NVLink
+  struct ncclTopoLink* link;
+  for (link = node->links; link->remNode; link++) {
+    if (link->remNode == remNode && link->type == type) break;
   }
-  // Then attach to a CPU node
-  int numaId = getNumaId(path);
-  int width;
-  NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
-  NCCLCHECK(ncclTopoConnectCpu(system, numaId, lastNode, LINK_PCI, width));
-  return ncclSuccess;
-}
-
-// Try to detect if IB cards are in fact the same physical NIC, hence sharing ports.
-#include <glob.h>
-#define IB_GUID_PATH "%s/infiniband/mlx5_*/sys_image_guid"
-uint64_t getIbGuid(char* path) {
-  uint64_t guid = 0ULL;
-  char guidPath[PATH_MAX];
-  snprintf(guidPath, PATH_MAX, IB_GUID_PATH, path);
-  // PATH has a wildcard in it so use glob()
-  glob_t globbuf;
-  glob(guidPath, 0, NULL, &globbuf);
-  if (globbuf.gl_pathc > 0)
-    strncpy(guidPath, globbuf.gl_pathv[0], PATH_MAX);
-  globfree(&globbuf);
-  guidPath[PATH_MAX-1] = '\0';
-  FILE *file = fopen(guidPath, "r");
-  if (file != NULL) {
-    uint64_t a, b, c, d;
-    if (fscanf(file, "%04lx:%04lx:%04lx:%04lx", &a, &b, &c, &d) != EOF) {
-      guid = (a << 48) + (b << 32) + (c<<16) + d;
-      TRACE(NCCL_GRAPH, "Opened %s guid %lx", guidPath, guid);
-    }
-    fclose(file);
-  }
-  return guid;
-}
-
-struct netInfo {
-  char* path;
-  int64_t nic;
-  uint64_t asic;
-  int port;
-  int net;
-};
-
-ncclResult_t ncclTopoComputeNetInfo(struct netInfo* netInfos, int ndev) {
-  for (int n=0; n<ndev; n++) {
-    struct netInfo* info = netInfos+n;
-    uint64_t ibGuid;
-    info->nic = n;
-    info->asic = n;
-    info->port = 0;
-    info->net = n;
-    if (info->path && (ibGuid = getIbGuid(info->path)) != 0) {
-      info->asic = ibGuid;
-
-      // Ignore PCI subdevice when computing the ID to merge multi-port cards
-      // and make them use the same PCI link.
-      char* path = strdup(info->path);
-      path[strlen(path)-1]='0';
-      NCCLCHECK(pciPathToInt64(path, strlen(path), 0, &info->nic));
-      free(path);
-
-      // Same PCI path -> different ports of the same NIC
-      for (int i=0; i<n; i++) if (netInfos[i].nic == info->nic) info->port++;
-
-      // Same GUID -> same network links as the other NIC
-      for (int i=0; i<n; i++) if (netInfos[i].asic == info->asic && netInfos[i].port == info->port) info->net = netInfos[i].net;
-    }
-    INFO(NCCL_GRAPH, "%s -> %x/%lx/%d/%d", info->path, info->nic, info->asic, info->port, info->net);
+  if (link->remNode == NULL) node->nlinks++;
+  link->type = type;
+  link->remNode = remNode;
+  link->width += width;
+
+  // Sort links in BW descending order
+  struct ncclTopoLink linkSave;
+  memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
+  while (link != node->links) {
+    if ((link-1)->width >= linkSave.width) break;
+    memcpy(link, link-1, sizeof(struct ncclTopoLink));
+    link--;
   }
+  memcpy(link, &linkSave, sizeof(struct ncclTopoLink));
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoConnectPCI(struct ncclTopoSystem* system) {
-  for (int g=0; g<system->nodes[GPU].count; g++) {
-    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
-    char* path;
-    NCCLCHECK(getPath(gpu->id, &path));
-    NCCLCHECK(ncclTopoCreatePciPath(system, gpu, path));
-    free(path);
-  }
-
-  // Connect the NICs
-  int netDevCount;
-  NCCLCHECK(ncclNetDevices(&netDevCount));
-  int netWidth;
-  NCCLCHECK(ncclTopoGetNetWidth(&netWidth));
-
-  struct netInfo* netInfos;
-  NCCLCHECK(ncclCalloc(&netInfos, netDevCount));
-
-  for (int n=0; n<netDevCount; n++) {
-    ncclResult_t res = ncclNetPciPath(n, &netInfos[n].path);
-    if (res != ncclSuccess) netInfos[n].path = NULL;
-  }
-
-  NCCLCHECK(ncclTopoComputeNetInfo(netInfos, netDevCount));
-
-  for (int n=0; n<netDevCount; n++) {
-    struct netInfo* info = netInfos+n;
-    // Create NIC and attach it to the PCI tree
-    struct ncclTopoNode* nicNode = NULL;
-    for (int i=0; i<system->nodes[NIC].count; i++) {
-      if (system->nodes[NIC].nodes[i].id == info->nic) {
-        nicNode = system->nodes[NIC].nodes+i;
-        break;
-      }
-    }
-    if (!nicNode) {
-      NCCLCHECK(ncclTopoCreateNode(system, &nicNode, NIC, info->nic));
-      if (info->path) {
-        // Create the PCI path
-        NCCLCHECK(ncclTopoCreatePciPath(system, nicNode, info->path));
-      } else {
-        // This is probably a virtual NIC. Just attach it directly to CPU 0
-        int width;
-        NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
-        NCCLCHECK(ncclTopoConnectCpu(system, 0, nicNode, LINK_PCI, width));
-      }
-    }
-    free(info->path);
-
-    // Create the network side
-    struct ncclTopoNode* netNode;
-    NCCLCHECK(ncclTopoCreateNode(system, &netNode, NET, n));
-
-    // Use rank to store the net information
-    netNode->rank = info->net;
-
-    NCCLCHECK(ncclTopoConnectNodes(nicNode, netNode, LINK_NET, netWidth));
-    NCCLCHECK(ncclTopoConnectNodes(netNode, nicNode, LINK_NET, netWidth));
-  }
-  free(netInfos);
-
+ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
   // And connect all CPU nodes together
   for (int n=0; n<system->nodes[CPU].count; n++) {
     for (int p=0; p<system->nodes[CPU].count; p++) {
       if (n == p) continue;
-      int width;
-      NCCLCHECK(ncclTopoGetInterCpuWidth(&width));
-      NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_QPI, width));
+      float width;
+      NCCLCHECK(ncclTopoGetInterCpuWidth(system->nodes[CPU].nodes+n, &width));
+      NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_SYS, width));
     }
   }
   return ncclSuccess;
@@ -491,7 +184,9 @@ ncclResult_t ncclTopoConnectPCI(struct ncclTopoSystem* system) {
 
 static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) {
   if (node->type == GPU) {
-    sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->rank);
+    sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank);
+  } else if (node->type == CPU) {
+    sprintf(line+offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model);
   } else {
     sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
   }
@@ -501,14 +196,14 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
   for (int l=0; l<node->nlinks; l++) {
     struct ncclTopoLink* link = node->links+l;
     if (link->type == LINK_LOC) continue;
-    if (link->remNode != prevNode) {
-      sprintf(line+offset, "+ %s[%2d] - ", topoLinkTypeStr[link->type], link->width);
+    if (link->type != LINK_PCI || link->remNode != prevNode) {
+      sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->width);
       int nextOffset = strlen(line);
       if (link->type == LINK_PCI) {
         NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
       } else {
         if (link->remNode->type == NET) {
-          sprintf(line+nextOffset, "%s/%lX (%d)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->rank);
+          sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.width);
         } else {
           sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
         }
@@ -520,7 +215,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
 }
 
 ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
-  INFO(NCCL_GRAPH, "=== System : maxWidth %2d maxSpeed %2d ===", s->maxWidth, s->maxSpeed);
+  INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f ===", s->maxWidth);
   char line[1024];
   for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
   INFO(NCCL_GRAPH, "==========================================");
@@ -554,88 +249,400 @@ static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode*
 // 1. NVLinks (already the case)
 // 2. PCI down
 // 3. PCI up
-// 4. QPI (already the case)
+// 4. SYS (already the case)
 ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) {
   for (int n=0; n<system->nodes[CPU].count; n++) NCCLCHECK(ncclTopoSort(system->nodes[CPU].nodes+n, NULL));
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
-  struct ncclTopoSystem* s;
-  NCCLCHECK(ncclCalloc(&s, 1));
-  nvmlDevice_t* nvmlDevs;
-  int g = 0;
-  NCCLCHECK(ncclCalloc(&nvmlDevs, comm->nRanks));
-  for (int r=0; r<comm->nRanks; r++) {
-    if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
-      // Consider the GPU as outside of our node if we can't see it through NVML.
-      char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-      NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
-      if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevs+g) != ncclSuccess) continue;
-      g++;
-      struct ncclTopoNode* gpuNode;
-      NCCLCHECK(ncclTopoCreateNode(s, &gpuNode, GPU, comm->peerInfo[r].busId));
-      gpuNode->rank = r;
+ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
+  int dev;
+  NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));
+
+  struct ncclTopoNode* net;
+  NCCLCHECK(ncclTopoCreateNode(system, &net, NET, dev));
+  const char* str;
+  NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str));
+  if (str) sscanf(str, "0x%lx", &net->net.asic);
+  else net->net.asic = dev;
+
+  ncclDebugNoWarn = NCCL_GRAPH;
+  int mbps;
+  if (xmlGetAttrInt(xmlNet, "speed", &mbps) != ncclSuccess) mbps = 0;
+  if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1
+  net->net.width = mbps / 8000.0;
+  if (xmlGetAttrInt(xmlNet, "port", &net->net.port) != ncclSuccess) net->net.port = 0;
+  if (xmlGetAttrInt(xmlNet, "gdr", &net->net.gdrSupport) != ncclSuccess) net->net.gdrSupport = 0;
+  if (xmlGetAttrInt(xmlNet, "maxconn", &net->net.maxChannels) != ncclSuccess) net->net.maxChannels = MAXCHANNELS;
+  if (xmlGetAttrInt(xmlNet, "coll", &net->net.collSupport) != ncclSuccess) net->net.collSupport = 0;
+  ncclDebugNoWarn = 0;
+
+  NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width));
+  NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.width));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
+  for (int s=0; s<xmlNic->nSubs; s++) {
+    struct ncclXmlNode* xmlNet = xmlNic->subs[s];
+    if (strcmp(xmlNet->name, "net") != 0) continue;
+    int index;
+    NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));
+    if (index == -1) continue;
+    NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
+  NCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap));
+  NCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank));
+  NCCLCHECK(xmlGetAttrInt(xmlGpu, "dev", &gpu->gpu.dev));
+  NCCLCHECK(xmlGetAttrInt(xmlGpu, "gdr", &gpu->gpu.gdrSupport));
+  // Do not go any further, nvlinks will be added in a second pass
+  return ncclSuccess;
+}
+
+struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, 0 } };
+struct kvDict kvDictPciGen[] = { { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { NULL, 0 } }; // x100 Mbps per lane
+ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) {
+  const char* str;
+
+  int type;
+  NCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str));
+  NCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass));
+
+  int64_t busId;
+  NCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str));
+  NCCLCHECK(busIdToInt64(str, &busId));
+
+  struct ncclTopoNode* node = NULL;
+  if (type == GPU) {
+    struct ncclXmlNode* xmlGpu;
+    NCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu));
+    if (xmlGpu == NULL) return ncclSuccess;
+    int index;
+    NCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index));
+    if (index == -1) return ncclSuccess;
+    NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
+    NCCLCHECK(ncclTopoAddGpu(xmlGpu, system, node));
+  }
+  if (type == NIC) {
+    struct ncclXmlNode* xmlNic;
+    NCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic));
+    if (xmlNic == NULL) return ncclSuccess;
+
+    // Ignore sub device ID and merge multi-port NICs into one PCI device.
+    busId &= 0xfffffffffffffff0;
+    struct ncclTopoNode* nicNode = NULL;
+    NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, busId));
+    if (nicNode == NULL) {
+      NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, busId));
+      node = nicNode; // Connect it to parent later on
+    }
+    NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode));
+  } else if (type == PCI) {
+    NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
+    for (int s=0; s<xmlPci->nSubs; s++) {
+      struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
+      NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node));
     }
   }
 
-  NCCLCHECK(ncclTopoConnectNVLink(nvmlDevs, s));
-  NCCLCHECK(ncclTopoConnectPCI(s));
+  if (node) {
+    int width, speed;
+    NCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width));
+    NCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str));
+
+    // Manage cases where speed was not indicated in /sys
+    if (width == 0) width = 16;
+    if (strlen(str) == 0 || strcasecmp(str, "Unknown speed") == 0) str = "8 GT/s";
 
-  free(nvmlDevs);
-  NCCLCHECK(ncclTopoSortSystem(s));
-  *system = s;
+    NCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end)
+
+    NCCLCHECK(ncclTopoConnectNodes(node, parent, LINK_PCI, width*speed/80.0));
+    NCCLCHECK(ncclTopoConnectNodes(parent, node, LINK_PCI, width*speed/80.0));
+  }
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink) {
-  int g1, g2;
-  NCCLCHECK(idToIndex(system, busId1, &g1));
-  NCCLCHECK(idToIndex(system, busId2, &g2));
-  *nvlink = g1 != -1 && g2 != -1 && system->nodes[GPU].nodes[g1].paths[GPU][g2].type == LINK_NVL;
+struct kvDict kvDictCpuArch[] = { { "x86_64", NCCL_TOPO_CPU_ARCH_X86 }, { "arm64", NCCL_TOPO_CPU_ARCH_ARM }, { "ppc64", NCCL_TOPO_CPU_ARCH_POWER }, { NULL, 0 } };
+struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { NULL, 0 } };
+
+ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) {
+  int numaId;
+  NCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId));
+  struct ncclTopoNode* cpu;
+  NCCLCHECK(ncclTopoCreateNode(system, &cpu, CPU, numaId));
+  const char* str;
+  NCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str));
+  if (str != NULL) {
+    NCCLCHECK(ncclStrToCpuset(str, &cpu->cpu.affinity));
+  }
+
+  NCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str));
+  NCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch));
+  if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86) {
+    NCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str));
+    NCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor));
+    if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
+      int familyId, modelId;
+      NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
+      NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
+      cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;
+    }
+  }
+  for (int s=0; s<xmlCpu->nSubs; s++) {
+    struct ncclXmlNode* node = xmlCpu->subs[s];
+    if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu));
+    if (strcmp(node->name, "nic") == 0) {
+      struct ncclTopoNode* nic = NULL;
+      NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0));
+      if (nic == NULL) {
+        NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, 0));
+        NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_WIDTH));
+        NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_WIDTH));
+      }
+      NCCLCHECK(ncclTopoAddNic(node, system, nic));
+    }
+  }
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink) {
-  int g;
-  NCCLCHECK(idToIndex(system, busId, &g));
-  for (int i=0; i<system->nodes[GPU].count; i++) {
-    if (i == g) continue;
-    if (system->nodes[GPU].nodes[g].paths[GPU][i].type == LINK_NVL) {
-      *nvlink = 1;
-      return ncclSuccess;
+ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) {
+  if (strcmp(node->name, "nvlink") == 0) {
+    struct ncclTopoNode* gpu = NULL;
+    int64_t pBusId;
+    NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
+    NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
+    if (gpu == NULL) {
+      WARN("Add NVLink error : could not find GPU %lx\n", pBusId);
+      return ncclInternalError;
+    }
+    int count;
+    NCCLCHECK(xmlGetAttrInt(node, "count", &count));
+    const char* targetClass;
+    NCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass));
+    int targetType;
+    NCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass));
+    struct ncclTopoNode* remote = NULL;
+    if (targetType == GPU) {
+      // NVL P2P connection to another GPU
+      const char* target;
+      NCCLCHECK(xmlGetAttrStr(node, "target", &target));
+      int64_t busId;
+      NCCLCHECK(busIdToInt64(target, &busId));
+      NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, busId));
+    } else if (targetType == CPU) {
+      // NVL connection to the local CPU
+      NCCLCHECK(findLocalCpu(gpu, &remote));
+    } else {
+      if (system->nodes[NVS].count == 0) {
+        NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0));
+      } else {
+        remote = system->nodes[NVS].nodes;
+      }
+    }
+    if (remote) {
+      int nvlSpeed = gpu->gpu.cudaCompCap == 60 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
+      NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed));
+      if (remote->type != GPU) {
+        NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed));
+      }
+    }
+  } else {
+    const char* busId;
+    NCCLCHECK(xmlGetAttr(node, "busid", &busId));
+    for (int s=0; s<node->nSubs; s++) {
+      NCCLCHECK(ncclTopoAddNvLinks(node->subs[s], system, busId ? busId : parentBusId));
     }
   }
-  *nvlink = 0;
   return ncclSuccess;
 }
 
-static int pathDistance(struct ncclTopoLinkList* links) {
-  int distance = PATH_PIX;
-  if (links->count > 2) distance = PATH_PXB;
-  for (int l=0; l<links->count; l++) {
-    // PHB if we go through 1 CPU, SYS if we go through 2 CPUs
-    if (links->list[l]->remNode->type == CPU) distance = (distance == PATH_PHB) ? PATH_SYS : PATH_PHB;
+ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem) {
+  NCCLCHECK(ncclCalloc(topoSystem, 1));
+  struct ncclXmlNode* topNode;
+  NCCLCHECK(xmlFindTag(xml, "system", &topNode));
+  for (int s=0; s<topNode->nSubs; s++) {
+    struct ncclXmlNode* node = topNode->subs[s];
+    if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
+  }
+  NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL));
+
+  NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
+  NCCLCHECK(ncclTopoSortSystem(*topoSystem));
+
+  return ncclSuccess;
+}
+
+NCCL_PARAM(TopoDumpFileRank, "TOPO_DUMP_FILE_RANK", 0);
+
+// Only set values if not already set
+static ncclResult_t xmlInitAttrInt(struct ncclXmlNode* node, const char* attrName, const int value) {
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+  if (index == -1) {
+    index = node->nAttrs++;
+    strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+    snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
   }
-  return distance;
+  return ncclSuccess;
+}
+static ncclResult_t xmlInitAttrUint64(struct ncclXmlNode* node, const char* attrName, const uint64_t value) {
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+  if (index == -1) {
+    index = node->nAttrs++;
+    strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+    snprintf(node->attrs[index].value, MAX_STR_LEN, "0x%lx", value);
+  }
+  return ncclSuccess;
 }
 
-ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance) {
-  int g1, g2;
-  NCCLCHECK(idToIndex(system, busId1, &g1));
-  NCCLCHECK(idToIndex(system, busId2, &g2));
-  *distance = pathDistance(system->nodes[GPU].nodes[g1].paths[GPU]+g2);
+
+ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
+  struct ncclXml* xml;
+  NCCLCHECK(ncclCalloc(&xml, 1));
+  char* xmlTopoFile = getenv("NCCL_TOPO_FILE");
+  if (xmlTopoFile) {
+    NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml));
+  }
+  if (xml->maxIndex == 0) {
+    // Create top tag
+    struct ncclXmlNode* top;
+    NCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
+    NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION));
+  }
+
+  // Auto-detect GPUs if needed
+  for (int r=0; r<comm->nRanks; r++) {
+    if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
+      char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+      NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
+      struct ncclXmlNode* node;
+      NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
+      NCCLCHECK(xmlSetAttrInt(node, "rank", r));
+      NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
+    }
+  }
+  // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
+  // so we start with collnet so that it has precedence.
+  int netDevCount = 0;
+  if (ncclCollNet) {
+    NCCLCHECK(collNetDevices(&netDevCount));
+    for (int n=0; n<netDevCount; n++) {
+      ncclNetProperties_t props;
+      NCCLCHECK(collNetGetProperties(n, &props));
+      struct ncclXmlNode* netNode;
+      NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
+      NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
+      NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
+      NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
+      NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
+      NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
+      NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
+      NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1));
+    }
+  }
+  if (netDevCount == 0) {
+    NCCLCHECK(ncclNetDevices(&netDevCount));
+  }
+  for (int n=0; n<netDevCount; n++) {
+    ncclNetProperties_t props;
+    NCCLCHECK(ncclNetGetProperties(n, &props));
+    struct ncclXmlNode* netNode;
+    NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
+    NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
+    NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
+    NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
+    NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
+    NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
+    NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
+  }
+
+  xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE");
+  if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
+    NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
+  }
+
+  NCCLCHECK(ncclTopoGetSystemFromXml(xml, system));
+  free(xml);
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance) {
-  int g;
-  NCCLCHECK(idToIndex(system, busId, &g));
-  *distance = pathDistance(system->nodes[GPU].nodes[g].paths[NET]+netDev);
+/****************************/
+/* External query functions */
+/****************************/
+
+ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model) {
+  *arch = system->nodes[CPU].nodes[0].cpu.arch;
+  *vendor = system->nodes[CPU].nodes[0].cpu.vendor;
+  *model = system->nodes[CPU].nodes[0].cpu.model;
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count) {
-  *count = system->nodes[CPU].count;
+NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
+
+ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank) {
+  struct ncclTopoNode* cpu = NULL, *gpu = NULL;
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
+      gpu = system->nodes[GPU].nodes+g;
+      // Find closer CPU
+      int cpuIndex = -1, minHops = 0;
+      for (int c=0; c<system->nodes[CPU].count; c++) {
+        int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count;
+        if (cpuIndex == -1 || nHops < minHops) {
+          cpuIndex = c;
+          minHops = nHops;
+        }
+      }
+      cpu = system->nodes[CPU].nodes+cpuIndex;
+    }
+  }
+  if (cpu == NULL) {
+    WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank);
+    return ncclInternalError;
+  }
+
+  // Query the CPU affinity set we were provided
+  cpu_set_t mask;
+  SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
+
+#ifdef ENABLE_TRACE
+  {
+    char affinityStr[sizeof(cpu_set_t)*2];
+    NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
+    TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev, affinityStr);
+  }
+#endif
+
+  // Get the affinity of the CPU close to our GPU.
+  cpu_set_t cpuMask = cpu->cpu.affinity;
+
+#ifdef ENABLE_TRACE
+  {
+    char affinityStr[sizeof(cpu_set_t)*2];
+    NCCLCHECK(ncclCpusetToStr(&cpuMask, affinityStr));
+    TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev, affinityStr);
+  }
+#endif
+
+  cpu_set_t finalMask;
+  if (ncclParamIgnoreCpuAffinity())
+    // Ignore the CPU affinity set and use the GPU one instead
+    finalMask = cpuMask;
+  else
+    // Use a subset of the GPU affinity set
+    CPU_AND(&finalMask, &mask, &cpuMask);
+
+  // If there is a non empty set, use it to set affinity
+  if (CPU_COUNT(&finalMask)) {
+    char affinityStr[sizeof(cpu_set_t)*2];
+    NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
+    INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr);
+    SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity");
+  }
   return ncclSuccess;
 }
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 6b8a2f9..848fc03 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,22 +9,24 @@
 
 #include "graph.h"
 #include "core.h"
-
-#define LOC_WIDTH 5000
-#define PASCAL_NVLINK_WIDTH 18
-#define VOLTA_NVLINK_WIDTH 21
-#define PCI_WIDTH 12           // PCI Gen3 x16
-#define QPI_WIDTH 8
-#define SKL_QPI_WIDTH 12
-#define P9_WIDTH 32
-#define NET_WIDTH 12           // 100Gbit
-
-// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, to GPU
-// to GPU traffic consumed more PCI bandwidth.
+#include <sched.h>
+
+#define LOC_WIDTH 5000.0
+#define PASCAL_NVLINK_WIDTH 18.0
+#define VOLTA_NVLINK_WIDTH 21.0
+#define PCI_WIDTH 12.0           // PCI Gen3 x16
+#define QPI_WIDTH 6.0
+#define SKL_QPI_WIDTH 9.0
+#define P9_WIDTH 32.0
+#define ARM_WIDTH 6.0
+#define NET_WIDTH 12.0           // 100Gbit
+
+// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
+// to GPU traffic consumes more PCI bandwidth.
 #define INTEL_P2P(speed) (speed*9/12)
 #define INTEL_P2P_OVERHEAD(speed) (speed*12/9)
 
-#define NCCL_TOPO_NODE_TYPES 6
+#define NCCL_TOPO_NODE_TYPES 7
 #define GPU 0
 #define PCI 1
 #define NVS 2
@@ -33,37 +35,72 @@
 #define NET 5
 extern const char* topoNodeTypeStr[];
 
+// We want link types and path types to match as much as possible
 #define LINK_LOC 0
 #define LINK_NVL 1
 #define LINK_PCI 2
-#define LINK_QPI 3
-#define LINK_NET 4
+// Skipping 3 for PATH_PXB
+// Skipping 4 for PATH_PHB
+#define LINK_SYS 5
+#define LINK_NET 6
 extern const char* topoLinkTypeStr[];
 
+#define PATH_LOC 0
+#define PATH_NVL 1
+#define PATH_PIX 2
+#define PATH_PXB 3
+#define PATH_PHB 4
+#define PATH_SYS 5
+#define PATH_NET 6
+extern const char* topoPathTypeStr[];
+
 struct ncclTopoNode;
 struct ncclTopoLink {
   int type;
-  int width;
+  float width;
   struct ncclTopoNode* remNode;
 };
 #define NCCL_TOPO_MAX_LINKS 32
 #define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES)
-#define SELECT_PATH 1
-#define SELECT_LAST 2
-
-#define NET_GDR_MASK 0x70000000
 
 struct ncclTopoLinkList {
   struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS];
   int count;
-  int width;
+  float width;
   int type;
 };
 
+#define NCCL_TOPO_CPU_INTEL_BDW 1
+#define NCCL_TOPO_CPU_INTEL_SKL 2
+
+#define NCCL_TOPO_UNDEF (-1)
+
 struct ncclTopoNode {
   int type;
   int64_t id;
-  int rank;
+  // Type specific data
+  union {
+    struct {
+      int dev; // NVML dev number
+      int rank;
+      int cudaCompCap;
+      int gdrSupport;
+    }gpu;
+    struct {
+      uint64_t asic;
+      int port;
+      float width;
+      int gdrSupport;
+      int collSupport;
+      int maxChannels;
+    }net;
+    struct {
+      int arch;
+      int vendor;
+      int model;
+      cpu_set_t affinity;
+    }cpu;
+  };
   int nlinks;
   struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS];
   // Pre-computed paths to GPUs and NICs
@@ -79,60 +116,29 @@ struct ncclTopoNodeSet {
 
 struct ncclTopoSystem {
   struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
-  int maxSpeed;
-  int maxWidth;
-  int searchInitDone;
+  float maxWidth;
 };
 
-static ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
+ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
+ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
+ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id);
+ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width);
+ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
+ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system);
+
+ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem);
+ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
+ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml);
+
+static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, int64_t id, int* index) {
+  *index = -1;
   for (int i=0; i<system->nodes[type].count; i++) {
     if (system->nodes[type].nodes[i].id == id) {
-      *node = system->nodes[type].nodes+i;
+      *index = i;
       return ncclSuccess;
     }
   }
-  if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
-    WARN("Error : tried to create too many nodes of type %d\n", type);
-    return ncclInternalError;
-  }
-  struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
-  system->nodes[type].count++;
-  n->type = type;
-  n->id = id;
-  if (type == GPU) {
-    // Create link to itself (used in some corner cases)
-    n->nlinks=1;
-    n->links[0].type = LINK_LOC;
-    n->links[0].remNode = n;
-    n->links[0].width = LOC_WIDTH;
-  }
-  *node = n;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, int width) {
-  // Aggregate links into higher width for NVLink
-  struct ncclTopoLink* link;
-  for (link = node->links; link->remNode; link++) {
-    if (link->remNode == remNode && link->type == type) break;
-  }
-  if (link->remNode == NULL) node->nlinks++;
-  link->type = type;
-  link->remNode = remNode;
-  link->width += width;
-
-  // Sort links in BW descending order
-  struct ncclTopoLink linkSave;
-  memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
-  while (link != node->links) {
-    if ((link-1)->width >= linkSave.width) break;
-    memcpy(link, link-1, sizeof(struct ncclTopoLink));
-    link--;
-  }
-  memcpy(link, &linkSave, sizeof(struct ncclTopoLink));
-  return ncclSuccess;
+  return ncclInternalError;
 }
 
-ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
-
 #endif
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index 87afb2f..8a0b4cd 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -52,12 +52,12 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
 }
 
 static const char* ncclFuncStr[] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
-static const char* ncclAlgoStr[] = { "Tree", "Ring" };
+static const char* ncclAlgoStr[] = { "Tree", "Ring", "CollNet" };
 static const char* ncclProtoStr[] = { "LL", "LL128", "Simple" };
 
 // Latencies in us, Bandwidths in GB/s
 // Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
-static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4,  0 }, { 3.6, 3.6, 8.4 } };
+static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4,  0 }, { 3.6, 3.6, 8.4 }, { 4.4, 4.4,  0 } };
 
 // NVLink, PCI, Network
 #define NCCL_HW_NVLINK 0
@@ -66,29 +66,32 @@ static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4,
 // Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network).
 static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
 { /* NVLINK */
-  { /* Tree (LL/LL128/Simple)*/ {  .5, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ {  .4, 2.5, 5.7 } },
+  { /* Tree (LL/LL128/Simple)*/ {  .5, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ {  .4, 2.5, 5.7 }, /* CollNet (LL/LL128/Simple)*/ {  .5, 1.9, 4.0 } },
   /* PCI */
-  { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 } },
+  { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { 1.0, 1.9, 5.5 } },
   /* NET */
-  { /* Tree (LL/LL128/Simple)*/ { 5.0, 7.5, 50 }, /* Ring (LL/LL128/Simple)*/ {  .9, 2.5, 6.6 } }
+  { /* Tree (LL/LL128/Simple)*/ { 5.0, 7.5, 50 }, /* Ring (LL/LL128/Simple)*/ {  .9, 2.5, 6.6 }, /* CollNet (LL/LL128/Simple)*/ { 5.0, 5.0, 10.7 } }
 };
 
 // LL128 max BW for the different collectives
 static const double ll128MaxBw[NCCL_NUM_FUNCTIONS] = { 113.0, 72.0, 110.0, 91.0, 100.0 };
 
-ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph) {
-  int simpleDefaultThreads = (treeGraph->speedIntra*treeGraph->nChannels <= 12) ? 256 : NCCL_MAX_NTHREADS;
-  comm->maxThreads[NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
-  comm->maxThreads[NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
-  comm->maxThreads[NCCL_PROTO_LL128] = getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
-
-  INFO(NCCL_INIT, "Threads per block : %d/%d/%d", comm->maxThreads[NCCL_PROTO_LL], comm->maxThreads[NCCL_PROTO_LL128], comm->maxThreads[NCCL_PROTO_SIMPLE]);
+ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
+  int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_MAX_NTHREADS;
+  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
+    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
+  comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
+    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
+  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
+    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
+  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128] =
+    getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
 
   if (comm->nRanks <= 1) return ncclSuccess;
 
-  struct ncclTopoGraph* graphs[2] = { treeGraph, ringGraph };
-  int intraHw[2], hw[2];
-  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->nvlink ? NCCL_HW_NVLINK : NCCL_HW_PCI;
+  struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph };
+  int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
   for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = comm->nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
 
   for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
@@ -97,21 +100,24 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom
       comm->nRanks;
 
     for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
-      if (coll != ncclCollAllReduce && a == NCCL_ALGO_TREE) continue;
+      if (coll != ncclCollAllReduce && a != NCCL_ALGO_RING) continue;
 
       for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-        int speed = comm->nNodes <= 2 ? graphs[a]->speedIntra : graphs[a]->speedInter;
-        float busBw = graphs[a]->nChannels * speed * 1.0;
+        float speed = comm->nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
+        float busBw = graphs[a]->nChannels * speed;
 
         // Various model refinements
         if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL)    busBw *= 1.0/4.0;
         if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw*120.0/128.0, ll128MaxBw[coll]);
-        if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.9, comm->nNodes > 1 ? 70.0 : 90.0);
+        if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.9, comm->nNodes > 2 ? 80.0 : 110.0);
         if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/3.0;
         if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw *= 7.0/9.0;
+        if (a == NCCL_ALGO_COLLNET) busBw *= .9;
+        if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL) busBw *= 1.0/6.0; // Take into account that GDR read is disabled on both sides
+        if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL128) busBw = 0;  // CollNet does not support LL128
 
         // Convert bus BW to algorithm BW
-        float ratio = a == NCCL_ALGO_TREE ? .5 : (1.0 * comm->nRanks) / nsteps;
+        float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * comm->nRanks) / nsteps;
         comm->bandwidths[coll][a][p] = busBw * ratio;
 
         comm->latencies[coll][a][p] = baseLat[a][p];
@@ -127,11 +133,16 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom
           } else {
             comm->latencies[coll][a][p] += nsteps*lat;
           }
-        } else {
+        } else if (a == NCCL_ALGO_TREE) {
           float intraLat = hwLat[intraHw[a]][a][p];
           float interLat = hwLat[NCCL_HW_NET][a][p];
           comm->latencies[coll][a][p] +=
             2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat);
+        } else {
+          float intraLat = hwLat[intraHw[a]][a][p];
+          float interLat = hwLat[NCCL_HW_NET][a][p];
+          comm->latencies[coll][a][p] +=
+            2 * (comm->nRanks/comm->nNodes-1) * intraLat + interLat;
         }
       }
     }
@@ -140,7 +151,7 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom
   // Protocols/Algorithms enable/disable, and user overrides.
   // All are enabled except ll128 which is enabled by default only in certain cases.
   int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
-  int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1 };
+  int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1 };
 
   const char *protoStr = getenv("NCCL_PROTO");
   if (protoStr) NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
@@ -151,30 +162,32 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom
     int pEnable = protoEnable[p];
     if (pEnable == 2 && p == NCCL_PROTO_LL128) {
       // Enable LL128 by default only on Volta+NVLink. Other cases are not tested and may cause silent data corruption.
-      pEnable = (graphs[a]->type <= LINK_PCI) && graphs[a]->nvlink && minCompCap == 70 && maxCompCap == 70 ? 1 : 0;
+      pEnable = (graphs[a]->typeInter <= LINK_PCI) && graphs[a]->typeIntra == LINK_NVL && minCompCap == 70 && maxCompCap == 70 ? 1 : 0;
     }
     if (pEnable == 0 || algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
   }
 
   if (comm->rank == 0) {
     char line[1024];
-    int offset = 0;
     sprintf(line, "Latency/AlgBw |");
-    offset = strlen(line);
     for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
       for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-        sprintf(line+offset, " %4s/%6s |", ncclAlgoStr[a], ncclProtoStr[p]);
-        offset = strlen(line);
+        sprintf(line+strlen(line), " %7s/%6s |", ncclAlgoStr[a], ncclProtoStr[p]);
+      }
+    }
+    INFO(NCCL_TUNING, "%s", line);
+    sprintf(line, " Max NThreads |");
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+        sprintf(line+strlen(line), " %14d |", comm->maxThreads[a][p]);
       }
     }
     INFO(NCCL_TUNING, "%s", line);
     for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
       sprintf(line, "%13s |", ncclFuncStr[c]);
-      offset = strlen(line);
       for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
         for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-          sprintf(line+offset, "%7.1f/%5.1f|", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
-          offset = strlen(line);
+          sprintf(line+strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
         }
       }
       INFO(NCCL_TUNING, "%s", line);
@@ -201,12 +214,34 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom
     }
   }
 
-  INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld",
+  INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld | %ld/%ld/%ld",
       comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL],
       comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL128],
       comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE],
       comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL],
       comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL128],
-      comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
+      comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE],
+      comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_LL],
+      comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128],
+      comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE]);
+  return ncclSuccess;
+}
+
+// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
+// factor is not ideal but works quite well. Powers of two, 64 B to 1 GB.
+static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
+  { 1.0, 1.0, 1.0, 1.0,  .9,  .8,  .7,  .7,  .7,  .7,  .6,  .5,  .5,  .5,  .6,  .7,  .8,  .9,  .9, 1.0, 1.0, 1.0 },
+  { 1.0, 1.0, 1.0, 1.0, 1.0,  .9,  .8,  .8,  .8,  .8,  .7,  .7,  .7,  .6,  .6,  .7,  .7,  .8,  .8,  .9,  .9, 1.0 },
+  {  .9,  .9,  .9,  .9,  .9,  .9,  .9,  .8,  .7,  .6,  .6,  .5,  .5,  .5,  .5,  .5,  .5,  .6,  .6,  .7,  .8,  .9 }
+};
+
+ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time) {
+  float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
+  if (bw == 0) {
+    *time = -1.0; return ncclSuccess;
+  }
+  int logSize = log2i(info->nBytes>>6);
+  if (algorithm == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[protocol][logSize];
+  *time = info->comm->latencies[info->coll][algorithm][protocol] + (info->nBytes) / (1000 * bw);
   return ncclSuccess;
 }
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
new file mode 100644
index 0000000..550cfcd
--- /dev/null
+++ b/src/graph/xml.cc
@@ -0,0 +1,780 @@
+/*************************************************************************
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <ctype.h>
+#include "core.h"
+#include "nvmlwrap.h"
+#include "xml.h"
+
+/*******************/
+/* XML File Parser */
+/*******************/
+
+ncclResult_t xmlGetChar(FILE* file, char* c) {
+  if (fread(c, 1, 1, file) == 0) {
+    WARN("XML Parse : Unexpected EOF");
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t xmlGetValue(FILE* file, char* value, char* last) {
+  char c;
+  NCCLCHECK(xmlGetChar(file, &c));
+  if (c != '"' && c != '\'') {
+#if INT_OK
+    int o = 0;
+    do {
+      value[o++] = c;
+      NCCLCHECK(xmlGetChar(file, &c));
+    } while (c >= '0' && c <= '9');
+    value[o] = '\0';
+    *last = c;
+    return ncclSuccess;
+#else
+    WARN("XML Parse : Expected (double) quote.");
+    return ncclInternalError;
+#endif
+  }
+  int o = 0;
+  do {
+    NCCLCHECK(xmlGetChar(file, &c));
+    value[o++] = c;
+  } while (c != '"');
+  value[o-1] = '\0';
+  NCCLCHECK(xmlGetChar(file, last));
+  return ncclSuccess;
+}
+
+ncclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) {
+  char c;
+  char* ptr = name;
+  int o = 0;
+  do {
+    NCCLCHECK(xmlGetChar(file, &c));
+    if (c == '=') {
+      ptr[o] = '\0';
+      if (value == NULL) {
+        WARN("XML Parse : Unexpected value with name %s\n", ptr);
+        return ncclInternalError;
+      }
+      return xmlGetValue(file, value, last);
+    }
+    ptr[o] = c;
+    if (o == MAX_STR_LEN-1) {
+      ptr[o] = '\0';
+      WARN("Error : name %s too long (max %d)", ptr, MAX_STR_LEN);
+      return ncclInternalError;
+    }
+    o++;
+  } while (c != ' ' && c != '>' && c != '/' && c != '\n' && c != '\r');
+  ptr[o-1] = '\0';
+  *last = c;
+  return ncclSuccess;
+}
+
+// Shift the 3-chars string by one char and append c at the end
+#define SHIFT_APPEND(s, c) do { s[0]=s[1]; s[1]=s[2]; s[2]=c; } while(0)
+ncclResult_t xmlSkipComment(FILE* file, char* start, char next) {
+  // Start from something neutral with \0 at the end.
+  char end[4] = "...";
+
+  // Inject all trailing chars from previous reads. We don't need
+  // to check for --> here because there cannot be a > in the name.
+  for (int i=0; i<strlen(start); i++) SHIFT_APPEND(end, start[i]);
+  SHIFT_APPEND(end, next);
+
+  // Stop when we find "-->"
+  while (strcmp(end, "-->") != 0) {
+    int c;
+    if (fread(&c, 1, 1, file) != 1) {
+      WARN("XML Parse error : unterminated comment");
+      return ncclInternalError;
+    }
+    SHIFT_APPEND(end, c);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
+  node->type = NODE_TYPE_NONE;
+  char c = ' ';
+  while (c == ' ' || c == '\n' || c == '\r') {
+    if (fread(&c, 1, 1, file) == 0) return ncclSuccess;
+  }
+  if (c != '<') {
+    WARN("XML Parse error : expecting '<', got '%c'", c);
+    return ncclInternalError;
+  }
+  // Read XML element name
+  NCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
+
+  // Check for comments
+  if (strncmp(node->name, "!--", 3) == 0) {
+    NCCLCHECK(xmlSkipComment(file, node->name+3, c));
+    return xmlGetNode(file, node);
+  }
+
+  // Check for closing tag
+  if (node->name[0] == '\0' && c == '/') {
+    node->type = NODE_TYPE_CLOSE;
+    // Re-read the name, we got '/' in the first call
+    NCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
+    if (c != '>') {
+      WARN("XML Parse error : unexpected trailing %c in closing tag %s\n", c, node->name);
+      return ncclInternalError;
+    }
+    return ncclSuccess;
+  }
+
+  node->type = NODE_TYPE_OPEN;
+
+  // Get Attributes
+  int a = 0;
+  while (c == ' ') {
+    NCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c));
+    if (a == MAX_ATTR_COUNT) {
+      INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)\n", MAX_ATTR_COUNT);
+      // Actually we need to still consume the extra attributes so we have an extra one.
+    } else a++;
+  }
+  node->nAttrs = a;
+  if (c == '/') {
+    node->type = NODE_TYPE_SINGLE;
+    char str[MAX_STR_LEN];
+    NCCLCHECK(xmlGetToken(file, str, NULL, &c));
+  }
+  if (c != '>') {
+    WARN("XML Parse : expected >, got '%c'", c);
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
+typedef ncclResult_t (*xmlHandlerFunc_t)(FILE*, struct ncclXml*, struct ncclXmlNode*);
+
+struct xmlHandler {
+  const char * name;
+  xmlHandlerFunc_t func;
+};
+
+ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head, struct xmlHandler handlers[], int nHandlers) {
+  if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess;
+  while (1) {
+    if (xml->maxIndex == MAX_NODES) {
+      WARN("Error : XML parser is limited to 1024 nodes\n");
+      return ncclInternalError;
+    }
+    struct ncclXmlNode* node = xml->nodes+xml->maxIndex;
+    memset(node, 0, sizeof(struct ncclXmlNode));
+    NCCLCHECK(xmlGetNode(file, node));
+    if (node->type == NODE_TYPE_NONE) {
+      if (head) {
+        WARN("XML Parse : unterminated %s", head->name);
+        return ncclInternalError;
+      } else {
+        // All done
+        return ncclSuccess;
+      }
+    }
+    if (head && node->type == NODE_TYPE_CLOSE) {
+      if (strcmp(node->name, head->name) != 0) {
+        WARN("XML Mismatch : %s / %s", head->name, node->name);
+        return ncclInternalError;
+      }
+      return ncclSuccess;
+    }
+    int found = 0;
+    for (int h=0; h<nHandlers; h++) {
+      if (strcmp(node->name, handlers[h].name) == 0) {
+        if (head) head->subs[head->nSubs++] = node;
+        node->parent = head;
+        node->nSubs = 0;
+        xml->maxIndex++;
+        NCCLCHECK(handlers[h].func(file, xml, node));
+        found = 1;
+        break;
+      }
+    }
+    if (!found) {
+      if (nHandlers) INFO(NCCL_GRAPH, "Ignoring element %s", node->name);
+      NCCLCHECK(xmlLoadSub(file, xml, node, NULL, 0));
+    }
+  }
+}
+
+/**************/
+/* XML Writer */
+/**************/
+
+ncclResult_t ncclTopoDumpXmlRec(int indent, FILE* file, struct ncclXmlNode* node) {
+  for (int i=0; i<indent; i++) fprintf(file, " ");
+  fprintf(file, "<%s", node->name);
+
+  for (int a=0; a<node->nAttrs; a++) {
+    fprintf(file, " %s=\"%s\"", node->attrs[a].key, node->attrs[a].value);
+  }
+  if (node->nSubs == 0) {
+    fprintf(file, "/>\n");
+  } else {
+    fprintf(file, ">\n");
+    for (int s=0; s<node->nSubs; s++) {
+      NCCLCHECK(ncclTopoDumpXmlRec(indent+2, file, node->subs[s]));
+    }
+    for (int i=0; i<indent; i++) fprintf(file, " ");
+    fprintf(file, "</%s>\n", node->name);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml) {
+  FILE* file = fopen(xmlTopoFile, "w");
+  if (file == NULL) {
+    WARN("Unable to open %s, not dumping topology.", xmlTopoFile);
+    return ncclSuccess;
+  }
+  NCCLCHECK(ncclTopoDumpXmlRec(0, file, xml->nodes));
+  fclose(file);
+  return ncclSuccess;
+}
+
+/****************************************/
+/* Parser rules for our specific format */
+/****************************************/
+
+ncclResult_t ncclTopoXmlLoadNvlink(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  struct xmlHandler handlers[] = { { "nvlink", ncclTopoXmlLoadNvlink } };
+  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlLoadNet(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlLoadNic(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  struct xmlHandler handlers[] = { { "net", ncclTopoXmlLoadNet } };
+  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlLoadPci(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "gpu", ncclTopoXmlLoadGpu }, { "nic", ncclTopoXmlLoadNic} };
+  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 3));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlLoadCpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "nic", ncclTopoXmlLoadNic } };
+  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlLoadSystem(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  int version;
+  NCCLCHECK(xmlGetAttrInt(head, "version", &version));
+  if (version != NCCL_TOPO_XML_VERSION) {
+    WARN("XML Topology has wrong version %d, %d needed", version, NCCL_TOPO_XML_VERSION);
+    return ncclInvalidUsage;
+  }
+  const char* name;
+  NCCLCHECK(xmlGetAttr(head, "name", &name));
+  if (name != NULL) INFO(NCCL_GRAPH, "Loading topology %s", name);
+  else INFO(NCCL_GRAPH, "Loading unnamed topology");
+
+  struct xmlHandler handlers[] = { { "cpu", ncclTopoXmlLoadCpu } };
+  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml) {
+  FILE* file = fopen(xmlTopoFile, "r");
+  if (file == NULL) {
+    WARN("Could not open XML topology file %s : %s", xmlTopoFile, strerror(errno));
+    return ncclSuccess;
+  }
+  struct xmlHandler handlers[] = { { "system", ncclTopoXmlLoadSystem } };
+  xml->maxIndex = 0;
+  NCCLCHECK(xmlLoadSub(file, xml, NULL, handlers, 1));
+  fclose(file);
+  return ncclSuccess;
+}
+
+/**********************/
+/* XML creation       */
+/* from autodetection */
+/**********************/
+
+#define BUSID_SIZE (sizeof("0000:00:00.0"))
+#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
+static void memcpylower(char* dst, const char* src, const size_t size) {
+  for (int i=0; i<size; i++) dst[i] = tolower(src[i]);
+}
+static ncclResult_t getPciPath(const char* busId, char** path) {
+  char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
+  memcpylower(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
+  memcpylower(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
+  *path = realpath(busPath, NULL);
+  if (*path == NULL) {
+    WARN("Could not find real path of %s", busPath);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue) {
+  char filePath[PATH_MAX];
+  sprintf(filePath, "%s/%s", path, fileName);
+  int offset = 0;
+  FILE* file;
+  if ((file = fopen(filePath, "r")) != NULL) {
+    while (feof(file) == 0 && ferror(file) == 0 && offset < MAX_STR_LEN) {
+      int len = fread(strValue+offset, 1, MAX_STR_LEN-offset, file);
+      offset += len;
+    }
+    fclose(file);
+  }
+  if (offset == 0) {
+    strValue[0] = '\0';
+    INFO(NCCL_GRAPH, "Topology detection : could not read %s, ignoring", filePath);
+  } else {
+    strValue[offset-1] = '\0';
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoSetAttrFromSys(struct ncclXmlNode* pciNode, const char* path, const char* fileName, const char* attrName) {
+  char strValue[MAX_STR_LEN];
+  NCCLCHECK(ncclTopoGetStrFromSys(path, fileName, strValue));
+  if (strValue[0] != '\0') { NCCLCHECK(xmlSetAttr(pciNode, attrName, strValue)); }
+  TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s\n", path, fileName, attrName, strValue);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetXmlFromCpu(struct ncclXmlNode* cpuNode, struct ncclXml* xml) {
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(cpuNode, "affinity", &index));
+  if (index == -1) {
+    const char* numaId;
+    NCCLCHECK(xmlGetAttr(cpuNode, "numaid", &numaId));
+    if (numaId == NULL) {
+      WARN("GetXmlFromCpu : could not find CPU numa ID.");
+      return ncclInternalError;
+    }
+    // Set affinity
+    char cpumaskPath[] = "/sys/devices/system/node/node0000";
+    sprintf(cpumaskPath, "/sys/devices/system/node/node%s", numaId);
+    NCCLCHECK(ncclTopoSetAttrFromSys(cpuNode, cpumaskPath, "cpumap", "affinity"));
+  }
+
+  NCCLCHECK(xmlGetAttrIndex(cpuNode, "arch", &index));
+  if (index == -1) {
+    // Fill CPU type / vendor / model
+#if defined(__PPC__)
+    NCCLCHECK(xmlSetAttr(cpuNode, "arch", "ppc64"));
+#elif defined(__aarch64__)
+    NCCLCHECK(xmlSetAttr(cpuNode, "arch", "arm64"));
+#elif defined(__x86_64__)
+    NCCLCHECK(xmlSetAttr(cpuNode, "arch", "x86_64"));
+#endif
+  }
+
+#if defined(__x86_64__)
+  NCCLCHECK(xmlGetAttrIndex(cpuNode, "vendor", &index));
+  if (index == -1) {
+    union {
+      struct {
+        // CPUID 0 String register order
+        uint32_t ebx;
+        uint32_t edx;
+        uint32_t ecx;
+      };
+      char vendor[12];
+    } cpuid0;
+
+    asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0) : "memory");
+    char vendor[13];
+    strncpy(vendor, cpuid0.vendor, 12);
+    vendor[12] = '\0';
+    NCCLCHECK(xmlSetAttr(cpuNode, "vendor", vendor));
+  }
+
+  NCCLCHECK(xmlGetAttrIndex(cpuNode, "familyid", &index));
+  if (index == -1) {
+    union {
+      struct {
+        unsigned steppingId:4;
+        unsigned modelId:4;
+        unsigned familyId:4;
+        unsigned processorType:2;
+        unsigned resv0:2;
+        unsigned extModelId:4;
+        unsigned extFamilyId:8;
+        unsigned resv1:4;
+      };
+      uint32_t val;
+    } cpuid1;
+    asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1) : "memory");
+    int familyId = cpuid1.familyId + (cpuid1.extFamilyId << 4);
+    int modelId = cpuid1.modelId + (cpuid1.extModelId << 4);
+    NCCLCHECK(xmlSetAttrInt(cpuNode, "familyid", familyId));
+    NCCLCHECK(xmlSetAttrInt(cpuNode, "modelid", modelId));
+  }
+#endif
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetPciNode(struct ncclXml* xml, const char* busId, struct ncclXmlNode** pciNode) {
+  NCCLCHECK(xmlFindTagKv(xml, "pci", pciNode, "busid", busId));
+  if (*pciNode == NULL) {
+    NCCLCHECK(xmlAddNode(xml, NULL, "pci", pciNode));
+  }
+  NCCLCHECK(xmlSetAttr(*pciNode, "busid", busId));
+  return ncclSuccess;
+}
+
+// Check whether a string is in BDF format or not.
+// BDF (Bus-Device-Function) is "BBBB:BB:DD.F" where B, D and F are hex digits.
+// There can be trailing chars.
+int isHex(char c) { return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')); }
+int checkBDFFormat(char* bdf) {
+  if (bdf[4] != ':' || bdf[7] != ':' || bdf[10] != '.') return 0;
+  if (isHex(bdf[0]) == 0 || isHex(bdf[1] == 0) || isHex(bdf[2] == 0) || isHex(bdf[3] == 0) ||
+      isHex(bdf[5] == 0) || isHex(bdf[6] == 0) || isHex(bdf[8] == 0) || isHex(bdf[9] == 0) ||
+      isHex(bdf[11] == 0)) return 0;
+  return 1;
+}
+
+ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* xml) {
+  // Fill info, then parent
+  const char* busId;
+  NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
+  char* path = NULL;
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(pciNode, "class", &index));
+  if (index == -1) {
+    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
+    NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
+  }
+  NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
+  if (index == -1) {
+    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
+    char deviceSpeedStr[MAX_STR_LEN];
+    float deviceSpeed;
+    NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_speed", deviceSpeedStr));
+    sscanf(deviceSpeedStr, "%f GT/s", &deviceSpeed);
+    char portSpeedStr[MAX_STR_LEN];
+    float portSpeed;
+    NCCLCHECK(ncclTopoGetStrFromSys(path, "../max_link_speed", portSpeedStr));
+    sscanf(portSpeedStr, "%f GT/s", &portSpeed);
+    NCCLCHECK(xmlSetAttr(pciNode, "link_speed", portSpeed < deviceSpeed ? portSpeedStr : deviceSpeedStr));
+  }
+  NCCLCHECK(xmlGetAttrIndex(pciNode, "link_width", &index));
+  if (index == -1) {
+    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
+    char strValue[MAX_STR_LEN];
+    NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_width", strValue));
+    int deviceWidth = strtol(strValue, NULL, 0);
+    NCCLCHECK(ncclTopoGetStrFromSys(path, "../max_link_width", strValue));
+    int portWidth = strtol(strValue, NULL, 0);
+    NCCLCHECK(xmlSetAttrInt(pciNode, "link_width", std::min(deviceWidth,portWidth)));
+  }
+  struct ncclXmlNode* parent = pciNode->parent;
+  if (parent == NULL) {
+    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
+
+    // Save that for later in case next step is a CPU
+    char numaIdStr[MAX_STR_LEN];
+    NCCLCHECK(ncclTopoGetStrFromSys(path, "numa_node", numaIdStr));
+
+    // Go up one level in the PCI tree. Rewind two "/" and follow the upper PCI
+    // switch, or stop if we reach a CPU root complex.
+    int slashCount = 0;
+    int parentOffset;
+    for (parentOffset = strlen(path)-1; parentOffset>0; parentOffset--) {
+      if (path[parentOffset] == '/') {
+        slashCount++;
+        path[parentOffset] = '\0';
+        int start = parentOffset - 1;
+        while (start>0 && path[start] != '/') start--;
+        // Check whether the parent path looks like "BBBB:BB:DD.F" or not.
+        if (checkBDFFormat(path+start+1) == 0) {
+          // This a CPU root complex. Create a CPU tag and stop there.
+          struct ncclXmlNode* topNode;
+          NCCLCHECK(xmlFindTag(xml, "system", &topNode));
+          NCCLCHECK(xmlGetSubKv(topNode, "cpu", &parent, "numaid", numaIdStr));
+          if (parent == NULL) {
+            NCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent));
+            NCCLCHECK(xmlSetAttr(parent, "numaid", numaIdStr));
+          }
+        } else if (slashCount == 2) {
+          // Continue on the upper PCI switch
+          for (int i = strlen(path)-1; i>0; i--) {
+            if (path[i] == '/') {
+              NCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", path+i+1));
+              if (parent == NULL) {
+                NCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent));
+                NCCLCHECK(xmlSetAttr(parent, "busid", path+i+1));
+              }
+              break;
+            }
+          }
+        }
+      }
+      if (parent) break;
+    }
+    pciNode->parent = parent;
+    parent->subs[parent->nSubs++] = pciNode;
+  }
+  if (strcmp(parent->name, "pci") == 0) {
+    NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
+  } else if (strcmp(parent->name, "cpu") == 0) {
+    NCCLCHECK(ncclTopoGetXmlFromCpu(parent, xml));
+  }
+  free(path);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvmlDev, struct ncclXml* xml, struct ncclXmlNode** gpuNodeRet) {
+  struct ncclXmlNode* gpuNode = NULL;
+  NCCLCHECK(xmlGetSub(pciNode, "gpu", &gpuNode));
+  if (gpuNode == NULL) NCCLCHECK(xmlAddNode(xml, pciNode, "gpu", &gpuNode));
+
+  int index = -1;
+
+  int dev = -1;
+  NCCLCHECK(xmlGetAttrIndex(gpuNode, "dev", &index));
+  if (index == -1) {
+    if (nvmlDev == NULL) {
+      WARN("No NVML, trying to use CUDA instead");
+      const char* busId;
+      NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
+      if (busId == NULL || cudaDeviceGetByPCIBusId(&dev, busId) != cudaSuccess) dev = -1;
+    } else {
+      NCCLCHECK(wrapNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&dev));
+    }
+    NCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev));
+  }
+  NCCLCHECK(xmlGetAttrInt(gpuNode, "dev", &dev));
+  if (dev == -1) return ncclSuccess;
+
+  NCCLCHECK(xmlGetAttrIndex(gpuNode, "sm", &index));
+  if (index == -1) {
+    int cudaMajor, cudaMinor;
+    if (nvmlDev == NULL) {
+      cudaDeviceProp devProp;
+      CUDACHECK(cudaGetDeviceProperties(&devProp, dev));
+      cudaMajor = devProp.major; cudaMinor = devProp.minor;
+    } else {
+      NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDev, &cudaMajor, &cudaMinor));
+    }
+    NCCLCHECK(xmlSetAttrInt(gpuNode, "sm", cudaMajor*10+cudaMinor));
+  }
+  int sm;
+  NCCLCHECK(xmlGetAttrInt(gpuNode, "sm", &sm));
+
+  struct ncclXmlNode* nvlNode = NULL;
+  NCCLCHECK(xmlGetSub(pciNode, "nvlink", &nvlNode));
+  if (nvlNode == NULL) {
+    // NVML NVLink detection
+    int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : 6;
+
+    if (maxNvLinks > 0 && nvmlDev == NULL) {
+      WARN("No NVML device handle. Skipping nvlink detection.\n");
+      maxNvLinks = 0;
+    }
+
+    for (int l=0; l<maxNvLinks; ++l) {
+      // Check whether we can use this NVLink for P2P
+      unsigned canP2P;
+      if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
+
+      // Make sure the Nvlink is up. The previous call should have trained the link.
+      nvmlEnableState_t isActive;
+      if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
+
+      // Try to figure out what's on the other side of the NVLink
+      nvmlPciInfo_t remoteProc;
+      if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
+
+      // Make a lower case copy of the bus ID for calling ncclDeviceType
+      // PCI system path is in lower case
+      char* p = remoteProc.busId;
+      char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+      for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
+        lowerId[c] = tolower(p[c]);
+        if (p[c] == 0) break;
+      }
+
+      NCCLCHECK(xmlGetSubKv(gpuNode, "nvlink", &nvlNode, "target", lowerId));
+      if (nvlNode == NULL) {
+        NCCLCHECK(xmlAddNode(xml, gpuNode, "nvlink", &nvlNode));
+        NCCLCHECK(xmlSetAttr(nvlNode, "target", lowerId));
+        NCCLCHECK(xmlSetAttrInt(nvlNode, "count", 1));
+      } else {
+        int count;
+        NCCLCHECK(xmlGetAttrInt(nvlNode, "count", &count));
+        NCCLCHECK(xmlSetAttrInt(nvlNode, "count", count+1));
+      }
+    }
+  }
+  // Fill target classes
+  for (int s=0; s<gpuNode->nSubs; s++) {
+    struct ncclXmlNode* sub = gpuNode->subs[s];
+    if (strcmp(sub->name, "nvlink") != 0) continue;
+    int index;
+    NCCLCHECK(xmlGetAttrIndex(sub, "tclass", &index));
+    if (index == -1) {
+      const char* busId;
+      NCCLCHECK(xmlGetAttr(sub, "target", &busId));
+      char* path;
+      NCCLCHECK(getPciPath(busId, &path));
+      NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass"));
+    }
+  }
+  *gpuNodeRet = gpuNode;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode) {
+  struct ncclXmlNode* node;
+  NCCLCHECK(ncclTopoGetPciNode(xml, busId, &node));
+  NCCLCHECK(ncclTopoGetXmlFromSys(node, xml));
+  NCCLCHECK(wrapNvmlSymbols());
+  NCCLCHECK(wrapNvmlInit());
+  nvmlDevice_t nvmlDev;
+  if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
+  NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode));
+  return ncclSuccess;
+}
+
+// Returns the subsystem name of a path, i.e. the end of the path
+// where sysPath/subsystem points to.
+ncclResult_t ncclTopoGetSubsystem(const char* sysPath, char* subSys) {
+  char subSysPath[PATH_MAX];
+  sprintf(subSysPath, "%s/subsystem", sysPath);
+  char* path = realpath(subSysPath, NULL);
+  if (path == NULL) {
+    subSys[0] = '\0';
+  } else {
+    int offset;
+    for (offset = strlen(path); offset > 0 && path[offset] != '/'; offset--);
+    strcpy(subSys, path+offset+1);
+    free(path);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode) {
+  NCCLCHECK(xmlFindTagKv(xml, "net", netNode, "name", netName));
+  if (*netNode != NULL) return ncclSuccess;
+
+  const char* pciSysPath = pciPath;
+  if (pciSysPath) {
+    char subSystem[PATH_MAX];
+    NCCLCHECK(ncclTopoGetSubsystem(pciSysPath, subSystem));
+    // This is not a PCI device (virtual, usb, ...).
+    if (strcmp(subSystem, "pci") != 0) {
+      INFO(NCCL_GRAPH, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem);
+      pciSysPath = NULL;
+    }
+  }
+
+  struct ncclXmlNode* parent = NULL;
+  if (pciSysPath) {
+    int offset;
+    for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--);
+    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+    strcpy(busId, pciSysPath+offset+1);
+    NCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", busId));
+    if (parent == NULL) {
+      NCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent));
+      NCCLCHECK(xmlSetAttr(parent, "busid", busId));
+      NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
+    }
+  } else {
+    // Virtual NIC, no PCI device, attach to first CPU
+    NCCLCHECK(xmlFindTag(xml, "cpu", &parent));
+  }
+
+  struct ncclXmlNode* nicNode = NULL;
+  NCCLCHECK(xmlGetSub(parent, "nic", &nicNode));
+  if (nicNode == NULL) {
+    NCCLCHECK(xmlAddNode(xml, parent, "nic", &nicNode));
+  }
+
+  // We know that this net does not exist yet (we searched for it at the
+  // beginning of this function), so we can add it.
+  NCCLCHECK(xmlAddNode(xml, nicNode, "net", netNode));
+  NCCLCHECK(xmlSetAttr(*netNode, "name", netName));
+  return ncclSuccess;
+}
+
+/**************************************************/
+/* Parser rules for the user-defined graph search */
+/**************************************************/
+
+ncclResult_t ncclTopoXmlGraphLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlGraphLoadNet(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlGraphLoadChannel(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  struct xmlHandler handlers[] = { { "net", ncclTopoXmlGraphLoadNet }, { "gpu", ncclTopoXmlGraphLoadGpu } };
+  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlGraphLoadGraph(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+  struct xmlHandler handlers[] = { { "channel", ncclTopoXmlGraphLoadChannel } };
+  NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlGraphLoadGraphs(FILE* file, struct ncclXml* xmlGraph, struct ncclXmlNode* head) {
+  int version;
+  NCCLCHECK(xmlGetAttrInt(head, "version", &version));
+  if (version != NCCL_GRAPH_XML_VERSION) {
+    WARN("XML Graph has wrong version %d, %d needed", version, NCCL_GRAPH_XML_VERSION);
+    return ncclInvalidUsage;
+  }
+  const char* name;
+  NCCLCHECK(xmlGetAttr(head, "name", &name));
+  if (name != NULL) INFO(NCCL_GRAPH, "Loading graphs for topology %s", name);
+  else INFO(NCCL_GRAPH, "Loading graphs");
+
+  struct xmlHandler handlers[] = { { "graph", ncclTopoXmlGraphLoadGraph } };
+  NCCLCHECK(xmlLoadSub(file, xmlGraph, head, handlers, 1));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXml* xml) {
+  FILE* file = fopen(xmlGraphFile, "r");
+  if (file == NULL) {
+    WARN("Could not open XML graph file %s : %s", xmlGraphFile, strerror(errno));
+    return ncclSystemError;
+  }
+  struct xmlHandler handlers[] = { { "graphs", ncclTopoXmlGraphLoadGraphs } };
+  xml->maxIndex = 0;
+  NCCLCHECK(xmlLoadSub(file, xml, NULL, handlers, 1));
+  fclose(file);
+  return ncclSuccess;
+}
diff --git a/src/graph/xml.h b/src/graph/xml.h
new file mode 100644
index 0000000..fa04527
--- /dev/null
+++ b/src/graph/xml.h
@@ -0,0 +1,237 @@
+/*************************************************************************
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef XML_H_
+#define XML_H_
+
+// A few constraints to make the implementation easy
+#define MAX_STR_LEN 256
+#define MAX_ATTR_COUNT 16
+#define MAX_SUBS 32
+#define MAX_NODES 1024
+
+#define NODE_TYPE_NONE 0
+#define NODE_TYPE_OPEN 1
+#define NODE_TYPE_CLOSE 2
+#define NODE_TYPE_SINGLE 3
+
+struct ncclXmlNode {
+  char name[MAX_STR_LEN];
+  struct {
+    char key[MAX_STR_LEN];
+    char value[MAX_STR_LEN];
+  } attrs[MAX_ATTR_COUNT+1]; // Need an extra one to consume extra params
+  int nAttrs;
+  int type;
+  struct ncclXmlNode* parent;
+  struct ncclXmlNode* subs[MAX_SUBS];
+  int nSubs;
+};
+
+struct ncclXml {
+  struct ncclXmlNode nodes[MAX_NODES];
+  int maxIndex;
+};
+
+/* File functions */
+#define NCCL_TOPO_XML_VERSION 1
+ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml);
+ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml);
+#define NCCL_GRAPH_XML_VERSION 1
+ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXml* xml);
+
+/* Auto-detect functions */
+ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode);
+ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode);
+
+/**************/
+/* XML Struct */
+/* Functions  */
+/**************/
+
+static ncclResult_t xmlGetAttrIndex(struct ncclXmlNode* node, const char* attrName, int* index) {
+  *index = -1;
+  const int nAttrs = node->nAttrs;
+  for (int a=0; a<nAttrs; a++) {
+    if (strncmp(node->attrs[a].key, attrName, MAX_STR_LEN-1) == 0) {
+      *index = a;
+      return ncclSuccess;
+    }
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlGetAttr(struct ncclXmlNode* node, const char* attrName, const char** value) {
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+  *value = index == -1 ? NULL : node->attrs[index].value;
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlGetAttrStr(struct ncclXmlNode* node, const char* attrName, const char** value) {
+  NCCLCHECK(xmlGetAttr(node, attrName, value));
+  if (*value == NULL) {
+    WARN("Attribute %s of node %s not found", attrName, node->name);
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+static ncclResult_t xmlGetAttrInt(struct ncclXmlNode* node, const char* attrName, int* value) {
+  const char* str;
+  NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
+  *value = strtol(str, NULL, 0);
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrName, float* value) {
+  const char* str;
+  NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
+  *value = strtof(str, NULL);
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlFindTag(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node) {
+  *node = NULL;
+  for (int i=0; i<xml->maxIndex; i++) {
+    struct ncclXmlNode* n = xml->nodes+i;
+    if (strcmp(n->name, tagName) == 0) {
+      *node = n;
+      return ncclSuccess;
+    }
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlFindTagKv(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node, const char* attrName, const char* attrValue) {
+  *node = NULL;
+  for (int i=0; i<xml->maxIndex; i++) {
+    struct ncclXmlNode* n = xml->nodes+i;
+    if (strcmp(n->name, tagName) == 0) {
+      const char* value;
+      NCCLCHECK(xmlGetAttr(n, attrName, &value));
+      if (value && strcmp(value, attrValue) == 0) {
+        *node = n;
+        return ncclSuccess;
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, const char* value) {
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+  if (index == -1) {
+    index = node->nAttrs++;
+    strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+  }
+  strncpy(node->attrs[index].value, value, MAX_STR_LEN);
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlSetAttrInt(struct ncclXmlNode* node, const char* attrName, const int value) {
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+  if (index == -1) {
+    index = node->nAttrs++;
+    strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+  }
+  snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrName, const float value) {
+  int index;
+  NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+  if (index == -1) {
+    index = node->nAttrs++;
+    strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+  }
+  snprintf(node->attrs[index].value, MAX_STR_LEN, "%g", value);
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlGetSub(struct ncclXmlNode* node, const char* subName, struct ncclXmlNode** sub) {
+  *sub = NULL;
+  for (int s=0; s<node->nSubs; s++) {
+    if (strcmp(node->subs[s]->name, subName) == 0) {
+      *sub = node->subs[s];
+      return ncclSuccess;
+    }
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlGetSubKv(struct ncclXmlNode* node, const char* subName, struct ncclXmlNode** sub, const char* attrName, const char* attrValue) {
+  *sub = NULL;
+  for (int s=0; s<node->nSubs; s++) {
+    struct ncclXmlNode* subNode = node->subs[s];
+    if (strcmp(subNode->name, subName) == 0) {
+      const char* value;
+      NCCLCHECK(xmlGetAttr(subNode, attrName, &value));
+      if (value && strcmp(value, attrValue) == 0) {
+        *sub = node->subs[s];
+        return ncclSuccess;
+      }
+    }
+  }
+  return ncclSuccess;
+}
+static ncclResult_t xmlGetSubKvInt(struct ncclXmlNode* node, const char* subName, struct ncclXmlNode** sub, const char* attrName, const int attrValue) {
+  char strValue[10];
+  snprintf(strValue, 10, "%d", attrValue);
+  NCCLCHECK(xmlGetSubKv(node, subName, sub, attrName, strValue));
+  return ncclSuccess;
+}
+
+static ncclResult_t xmlAddNode(struct ncclXml* xml, struct ncclXmlNode* parent, const char* subName, struct ncclXmlNode** sub) {
+  if (xml->maxIndex == MAX_NODES) {
+    WARN("Error : too many XML nodes (max %d)", MAX_NODES);
+    return ncclInternalError;
+  }
+  struct ncclXmlNode* s = xml->nodes+xml->maxIndex++;
+  s->nSubs = 0;
+  s->nAttrs = 0;
+  *sub = s;
+  s->parent = parent;
+  if (parent) parent->subs[parent->nSubs++] = s;
+  strncpy(s->name, subName, MAX_STR_LEN);
+  return ncclSuccess;
+}
+
+// Dictionary for STR -> INT conversions. No dictionary size information,
+// there needs to be a last element with str == NULL.
+struct kvDict {
+  const char* str;
+  int value;
+};
+
+static ncclResult_t kvConvertToInt(const char* str, int* value, struct kvDict* dict) {
+  struct kvDict* d = dict;
+  while (d->str) {
+    if (strncmp(str, d->str, strlen(d->str)) == 0) {
+      *value = d->value;
+      return ncclSuccess;
+    }
+    d++;
+  }
+  WARN("KV Convert to int : could not find value of '%s' in dictionary", str);
+  return ncclInternalError;
+}
+static ncclResult_t kvConvertToStr(int value, const char** str, struct kvDict* dict) {
+  struct kvDict* d = dict;
+  while (d->str) {
+    if (value == d->value) {
+      *str = d->str;
+      return ncclSuccess;
+    }
+    d++;
+  }
+  WARN("KV Convert to str : could not find value %d in dictionary", value);
+  return ncclInternalError;
+}
+
+#endif
diff --git a/src/include/align.h b/src/include/align.h
new file mode 100644
index 0000000..1c9e7aa
--- /dev/null
+++ b/src/include/align.h
@@ -0,0 +1,19 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ALIGN_H_
+#define NCCL_ALIGN_H_
+
+#define DIVUP(x, y) \
+    (((x)+(y)-1)/(y))
+
+#define ROUNDUP(x, y) \
+    (DIVUP((x), (y))*(y))
+
+#define ALIGN_SIZE(size, align) \
+  size = ((size + (align) - 1) / (align)) * (align);
+
+#endif
diff --git a/src/include/alloc.h b/src/include/alloc.h
index bcdbd18..27e206f 100644
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -9,6 +9,7 @@
 
 #include "nccl.h"
 #include "checks.h"
+#include "align.h"
 #include <sys/mman.h>
 
 static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
@@ -48,4 +49,18 @@ static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
   return ncclSuccess;
 }
 
+// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
+// allocated on separate pages as those pages will be marked DONTFORK
+// and if they are shared, that could cause a crash in a child process
+static ncclResult_t ncclIbMalloc(void** ptr, size_t size) {
+  size_t page_size = sysconf(_SC_PAGESIZE);
+  void* p;
+  int size_aligned = ROUNDUP(size, page_size);
+  int ret = posix_memalign(&p, page_size, size_aligned);
+  if (ret != 0) return ncclSystemError;
+  memset(p, 0, size);
+  *ptr = p;
+  return ncclSuccess;
+}
+
 #endif
diff --git a/src/include/checks.h b/src/include/checks.h
index 50737b0..257e9ca 100644
--- a/src/include/checks.h
+++ b/src/include/checks.h
@@ -56,7 +56,7 @@
   ncclResult_t res = call; \
   if (res != ncclSuccess) { \
     /* Print the back trace*/ \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
     return res; \
   } \
 } while (0);
@@ -65,7 +65,7 @@
   res = call; \
   if (res != ncclSuccess) { \
     /* Print the back trace*/ \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
     goto label; \
   } \
 } while (0);
diff --git a/src/include/coll_net.h b/src/include/coll_net.h
new file mode 100644
index 0000000..3278560
--- /dev/null
+++ b/src/include/coll_net.h
@@ -0,0 +1,34 @@
+/*************************************************************************
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COLL_NET_H_
+#define COLL_NET_H_
+
+#include "nccl.h"
+#include "nccl_net.h"
+
+extern ncclCollNet_t* ncclCollNet;
+typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
+
+// Translation to external API
+static const char* collNetName() { return ncclCollNet->name; }
+static ncclResult_t collNetDevices(int* ndev) { NCCLCHECK(ncclCollNet->devices(ndev)); return ncclSuccess; }
+static ncclResult_t collNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
+static ncclResult_t collNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t collNetConnect(void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
+static ncclResult_t collNetReduceSupport(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
+static ncclResult_t collNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclCollNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclCollNet->deregMr(comm, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle,  void** request) {
+  NCCLCHECK(ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
+static ncclResult_t collNetFlush(void* collComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclCollNet->flush(collComm, data, size, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclCollNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; }
+static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
+
+static int collNetSupport() { return ncclCollNet != NULL ? 1 : 0; }
+
+#endif
diff --git a/src/include/collectives.h b/src/include/collectives.h
index 69c8e74..bd64106 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -30,7 +30,8 @@
 
 #define DECL_COLL3(coll, op, dtype) \
   DECL_COLL4(coll##Ring, op, dtype) \
-  DECL_COLL4(coll##Tree, op, dtype)
+  DECL_COLL4(coll##Tree, op, dtype) \
+  DECL_COLL4(coll##CollNet, op, dtype)
 
 #define DECL_COLL2(coll, op) \
   DECL_COLL3(coll, op, i8) \
diff --git a/src/include/comm.h b/src/include/comm.h
index 7164dc0..cc87a42 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -20,8 +20,6 @@ struct cudaLaunchParams {
 };
 #endif
 
-#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
-
 #define CACHE_LINE_SIZE 128
 #define MEM_ALIGN 4096
 #define CUDA_IPC_MIN 2097152UL
@@ -91,14 +89,11 @@ struct ncclComm {
   // Channels for collectives
   int nChannels;
 
-  // Only nvlink is used for inter-GPU communication
-  int nvlink;
-
   // Algorithm/Protocols thresholds
   ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
-  int maxThreads[NCCL_NUM_PROTOCOLS];
+  int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
 
   // An internal CUDA stream for NCCL kernel CGMD launches
   int groupCudaStream;
@@ -136,6 +131,9 @@ struct ncclComm {
   // Global proxy thread
   pthread_t proxyThread;
   struct ncclProxyState proxyState;
+
+  // Whether this communicator uses collNet
+  int collNetSupport;
 };
 
 #endif
diff --git a/src/include/core.h b/src/include/core.h
index 250f43b..ac5fa85 100644
--- a/src/include/core.h
+++ b/src/include/core.h
@@ -53,9 +53,10 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
 #define NCCL_NUM_FUNCTIONS 5
 typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce } ncclFunc_t;
 
-#define NCCL_NUM_ALGORITHMS 2 // Tree/Ring
+#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
 #define NCCL_ALGO_TREE 0
 #define NCCL_ALGO_RING 1
+#define NCCL_ALGO_COLLNET 2
 
 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
 #define NCCL_PROTO_LL 0
diff --git a/src/include/cpuset.h b/src/include/cpuset.h
index 98b93de..40c1594 100644
--- a/src/include/cpuset.h
+++ b/src/include/cpuset.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -19,7 +19,7 @@ static int hexToInt(char c) {
 
 #define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
 
-ncclResult_t ncclStrToCpuset(char* str, cpu_set_t* mask) {
+ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
   uint32_t cpumasks[CPU_SET_N_U32];
   int m = CPU_SET_N_U32-1;
   cpumasks[m] = 0;
diff --git a/src/include/debug.h b/src/include/debug.h
index 89b6e42..d88458c 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -29,11 +29,6 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
 
 // Let code temporarily downgrade WARN into INFO
 extern thread_local int ncclDebugNoWarn;
-#define NOWARN(a, ret) do { \
-  ncclDebugNoWarn = 1; \
-  ret = a; \
-  ncclDebugNoWarn = 0; \
-} while (0)
 
 #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
 #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
index 46d236b..96c69ba 100644
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@@ -8,19 +8,12 @@
 #define NCCL_DEVICE_H_
 
 #include "nccl.h"
+#include "align.h"
 #include <stdint.h>
 
 #define NCCL_MAX_OPS 2048
 #define NCCL_STEPS 8
 
-#define DIVUP(x, y) \
-    (((x)+(y)-1)/(y))
-#define ROUNDUP(x, y) \
-    (DIVUP((x), (y))*(y))
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
 union ncclLLFifoLine {
   /* Flags have to be *after* data, because otherwise, an incomplete receive
      from the network may receive the flag but not the data.
@@ -73,6 +66,9 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
 #define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
 #define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
 
+#define NCCL_DIRECT_GPU 0x01
+#define NCCL_DIRECT_NIC 0x10
+
 struct ncclConnInfo {
   // Regular comm mechanism
   char *buff;         // Local for recv, remote for send
@@ -171,6 +167,8 @@ struct ncclChannel {
       struct ncclRing ring;
       struct ncclTree treeUp;
       struct ncclTree treeDn;
+      struct ncclTree collTreeUp;
+      struct ncclTree collTreeDn;
 
       int id;
       int nthreads;
diff --git a/src/include/graph.h b/src/include/graph.h
index 3c8ba19..b27ea35 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -14,17 +14,6 @@
 #include <ctype.h>
 #include <stdio.h>
 
-enum ncclPathDist {
-  PATH_PIX  = 0,
-  PATH_PXB  = 1,
-  PATH_PHB  = 2,
-  PATH_NODE = 3,
-  PATH_SYS  = 4,
-  PATH_ARRAY_SIZE = 5
-};
-
-extern const char* pathDists[PATH_ARRAY_SIZE];
-
 ncclResult_t ncclTopoCudaPath(int cudaDev, char** path);
 
 struct ncclTopoSystem;
@@ -36,32 +25,47 @@ ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info);
 void ncclTopoFree(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
-ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system);
 
 // Query topology
-ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink);
-ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink);
-ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance);
 ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* net);
-ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance);
-ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count);
+ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p);
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
+
+// Set CPU affinity
+ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank);
+
+#define NCCL_TOPO_CPU_ARCH_X86 1
+#define NCCL_TOPO_CPU_ARCH_POWER 2
+#define NCCL_TOPO_CPU_ARCH_ARM 3
+#define NCCL_TOPO_CPU_VENDOR_INTEL 1
+#define NCCL_TOPO_CPU_VENDOR_AMD 2
+#define NCCL_TOPO_CPU_TYPE_BDW 1
+#define NCCL_TOPO_CPU_TYPE_SKL 2
+ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
 
 #define NCCL_TOPO_MAX_NODES 256
 
+// Init search. Needs to be done before calling ncclTopoCompute
+ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
+
 #define NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP 1 // Split tree (send/recv from different ranks) always flowing in the same direction
 #define NCCL_TOPO_PATTERN_SPLIT_TREE 2      // Split tree (send/recv from different ranks) flowing in both directions
 #define NCCL_TOPO_PATTERN_TREE 3            // Simple tree (send/recv from same rank) flowing in both directions
 #define NCCL_TOPO_PATTERN_RING 4            // Ring
 struct ncclTopoGraph {
   // Input / output
+  int id; // ring : 0, tree : 1, collnet : 2
   int pattern;
   int crossNic;
+  int collNet;
+  int minChannels;
+  int maxChannels;
   // Output
   int nChannels;
-  int speedIntra;
-  int speedInter;
-  int type;
-  int nvlink;
+  float speedIntra;
+  float speedInter;
+  int typeIntra;
+  int typeInter;
   int sameChannels;
   int nHops;
   int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES];
@@ -70,6 +74,7 @@ struct ncclTopoGraph {
 ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
 
 ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
+ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs);
 
 struct ncclTopoRanks {
   int ringRecv[MAXCHANNELS];
@@ -83,12 +88,16 @@ struct ncclTopoRanks {
 };
 
 ncclResult_t ncclTopoPreset(struct ncclComm* comm,
-    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
+    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
     struct ncclTopoRanks* topoRanks);
 
 ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
     struct ncclTopoRanks** allTopoRanks, int* rings);
 
-ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph);
+ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
+
+ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
+#include "info.h"
+ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time);
 
 #endif
diff --git a/src/include/info.h b/src/include/info.h
index 9461759..46b9795 100644
--- a/src/include/info.h
+++ b/src/include/info.h
@@ -17,7 +17,9 @@ typedef enum {
   ncclPatternPipelineTo,
   ncclPatternTreeUp,
   ncclPatternTreeDown,
-  ncclPatternTreeUpDown
+  ncclPatternTreeUpDown,
+  ncclPatternCollTreeUp,
+  ncclPatternCollTreeDown
 } ncclPattern_t;
 
 // Used to pass NCCL call information between functions
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
index d6ae9f8..95dce5b 100644
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,6 +8,7 @@
 #define NCCL_NET_H_
 
 #include "nccl.h"
+#include <stdint.h>
 
 #define NCCL_NET_HANDLE_MAXSIZE 64
 
@@ -20,18 +21,27 @@ typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCC
 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
 
 typedef struct {
+  char* name;     // Used mostly for logging.
+  char* pciPath;  // Path to the PCI device in /sys.
+  uint64_t guid;  // Unique identifier for the NIC chip. Important for
+                  // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
+  int speed;      // Port speed in Mbps.
+  int port;       // Port number.
+  int maxComms;   // Maximum number of comms we can create
+}ncclNetProperties_v3_t;
+
+typedef ncclNetProperties_v3_t ncclNetProperties_t;
+
+typedef struct {
   // Name of the network (mainly for logs)
   const char* name;
   // Initialize the network.
   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
   // Return the number of adapters.
   ncclResult_t (*devices)(int* ndev);
-  // Return the device path in /sys. NCCL will call free on this path.
-  ncclResult_t (*pciPath)(int dev, char** path);
-  // Return whether this device supports host pointers and/or CUDA pointers
-  // as data from the current GPU. Supported types should be composed with
-  // NCCL_PTR_HOST and NCCL_PTR_CUDA.
-  ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
   // Create a receiving object and provide a handle to connect to it. The
   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
   // between ranks to create a connection.
@@ -40,15 +50,19 @@ typedef struct {
   ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
   // Finalize connection establishment after remote peer has called connectHandle
   ncclResult_t (*accept)(void* listenComm, void** recvComm);
-  // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
   // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int type, void** request);
-  // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
   // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, void* data, int size, int type, void** request);
+  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
   // visible to the GPU
-  ncclResult_t (*flush)(void* recvComm, void* data, int size);
+  ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
   // Test whether a request is complete. If size is not NULL, it returns the
   // number of bytes sent/received.
   ncclResult_t (*test)(void* request, int* done, int* size);
@@ -56,53 +70,52 @@ typedef struct {
   ncclResult_t (*closeSend)(void* sendComm);
   ncclResult_t (*closeRecv)(void* recvComm);
   ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v1_t;
+} ncclNet_v3_t;
+
+typedef ncclNet_v3_t ncclNet_t;
+
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v3
 
 typedef struct {
-  // Name of the network (mainly for logs)
+  // Name of the collective network (mainly for logs)
   const char* name;
-  // Initialize the network.
+  // Initialize the collective network.
   ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
   ncclResult_t (*devices)(int* ndev);
-  // Return the device path in /sys. NCCL will call free on this path.
-  ncclResult_t (*pciPath)(int dev, char** path);
-  // Return whether this device supports host pointers and/or CUDA pointers
-  // as data from the current GPU. Supported types should be composed with
-  // NCCL_PTR_HOST and NCCL_PTR_CUDA.
-  ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
   // Create a receiving object and provide a handle to connect to it. The
   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
+  // between ranks to create connections.
   ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
-  // Finalize connection establishment after remote peer has called connectHandle
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
   // visible to the GPU
-  ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
+  ncclResult_t (*flush)(void* collComm, void* data, int size, void* mhandle);
   // Test whether a request is complete. If size is not NULL, it returns the
   // number of bytes sent/received.
   ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
   ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v2_t;
+} ncclCollNet_v3_t;
 
-typedef ncclNet_v2_t ncclNet_t;
+typedef ncclCollNet_v3_t ncclCollNet_t;
 
-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v2
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v3
 
 #endif // end include guard
diff --git a/src/include/net.h b/src/include/net.h
index 3d37c8c..bc81965 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -16,7 +16,7 @@ typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
 // Translation to external API
 static const char* ncclNetName() { return ncclNet->name; }
 static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
-static ncclResult_t ncclNetPciPath(int dev, char** path) { NCCLCHECK(ncclNet->pciPath(dev, path)); return ncclSuccess; }
+static ncclResult_t ncclNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclNet->getProperties(dev, props)); return ncclSuccess; }
 static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
 static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
 static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
@@ -30,33 +30,40 @@ static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeS
 static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
 
+// Test whether the current GPU support GPU Direct RDMA.
 #define GPU_BUF_SIZE (2*1024*1024)
-static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) {
-  int support;
-  NCCLCHECK(ncclNet->ptrSupport(dev, &support));
-  *supportedTypes = support & ~NCCL_PTR_CUDA;
-  // The network supports GPU Direct RDMA ; verify the GPU supports it as well.
-  if (support & NCCL_PTR_CUDA) {
+static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
+  int netDevs;
+  NCCLCHECK(ncclNetDevices(&netDevs));
+  *gdrSupport = 0;
+  for (int dev=0; dev<netDevs; dev++) {
+    // Find a net device which is GDR-capable
+    ncclNetProperties_t props;
+    NCCLCHECK(ncclNet->getProperties(dev, &props));
+    if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
+
+    // Allocate memory on the GPU and try to register it on the NIC.
     void *lComm = NULL, *sComm = NULL, *rComm = NULL;
     ncclNetHandle_t handle;
     void* gpuPtr = NULL;
     void* mHandle = NULL;
-    ncclResult_t res;
-    NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), res, cleanup);
-    NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), res, cleanup);
-    NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), res, cleanup);
-    CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), res, cleanup);
-    NOWARN(ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res);
-    if (res != ncclSuccess) goto cleanup;
-    NCCLCHECKGOTO(ncclNetDeregMr(sComm, mHandle), res, cleanup);
-    NCCLCHECKGOTO(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res, cleanup);
-    NCCLCHECKGOTO(ncclNetDeregMr(rComm, mHandle), res, cleanup);
-    *supportedTypes |= NCCL_PTR_CUDA;
-cleanup:
-    if (gpuPtr) cudaFree(gpuPtr);
-    if (rComm) ncclNetCloseRecv(rComm);
-    if (sComm) ncclNetCloseSend(sComm);
-    if (lComm) ncclNetCloseListen(lComm);
+    NCCLCHECK(ncclNetListen(dev, &handle, &lComm));
+    NCCLCHECK(ncclNetConnect(dev, &handle, &sComm));
+    NCCLCHECK(ncclNetAccept(lComm, &rComm));
+    CUDACHECK(cudaMalloc(&gpuPtr, GPU_BUF_SIZE));
+    ncclDebugNoWarn = NCCL_NET;
+    if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
+      NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
+      NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
+      NCCLCHECK(ncclNetDeregMr(rComm, mHandle));
+      *gdrSupport = 1;
+    }
+    ncclDebugNoWarn = 0;
+    CUDACHECK(cudaFree(gpuPtr));
+    NCCLCHECK(ncclNetCloseRecv(rComm));
+    NCCLCHECK(ncclNetCloseSend(sComm));
+    NCCLCHECK(ncclNetCloseListen(lComm));
+    break;
   }
   return ncclSuccess;
 }
diff --git a/src/include/socket.h b/src/include/socket.h
index 96bf5db..9376062 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -283,6 +283,7 @@ static ncclResult_t GetSocketAddrFromString(union socketAddress* ua, const char*
 }
 
 static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) {
+  static int shownIfName = 0;
   int nIfs = 0;
   // Allow user to force the INET socket family selection
   int sock_family = envSocketFamily();
@@ -290,6 +291,7 @@ static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNam
   char* env = getenv("NCCL_SOCKET_IFNAME");
   if (env && strlen(env) > 1) {
     // Specified by user : find or fail
+    if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env);
     nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
   } else {
     // Try to automatically pick the right one
diff --git a/src/include/transport.h b/src/include/transport.h
index 8f9bf0e..e25132f 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -53,6 +53,8 @@ struct ncclProxyArgs {
   int nsteps;
   uint64_t opCount;
   int protocol;
+  ncclDataType_t dtype;
+  ncclRedOp_t redOp;
   int state;   // add component before this line -- it is left out during initialization
 
   // Internal state
@@ -80,7 +82,7 @@ struct ncclProxyState {
 
 struct ncclTransportComm {
   ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
-  ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*);
+  ncclResult_t (*connect)(struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
   ncclResult_t (*free)(void*);
   ncclResult_t (*proxy)(struct ncclProxyArgs*);
 };
diff --git a/src/include/utils.h b/src/include/utils.h
index 266abca..86ab3a2 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -14,7 +14,7 @@ int ncclCudaCompCap();
 
 // PCI Bus ID <-> int64 conversion functions
 ncclResult_t int64ToBusId(int64_t id, char* busId);
-ncclResult_t busIdToInt64(char* busId, int64_t* id);
+ncclResult_t busIdToInt64(const char* busId, int64_t* id);
 
 ncclResult_t getBusId(int cudaDev, int64_t *busId);
 
diff --git a/src/init.cc b/src/init.cc
index 627f6c7..0a02760 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,11 +11,10 @@
 #include "transport.h"
 #include "group.h"
 #include "net.h"
+#include "coll_net.h"
 #include "enqueue.h"
 #include "graph.h"
 #include "argcheck.h"
-#include "cpuset.h"
-#include <sched.h>
 #include <fcntl.h>
 #include <string.h>
 #include <errno.h>
@@ -43,6 +42,7 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
 NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
 
 ncclNet_t* ncclNet = NULL;
+ncclCollNet_t* ncclCollNet = NULL;
 
 // Returns ncclInternalError if anything fails, causing that network to be ignored.
 ncclResult_t initNet(ncclNet_t* net) {
@@ -53,7 +53,15 @@ ncclResult_t initNet(ncclNet_t* net) {
   return ncclSuccess;
 }
 
-ncclResult_t initNetPlugin(ncclNet_t** net) {
+ncclResult_t initCollNet(ncclCollNet_t* collnet) {
+  int ndev;
+  if (collnet->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
+  if (collnet->devices(&ndev) != ncclSuccess) return ncclInternalError;
+  if (ndev <= 0) return ncclSystemError;
+  return ncclSuccess;
+}
+
+ncclResult_t initNetPlugin(ncclNet_t** net, ncclCollNet_t** collnet) {
   void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL);
   if (netPluginLib == NULL) {
     // dlopen does not guarantee to set errno, but dlerror only gives us a
@@ -69,13 +77,17 @@ ncclResult_t initNetPlugin(ncclNet_t** net) {
   ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
   if (extNet == NULL) {
     INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol.");
-    goto cleanup;
-  }
-  if (initNet(extNet) == ncclSuccess) {
+  } else if (initNet(extNet) == ncclSuccess) {
     *net = extNet;
+    // Check for CollNet
+    ncclCollNet_t* extCollNet = (ncclCollNet_t*) dlsym(netPluginLib, STR(NCCL_COLLNET_PLUGIN_SYMBOL));
+    if (extCollNet == NULL) {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_COLLNET_PLUGIN_SYMBOL) " symbol.");
+    } else if (initCollNet(extCollNet) == ncclSuccess) {
+      *collnet = extCollNet;
+    }
     return ncclSuccess;
   }
-cleanup:
   if (netPluginLib != NULL) dlclose(netPluginLib);
   return ncclSuccess;
 }
@@ -84,7 +96,7 @@ ncclResult_t initNet() {
   // Always initialize bootstrap network
   NCCLCHECK(bootstrapNetInit());
 
-  NCCLCHECK(initNetPlugin(&ncclNet));
+  NCCLCHECK(initNetPlugin(&ncclNet, &ncclCollNet));
   if (ncclNet != NULL) return ncclSuccess;
   if (initNet(&ncclNetIb) == ncclSuccess) {
     ncclNet = &ncclNetIb;
@@ -95,6 +107,8 @@ ncclResult_t initNet() {
   return ncclSuccess;
 }
 
+NCCL_PARAM(CollNetEnable, "COLLNET_ENABLE", 0);
+
 pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
 static bool initialized = false;
 static ncclResult_t ncclInit() {
@@ -103,6 +117,7 @@ static ncclResult_t ncclInit() {
   if (!initialized) {
     initEnv();
     initNet();
+    INFO(NCCL_INIT, "Using network %s", ncclNetName());
     initialized = true;
   }
   pthread_mutex_unlock(&initLock);
@@ -220,6 +235,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   *comm->abortFlag = 0;
 
   comm->argsptr = &comm->args;
+  comm->collNetSupport = 0;
 
   *comret = comm;
   return ncclSuccess;
@@ -233,7 +249,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   // Copy userRanks and peers
   for (int r=0; r<comm->nChannels; r++) {
     NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
-    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks));
+    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks+1));
   }
 
   // Duplicate the dev comm on the device
@@ -269,14 +285,8 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
   info->shmDev = statbuf.st_dev;
 
   info->busId = comm->busId;
-  int netDevs;
 
-  NCCLCHECK(ncclNetDevices(&netDevs));
-  for (int n=0; n<netDevs; n++) {
-    int ptrSupport;
-    NCCLCHECK(ncclNetPtrSupport(n, &ptrSupport));
-    if (ptrSupport & NCCL_PTR_CUDA) info->gdrSupport |= (1 << n);
-  }
+  NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport));
   return ncclSuccess;
 }
 
@@ -396,7 +406,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph,
   struct ncclConnector* conn;
   for (int i=0; i<nrecv; i++) {
     int peer = peerRecv[i];
-    if (peer == -1) continue;
+    if (peer == -1 || peer >= comm->nRanks) continue;
     conn = &channel->peers[peer].recv;
     if (conn->connected) { ++nSkippedRecv; continue; }
     memset(&connect, 0, sizeof(connect));
@@ -405,7 +415,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph,
   }
   for (int i=0; i<nsend; i++) {
     int peer = peerSend[i];
-    if (peer == -1) continue;
+    if (peer == -1 || peer >= comm->nRanks) continue;
     conn = &channel->peers[peer].send;
     if (conn->connected) { ++nSkippedSend; continue; }
     memset(&connect, 0, sizeof(connect));
@@ -414,29 +424,148 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph,
   }
   for (int i=0; i<nsend; i++) {
     int peer = peerSend[i];
-    if (peer == -1) continue;
+    if (peer == -1 || peer >= comm->nRanks) continue;
     conn = &channel->peers[peer].send;
     if (conn->connected) {++nSkippedSend; continue; }
     memset(&connect, 0, sizeof(connect));
     NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
-    NCCLCHECK(conn->transportComm->connect(&connect, conn));
+    NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
     conn->connected = 1;
   }
   for (int i=0; i<nrecv; i++) {
     int peer = peerRecv[i];
-    if (peer == -1) continue;
+    if (peer == -1 || peer >= comm->nRanks) continue;
     conn = &channel->peers[peer].recv;
     if (conn->connected) {++nSkippedRecv; continue; }
     memset(&connect, 0, sizeof(connect));
     NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
-    NCCLCHECK(conn->transportComm->connect(&connect, conn));
+    NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
     conn->connected = 1;
   }
   TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
   return ncclSuccess;
 }
 
+extern struct ncclTransport collNetTransport;
+
+// All ranks must participate in collNetSetup call
+// type: 0 for send, 1 for recv
+// return: 0 - unsupported, 1 - supported
+static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int collNetChannels, int rank, int nranks,  int masterRank, int masterPeer, int nMasters, int type) {
+  int rankInCollNet = -1;
+  int supported = 0;
+  int isMaster = (rank == masterRank) ? 1 : 0;
+  struct {
+    int collNetRank;
+    ncclConnect connect;
+  } sendrecvExchange;
+
+  // check if we can connect to collnet, whose root is the nranks-th rank
+  struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks;
+  peerInfo->rank = nranks;
+  int ret = 1;
+  if (isMaster) {
+    NCCLCHECK(collNetTransport.canConnect(&ret, comm->topo, collNetGraph, myInfo, peerInfo));
+  }
+
+  // send master receives connect info from peer recv master
+  if (isMaster && type == 0) {
+    NCCLCHECK(bootstrapRecv(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange)));
+    rankInCollNet = sendrecvExchange.collNetRank;
+    INFO(NCCL_INIT, "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d", rank, rankInCollNet, nMasters, masterPeer);
+  }
+
+  // select
+  struct ncclPeer* root = channel->peers+nranks;
+  struct ncclConnector* conn = (type == 1) ? &root->recv : &root->send;
+  struct ncclTransportComm* transportComm = (type == 1) ? &(collNetTransport.recv) : &(collNetTransport.send);
+  conn->transportComm = transportComm;
+  // setup
+  struct ncclConnect myConnect;
+  if (isMaster && ret > 0) {
+    NCCLCHECK(transportComm->setup(comm->topo, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->buffSize, channel->id));
+  }
+  // prepare connect handles
+  ncclResult_t res;
+  struct {
+    int isMaster;
+    ncclConnect connect;
+  } *allConnects = NULL;
+  ncclConnect *masterConnects = NULL;
+  NCCLCHECK(ncclCalloc(&masterConnects, nMasters));
+  if (type == 1) {  // recv side: AllGather
+    // all ranks must participate
+    NCCLCHECK(ncclCalloc(&allConnects, nranks));
+    allConnects[rank].isMaster = isMaster;
+    memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect));
+    NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), res, cleanup);
+    // consolidate
+    int c = 0;
+    for (int r = 0; r < nranks; r++) {
+      if (allConnects[r].isMaster) {
+        memcpy(masterConnects+c, &(allConnects[r].connect), sizeof(struct ncclConnect));
+        if (r == rank) rankInCollNet = c;
+        c++;
+      }
+    }
+  } else { // send side : copy in connect info received from peer recv master
+    if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
+  }
+  // connect
+  if (isMaster && ret > 0) {
+    NCCLCHECKGOTO(transportComm->connect(masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
+  }
+  // recv side sends connect info to send side
+  if (isMaster && type == 1) {
+    sendrecvExchange.collNetRank = rankInCollNet;
+    memcpy(&sendrecvExchange.connect, masterConnects+rankInCollNet, sizeof(struct ncclConnect));
+    NCCLCHECK(bootstrapSend(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange)));
+    INFO(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
+  }
+  if (ret > 0) {
+    supported = 1;
+  }
+cleanup:
+  if (allConnects != NULL) free(allConnects);
+  if (masterConnects != NULL) free(masterConnects);
+  return supported;
+}
+
+static ncclResult_t checkCollNetSetup(struct ncclComm* comm, int rank, int collNetSetupFail) {
+  int nranks = comm->nRanks;
+  // AllGather collNet setup results
+  int* allGatherFailures;
+  NCCLCHECK(ncclCalloc(&allGatherFailures, nranks));
+  allGatherFailures[rank] = collNetSetupFail;
+  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGatherFailures, sizeof(int)));
+  for (int i=0; i<nranks; i++) {
+    if (allGatherFailures[i] != 0) {
+      collNetSetupFail = 1;
+      break;
+    }
+  }
+  free(allGatherFailures);
+  if (collNetSetupFail) {
+    if (rank == 0) WARN("Cannot initialize CollNet, using %s instead", ncclNetName());
+    // Free collNet resources
+    for (int r=0; r<comm->nChannels; r++) {
+      struct ncclChannel* channel = comm->channels+r;
+      struct ncclPeer* peer = channel->peers+nranks;
+      if (peer->send.transportResources && peer->send.transportComm) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
+      if (peer->recv.transportResources && peer->recv.transportComm) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
+      peer->send.transportResources = NULL; // avoid double free
+      peer->recv.transportResources = NULL; // avoid double free
+    }
+    // Set support to 0
+    comm->collNetSupport = 0;
+  } else {
+    comm->collNetSupport = 1;
+  }
+  return ncclSuccess;
+}
+
 NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
+NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
 
 static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
   // We use 3 AllGathers
@@ -462,7 +591,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   NCCLCHECK(fillInfo(comm, myInfo, commHash));
   NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
 
-  NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
+  NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks+1)); // Extra rank to represent CollNet root
   for (int i = 0; i < nranks; i++) {
     memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
     if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
@@ -481,60 +610,82 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm));
   // Recompute paths after trimming
   NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
-  // Compute max speed to accelerate search
-  NCCLCHECK(ncclTopoGetMaxSpeed(comm->topo));
+  // Init search
+  NCCLCHECK(ncclTopoSearchInit(comm->topo));
   // Print final topology
   NCCLCHECK(ncclTopoPrint(comm->topo));
 
   // Get rings and trees
-  struct ncclTopoGraph treeGraph;
-  treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
-  treeGraph.crossNic = ncclParamCrossNic();
-  // We communicate only half the data between node with trees on 2 nodes.
-  NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph));
-  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph));
   struct ncclTopoGraph ringGraph;
+  ringGraph.id = 0;
   ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
   ringGraph.crossNic = ncclParamCrossNic();
+  ringGraph.collNet = 0;
+  ringGraph.minChannels = 1;
+  ringGraph.maxChannels = MAXCHANNELS/2;
   NCCLCHECK(ncclTopoCompute(comm->topo, &ringGraph));
   NCCLCHECK(ncclTopoPrintGraph(comm->topo, &ringGraph));
 
+  struct ncclTopoGraph treeGraph;
+  treeGraph.id = 1;
+  treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
+  treeGraph.crossNic = ncclParamCrossNic();
+  treeGraph.collNet = 0;
+  treeGraph.minChannels = 1;
+  treeGraph.maxChannels = ringGraph.nChannels;
+  NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph));
+  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph));
+
+  struct ncclTopoGraph collNetGraph;
+  collNetGraph.id = 2;
+  collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
+  collNetGraph.collNet = 1;
+  collNetGraph.crossNic = ncclParamCrossNic();
+  collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels;
+  NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph));
+  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph));
+
+  if (comm->rank == ncclParamGraphDumpFileRank()) {
+    struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph };
+    NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
+  }
+
   // AllGather3 - begin
+  struct ncclGraphInfo {
+    int sameChannels;
+    float speedIntra;
+    float speedInter;
+    int typeIntra;
+  };
 
   struct {
     int cudaCompCap;
     int fullCudaCompCap;
-    int nvlink;
     int nChannels;
-    struct {
-      int sameChannels;
-      int speedIntra;
-      int speedInter;
-      int nvlink;
-    } tree;
-    struct {
-      int sameChannels;
-      int speedIntra;
-      int speedInter;
-      int nvlink;
-    } ring;
+    struct ncclGraphInfo tree;
+    struct ncclGraphInfo ring;
+    struct ncclGraphInfo collNet;
     struct ncclTopoRanks topoRanks;
   } *allGather3Data;
 
   NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
   allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
-  allGather3Data[rank].nvlink = treeGraph.nvlink;
-  allGather3Data[rank].nChannels = comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
+  allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
+    std::min(treeGraph.nChannels, ringGraph.nChannels);
   allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
   allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
   allGather3Data[rank].tree.speedInter = treeGraph.speedInter;
-  allGather3Data[rank].tree.nvlink = treeGraph.nvlink;
+  allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra;
   allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
   allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra;
   allGather3Data[rank].ring.speedInter = ringGraph.speedInter;
-  allGather3Data[rank].ring.nvlink = ringGraph.nvlink;
+  allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra;
+  allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels;
+  allGather3Data[rank].collNet.speedIntra = collNetGraph.speedIntra;
+  allGather3Data[rank].collNet.speedInter = collNetGraph.speedInter;
+  allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra;
 
-  NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &allGather3Data[rank].topoRanks));
+  NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks));
 
   NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
 
@@ -562,9 +713,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     maxCompCap = std::max(allGather3Data[i].cudaCompCap, maxCompCap);
   }
 
-  comm->nvlink = 1;
-  for (int i = 0; i < nranks; i++) comm->nvlink &= allGather3Data[i].nvlink;
-
   int nChannelsOrig = comm->nChannels;
   struct ncclTopoRanks** allTopoRanks;
   NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
@@ -575,11 +723,15 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
     treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
     treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
-    treeGraph.nvlink = std::min(allGather3Data[i].tree.nvlink, treeGraph.nvlink);
+    treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
     ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
     ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
     ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
-    ringGraph.nvlink = std::min(allGather3Data[i].ring.nvlink, ringGraph.nvlink);
+    ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
+    collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels);
+    collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra);
+    collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter);
+    collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
   }
 
   if (comm->nChannels < nChannelsOrig) {
@@ -592,6 +744,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
 
   NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings));
+  if (comm->nNodes > 1 &&
+      ncclParamCollNetEnable() == 1 &&
+      collNetSupport()) {
+    NCCLCHECK(ncclTopoConnectCollNet(comm, &collNetGraph, rank));
+  }
 
   free(allTopoRanks);
   free(nodesFirstRank);
@@ -601,7 +758,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
 
   TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
 
-  NCCLCHECK(ncclSetThresholds(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph));
+  NCCLCHECK(ncclTopoSetThresholds(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
 
   char line[1024];
   line[0]='\0';
@@ -615,21 +772,58 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   line[1023] = '\0';
   INFO(NCCL_INIT, "Trees%s", line);
 
+  // Set Affinity to a CPU local the our GPU, so that all memory we allocate
+  // on the host is local.
+  cpu_set_t affinitySave;
+  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  NCCLCHECK(ncclTopoSetAffinity(comm->topo, comm->rank));
+  ncclResult_t ret;
+
   // Connect with prev/next for each ring
   struct ncclConnect *connect;
-  NCCLCHECK(ncclCalloc(&connect, 2));
+  NCCLCHECKGOTO(ncclCalloc(&connect, 2), ret, affinity_restore);
   for (int c=0; c<comm->nChannels; c++) {
     struct ncclChannel* channel = comm->channels+c;
-    NCCLCHECK(setupChannel(comm, c, rank, nranks, rings+c*nranks));
+    NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore);
     if (comm->nRanks == 1) continue;
-    NCCLCHECK(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next));
-    NCCLCHECK(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up));
-    NCCLCHECK(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down));
+    NCCLCHECKGOTO(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore);
+    NCCLCHECKGOTO(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up), ret, affinity_restore);
+    NCCLCHECKGOTO(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down), ret, affinity_restore);
+  }
+
+  // Check if we can setup CollNet
+  if (comm->nNodes > 1 &&
+      ncclParamCollNetEnable() == 1 &&
+      collNetSupport()) {
+    int logicChannels = comm->nChannels/2;
+    int collNetSetupFail = 0;
+    const int recvIndex = 0;  // recv GPU index is always 0
+    const int sendIndex = collNetGraph.pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;  // send GPU index depends on topo pattern
+    for (int c=0; c<logicChannels; c++) {
+      struct ncclChannel* channelRecv = comm->channels+logicChannels+c;
+      struct ncclChannel* channelSend = comm->channels+c;
+      NCCLCHECK(p2pSetup(comm, &collNetGraph, channelRecv, 1, &channelRecv->collTreeDn.up, 1, channelRecv->collTreeDn.down));
+      NCCLCHECK(p2pSetup(comm, &collNetGraph, channelSend, 1, channelSend->collTreeUp.down, 1, &channelSend->collTreeUp.up));
+      const int recvMaster = collNetGraph.intra[c*comm->localRanks+recvIndex];
+      const int sendMaster = collNetGraph.intra[c*comm->localRanks+sendIndex];
+      if (collNetSetup(comm, &collNetGraph, channelRecv, logicChannels, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1)
+        collNetSetupFail = 1;
+      if (collNetSetup(comm, &collNetGraph, channelSend, logicChannels, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1)
+        collNetSetupFail = 1;
+    }
+    // Verify CollNet setup across ranks
+    NCCLCHECK(checkCollNetSetup(comm, rank, collNetSetupFail));
   }
   TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
   free(connect);
   free(rings);
 
+  // We should have allocated all buffers, collective fifos, ... we can
+  // restore the affinity.
+affinity_restore:
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  if (ret != ncclSuccess) return ret;
+
   // Compute intra ranks (using AllGather1 data)
   int intraRank0 = -1, intraRank = -1, intraRanks = 0;
   for (int i = 0; i < nranks; i++) {
@@ -658,98 +852,20 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   return ncclSuccess;
 }
 
-static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) {
-  CPU_ZERO_S(sizeof(cpu_set_t), mask);
-  char* cudaPath;
-  NCCLCHECK(ncclTopoCudaPath(cudaDev, &cudaPath));
-  char path[PATH_MAX];
-  strncpy(path, cudaPath, PATH_MAX-1);
-  snprintf(path+strlen(path), PATH_MAX-1-strlen(path), "/local_cpus");
-  path[PATH_MAX-1] = '\0';
-  int fd;
-  SYSCHECKVAL(open(path, O_RDONLY), "open", fd);
-  char affinityStr[sizeof(cpu_set_t)*2 + 1];
-  int r = read(fd, affinityStr, sizeof(cpu_set_t)*2);
-  if (r > 0) {
-    affinityStr[r] = '\0';
-    NCCLCHECK(ncclStrToCpuset(affinityStr, mask));
-  }
-  close(fd);
-  free(cudaPath);
-  return ncclSuccess;
-}
-
-NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
-
-static ncclResult_t setCpuAffinity(int cudaDev) {
-  // Query the CPU affinity set we were provided
-  cpu_set_t mask;
-  SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
-
-#ifdef ENABLE_TRACE
-  {
-    char affinityStr[sizeof(cpu_set_t)*2];
-    NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
-    TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", cudaDev, affinityStr);
-  }
-#endif
-
-  // Find the CPUs that are local to the supplied GPU
-  cpu_set_t gpuMask;
-  NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask));
-
-#ifdef ENABLE_TRACE
-  {
-    char affinityStr[sizeof(cpu_set_t)*2];
-    NCCLCHECK(ncclCpusetToStr(&gpuMask, affinityStr));
-    TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", cudaDev, affinityStr);
-  }
-#endif
-
-  cpu_set_t finalMask;
-  if (ncclParamIgnoreCpuAffinity())
-    // Ignore the CPU affinity set and use the GPU one instead
-    finalMask = gpuMask;
-  else
-    // Use a subset of the GPU affinity set
-    CPU_AND(&finalMask, &mask, &gpuMask);
-
-  // If there is a non empty set, use it to set affinity
-  if (CPU_COUNT(&finalMask)) {
-    char affinityStr[sizeof(cpu_set_t)*2];
-    NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
-    INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", cudaDev, affinityStr);
-    SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity");
-  }
-  return ncclSuccess;
-}
-
 ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) {
-  cpu_set_t affinitySave;
-  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-
-  NCCLCHECK(wrapNvmlSymbols());
-  NCCLCHECK(wrapNvmlInit());
-
-  // Make sure all host memory allocation are close to the GPU
-  CUDACHECK(cudaSetDevice(cudaDev));
-  NCCLCHECK(setCpuAffinity(cudaDev));
   ncclResult_t res;
 
+  CUDACHECK(cudaSetDevice(cudaDev));
   NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
   NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
   NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
 
-  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-  NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
-
   INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
 
   return ncclSuccess;
 cleanup:
   if ((*newcomm) && (*newcomm)->bootstrap) bootstrapAbort((*newcomm)->bootstrap);
   *newcomm = NULL;
-  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
   return res;
 }
 
diff --git a/src/misc/utils.cc b/src/misc/utils.cc
index 5158529..782e9c0 100644
--- a/src/misc/utils.cc
+++ b/src/misc/utils.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -24,7 +24,7 @@ ncclResult_t int64ToBusId(int64_t id, char* busId) {
   return ncclSuccess;
 }
 
-ncclResult_t busIdToInt64(char* busId, int64_t* id) {
+ncclResult_t busIdToInt64(const char* busId, int64_t* id) {
   const int size = strlen(busId);
   char* hexStr;
   NCCLCHECK(ncclCalloc(&hexStr, size));
diff --git a/src/transport.cc b/src/transport.cc
index 4059849..cc8d5d1 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -100,6 +100,7 @@ static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) {
 
   struct ncclPeer* peerComm = args->channel->peers+peer;
   struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send;
+  if (connector->transportComm == NULL) return ncclInternalError;
   if (connector->transportComm->proxy == NULL) return ncclSuccess;
 
   struct ncclProxyArgs* op;
@@ -130,6 +131,18 @@ ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int r
     for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
     NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
   }
+  if (pattern == ncclPatternCollTreeUp) {
+    // CollTree up
+    struct ncclTree* tree = &args->channel->collTreeUp;
+    NCCLCHECK(SaveProxy<proxyRecv>(tree->down[0], args));
+    NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
+  }
+  if (pattern == ncclPatternCollTreeDown) {
+    // CollTree down
+    struct ncclTree* tree = &args->channel->collTreeDn;
+    NCCLCHECK(SaveProxy<proxySend>(tree->down[0], args));
+    NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
+  }
   return ncclSuccess;
 }
 
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
new file mode 100644
index 0000000..73e9fdd
--- /dev/null
+++ b/src/transport/coll_net.cc
@@ -0,0 +1,430 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "coll_net.h"
+#include "graph.h"
+#include <assert.h>
+
+struct collNetRecvConnectInfo {
+  collNetHandle_t collNetHandle;
+};
+
+struct collNetSendConnectInfo {
+  void* collNetComm;
+  void* mhandle;
+  void* llMhandle;
+  struct reqSlot* reqFifo;
+};
+
+struct ncclLLDataLine {
+  uint32_t data1;
+  uint32_t data2;
+};
+static_assert(sizeof(struct ncclLLDataLine) == sizeof(union ncclLLFifoLine)>>1, "ncclLLDataLine is not half size of ncclLLFifoLine");
+
+struct reqSlot {
+  volatile void* recvBuff;
+  volatile int size;
+};
+
+struct collNetSendResources {
+  void* collNetSendComm;
+  struct ncclSendMem* hostSendMem;
+  struct ncclRecvMem* hostRecvMem;
+  struct ncclSendMem* devHostSendMem;
+  struct ncclRecvMem* devHostRecvMem;
+  struct ncclLLDataLine* llData;
+  int netDev;
+  int useGdr;
+  int buffSize;
+  void* sendMhandle;
+  void* llSendMhandle;
+  void* recvMhandle;
+  void* llRecvMhandle;
+  struct ncclRecvMem* devRecvMem;
+  uint64_t step;
+  uint64_t llLastCleaning;
+  struct reqSlot* reqFifo;
+  int collNetRank;
+};
+
+struct collNetRecvResources {
+  void* netListenComm;
+  void* collNetRecvComm;
+  struct ncclSendMem* hostSendMem;
+  struct ncclRecvMem* hostRecvMem;
+  struct ncclSendMem* devHostSendMem;
+  struct ncclRecvMem* devHostRecvMem;
+  struct ncclLLDataLine* llData;
+  int netDev;
+  int useGdr;
+  int buffSize;
+  void* mhandle;
+  void* llMhandle;
+  struct ncclRecvMem* devRecvMem;
+  uint64_t step;
+  uint64_t llLastCleaning;
+  struct reqSlot* reqFifo;
+  int collNetRank;
+};
+
+/* Determine if we can communicate with the peer */
+ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+  *ret = 1;
+  return ncclSuccess;
+}
+
+/* Setup send connector, and return connect information for others in the coll communicator to connect to me */
+ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+  struct collNetSendResources* sendResources;
+  NCCLCHECK(ncclCalloc(&sendResources, 1));
+  send->transportResources = sendResources;
+
+  NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &sendResources->netDev));
+  NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, sendResources->netDev, 1, &sendResources->useGdr));
+
+  int sendSize = sizeof(struct ncclSendMem);
+  NCCLCHECK(ncclCudaHostAlloc((void**)&sendResources->hostSendMem, (void**)&sendResources->devHostSendMem, sendSize));
+
+  int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+  if (sendResources->useGdr) {
+    NCCLCHECK(ncclCudaCalloc((char**)(&sendResources->devRecvMem), recvSize));
+  }
+  NCCLCHECK(ncclCudaHostAlloc((void**)&sendResources->hostRecvMem, (void**)&sendResources->devHostRecvMem, recvSize));
+  NCCLCHECK(ncclIbMalloc((void**)&(sendResources->llData), NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine)));
+  sendResources->buffSize = buffSize;
+
+  INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), sendResources->netDev,
+      sendResources->useGdr ? "/GDRDMA" : "");
+
+  return ncclSuccess;
+}
+
+/* Setup recv connector */
+ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
+  struct collNetRecvResources* recvResources;
+  NCCLCHECK(ncclCalloc(&recvResources, 1));
+  recv->transportResources = recvResources;
+
+  NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &recvResources->netDev));
+  NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, recvResources->netDev, 0, &recvResources->useGdr));
+
+  int sendSize = sizeof(struct ncclSendMem);
+  NCCLCHECK(ncclCudaHostAlloc((void**)&recvResources->hostSendMem, (void**)&recvResources->devHostSendMem, sendSize));
+
+  int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+  if (recvResources->useGdr) {
+    NCCLCHECK(ncclCudaCalloc((char**)(&recvResources->devRecvMem), recvSize));
+  }
+  NCCLCHECK(ncclCudaHostAlloc((void**)&recvResources->hostRecvMem, (void**)&recvResources->devHostRecvMem, recvSize));
+  NCCLCHECK(ncclIbMalloc((void**)&(recvResources->llData), NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine)));
+  recvResources->buffSize = buffSize;
+
+  INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), recvResources->netDev,
+      recvResources->useGdr ? "/GDRDMA" : "");
+
+  struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
+  NCCLCHECK(collNetListen(recvResources->netDev, &info->collNetHandle, &recvResources->netListenComm));
+
+  return ncclSuccess;
+}
+
+ncclResult_t collNetSendConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
+  // Setup device pointers
+  struct collNetSendResources* sendResources = (struct collNetSendResources*)send->transportResources;
+  sendResources->collNetRank = rank;
+
+  // Get info from recv side
+  struct collNetSendConnectInfo* sInfo = (struct collNetSendConnectInfo*)(connectInfos+rank);
+  sendResources->reqFifo = sInfo->reqFifo;
+  sendResources->collNetSendComm = sInfo->collNetComm;
+  sendResources->recvMhandle = sInfo->mhandle;
+  sendResources->llRecvMhandle = sInfo->llMhandle;
+
+  // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host
+  struct ncclRecvMem* sRecvMem = sendResources->useGdr ? sendResources->devRecvMem : sendResources->devHostRecvMem;
+  // Register buffers
+  NCCLCHECK(collNetRegMr(sendResources->collNetSendComm, sRecvMem->buff, sendResources->buffSize,
+        sendResources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &sendResources->sendMhandle));
+  NCCLCHECK(collNetRegMr(sendResources->collNetSendComm, sendResources->llData,
+        NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine), NCCL_PTR_HOST, &sendResources->llSendMhandle));
+
+  send->conn.buff = sRecvMem->buff;
+  send->conn.llBuff = sendResources->devHostRecvMem->llBuff;
+  send->conn.direct |= sendResources->useGdr ? NCCL_DIRECT_NIC : 0;
+
+  // Head/Tail/Opcount/Fifos are always on host
+  send->conn.tail = &sendResources->devHostRecvMem->tail;
+  send->conn.opCountRem = &sendResources->devHostRecvMem->opCount;
+  send->conn.fifo = sendResources->devHostRecvMem->sizesFifo;
+  send->conn.head = &sendResources->devHostSendMem->head;
+  send->conn.opCountLoc = &sendResources->devHostSendMem->opCount;
+  for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
+
+  return ncclSuccess;
+}
+
+ncclResult_t collNetRecvConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
+  // Setup device pointers
+  struct collNetRecvResources* recvResources = (struct collNetRecvResources*)recv->transportResources;
+  struct collNetSendConnectInfo* sInfo = (struct collNetSendConnectInfo*)(connectInfos+rank);
+  recvResources->collNetRank = rank;
+
+  // Intermediate buffering on GPU for GPU Direct RDMA
+  struct ncclRecvMem* rRecvMem = recvResources->useGdr ? recvResources->devRecvMem : recvResources->devHostRecvMem;
+  recv->conn.buff = rRecvMem->buff;
+  recv->conn.llBuff = recvResources->devHostRecvMem->llBuff;  // recv LL buff always on host
+  recv->conn.direct |= recvResources->useGdr ? NCCL_DIRECT_NIC : 0;
+
+  // Head/Tail/Opcount are always on host
+  recv->conn.tail = &recvResources->devHostRecvMem->tail;
+  recv->conn.opCountLoc = &recvResources->devHostRecvMem->opCount;
+  recv->conn.head = &recvResources->devHostSendMem->head;
+  recv->conn.opCountRem = &recvResources->devHostSendMem->opCount;
+
+  // Connect to coll comm
+  collNetHandle_t** handlePtrs = NULL;
+  NCCLCHECK(ncclCalloc(&handlePtrs, nranks));
+  for (int i = 0; i < nranks; i++) {
+    struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i);
+    handlePtrs[i] = &(info->collNetHandle);
+  }
+  ncclResult_t res;
+  NCCLCHECKGOTO(collNetConnect((void**)handlePtrs, nranks, rank, recvResources->netListenComm, &recvResources->collNetRecvComm), res, cleanup);
+
+  // Register buffers
+  NCCLCHECK(collNetRegMr(recvResources->collNetRecvComm, rRecvMem->buff, recvResources->buffSize,
+        recvResources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &recvResources->mhandle));
+  NCCLCHECK(collNetRegMr(recvResources->collNetRecvComm, recvResources->llData,
+        NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine), NCCL_PTR_HOST, &recvResources->llMhandle));
+
+  // Create shared info between send and recv proxies
+  NCCLCHECK(ncclCalloc(&(recvResources->reqFifo), NCCL_STEPS));
+
+  // Pass info to send side
+  sInfo->reqFifo = recvResources->reqFifo;
+  sInfo->collNetComm = recvResources->collNetRecvComm;
+  sInfo->mhandle = recvResources->mhandle;
+  sInfo->llMhandle = recvResources->llMhandle;
+
+cleanup:
+  if (handlePtrs != NULL) free(handlePtrs);
+  // Close listen comm
+  NCCLCHECK(collNetCloseListen(recvResources->netListenComm));
+
+  return res;
+}
+
+ncclResult_t collNetSendFree(void* sendTransportResources) {
+  struct collNetSendResources* sendResources = (struct collNetSendResources*)sendTransportResources;
+  NCCLCHECK(ncclCudaHostFree(sendResources->hostSendMem));
+  NCCLCHECK(ncclCudaHostFree(sendResources->hostRecvMem));
+  if (sendResources->collNetSendComm) {
+    NCCLCHECK(collNetDeregMr(sendResources->collNetSendComm, sendResources->sendMhandle));
+    NCCLCHECK(collNetDeregMr(sendResources->collNetSendComm, sendResources->llSendMhandle));
+  }
+  if (sendResources->useGdr)
+    CUDACHECK(cudaFree(sendResources->devRecvMem));
+  free(sendResources->llData);
+  free(sendResources);
+  return ncclSuccess;
+}
+
+ncclResult_t collNetRecvFree(void* recvTransportResources) {
+  struct collNetRecvResources* recvResources = (struct collNetRecvResources*)recvTransportResources;
+  NCCLCHECK(ncclCudaHostFree(recvResources->hostSendMem));
+  if (recvResources->collNetRecvComm) {
+    NCCLCHECK(collNetDeregMr(recvResources->collNetRecvComm, recvResources->mhandle));
+    NCCLCHECK(collNetDeregMr(recvResources->collNetRecvComm, recvResources->llMhandle));
+  }
+  NCCLCHECK(ncclCudaHostFree(recvResources->hostRecvMem));
+  if (recvResources->useGdr)
+    CUDACHECK(cudaFree(recvResources->devRecvMem));
+  free(recvResources->llData);
+  free(recvResources->reqFifo);
+
+  // Make sure SendFree is called before RecvFree
+  if (recvResources->collNetRecvComm) {
+    NCCLCHECK(collNetCloseColl(recvResources->collNetRecvComm));
+  }
+  free(recvResources);
+  return ncclSuccess;
+}
+
+ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
+  if (args->protocol == NCCL_PROTO_LL128) {
+    WARN("CollNet does not support LL128");
+    return ncclInternalError;
+  }
+  struct collNetSendResources* resources = (struct collNetSendResources*) (args->connector->transportResources);
+  if (args->state == ncclProxyOpReady) {
+    // Update opCount
+    resources->hostRecvMem->opCount = args->opCount;
+
+    // Round to next multiple of sliceSteps
+    resources->step = ROUNDUP(resources->step, args->chunkSteps);
+    args->head = resources->step;
+    args->tail = resources->step;
+    args->end = args->head + args->nsteps;
+    args->state = ncclProxyOpProgress;
+  }
+  if (args->state == ncclProxyOpProgress) {
+    args->idle = 1;
+    struct reqSlot* reqFifo = resources->reqFifo;
+    if (args->head < args->end) {
+      int buffSlot = args->tail%NCCL_STEPS;
+      if (args->tail < args->end && args->tail < args->head + NCCL_STEPS
+          && reqFifo[buffSlot].recvBuff != NULL) {
+        volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
+        volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
+        if (args->protocol == NCCL_PROTO_LL) {
+          int size = sizesFifo[buffSlot];
+          if (size != -1) {
+            uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
+            int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
+            union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
+            int ready = 1;
+            for (int i=0; i<nFifoLines; i++) {
+              volatile uint32_t *f1 = &lines[i].flag1;
+              volatile uint32_t *f2 = &lines[i].flag2;
+              if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
+            }
+            if (ready) {
+              //separate data from flag
+              struct ncclLLDataLine* sendBuff = resources->llData+buffSlot*NCCL_LL_SLICE_LINES;
+              for (int i=0; i<nFifoLines; i++) {
+                volatile uint32_t *d1 = &lines[i].data1;
+                volatile uint32_t *d2 = &lines[i].data2;
+                sendBuff[i].data1 = d1[0];
+                sendBuff[i].data2 = d2[0];
+              }
+              int count = nFifoLines*sizeof(struct ncclLLDataLine) / ncclTypeSize(args->dtype);
+              NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*)sendBuff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, resources->llSendMhandle, resources->llRecvMhandle, args->requests+buffSlot));
+              if (args->requests[buffSlot] != NULL) {
+                TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce (LL) posted, req %p", args->head, buffSlot, args->requests[buffSlot]);
+                sizesFifo[buffSlot] = -1;
+                // Make sure size is reset to zero before we update the head.
+                __sync_synchronize();
+                args->tail += args->sliceSteps;
+                args->idle = 0;
+              }
+            }
+          }
+        } else if (args->tail < *recvTail) {
+          int stepSize = args->channel->buffSize/NCCL_STEPS;
+          struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
+          // Send through network
+          if (sizesFifo[buffSlot] != -1) {
+            int count = sizesFifo[buffSlot]/ncclTypeSize(args->dtype);
+            NCCLCHECK(collNetIallreduce(resources->collNetSendComm, localMem->buff+buffSlot*stepSize, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, resources->sendMhandle, resources->recvMhandle, args->requests+buffSlot));
+            if (args->requests[buffSlot] != NULL) {
+              TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce posted, req %p count %d", args->head, buffSlot, args->requests[buffSlot], count);
+              sizesFifo[buffSlot] = -1;
+              // Make sure size is reset to zero before we update the head.
+              __sync_synchronize();
+              args->tail += args->sliceSteps;
+              args->idle = 0;
+            }
+          }
+        }
+      }
+      if (args->head < args->tail) {
+        int done, size;
+        int buffSlot = args->head%NCCL_STEPS;
+        NCCLCHECK(collNetTest((void*)(args->requests[buffSlot]), &done, &size));
+        if (done) {
+          TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->head, buffSlot, args->requests[buffSlot], size);
+          reqFifo[buffSlot].size = size;
+          // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
+          // (reordered store after store is possible on POWER, though not on x86)
+          __sync_synchronize();
+          reqFifo[buffSlot].recvBuff = NULL; // Notify recvProxy
+          args->head += args->sliceSteps;
+          resources->hostSendMem->head = args->head;
+          args->idle = 0;
+        }
+      }
+    }
+    if (args->head == args->end) {
+      resources->step = args->end;
+      args->idle = 0;
+      args->state = ncclProxyOpNone;
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
+  if (args->protocol == NCCL_PROTO_LL128) {
+    WARN("CollNet does not support LL128");
+    return ncclInternalError;
+  }
+  struct collNetRecvResources* resources = (struct collNetRecvResources*) (args->connector->transportResources);
+  if (args->state == ncclProxyOpReady) {
+    // Update opCount
+    resources->hostSendMem->opCount = args->opCount;
+
+    // Round to next multiple of sliceSteps
+    resources->step = ROUNDUP(resources->step, args->chunkSteps);
+    args->head = resources->step;
+    args->tail = resources->step;
+    args->end = args->head + args->nsteps;
+    args->state = ncclProxyOpProgress;
+  }
+  if (args->state == ncclProxyOpProgress) {
+    args->idle = 1;
+    int stepSize = ( args->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine) : args->channel->buffSize ) / NCCL_STEPS;
+    struct reqSlot* reqFifo = resources->reqFifo;
+    if (args->head < args->end) {
+      struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
+      char* localBuff = args->protocol == NCCL_PROTO_LL ? (char*)resources->llData : localMem->buff;
+      void* mhandle = args->protocol == NCCL_PROTO_LL ? resources->llMhandle : resources->mhandle;
+      if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
+        int buffSlot = args->tail%NCCL_STEPS;
+        reqFifo[buffSlot].recvBuff = localBuff+buffSlot*stepSize;
+        TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->tail, buffSlot, localBuff+buffSlot*stepSize);
+        args->tail += args->sliceSteps;
+        args->idle = 0;
+      }
+      if (args->tail > args->head) {
+        int buffSlot = args->head%NCCL_STEPS;
+        if (reqFifo[buffSlot].recvBuff == NULL) { // Buffer is cleared : coll is complete
+          TRACE(NCCL_NET, "recvProxy [%d/%d] done, size %d", args->head, buffSlot, reqFifo[buffSlot].size);
+          args->head += args->sliceSteps;
+          if (args->protocol == NCCL_PROTO_LL) { // ll
+            // re-attach flag
+            uint32_t flag = args->head;
+            union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(resources->hostRecvMem->llBuff)+buffSlot*NCCL_LL_SLICE_LINES;
+            struct ncclLLDataLine* recvData = resources->llData+buffSlot*NCCL_LL_SLICE_LINES;
+            int nFifoLines = DIVUP(reqFifo[buffSlot].size, sizeof(struct ncclLLDataLine));
+            for (int i=0; i<nFifoLines; i++) {
+              lines[i].v[0] = ((uint64_t)flag << 32) + recvData[i].data1;
+              lines[i].v[1] = ((uint64_t)flag << 32) + recvData[i].data2;
+            }
+          } else if (args->protocol == NCCL_PROTO_SIMPLE) {
+            if (resources->useGdr) collNetFlush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, reqFifo[buffSlot].size, mhandle);
+            resources->hostRecvMem->tail = args->head;
+          }
+          args->idle = 0;
+        }
+      }
+    }
+    if (args->head == args->end) {
+      resources->step = args->end;
+      args->idle = 0;
+      args->state = ncclProxyOpNone;
+    }
+  }
+  return ncclSuccess;
+}
+
+struct ncclTransport collNetTransport = {
+  "COL",
+  collNetCanConnect,
+  { collNetSendSetup, collNetSendConnect, collNetSendFree, collNetSendProxy },
+  { collNetRecvSetup, collNetRecvConnect, collNetRecvFree, collNetRecvProxy }
+};
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 87fc9ce..e0db85e 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -53,40 +53,6 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
   return ncclSuccess;
 }
 
-NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
-NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
-
-static ncclResult_t netGetGdrSupport(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr) {
-  *useGdr = 0;
-
-  if (read) { // For reads (sends) only enable under certain conditions
-    int gdrReadParam = ncclParamNetGdrRead();
-    if (gdrReadParam == 0) return ncclSuccess;
-    if (gdrReadParam < 0) {
-       int nvlink;
-       NCCLCHECK(ncclTopoHasNvlink(topo, busId, &nvlink));
-       if (!nvlink) return ncclSuccess;
-    }
-  }
-
-  // Check if we are close enough that it makes sense to enable GDR
-  int netGdrLevel = ncclParamNetGdrLevel();
-  int distance;
-  NCCLCHECK(ncclTopoNetDistance(topo, busId, netDev, &distance));
-  if (distance >= netGdrLevel) {
-    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d >= %d)", ncclNetName(), busId, netDev, distance, netGdrLevel);
-    return ncclSuccess;
-  }
-
-  // Finally, check if the NIC supports it
-  int flags;
-  NCCLCHECK(ncclNetPtrSupport(netDev, &flags));
-  if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess;
-  *useGdr = 1;
-  INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d < %d), read %d", ncclNetName(), busId, netDev, distance, netGdrLevel, read);
-  return ncclSuccess;
-}
-
 /* Determine if we will use this transport for this peer and return connect
  * information for this peer */
 ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
@@ -95,7 +61,7 @@ ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
   send->transportResources = resources;
 
   NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &resources->netDev));
-  NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
+  NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
 
   int sendSize = sizeof(struct ncclSendMem);
   NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
@@ -118,7 +84,7 @@ ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
   recv->transportResources = resources;
 
   NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &resources->netDev));
-  NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
+  NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
 
   int sendSize = sizeof(struct ncclSendMem);
   NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
@@ -137,7 +103,7 @@ ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
   return ncclSuccess;
 }
 
-ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
   // Setup device pointers
   struct netSendResources* resources = (struct netSendResources*)send->transportResources;
 
@@ -146,6 +112,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
   send->conn.buff = recvMem->buff;
   send->conn.llBuff = resources->devHostRecvMem->llBuff;
   send->conn.ll128Buff = recvMem->ll128Buff;
+  send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
 
   // Head/Tail/Opcount/Fifos are always on host
   send->conn.tail = &resources->devHostRecvMem->tail;
@@ -170,7 +137,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
 }
 
 /* Connect to this peer */
-ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
   // Setup device pointers
   struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources;
 
@@ -179,6 +146,7 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
   recv->conn.buff = recvMem->buff;
   recv->conn.llBuff = recvMem->llBuff;
   recv->conn.ll128Buff = recvMem->ll128Buff;
+  recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
 
   // Head/Tail/Opcount are always on host
   recv->conn.tail = &resources->devHostRecvMem->tail;
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index 0d5307c..1a832f2 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -28,13 +28,19 @@
 #define MAXNAMESIZE 64
 static char ncclIbIfName[MAX_IF_NAME_SIZE];
 static union socketAddress ncclIbIfAddr;
+
 static int ncclNIbDevs = -1;
 struct ncclIbDev {
   int device;
+  uint64_t guid;
   uint8_t port;
   uint8_t link;
+  int speed;
   ibv_context* context;
   char devName[MAXNAMESIZE];
+  char* pciPath;
+  int realPort;
+  int maxQp;
 };
 
 #define MAX_IB_PORT 15
@@ -53,20 +59,7 @@ NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 14);
 NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
 NCCL_PARAM(IbSl, "IB_SL", 0);
 NCCL_PARAM(IbTc, "IB_TC", 0);
-
-// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
-// allocated on separate pages as those pages will be marked DONTFORK
-// and if they are shared, that could cause a crash in a child process
-static ncclResult_t ncclIbMalloc(void** ptr, size_t size) {
-  size_t page_size = sysconf(_SC_PAGESIZE);
-  void* p;
-  int size_aligned = ROUNDUP(size, page_size);
-  int ret = posix_memalign(&p, page_size, size_aligned);
-  if (ret != 0) return ncclSystemError;
-  memset(p, 0, size);
-  *ptr = p;
-  return ncclSuccess;
-}
+NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
 
 pthread_t ncclIbAsyncThread;
 static void* ncclIbAsyncThreadMain(void* args) {
@@ -85,6 +78,39 @@ static void* ncclIbAsyncThreadMain(void* args) {
 
 NCCL_PARAM(IbDisable, "IB_DISABLE", 0);
 
+static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort) {
+  char devicePath[PATH_MAX];
+  snprintf(devicePath, PATH_MAX, "/sys/class/infiniband/%s/device", devName);
+  char* p = realpath(devicePath, NULL);
+  if (p == NULL) {
+    WARN("Could not find real path of %s", *devicePath);
+  } else {
+    // Merge multi-port NICs into the same PCI device
+    p[strlen(p)-1] = '0';
+    // And keep the real port aside (the ibv port is always 1 on recent cards)
+    *realPort = 0;
+    for (int d=0; d<ncclNIbDevs; d++) {
+      if (strcmp(p, ncclIbDevs[d].pciPath) == 0) (*realPort)++;
+    }
+  }
+  *path = p;
+  return ncclSuccess;
+}
+
+static int ibvWidths[] = { 1, 4, 8, 12 };
+static int ibvSpeeds[] = { 2500, 5000, 10000, 10000, 14000, 25000, 50000 };
+static int firstBitSet(int val, int max) {
+  int i = 0;
+  while (i<max && ((val & (1<<i)) == 0)) i++;
+  return i;
+}
+static int ncclIbWidth(int width) {
+  return ibvWidths[firstBitSet(width, sizeof(ibvWidths)/sizeof(int)-1)];
+}
+static int ncclIbSpeed(int speed) {
+  return ibvSpeeds[firstBitSet(speed, sizeof(ibvSpeeds)/sizeof(int)-1)];
+}
+
 ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
   if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
   if (ncclParamIbDisable()) return ncclInternalError;
@@ -145,10 +171,14 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
           TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
               portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
           ncclIbDevs[ncclNIbDevs].device = d;
+          ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid;
           ncclIbDevs[ncclNIbDevs].port = port;
           ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
+          ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width);
           ncclIbDevs[ncclNIbDevs].context = context;
           strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
+          NCCLCHECK(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort));
+          ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp;
           ncclNIbDevs++;
           nPorts++;
           pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
@@ -180,17 +210,6 @@ ncclResult_t ncclIbDevices(int* ndev) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbPciPath(int dev, char** path) {
-  char devicepath[PATH_MAX];
-  snprintf(devicepath, PATH_MAX, "/sys/class/infiniband/%s/device", ncclIbDevs[dev].devName);
-  *path = realpath(devicepath, NULL);
-  if (*path == NULL) {
-    WARN("Could not find real path of %s", devicepath);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
 // Detect whether GDR can work on a given NIC with the current CUDA device
 // Returns :
 // ncclSuccess : GDR works
@@ -204,19 +223,24 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
-  *supportedTypes = NCCL_PTR_HOST;
+static ncclResult_t GetSocketAddr(union socketAddress* addr) {
+  memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
+  return ncclSuccess;
+}
 
+ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
+  props->name = ncclIbDevs[dev].devName;
+  props->pciPath = ncclIbDevs[dev].pciPath;
+  props->guid = ncclIbDevs[dev].guid;
+  props->ptrSupport = NCCL_PTR_HOST;
   if (ncclIbGdrSupport(dev) != ncclSuccess) {
     INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName);
-    return ncclSuccess;
+  } else {
+    props->ptrSupport |= NCCL_PTR_CUDA;
   }
-  *supportedTypes |= NCCL_PTR_CUDA;
-  return ncclSuccess;
-}
-
-static ncclResult_t GetSocketAddr(union socketAddress* addr) {
-  memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
+  props->speed = ncclIbDevs[dev].speed;
+  props->port = ncclIbDevs[dev].port + ncclIbDevs[dev].realPort;
+  props->maxComms = ncclIbDevs[dev].maxQp;
   return ncclSuccess;
 }
 
@@ -325,7 +349,8 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int acce
   qpInitAttr.send_cq = verbs->cq;
   qpInitAttr.recv_cq = verbs->cq;
   qpInitAttr.qp_type = IBV_QPT_RC;
-  qpInitAttr.cap.max_send_wr = MAX_REQUESTS;
+  // We might send 2 requests per send (RDMA_WRITE+RDMA_WRITE_WITH_IMM)
+  qpInitAttr.cap.max_send_wr = 2*MAX_REQUESTS;
   qpInitAttr.cap.max_recv_wr = MAX_REQUESTS;
   qpInitAttr.cap.max_send_sge = 1;
   qpInitAttr.cap.max_recv_sge = 1;
@@ -627,6 +652,10 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
   wr.opcode = IBV_WR_SEND;
   wr.send_flags = IBV_SEND_SIGNALED;
 
+  int useAr = 0;
+  if (size > ncclParamIbArThreshold()) {
+    useAr = 1;
+  }
 #if USE_RDMA_WRITE
   __sync_synchronize(); // order the readyPtr load against rkey load below
   // Sanity checks to catch user collective call count/size mismatches
@@ -636,7 +665,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
         size, slot->size, slot->addr, slot->rkey, slot->seq, comm->fifoHead);
     return ncclInternalError;
   }
-  wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+  wr.opcode = useAr ? IBV_WR_RDMA_WRITE : IBV_WR_RDMA_WRITE_WITH_IMM;
   wr.wr.rdma.remote_addr = slot->addr;
   wr.wr.rdma.rkey = slot->rkey;
   wr.imm_data = size; // Send the message size via imm_data
@@ -651,6 +680,19 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
 
   struct ibv_send_wr* bad_wr;
   NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
+
+#if USE_RDMA_WRITE
+  // When using adaptive routing, send the bulk of the data first as an
+  // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
+  // completion.
+  if (useAr) {
+    wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+    wr.sg_list = NULL;
+    wr.num_sge = 0;
+    wr.send_flags &= ~IBV_SEND_SIGNALED;
+    NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
+  }
+#endif
   *request = req;
   return ncclSuccess;
 }
@@ -835,8 +877,7 @@ ncclNet_t ncclNetIb = {
   "IB",
   ncclIbInit,
   ncclIbDevices,
-  ncclIbPciPath,
-  ncclIbPtrSupport,
+  ncclIbGetProperties,
   ncclIbListen,
   ncclIbConnect,
   ncclIbAccept,
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
index 1b1fc4f..5bc22c3 100644
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -19,16 +19,31 @@
 #include <fcntl.h>
 
 /* Init functions */
-static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
-static union socketAddress ncclNetIfAddrs[MAX_IFS];
 static int ncclNetIfs = -1;
+struct ncclSocketDev {
+  union socketAddress addr;
+  char devName[MAX_IF_NAME_SIZE];
+  char* pciPath;
+};
+static struct ncclSocketDev ncclSocketDevs[MAX_IFS];
+
 pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
 
+static ncclResult_t ncclSocketGetPciPath(char* devName, char** pciPath) {
+  char devicePath[PATH_MAX];
+  snprintf(devicePath, PATH_MAX, "/sys/class/net/%s/device", devName);
+  // May return NULL if the file doesn't exist.
+  *pciPath = realpath(devicePath, NULL);
+  return ncclSuccess;
+}
+
 ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
   if (ncclNetIfs == -1) {
     pthread_mutex_lock(&ncclSocketLock);
     if (ncclNetIfs == -1) {
-      ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
+      char names[MAX_IF_NAME_SIZE*MAX_IFS];
+      union socketAddress addrs[MAX_IFS];
+      ncclNetIfs = findInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS);
       if (ncclNetIfs <= 0) {
         WARN("NET/Socket : no interface found");
         return ncclInternalError;
@@ -37,8 +52,11 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
         char addrline[1024];
         line[0] = '\0';
         for (int i=0; i<ncclNetIfs; i++) {
-          snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, ncclNetIfNames+i*MAX_IF_NAME_SIZE,
-              socketToString(&ncclNetIfAddrs[i].sa, addrline));
+          strcpy(ncclSocketDevs[i].devName, names+i*MAX_IF_NAME_SIZE);
+          memcpy(&ncclSocketDevs[i].addr, addrs+i, sizeof(union socketAddress));
+          NCCLCHECK(ncclSocketGetPciPath(ncclSocketDevs[i].devName, &ncclSocketDevs[i].pciPath));
+          snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, names+i*MAX_IF_NAME_SIZE,
+              socketToString(&addrs[i].sa, addrline));
         }
         line[1023] = '\0';
         INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
@@ -49,30 +67,44 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) {
-  *supportedTypes = NCCL_PTR_HOST;
-  return ncclSuccess;
-}
-
 ncclResult_t ncclSocketDevices(int* ndev) {
   *ndev = ncclNetIfs;
   return ncclSuccess;
 }
 
-ncclResult_t ncclSocketPciPath(int dev, char** path) {
-  char devicepath[PATH_MAX];
-  snprintf(devicepath, PATH_MAX, "/sys/class/net/%s/device", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
-  *path = realpath(devicepath, NULL);
-  if (*path == NULL) {
-    INFO(NCCL_NET|NCCL_INIT, "Could not find real path of %s", devicepath);
-    return ncclSystemError;
+static ncclResult_t ncclSocketGetSpeed(char* devName, int* speed) {
+  *speed = 0;
+  char speedPath[PATH_MAX];
+  sprintf(speedPath, "/sys/class/net/%s/speed", devName);
+  int fd = open(speedPath, O_RDONLY);
+  if (fd != -1) {
+    char speedStr[] = "        ";
+    if (read(fd, speedStr, sizeof(speedStr)-1) > 0) {
+      *speed = strtol(speedStr, NULL, 0);
+    }
+    close(fd);
+  }
+  if (*speed <= 0) {
+    INFO(NCCL_NET, "Could not get speed from %s. Defaulting to 10 Gbps.", speedPath);
+    *speed = 10000;
   }
   return ncclSuccess;
 }
 
+ncclResult_t ncclSocketGetProperties(int dev, ncclNetProperties_t* props) {
+  props->name = ncclSocketDevs[dev].devName;
+  props->pciPath = ncclSocketDevs[dev].pciPath;
+  props->guid = dev;
+  props->ptrSupport = NCCL_PTR_HOST;
+  NCCLCHECK(ncclSocketGetSpeed(props->name, &props->speed));
+  props->port = 0;
+  props->maxComms = 65536;
+  return ncclSuccess;
+}
+
 ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
   if (dev >= ncclNetIfs) return ncclInternalError;
-  memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr));
+  memcpy(addr, &ncclSocketDevs[dev].addr, sizeof(*addr));
   return ncclSuccess;
 }
 
@@ -196,7 +228,7 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
     // Auto-detection
     int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads
     char vendorPath[PATH_MAX];
-    snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
+    snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclSocketDevs[dev].devName);
     char* rPath = realpath(vendorPath, NULL);
     int fd = open(rPath, O_RDONLY);
     free(rPath);
@@ -486,8 +518,7 @@ ncclNet_t ncclNetSocket = {
   "Socket",
   ncclSocketInit,
   ncclSocketDevices,
-  ncclSocketPciPath,
-  ncclSocketPtrSupport,
+  ncclSocketGetProperties,
   ncclSocketListen,
   ncclSocketConnect,
   ncclSocketAccept,
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index 0cc92f3..6586ce7 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -28,9 +28,6 @@ struct p2pRecvResources {
 
 #include <sys/types.h>
 
-NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2);
-NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2);
-
 /* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
 static int busIdToCudaDev(int64_t busId) {
   int ndev;
@@ -50,73 +47,44 @@ static int busIdToCudaDev(int64_t busId) {
 
 /* Determine if two peers can communicate through p2p */
 ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
-  int cpuCount;
-  NCCLCHECK(ncclTopoCpuCount(topo, &cpuCount));
-  // Do not use P2P across sockets by default (provided CUDA permits it).
-  // When we are on a single socket, don't even use P2P through the CPU as
-  // it should be able to sustain two flows to sysmem faster than PCI P2P.
-  int p2pLevel = cpuCount == 1 ? PATH_PHB : PATH_NODE;
-  if (ncclParamP2pDisable() == 1) p2pLevel = 0;
-  if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();
-
-  // Disable P2P
-  *ret = 0;
-
-  if (p2pLevel == 0) return ncclSuccess;
-
   // Rule out different nodes
-  if (info1->hostHash != info2->hostHash) return ncclSuccess;
+  if (info1->hostHash != info2->hostHash) {
+    *ret = 0;
+    return ncclSuccess;
+  }
+
+  // Check topology / p2p level.
+  NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret));
+  if (*ret == 0) return ncclSuccess;
 
   // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
   int cudaDev1 = busIdToCudaDev(info1->busId);
   int cudaDev2 = busIdToCudaDev(info2->busId);
   if (cudaDev1 == -1 || cudaDev2 == -1) {
-    // Peer's CUDA device is not visible in this process
 #if CUDART_VERSION >= 10010
-    // But in CUDA 10.1 we can still communicate with 'invisible' devices
-    TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %lx and %lx", info1->busId, info2->busId);
-    // Check for NVLink/NVswitch including P2P access
-    int nvlink;
-    NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink));
-    if (nvlink > 0) {
-      *ret = 1;
-      return ncclSuccess;
-    }
-#endif
+    // CUDA 10.1 and later can use P2P with invisible devices.
     return ncclSuccess;
-  }
-
-  TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%lx] and [%d=%lx]", cudaDev1, info1->busId, cudaDev2, info2->busId);
-
-  // Do not detect topology if we're on the same GPU. Note this is not really supported.
-  if (cudaDev1 == cudaDev2) {
-    *ret = 1;
+#else
+    // Peer's CUDA device is not visible in this process : we can't communicate with it.
+    *ret = 0;
     return ncclSuccess;
+#endif
   }
 
-  // See if CUDA can do P2P
+  // Check that CUDA can do P2P
   int p2p;
   if (cudaDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != cudaSuccess) {
     INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%lx) and dev %d(=%lx)",
          cudaDev1, info1->busId, cudaDev2, info2->busId);
+    *ret = 0;
     return ncclSuccess;
   }
-  if (p2p == 0) return ncclSuccess;
-
-  // Check for NVLink/NVswitch
-  int nvlink;
-  NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink));
-  if (nvlink > 0) {
-    *ret = 1;
+  if (p2p == 0) {
+    INFO(NCCL_INIT|NCCL_P2P,"Could not enable P2P between dev %d(=%lx) and dev %d(=%lx)",
+         cudaDev1, info1->busId, cudaDev2, info2->busId);
+    *ret = 0;
     return ncclSuccess;
   }
-
-  // Finally compute the PCI distance and compare with the p2pLevel.
-  int distance;
-  NCCLCHECK(ncclTopoGpuDistance(topo, info1->busId, info2->busId, &distance));
-  if (distance < p2pLevel) {
-    *ret = 1;
-  }
   return ncclSuccess;
 }
 
@@ -227,13 +195,13 @@ ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
 }
 
 /* Connect/Send to this peer */
-static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
   struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
   struct ncclRecvMem* remDevMem;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
   if (info->direct) {
     remDevMem = (struct ncclRecvMem*)(info->directPtr);
-    send->conn.direct = 1;
+    send->conn.direct |= NCCL_DIRECT_GPU;
   } else {
     //TRACE_DUMP_IPC(&info->devIpc);
     cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
@@ -257,13 +225,13 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC
 }
 
 /* Connect/Recv from this peer */
-ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
   struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
   struct ncclSendMem* remDevMem;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
   if (info->direct) {
     remDevMem = (struct ncclSendMem*)(info->directPtr);
-    recv->conn.direct = 1;
+    recv->conn.direct |= NCCL_DIRECT_GPU;
     recv->conn.ptrExchange = &remDevMem->ptrExchange;
   } else {
     //TRACE_DUMP_IPC(&info->devIpc);
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
index 60f16c8..0b1d8ee 100644
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@@ -104,7 +104,7 @@ ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
 }
 
 /* Connect to this peer */
-ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
   // Setup device pointers
   struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
   struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
@@ -129,7 +129,7 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
   return ncclSuccess;
 }
 
-ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
   // Setup device pointers
   struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
   struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
author	Sylvain Jeaugey <sjeaugey@nvidia.com>	2020-01-17 03:02:42 +0300
committer	Sylvain Jeaugey <sjeaugey@nvidia.com>	2020-03-21 00:58:36 +0300
commit	b221128ecacf4ce1b3054172b9f30163307042c5 (patch)
tree	43aa7da7992fea7ce30b8cc3e6220bc56f93dd16
parent	c38f174bd436031dbc79dce19ff969f377976a8a (diff)