66 files changed, 3700 insertions, 3205 deletions
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 83a2a39..d0e2ca8 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -15,8 +15,7 @@ PROFAPI ?= 0
 NVCC = $(CUDA_HOME)/bin/nvcc
 
 CUDA_LIB ?= $(CUDA_HOME)/lib64
-CUDA_INC ?= $(CUDA_HOME)/include
-CUDA_VERSION = $(strip $(shell $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
 #CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
 CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
 CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
@@ -36,14 +35,14 @@ CUDA8_PTX     = -gencode=arch=compute_61,code=compute_61
 CUDA9_PTX     = -gencode=arch=compute_70,code=compute_70
 
 # Include Volta support if we're using CUDA9 or above
-ifeq ($(shell test "$(CUDA_MAJOR)" -gt 8; echo $$?),0)
+ifeq ($(shell test "0$(CUDA_MAJOR)" -gt 8; echo $$?),0)
   NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
 else
   NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
 endif
 #$(info NVCC_GENCODE is ${NVCC_GENCODE})
 
-CXXFLAGS   := -I$(CUDA_INC) -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
+CXXFLAGS   := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
 CXXFLAGS   += -Wall -Wno-sign-compare
 NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
 # Use addprefix so that we can specify more than one path
diff --git a/makefiles/version.mk b/makefiles/version.mk
index f9cee6a..a8c6e3a 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 3
-NCCL_PATCH   := 7
+NCCL_MINOR   := 4
+NCCL_PATCH   := 2
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/pkg/redhat/nccl.spec.in b/pkg/redhat/nccl.spec.in
index 65a2c60..f9d83a3 100644
--- a/pkg/redhat/nccl.spec.in
+++ b/pkg/redhat/nccl.spec.in
@@ -1,6 +1,6 @@
 Name:           libnccl
-Version:        ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
-Release:        ${pkg:Revision}
+Version:        ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}
+Release:        ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}
 Summary:        NVIDIA Collectives Communication Library (NCCL) Runtime
 
 Group:          Development/Libraries
diff --git a/pkg/srctxz/Makefile b/pkg/srctxz/Makefile
index 1cb7c06..ed677fe 100644
--- a/pkg/srctxz/Makefile
+++ b/pkg/srctxz/Makefile
@@ -36,4 +36,5 @@ $(TXZPREPDIR)/% : %.in
 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
 	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
 	    $< > $@
diff --git a/pkg/srctxz/create_srctxz.sh.in b/pkg/srctxz/create_srctxz.sh.in
index 0b8e6d2..ae7d01f 100644
--- a/pkg/srctxz/create_srctxz.sh.in
+++ b/pkg/srctxz/create_srctxz.sh.in
@@ -25,8 +25,9 @@ NCCL_MAJOR=${nccl:Major}
 NCCL_MINOR=${nccl:Minor}
 NCCL_PATCH=${nccl:Patch}
 NCCL_SUFFIX=${nccl:Suffix}
+NCCL_BUILD=${pkg:Revision}
 
-NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}"
+NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"
 
 tar --exclude build \
     --exclude ".git*" \
diff --git a/src/Makefile b/src/Makefile
index 481000a..fe60b11 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -9,8 +9,8 @@ include ../makefiles/version.mk
 
 ##### src files
 INCEXPORTS  := nccl.h nccl_net.h
-LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \
-		misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \
+LIBSRCFILES := init.cu channel.cu bootstrap.cu transport.cu enqueue.cu \
+                misc/group.cu misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/checks.cu misc/trees.cu \
 		transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
                 collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
 
@@ -29,11 +29,10 @@ LIBTARGET  := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
 STATICLIBTARGET := $(STATICLIBNAME)
 LIBOBJ     := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
 DEPFILES   := $(LIBOBJ:%.o=%.d)
-LDFLAGS    += -L${CUDA_LIB} -lcudart_static -lrt
+LDFLAGS    += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl
 
 DEVICELIB  := $(BUILDDIR)/obj/collectives/device/colldevice.a
 
-
 ##### rules
 build : lib staticlib
 
@@ -41,9 +40,12 @@ lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
 
 staticlib : $(LIBDIR)/$(STATICLIBTARGET)
 
-devicelib: $(INCDIR)/nccl.h
+$(DEVICELIB): ALWAYS_REBUILD
 	$(MAKE) -C collectives/device
 
+# Empty target to force rebuild
+ALWAYS_REBUILD:
+
 -include $(DEPFILES)
 $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
 
@@ -59,14 +61,14 @@ $(INCDIR)/nccl.h : nccl.h.in
 	    -e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \
 	    $< > $@
 
-$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) devicelib
+$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB)
 	@printf "Linking    %-35s > %s\n" $(LIBTARGET) $@
 	mkdir -p $(LIBDIR)
 	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS)
 	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
 	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
 
-$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) devicelib
+$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB)
 	@printf "Archiving  %-35s > %s\n" $(STATICLIBTARGET) $@
 	mkdir -p $(LIBDIR)
 	$(eval TMP := $(shell mktemp -d))
diff --git a/src/bootstrap.cu b/src/bootstrap.cu
index 13c6e92..6b1d573 100644
--- a/src/bootstrap.cu
+++ b/src/bootstrap.cu
@@ -15,27 +15,31 @@
 // Always use sockets for bootstrap
 ncclNet_t* ncclBootstrapNet = &ncclNetSocket;
 
-static ncclResult_t bootstrapListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; }
-static ncclResult_t bootstrapConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; }
-static ncclResult_t bootstrapAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; }
-static ncclResult_t bootstrapTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; }
-static ncclResult_t bootstrapCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; }
-static ncclResult_t bootstrapCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; }
-static ncclResult_t bootstrapCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; }
 
 // Additional sync functions based on async + test for bootstrap, using host ptrs.
-static ncclResult_t bootstrapSend(void* sendComm, void* data, int size) {
-  void* request;
-  NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, NCCL_PTR_HOST, &request));
+static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) {
+  void* request, *mhandle;
+  NCCLCHECK(ncclBootstrapNet->regMr(sendComm, data, size, NCCL_PTR_HOST, &mhandle));
+  NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, mhandle, &request));
+  NCCLCHECK(ncclBootstrapNet->deregMr(sendComm, mhandle));
   int done = 0;
-  while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
+  while (!done) NCCLCHECK(bootstrapNetTest(request, &done, NULL));
   return ncclSuccess;
 }
-static ncclResult_t bootstrapRecv(void* recvComm, void* data, int size) {
-  void* request;
-  NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, NCCL_PTR_HOST, &request));
+static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) {
+  void* request, *mhandle;
+  NCCLCHECK(ncclBootstrapNet->regMr(recvComm, data, size, NCCL_PTR_HOST, &mhandle));
+  NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, mhandle, &request));
+  NCCLCHECK(ncclBootstrapNet->deregMr(recvComm, mhandle));
   int done = 0;
-  while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
+  while (!done) NCCLCHECK(bootstrapNetTest(request, &done, NULL));
   return ncclSuccess;
 }
 
@@ -51,8 +55,8 @@ struct extId {
 struct extInfo {
   int rank;
   int nranks;
-  ncclNetHandle_t extHandleListenFromRoot;
-  ncclNetHandle_t extHandleRing;
+  ncclNetHandle_t extHandleListenRoot;
+  ncclNetHandle_t extHandleListen;
 };
 
 #include <sys/resource.h>
@@ -68,28 +72,25 @@ static ncclResult_t setFilesLimit() {
 static void *bootstrapRoot(void* commId) {
   struct extInfo info;
   struct extId* id = (struct extId*)commId;
-  ncclNetHandle_t *extHandleBstrap = NULL; // for initial rank <-> root information exchange
-  ncclNetHandle_t *extHandleRing = NULL; // for bootstrap ring creation
+  ncclNetHandle_t *rankHandles = NULL;
+  ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
   ncclNetHandle_t zero = { 0 }; // for sanity checking
   void* tmpComm;
   ncclResult_t res;
   setFilesLimit();
 
+  TRACE(NCCL_INIT, "BEGIN");
   /* Receive addresses from all ranks */
   int nranks = 0, c = 0;
   do {
-    NCCLCHECKGOTO(bootstrapAccept(id->extListenComm, &tmpComm), res, out);
-    NCCLCHECKGOTO(bootstrapRecv(tmpComm, &info, sizeof(info)), res, out);
-    NCCLCHECKGOTO(bootstrapCloseRecv(tmpComm), res, out);
+    NCCLCHECKGOTO(bootstrapNetAccept(id->extListenComm, &tmpComm), res, out);
+    NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out);
+    NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out);
 
     if (c == 0) {
-      extHandleBstrap = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t));
-      extHandleRing = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t));
-      if (extHandleBstrap == NULL || extHandleRing == NULL) {
-        WARN("Bootstrap thread : failed to allocate memory");
-        goto out;
-      }
       nranks = info.nranks;
+      NCCLCHECKGOTO(ncclCalloc(&rankHandles, nranks), res, out);
+      NCCLCHECKGOTO(ncclCalloc(&rankHandlesRoot, nranks), res, out);
     }
 
     if (nranks != info.nranks) {
@@ -97,40 +98,43 @@ static void *bootstrapRoot(void* commId) {
       goto out;
     }
 
-    if (memcmp(&zero, &extHandleBstrap[info.rank], sizeof(ncclNetHandle_t)) != 0) {
+    if (memcmp(&zero, &rankHandlesRoot[info.rank], sizeof(ncclNetHandle_t)) != 0) {
       WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
       goto out;
     }
 
-    // Save the connection handle for connecting back to the ranks
-    memcpy(&extHandleBstrap[info.rank], info.extHandleListenFromRoot, sizeof(ncclNetHandle_t));
-    // Save the connection handle for the AllGather ring
-    memcpy(&extHandleRing[info.rank], info.extHandleRing, sizeof(ncclNetHandle_t));
+    // Save the connection handle for that rank
+    memcpy(rankHandlesRoot+info.rank, info.extHandleListenRoot, sizeof(ncclNetHandle_t));
+    memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t));
 
     ++c;
   } while (c < nranks);
+  TRACE(NCCL_INIT, "COLLECTED HANDLES");
 
   // Send the connect handle for the next rank in the AllGather ring
   for (int r=0; r<nranks; ++r) {
     int next = (r+1) % nranks;
     void *tmpSendComm;
-    NCCLCHECKGOTO(bootstrapConnect(0, extHandleBstrap[r], &tmpSendComm), res, out);
-    NCCLCHECKGOTO(bootstrapSend(tmpSendComm, &extHandleRing[next], sizeof(ncclNetHandle_t)), res, out);
-    NCCLCHECKGOTO(bootstrapCloseSend(tmpSendComm), res, out);
+    NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot[r], &tmpSendComm), res, out);
+    NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
+    NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
   }
+  TRACE(NCCL_INIT, "SENT OUT HANDLES");
 
 out:
-  bootstrapCloseListen(id->extListenComm);
+  bootstrapNetCloseListen(id->extListenComm);
   free(commId);
-  free(extHandleBstrap);
-  free(extHandleRing);
+  if (rankHandles) free(rankHandles);
+  if (rankHandlesRoot) free(rankHandlesRoot);
+
+  TRACE(NCCL_INIT, "DONE");
   return NULL;
 }
 
 ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
   struct extId* id = (struct extId*)commId;
   id->hostHash = getHostHash();
-  NCCLCHECK(bootstrapListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
+  NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
   ncclUniqueId* threadIdCopy;
   NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
   memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
@@ -157,10 +161,18 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
   return ncclSuccess;
 }
 
+struct unexConn {
+  int peer;
+  void* comm;
+  struct unexConn* next;
+};
+
 struct extState {
+  void* extBstrapListenComm;
   void* extBstrapRingRecvComm;
   void* extBstrapRingSendComm;
-  ncclNetHandle_t extBstrapRootHandle;
+  ncclNetHandle_t* peerBstrapHandles;
+  struct unexConn* unexpectedConnections;
   int rank;
   int nranks;
   int dev;
@@ -174,39 +186,56 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co
   state->rank = rank;
   state->nranks = nranks;
   *commState = state;
-  void* extBstrapRootListenComm; // comm on which we accept root's connections
+
+  TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
 
   struct extInfo info = { 0 };
   info.rank = rank;
   info.nranks = nranks;
-  void *tmpSendComm, *extBstrapRingListenComm, *tmpRecvComm;
+  void *tmpSendComm, *tmpRecvComm;
   // Pass the remote address to listen via info
   if (idFromEnv) {
-    memcpy(&info.extHandleListenFromRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
-    memcpy(&info.extHandleRing, &id->extHandleRoot, sizeof(ncclNetHandle_t));
+    memcpy(&info.extHandleListen, &id->extHandleRoot, sizeof(ncclNetHandle_t));
+    memcpy(&info.extHandleListenRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
   }
   // listen will return the local address via info (specify interface type 'findSubnetIf')
   state->dev = idFromEnv ? findSubnetIf : 0;
-  NCCLCHECK(bootstrapListen(state->dev, &info.extHandleListenFromRoot, &extBstrapRootListenComm));
-  NCCLCHECK(bootstrapListen(state->dev, &info.extHandleRing, &extBstrapRingListenComm)); // AllGather Ring
+  void* extBstrapListenCommRoot;
+  NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListen, &state->extBstrapListenComm));
+  NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListenRoot, &extBstrapListenCommRoot));
+
+  // stagger connection times to avoid an overload of the root at very high rank counts
+  if (nranks > 128) {
+    long msec = rank;
+    struct timespec tv;
+    tv.tv_sec = msec / 1000;
+    tv.tv_nsec = 1000000 * (msec % 1000);
+    TRACE(NCCL_INIT, "rank %d delaying connection to root by %ld msec", rank, msec);
+    (void) nanosleep(&tv, NULL);
+  }
 
-  memcpy(&state->extBstrapRootHandle, &id->extHandleRoot, sizeof(ncclNetHandle_t));
-  // send info on my listening sockets to root
-  NCCLCHECK(bootstrapConnect(state->dev, id->extHandleRoot, &tmpSendComm));
-  NCCLCHECK(bootstrapSend(tmpSendComm, &info, sizeof(info)));
-  NCCLCHECK(bootstrapCloseSend(tmpSendComm));
+  // send info on my listening socket to root
+  NCCLCHECK(bootstrapNetConnect(state->dev, id->extHandleRoot, &tmpSendComm));
+  NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info)));
+  NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
 
   // get info on my "next" rank in the bootstrap ring from root
   ncclNetHandle_t extHandleNext;
-  NCCLCHECK(bootstrapAccept(extBstrapRootListenComm, &tmpRecvComm));
-  NCCLCHECK(bootstrapRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
-  NCCLCHECK(bootstrapCloseRecv(tmpRecvComm));
+  NCCLCHECK(bootstrapNetAccept(extBstrapListenCommRoot, &tmpRecvComm));
+  NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
+  NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
+  NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));
 
-  NCCLCHECK(bootstrapConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
+  NCCLCHECK(bootstrapNetConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
   // Accept the connect request from the previous rank in the AllGather ring
-  NCCLCHECK(bootstrapAccept(extBstrapRingListenComm, &state->extBstrapRingRecvComm));
-  NCCLCHECK(bootstrapCloseListen(extBstrapRingListenComm));
-  NCCLCHECK(bootstrapCloseListen(extBstrapRootListenComm));
+  NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm));
+
+  // AllGather all listen handlers
+  NCCLCHECK(ncclCalloc(&state->peerBstrapHandles, nranks));
+  memcpy(state->peerBstrapHandles+rank, info.extHandleListen, sizeof(ncclNetHandle_t));
+  NCCLCHECK(bootstrapAllGather(state, state->peerBstrapHandles, sizeof(ncclNetHandle_t)));
+
+  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
 
   return ncclSuccess;
 }
@@ -224,25 +253,106 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
    * and send previous step's data from (rank-i) to right
    */
   for (int i=0; i<nranks-1; i++) {
-    int rslice = (rank - i - 1 + nranks) % nranks;
-    int sslice = (rank - i + nranks) % nranks;
+    size_t rslice = (rank - i - 1 + nranks) % nranks;
+    size_t sslice = (rank - i + nranks) % nranks;
 
     // Send slice to the right
-    NCCLCHECK(bootstrapSend(state->extBstrapRingSendComm, data+sslice*size, size));
+    NCCLCHECK(bootstrapNetSend(state->extBstrapRingSendComm, data+sslice*size, size));
     // Recv slice from the left
-    NCCLCHECK(bootstrapRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
+    NCCLCHECK(bootstrapNetRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
   }
 
   TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
   return ncclSuccess;
 }
 
-ncclResult_t bootstrapClose(void* commState) {
+ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) {
   struct extState* state = (struct extState*)commState;
+  void* tmpSendComm;
+  NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles[peer], &tmpSendComm));
+  NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int)));
+  NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size));
+  NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
+  return ncclSuccess;
+}
+
+ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) {
+  // New unex
+  struct unexConn* unex;
+  NCCLCHECK(ncclCalloc(&unex, 1));
+  unex->peer = peer;
+  unex->comm = comm;
+
+  // Enqueue
+  struct unexConn* list = state->unexpectedConnections;
+  if (list == NULL) {
+    state->unexpectedConnections = unex;
+    return ncclSuccess;
+  }
+  while (list->next) list = list->next;
+  list->next = unex;
+  return ncclSuccess;
+}
 
-  NCCLCHECK(bootstrapCloseSend(state->extBstrapRingSendComm));
-  NCCLCHECK(bootstrapCloseRecv(state->extBstrapRingRecvComm));
+void* unexpectedDequeue(struct extState* state, int peer) {
+  struct unexConn* elem = state->unexpectedConnections;
+  struct unexConn* prev = NULL;
+  while (elem) {
+    if (elem->peer == peer) {
+      if (prev == NULL) {
+        state->unexpectedConnections = elem->next;
+      } else {
+        prev->next = elem->next;
+      }
+      void* comm = elem->comm;
+      free(elem);
+      return comm;
+    }
+    prev = elem;
+    elem = elem->next;
+  }
+  return NULL;
+}
+
+// We can't know who we'll receive from, so we need to receive everything at once
+ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
+  struct extState* state = (struct extState*)commState;
+
+  void* tmpRecvComm;
+
+  // Search unexpected connections first
+  if ((tmpRecvComm = unexpectedDequeue(state, peer)) != NULL) {
+    NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
+    NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
+    return ncclSuccess;
+  }
+
+  // Then look for new connections
+  while (1) {
+    NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &tmpRecvComm));
+    int newPeer;
+    NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &newPeer, sizeof(int)));
+    if (newPeer == peer) {
+      NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
+      NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
+      return ncclSuccess;
+    }
+    // Unexpected connection. Save for later.
+    NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvComm));
+  }
+}
+
+ncclResult_t bootstrapClose(void* commState) {
+  struct extState* state = (struct extState*)commState;
+  if (state->unexpectedConnections != NULL) {
+    WARN("Unexpected connections are not empty.\n");
+    return ncclInternalError;
+  }
+  NCCLCHECK(bootstrapNetCloseListen(state->extBstrapListenComm));
+  NCCLCHECK(bootstrapNetCloseSend(state->extBstrapRingSendComm));
+  NCCLCHECK(bootstrapNetCloseRecv(state->extBstrapRingRecvComm));
 
+  free(state->peerBstrapHandles);
   free(state);
 
   return ncclSuccess;
diff --git a/src/channel.cu b/src/channel.cu
new file mode 100644
index 0000000..937e84e
--- /dev/null
+++ b/src/channel.cu
@@ -0,0 +1,51 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "channel.h"
+#include "param.h"
+
+NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
+
+ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
+  struct ncclChannel* channel = comm->channels+channelid;
+  channel->id = channelid;
+
+  // Setup intermediate buffering
+  channel->buffSize = ncclParamBuffsize();
+
+  // Ring index to user rank table.
+  NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
+
+  // Communication structures with peers.
+  NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks));
+  for (size_t i=0; i<comm->nRanks; ++i) {
+    channel->peers[i].send.comm = comm;
+    channel->peers[i].recv.comm = comm;
+  }
+
+  // Per-channel operation list.
+  NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
+  return ncclSuccess;
+}
+
+ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
+  // Operation list
+  NCCLCHECK(ncclCudaHostFree(channel->collectives));
+
+  // Free Ring index to rank tables
+  free(channel->ring.userRanks);
+  CUDACHECK(cudaFree(channel->ring.devUserRanks));
+
+  // Free transport proxy resources
+  for (int r=0; r<nRanks; r++) {
+    struct ncclPeer* peer = channel->peers+r;
+    if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
+    if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
+  }
+  return ncclSuccess;
+}
diff --git a/src/collectives/all_gather.cu b/src/collectives/all_gather.cu
index 8dec28e..db21dee 100644
--- a/src/collectives/all_gather.cu
+++ b/src/collectives/all_gather.cu
@@ -4,29 +4,15 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
-#include "common_coll.h"
 #include "enqueue.h"
 #include "collectives.h"
 
-ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(ALLGATHER_SUBSTEPS, ALLGATHER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
-    NCCLCHECK(saveKernel(ncclCollAllGather, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes*comm->nRanks, 1));
-  }
-  return ncclSuccess;
-}
-
 NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
-  return ncclEnqueueCheck(ncclAllGatherFunc, "AllGather", sendbuff, recvbuff, sendcount, datatype,
-          ncclSum, 0, comm, stream);
+  struct ncclInfo info = { ncclCollAllGather, "AllGather",
+    sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
+    ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
 }
diff --git a/src/collectives/all_reduce.cu b/src/collectives/all_reduce.cu
index cc14083..1492c90 100644
--- a/src/collectives/all_reduce.cu
+++ b/src/collectives/all_reduce.cu
@@ -4,29 +4,15 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
-#include "common_coll.h"
 #include "enqueue.h"
 #include "collectives.h"
 
-ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(ALLREDUCE_SUBSTEPS, ALLREDUCE_BUFCHUNKS, (comm->nRanks)*2-2, comm->nRanks, nbytes, proxyPatternRing, comm));
-    NCCLCHECK(saveKernel(ncclCollAllReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, comm->nRanks));
-  }
-  return ncclSuccess;
-}
-
 NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
 ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
-  return ncclEnqueueCheck(ncclAllReduceFunc, "AllReduce", sendbuff, recvbuff, count, datatype,
-          op, 0, comm, stream);
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
+  struct ncclInfo info = { ncclCollAllReduce, "AllReduce",
+    sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
+    ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
 }
diff --git a/src/collectives/broadcast.cu b/src/collectives/broadcast.cu
index 91ce905..6a3d0a8 100644
--- a/src/collectives/broadcast.cu
+++ b/src/collectives/broadcast.cu
@@ -4,39 +4,23 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
-#include "common_coll.h"
 #include "enqueue.h"
 #include "collectives.h"
 
-ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(BROADCAST_SUBSTEPS, BROADCAST_BUFCHUNKS, 1, 1, nbytes, proxyPatternFrom(root), comm));
-    NCCLCHECK(saveKernel(ncclCollBroadcast, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes, 1));
-  }
-
-  return ncclSuccess;
+NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  struct ncclInfo info = { ncclCollBroadcast, "Broadcast",
+    sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
+    BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
 }
-
 /* Deprecated original "in place" function, similar to MPI */
 NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream) {
-  return ncclEnqueueCheck(ncclBroadcastFunc, "Bcast", buff, buff, count, datatype,
-          ncclSum, root, comm, stream);
+  return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
 }
 
-NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream) {
-  return ncclEnqueueCheck(ncclBroadcastFunc, "Broadcast", sendbuff, recvbuff, count, datatype,
-          ncclSum, root, comm, stream);
-}
diff --git a/src/collectives/collectives.h b/src/collectives/collectives.h
index 4a5cb7a..e6b19cb 100644
--- a/src/collectives/collectives.h
+++ b/src/collectives/collectives.h
@@ -7,9 +7,7 @@
 #ifndef NCCL_COLLECTIVES_H_
 #define NCCL_COLLECTIVES_H_
 
-typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
-
-#define FUNC_INDEX(coll, redop, dtype, ll) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll)
+#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*2+(al))*2+(ll))
 
 #define NCCL_COLL_NAME(coll, op, dtype) \
   coll##_##op##_##dtype
@@ -18,13 +16,17 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed
   coll##Kernel_##op##_##dtype
 
 /* Declare all collective operations */
-#define DECL_COLL4(coll, op, dtype) \
+#define DECL_COLL5(coll, op, dtype) \
   extern __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
-  extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl coll); \
+  extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl c); \
+
+#define DECL_COLL4(coll, op, dtype) \
+  DECL_COLL5(coll, op, dtype) \
+  DECL_COLL5(coll##LL, op, dtype)
 
 #define DECL_COLL3(coll, op, dtype) \
-  DECL_COLL4(coll##LL, op, dtype) \
-  DECL_COLL4(coll, op, dtype)
+  DECL_COLL4(coll##Ring, op, dtype) \
+  DECL_COLL4(coll##Tree, op, dtype)
 
 #define DECL_COLL2(coll, op) \
   DECL_COLL3(coll, op, i8) \
@@ -52,15 +54,16 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed
 
 DECL_ALL_COLLS
 
-#define ALLREDUCE_SUBSTEPS 2
-#define ALLREDUCE_BUFCHUNKS 2
-#define ALLGATHER_SUBSTEPS 2
-#define ALLGATHER_BUFCHUNKS 2
-#define REDUCESCATTER_SUBSTEPS 2
-#define REDUCESCATTER_BUFCHUNKS 2
-#define BROADCAST_SUBSTEPS 8
-#define BROADCAST_BUFCHUNKS 2
-#define REDUCE_SUBSTEPS 8
-#define REDUCE_BUFCHUNKS 2
+// CHUNKSIZE must be a multiple of SLICESIZE
+#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
+#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
+#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
+#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
+#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
+#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
+#define BROADCAST_SLICESTEPS 1
+#define BROADCAST_CHUNKSTEPS 1
+#define REDUCE_SLICESTEPS 1
+#define REDUCE_CHUNKSTEPS 1
 
 #endif
diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile
index e2bcd49..8e92596 100644
--- a/src/collectives/device/Makefile
+++ b/src/collectives/device/Makefile
@@ -12,18 +12,13 @@ OBJDIR := $(BUILDDIR)/obj/collectives/device
 
 LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu
 
-LIBOBJ     := $(patsubst %.cu,$(OBJDIR)/%_sum.o, $(LIBSRCFILES)) \
-              $(patsubst %.cu,$(OBJDIR)/%_prod.o, $(LIBSRCFILES)) \
-              $(patsubst %.cu,$(OBJDIR)/%_min.o, $(LIBSRCFILES)) \
-              $(patsubst %.cu,$(OBJDIR)/%_max.o, $(LIBSRCFILES)) \
-              $(OBJDIR)/functions.o
-
 LIBSRCFILES += functions.cu
 
 DEPFILES   := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES))
-DEPENDFILES := $(DEPFILES:%.d=%.dep)
+DEPENDFILES:= $(DEPFILES:%.d=%.dep)
 STATICLIB  := $(OBJDIR)/colldevice.a
 DEVOBJ     := $(OBJDIR)/devlink.o
+RULESFILE  := $(OBJDIR)/Makefile.rules
 
 NVCUFLAGS  += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden"
 
@@ -33,6 +28,16 @@ all: $(STATICLIB)
 # Dummy rule so that the extra dependency (%.dep) files are preserved by make
 all_deps: $(DEPENDFILES)
 
+# Auto-generating the rules per op/reduction/datatype/algorithm
+$(RULESFILE) :
+	@printf "Generating %-35s > %s\n" rules $@
+	@mkdir -p $(OBJDIR)
+	@./gen_rules.sh $(OBJDIR) > $@
+
+-include $(RULESFILE)
+
+LIBOBJ     := $(GENOBJS) $(OBJDIR)/functions.o
+
 -include $(DEPFILES)
 
 $(STATICLIB): $(LIBOBJ) $(DEVOBJ)
@@ -58,26 +63,6 @@ $(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep
 	mkdir -p `dirname $@`
 	$(NVCC) $(NVCUFLAGS) -dc $< -o $@
 
-$(OBJDIR)/%_sum.o : %.cu $(OBJDIR)/%.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) -DNCCL_OP=0 $(NVCUFLAGS) -dc $< -o $@
-
-$(OBJDIR)/%_prod.o : %.cu $(OBJDIR)/%.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) -DNCCL_OP=1 $(NVCUFLAGS) -dc $< -o $@
-
-$(OBJDIR)/%_min.o : %.cu $(OBJDIR)/%.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) -DNCCL_OP=2 $(NVCUFLAGS) -dc $< -o $@
-
-$(OBJDIR)/%_max.o : %.cu $(OBJDIR)/%.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) -DNCCL_OP=3 $(NVCUFLAGS) -dc $< -o $@
-
 # ... and create the device-side linked object with all those.
 $(DEVOBJ) : $(LIBOBJ)
 	$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu
index 0f572ce..530bf14 100644
--- a/src/collectives/device/all_gather.cu
+++ b/src/collectives/device/all_gather.cu
@@ -4,12 +4,8 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "common.h"
 #include "all_gather.h"
+#include "common.h"
 #include "collectives.h"
 
-#define UNROLL 4
-
-#if NCCL_OP == 0
-IMPL_COLL3(ncclAllGather, copy, FuncSum, i8, int8_t, ncclCollAllGather, ncclSum, ncclInt8);
-#endif
+IMPL_COLL_C(ncclAllGather, ncclCollAllGather);
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index a30e575..36809c9 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -8,72 +8,35 @@
 #include "primitives.h"
 #include "collectives.h"
 
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
-__device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x - 1;
   const int bid = args->bid;
-  __shared__ T* sharedNextOutput;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  int prevdirect = ring->recv.conn.direct;
-  int nextdirect = ring->send.conn.direct;
-
-  WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS);
-  PostFlag postDoneToPrev(ring->recv.conn.head, ALLGATHER_SUBSTEPS, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
-
-  typedef Primitives<UNROLL, ALLGATHER_SUBSTEPS, T> Prims;
-
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
   const int nranks = comm->nRanks;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / ALLGATHER_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
-
-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    *ring->recv.conn.opCount = args->opCount;
-    // Wait for next to be ready
-    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-    waitOpCountNext.wait(args->opCount);
-    if (prevdirect) {
-      *ring->recv.conn.ptrExchange = args->ThisOutput;
-    }
-    if (nextdirect) {
-      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
-      while (*ptr == nullptr);
-      sharedNextOutput = (T*)*ptr;
-      *ptr = nullptr;
-    }
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int poffset, noffset = 0;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*realChunkSize;
 
     /////////////// begin AllGather steps ///////////////
     ssize_t offset;
-    int maxOffset = min(chunkSize, size-chunkOffset);
+    int nelem = min(realChunkSize, size-chunkOffset);
     int rankDest;
 
     // step 0: push data to next GPU
@@ -81,129 +44,51 @@ __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
     offset = chunkOffset + rankDest * size;
 
     if (thisInput + chunkOffset == thisOutput + offset) { // In place
-      Prims::Copy(tid, nthreads,
-          thisInput  + chunkOffset,
-          nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
+      prims.directSend(thisInput+chunkOffset, offset, nelem);
     } else {
-      Prims::DoubleCopy(tid, nthreads,
-          thisInput  + chunkOffset,
-          thisOutput + offset,
-          nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
+      prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
     }
 
-    NEXT_STEP; // Increases step, poffset, noffset
-
     // k-2 steps: copy to next GPU
-    if (prevdirect) {
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ring->devUserRanks[nranks-j];
-        offset = chunkOffset + rankDest * size;
-
-        Prims::Copy(tid, nthreads,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-      Prims::Copy(tid, nthreads,
-          NULL,
-          NULL,
-          0, 0,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
-    } else {
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ring->devUserRanks[nranks-j];
-        offset = chunkOffset + rankDest * size;
-
-        Prims::DoubleCopy(tid, nthreads,
-            prevInput + poffset,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-
-      // Make final copy from buffer to dest.
-      rankDest = ring->devUserRanks[1];
+    for (int j=1; j<nranks-1; ++j) {
+      rankDest = ring->devUserRanks[nranks-j];
       offset = chunkOffset + rankDest * size;
 
-      // Here we need to copy from buffer to this output.
-      Prims::Copy(tid, nthreads,
-          prevInput + poffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
+      prims.directRecvCopySend(thisOutput+offset, offset, nelem);
     }
-  }
 
-  if (tid == 0) {
-    waitDoneFromNext.wait(ALLGATHER_SUBSTEPS*(step + ALLGATHER_BUFCHUNKS));
-    *ring->send.conn.head = 0ULL;
-    *ring->recv.conn.tail = 0ULL;
-    __threadfence_system();
-    *ring->recv.conn.opCount = args->opCount+1;
+    // Make final copy from buffer to dest.
+    rankDest = ring->devUserRanks[1];
+    offset = chunkOffset + rankDest * size;
+
+    // Final wait/copy.
+    prims.directRecv(thisOutput+offset, offset, nelem);
   }
 }
 
-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
-  poffset = noffset; \
-  pflag = nflag; \
-  noffset += NCCL_LL_SLICE_LINES; \
-  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
-  nflag++; \
-  step++;
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }
 
 template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
-  const int llNthreads = args->nThreads;
+  const int nthreads = args->nThreads;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
 
-  typedef LLPrimitives<T, FUNC> LL;
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
 
   const ssize_t size = args->N;
   //const int rank = comm->rank;
   const int nranks = comm->nRanks;
   ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*chunkSize;
-
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t pflag, nflag = step + 1;
-  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  const ssize_t loopSize = args->nChannels*chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     if (size-gridOffset < loopSize) {
@@ -213,57 +98,34 @@ __device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) {
 
     /////////////// begin AllGather steps ///////////////
     ssize_t offset;
-    int maxOffset = min(chunkSize, size-chunkOffset);
+    int nelem = min(chunkSize, size-chunkOffset);
     int rankDest;
 
     // step 0: push data to next GPU
     rankDest = ring->devUserRanks[0];
     offset = chunkOffset + rankDest * size;
 
-    WAIT_NEXT;
     if (thisInput + chunkOffset == thisOutput + offset) { // In place
-      LL::ReduceCopy(
-          thisInput  + chunkOffset,
-          nextOutput + noffset,
-          maxOffset, nflag, llNthreads);
+      LLprims.send(thisInput+chunkOffset, nelem);
     } else {
-      LL::ReduceCopy(
-          thisInput  + chunkOffset,
-          thisOutput + offset,
-          nextOutput + noffset,
-          maxOffset, nflag, llNthreads);
+      LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
     }
-    POST_SIZE;
-
-    NEXT_STEP_LL;
 
     // k-2 steps: copy to next GPU
     for (int j=1; j<nranks-1; ++j) {
       rankDest = ring->devUserRanks[nranks-j];
       offset = chunkOffset + rankDest * size;
 
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          prevInput  + poffset,
-          thisOutput + offset,
-          nextOutput + noffset,
-          maxOffset, pflag, nflag, llNthreads);
-      POST_SIZE;
-      ACK_PREV;
-
-      NEXT_STEP_LL;
+      LLprims.recvCopySend(thisOutput+offset, nelem);
     }
 
     // step k-1: final store
     rankDest = ring->devUserRanks[1];
     offset = chunkOffset + rankDest * size;
 
-    LL::ReduceCopy(
-        prevInput  + poffset,
-        thisOutput + offset,
-        maxOffset, pflag, llNthreads);
-    ACK_PREV;
+    LLprims.recv(thisOutput+offset, nelem);
   }
-
-  FIFO_CLEANING_AND_SAVE_STEP(nflag);
 }
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu
index caa1479..aaa96b4 100644
--- a/src/collectives/device/all_reduce.cu
+++ b/src/collectives/device/all_reduce.cu
@@ -4,18 +4,8 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "common.h"
 #include "all_reduce.h"
+#include "common.h"
 #include "collectives.h"
 
-#define UNROLL 4
-
-#if NCCL_OP == 0
-IMPL_COLL2(ncclAllReduce, sum,  FuncSum,  ncclCollAllReduce, ncclSum);
-#elif NCCL_OP == 1
-IMPL_COLL2(ncclAllReduce, prod, FuncProd, ncclCollAllReduce, ncclProd);
-#elif NCCL_OP == 2
-IMPL_COLL2(ncclAllReduce, min,  FuncMin,  ncclCollAllReduce, ncclMin);
-#elif NCCL_OP == 3
-IMPL_COLL2(ncclAllReduce, max,  FuncMax,  ncclCollAllReduce, ncclMax);
-#endif
+IMPL_COLL_R(ncclAllReduce, ncclCollAllReduce);
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index d7abc64..ea89a71 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -8,233 +8,152 @@
 #include "primitives.h"
 #include "collectives.h"
 
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
-__device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x - 1;
   const int bid = args->bid;
-  __shared__ T* sharedNextOutput;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  int prevdirect = ring->recv.conn.direct;
-  int nextdirect = ring->send.conn.direct;
-
-  WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS);
-  PostFlag postDoneToPrev(ring->recv.conn.head, ALLREDUCE_SUBSTEPS, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
-
-  typedef Primitives<UNROLL, ALLREDUCE_SUBSTEPS, T, FUNC> Prims;
-
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
-  //const int rank = comm->rank;
   const int nranks = comm->nRanks;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / ALLREDUCE_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
-
-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    *ring->recv.conn.opCount = args->opCount;
-    // Wait for next to be ready
-    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-    waitOpCountNext.wait(args->opCount);
-    if (prevdirect) {
-      *ring->recv.conn.ptrExchange = args->ThisOutput;
-    }
-    if (nextdirect) {
-      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
-      while (*ptr == nullptr);
-      sharedNextOutput = (T*)*ptr;
-      *ptr = nullptr;
-    }
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int poffset, noffset = 0;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,nranks*args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize;
 
     /////////////// begin AllReduce steps ///////////////
     ssize_t offset;
-    int maxOffset;
+    int nelem;
     int slice;
 
     // step 0: push data to next GPU
     slice = ring->devUserRanks[nranks-1];
-    offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    offset = chunkOffset + slice * realChunkSize;
+    nelem = min(realChunkSize, size-offset);
 
-    Prims::Copy(tid, nthreads,
-        thisInput  + offset,
-        nextOutput + noffset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext,
-        postReadyToNext);
-
-    NEXT_STEP; // Increases step, poffset, noffset
+    prims.send(thisInput+offset, nelem);
 
     // k-2 steps: reduce and copy to next GPU
     for (int j=2; j<nranks; ++j) {
       slice = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
-
-      Prims::Reduce(tid, nthreads,
-          prevInput  + poffset,
-          thisInput  + offset,
-          nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
+      offset = chunkOffset + slice * realChunkSize;
+      nelem = min(realChunkSize, size-offset);
+
+      prims.recvReduceSend(thisInput+offset, nelem);
     }
 
     // step k-1: reduce this buffer and data, which will produce the final
     // result that we store in this data and push to the next GPU
     slice = ring->devUserRanks[0];
-    offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    offset = chunkOffset + slice * realChunkSize;
+    nelem = min(realChunkSize, size-offset);
 
-    Prims::ReduceCopy(tid, nthreads,
-        prevInput  + poffset,
-        thisInput  + offset,
-        nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-        thisOutput + offset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP;
+    prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
 
     // k-2 steps: copy to next GPU
-    if (prevdirect) {
-      for (int j=1; j<nranks-1; ++j) {
-        slice = ring->devUserRanks[nranks - j];
-        offset = chunkOffset + slice * chunkSize;
-        maxOffset = min(chunkSize, size-offset);
-
-        Prims::Copy(tid, nthreads,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-      Prims::Copy(tid, nthreads,
-          NULL,
-          NULL,
-          0, 0,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
-    } else {
-      for (int j=1; j<nranks-1; ++j) {
-        slice = ring->devUserRanks[nranks - j];
-        offset = chunkOffset + slice * chunkSize;
-        maxOffset = min(chunkSize, size-offset);
-
-        Prims::DoubleCopy(tid, nthreads,
-            prevInput + poffset,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
+    for (int j=1; j<nranks-1; ++j) {
+      slice = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + slice * realChunkSize;
+      nelem = min(realChunkSize, size-offset);
 
-      // Make final copy from buffer to dest.
-      slice = ring->devUserRanks[1];
-      offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
-
-      // Here we need to copy from buffer to this output.
-      Prims::Copy(tid, nthreads,
-          prevInput + poffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
+      prims.directRecvCopySend(thisOutput+offset, offset, nelem);
     }
-  }
 
-  if (tid == 0) {
-    // Wait for next to have consumed all data before we reset the flag
-    waitDoneFromNext.wait(ALLREDUCE_SUBSTEPS*(step + ALLREDUCE_BUFCHUNKS));
-    *ring->send.conn.head = 0ULL;
-    *ring->recv.conn.tail = 0ULL;
-    __threadfence_system();
-    *ring->recv.conn.opCount = args->opCount+1;
+    // Make final copy from buffer to dest.
+    slice = ring->devUserRanks[1];
+    offset = chunkOffset + slice * realChunkSize;
+    nelem = min(realChunkSize, size-offset);
+
+    // Final wait/copy.
+    prims.directRecv(thisOutput+offset, offset, nelem);
   }
 }
 
-#include "ll_kernel.h"
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = blockDim.x - 1;
+  const int bid = args->bid;
+  struct ncclComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclTree* tree = &channel->tree;
+  const ssize_t size = args->N;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = args->lastChunkSize;
+  const ssize_t loopSize = args->nChannels*chunkSize;
 
-#define NEXT_STEP_LL \
-  poffset = noffset; \
-  pflag = nflag; \
-  noffset += NCCL_LL_SLICE_LINES; \
-  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
-  nflag++; \
-  step++;
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  do {
+    // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
+    ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Up
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        prims.send(thisInput+offset, nelem);
+      } else {
+        prims.recvReduceSend(thisInput+offset, nelem);
+      }
+    }
+  } while(0);
+
+  do {
+    // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
+    ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Down
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        prims.send(thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        prims.recv(thisOutput+offset, nelem);
+      } else {
+        prims.recvCopySend(thisOutput+offset, nelem);
+      }
+    }
+  } while(0);
+}
 
 template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
-  const int llNthreads = args->nThreads;
+  const int nthreads = args->nThreads;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
 
-  typedef LLPrimitives<T, FUNC> LL;
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
 
   const ssize_t size = args->N;
   //const int rank = comm->rank;
   const int nranks = comm->nRanks;
   ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*nranks*chunkSize;
-
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t pflag, nflag = step + 1;
-  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  const ssize_t loopSize = args->nChannels*nranks*chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     if (size-gridOffset < loopSize) {
@@ -244,89 +163,99 @@ __device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) {
 
     /////////////// begin AllReduce steps ///////////////
     ssize_t offset;
-    int maxOffset;
+    int nelem;
     int slice;
 
     // step 0: push data to next GPU
     slice = ring->devUserRanks[nranks-1];
     offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    nelem = min(chunkSize, size-offset);
 
-    WAIT_NEXT;
-    LL::ReduceCopy(
-        thisInput  + offset,
-        nextOutput + noffset,
-        maxOffset, nflag, llNthreads);
-    POST_SIZE;
-
-    NEXT_STEP_LL;
+    LLprims.send(thisInput+offset, nelem);
 
     // k-2 steps: reduce and copy to next GPU
     for (int j=2; j<nranks; ++j) {
       slice = ring->devUserRanks[nranks-j];
       offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
-
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          thisInput  + offset,
-          prevInput  + poffset,
-          nextOutput + noffset,
-          maxOffset, pflag, nflag, llNthreads);
-      POST_SIZE;
-      ACK_PREV;
-
-      NEXT_STEP_LL;
+      nelem = min(chunkSize, size-offset);
+
+      LLprims.recvReduceSend(thisInput+offset, nelem);
     }
 
     // step k-1: reduce this buffer and data, which will produce the final
     // result that we store in this data and push to the next GPU
     slice = ring->devUserRanks[0];
     offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    nelem = min(chunkSize, size-offset);
 
-    WAIT_NEXT;
-    LL::ReduceCopy(
-        thisInput  + offset,
-        prevInput  + poffset,
-        thisOutput + offset,
-        nextOutput + noffset,
-        maxOffset, pflag, nflag, llNthreads);
-    POST_SIZE;
-    ACK_PREV;
-
-    NEXT_STEP_LL;
+    LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
 
     // k-2 steps: copy to next GPU
     for (int j=1; j<nranks-1; ++j) {
-      slice = ring->devUserRanks[nranks - j];
+      slice = ring->devUserRanks[nranks-j];
       offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
-
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          prevInput + poffset,
-          thisOutput + offset,
-          nextOutput + noffset,
-          maxOffset, pflag, nflag, llNthreads);
-      POST_SIZE;
-      ACK_PREV;
-
-      NEXT_STEP_LL;
+      nelem = min(chunkSize, size-offset);
+
+      LLprims.recvCopySend(thisOutput+offset, nelem);
     }
 
     // Make final copy from buffer to dest.
     slice = ring->devUserRanks[1];
     offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    nelem = min(chunkSize, size-offset);
 
     // Here we need to copy from buffer to this output.
-    LL::ReduceCopy(
-        prevInput + poffset,
-        thisOutput + offset,
-        maxOffset, pflag, llNthreads);
-    ACK_PREV;
+    LLprims.recv(thisOutput+offset, nelem);
   }
+}
 
-  FIFO_CLEANING_AND_SAVE_STEP(nflag);
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = args->nThreads;
+  const int bid = args->bid;
+  struct ncclComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclTree* tree = &channel->tree;
+  const ssize_t size = args->N;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nChannels*chunkSize;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  do {
+    // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
+    ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Up
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        LLprims.send(thisInput+offset, nelem);
+      } else {
+        LLprims.recvReduceSend(thisInput+offset, nelem);
+      }
+    }
+  } while(0);
+
+  do {
+    // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
+    ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Down
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        LLprims.send(thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        LLprims.recv(thisOutput+offset, nelem);
+      } else {
+        LLprims.recvCopySend(thisOutput+offset, nelem);
+      }
+    }
+  } while(0);
 }
diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu
index 4125de4..b83ee70 100644
--- a/src/collectives/device/broadcast.cu
+++ b/src/collectives/device/broadcast.cu
@@ -4,12 +4,8 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "common.h"
 #include "broadcast.h"
+#include "common.h"
 #include "collectives.h"
 
-#define UNROLL 4
-
-#if NCCL_OP == 0
-IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8);
-#endif
+IMPL_COLL_C(ncclBroadcast, ncclCollBroadcast);
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index c2f6d00..fb18312 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -8,174 +8,74 @@
 #include "primitives.h"
 #include "collectives.h"
 
-// Increase Step and boffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  boffset += sliceSize; \
-  if (boffset == buffSize) boffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
-__device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
+__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x - 1;
   const int bid = args->bid;
-  __shared__ T* sharedNextOutput;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  int prevdirect = ring->recv.conn.direct;
-  int nextdirect = ring->send.conn.direct;
-
-  WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
-  PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS);
-
-  typedef Primitives<UNROLL, BROADCAST_SUBSTEPS, T> Prims;
-
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / BROADCAST_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
   const int rank = ring->devUserRanks[0];
   const int nextRank = ring->devUserRanks[1];
   const int root = args->root;
 
-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    *ring->recv.conn.opCount = args->opCount;
-    if (nextRank != root) {
-      // Wait for next to be ready
-      WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-      waitOpCountNext.wait(args->opCount);
-    }
-    if (rank != root && prevdirect) {
-      *ring->recv.conn.ptrExchange = args->ThisOutput;
-    }
-    if (nextRank != root && nextdirect) {
-      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
-      while (*ptr == nullptr);
-      sharedNextOutput = (T*)*ptr;
-      *ptr = nullptr;
-    }
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int boffset = 0;
-
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t offset = gridOffset + bid*chunkSize;
-    int maxOffset = min(chunkSize, size-offset);
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t offset = gridOffset + bid*realChunkSize;
+    int nelem = min(realChunkSize, size-offset);
 
     if (rank == root) {
       if (thisInput == thisOutput) {
-        Prims::Copy(tid, nthreads,
-            thisInput  + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext,
-            postReadyToNext);
+        prims.send(thisInput+offset, nelem);
       } else {
-        Prims::DoubleCopy(tid, nthreads,
-            thisInput  + offset,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext,
-            postReadyToNext);
+        prims.copySend(thisInput+offset, thisOutput+offset, nelem);
       }
     } else if (nextRank == root) {
-      if (prevdirect) maxOffset = 0; // Only wait for signals
-      Prims::Copy(tid, nthreads,
-          prevInput  + boffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
+      prims.recv(thisOutput+offset, nelem);
     } else {
-      if (prevdirect) {
-        Prims::Copy(tid, nthreads,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-      } else {
-        Prims::DoubleCopy(tid, nthreads,
-            prevInput + boffset,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-      }
-    }
-    NEXT_STEP; // Increases step, boffset
-  }
-
-  if (tid == 0) {
-    if (nextRank != root) {
-      // Wait for next to have consumed data before resetting the flag
-      waitDoneFromNext.wait(BROADCAST_SUBSTEPS*(step + BROADCAST_BUFCHUNKS - 1));
-      *ring->send.conn.head = 0ULL;
+      prims.recvCopySend(thisOutput+offset, nelem);
     }
-    *ring->recv.conn.tail = 0ULL;
-    __threadfence_system();
-    *ring->recv.conn.opCount = args->opCount+1;
   }
 }
 
-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
-  boffset += NCCL_LL_SLICE_LINES; \
-  if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
-  flag++; \
-  step++;
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }
 
 template<int UNUSED, class FUNC, typename T>
-__device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
-  const int llNthreads = args->nThreads;
+  const int nthreads = args->nThreads;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
-  const int rank = comm->rank;
-  const int nextRank = ring->devUserRanks[1];
-  const int root = args->root;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
 
-  typedef LLPrimitives<T, FUNC> LL;
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
 
   const ssize_t size = args->N;
-  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*chunkSize;
+  const int rank = ring->devUserRanks[0];
+  const int nextRank = ring->devUserRanks[1];
+  const int root = args->root;
 
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t flag = step + 1;
-  int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nChannels*chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     if (size-gridOffset < loopSize) {
@@ -183,46 +83,20 @@ __device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) {
     }
     ssize_t offset = gridOffset + bid*chunkSize;
 
-    int maxOffset = min(chunkSize, size-offset);
+    int nelem = min(chunkSize, size-offset);
     if (rank == root) {
-      WAIT_NEXT;
       if (thisInput == thisOutput) {
-        LL::ReduceCopy(
-            thisInput + offset,
-            nextOutput + boffset,
-            maxOffset, flag, llNthreads);
+        LLprims.send(thisInput+offset, nelem);
       } else {
-        LL::ReduceCopy(
-            thisInput + offset,
-            thisOutput + offset,
-            nextOutput + boffset,
-            maxOffset, flag, llNthreads);
+        LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
       }
-      POST_SIZE;
-      NEXT_STEP_LL;
     } else if (nextRank == root) {
-      LL::ReduceCopy(
-          prevInput + boffset,
-          thisOutput + offset,
-          maxOffset, flag, llNthreads);
-      NEXT_STEP_LL;
-      ACK_PREV;
+      LLprims.recv(thisOutput + offset, nelem);
     } else {
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          prevInput + boffset,
-          thisOutput + offset,
-          nextOutput + boffset,
-          maxOffset, flag, flag, llNthreads);
-      POST_SIZE;
-      NEXT_STEP_LL;
-      ACK_PREV;
+      LLprims.recvCopySend(thisOutput + offset, nelem);
     }
   }
-
-  // We need everyone to acknowledge data even if they didn't receive anything
-  // so that the next collective can start right away.
-  ACK_PREV;
-
-  FIFO_CLEANING_AND_SAVE_STEP(flag);
 }
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index c988913..e4aecbd 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -11,13 +11,29 @@
 #include "core.h"
 #include "nccl.h"
 
+// Exit If Abort Barrier across CTA: make sure all threads exit consistently
+// Each thread sets a predicate to true if abort == 1
+// all CTA's threads enter the barrier and do a popc on their predicates being True
+// If any of the thread's predicate was True, all the threads call exit()
+static inline __device__ void exitIfAbortBarrier(int abort) {
+  uint32_t popc;
+  asm ("{");
+  asm volatile ("   .reg .pred barr_pred;");
+  asm volatile ("   setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
+  asm volatile ("   bar.red.popc.u32 %0, 13, barr_pred;" : "=r"(popc));
+  asm ("}");
+  if (popc) { asm volatile ("exit;"); }
+}
+
 typedef void(*ncclKern_t)(struct CollectiveArgs* args);
 extern __device__ ncclKern_t ncclFuncs[];
 
 static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
   int* d = (int*)dst;
   int* s = (int*)src;
-  __syncthreads();
+  // When aggregation is effective, if some threads have aborted inside the LL kernel,
+  // make sure the rest of the threads abort as well
+  exitIfAbortBarrier(0);
   for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
   __syncthreads();
 }
@@ -27,12 +43,14 @@ static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* ho
 }
 
 /* Functions for aggregation case */
-#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
+#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
 __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
-  coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(args); \
+  coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(args); \
 }
+
+#if NCCL_OP == 0
 /* Kernels with the first operation inlined */
-#define IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, fIndex) \
+#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \
 __launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \
 __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
   int tid = threadIdx.x; \
@@ -40,25 +58,25 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
   __shared__ struct ncclColl localColl; \
  \
   struct ncclComm* comm = firstColl.args.comm; \
-  struct ncclRing* ring = comm->rings+bid; \
+  struct ncclChannel* channel = comm->channels+bid; \
   struct ncclColl* c; \
   if (bid == 0) { \
     /* To optimize for latency, (only) the first operation is passed as argument.*/ \
     c = &firstColl; \
   } else { \
     c = &localColl; \
-    load_coll(c, ring->devCollectives+ring->collFifoHead, tid); \
+    load_coll(c, channel->devCollectives+channel->collFifoHead, tid); \
   } \
   while (1) { \
-    if (tid < c->nThreads) { \
+    if (tid < c->args.nThreads) { \
       if (c->funcIndex == fIndex) { \
-        coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
+        coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
       } else { \
         ncclFuncs[c->funcIndex](&c->args); \
       } \
     } \
     int nextIndex = c->nextIndex; \
-    if (tid == 0) ring->collFifoHead = nextIndex; \
+    if (tid == 0) channel->collFifoHead = nextIndex; \
  \
     if (c->active == 2) { \
       return; \
@@ -66,25 +84,75 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
  \
     /* Load next collective operation*/ \
     c = &localColl; /* for bid 0 */ \
-    load_coll(c, ring->devCollectives+nextIndex, tid); \
+    load_coll(c, channel->devCollectives+nextIndex, tid); \
   } \
 }
+#else
+#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex)
+#endif
+
+// Only generate inline kernels for LL
+#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \
+  IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
+  IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \
+  IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1, al)) \
 
 #define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
-  IMPL_COLL4(coll##LL, op, ncclFunc, dtype, ctype) \
-  IMPL_COLL4K(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1)) \
-  IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
-  IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 0)) \
+  IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0) \
+  IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 1)
 
+#if NCCL_TYPE == 0
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, i8,  int8_t,   ncclColl, ncclOp, ncclInt8)
+#elif NCCL_TYPE == 1
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, u8,  uint8_t,  ncclColl, ncclOp, ncclUint8)
+#elif NCCL_TYPE == 2
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, i32, int32_t,  ncclColl, ncclOp, ncclInt32)
+#elif NCCL_TYPE == 3
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32)
+#elif NCCL_TYPE == 4
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, i64, int64_t,  ncclColl, ncclOp, ncclInt64)
+#elif NCCL_TYPE == 5
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64)
+#elif NCCL_TYPE == 6
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, f16, half,     ncclColl, ncclOp, ncclFloat16)
+#elif NCCL_TYPE == 7
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, f32, float,    ncclColl, ncclOp, ncclFloat32)
+#elif NCCL_TYPE == 8
 #define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, i8,  int8_t,   ncclColl, ncclOp, ncclInt8) \
-  IMPL_COLL3(coll, op, ncclFunc, u8,  uint8_t,  ncclColl, ncclOp, ncclUint8) \
-  IMPL_COLL3(coll, op, ncclFunc, i32, int32_t,  ncclColl, ncclOp, ncclInt32) \
-  IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32) \
-  IMPL_COLL3(coll, op, ncclFunc, i64, int64_t,  ncclColl, ncclOp, ncclInt64) \
-  IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64) \
-  IMPL_COLL3(coll, op, ncclFunc, f16, half,     ncclColl, ncclOp, ncclFloat16) \
-  IMPL_COLL3(coll, op, ncclFunc, f32, float,    ncclColl, ncclOp, ncclFloat32) \
   IMPL_COLL3(coll, op, ncclFunc, f64, double,   ncclColl, ncclOp, ncclFloat64)
+#endif
+
+// Reduction define all functions
+#if NCCL_OP == 0
+#define IMPL_COLL_R(collf, colln) \
+  IMPL_COLL2(collf, sum,  FuncSum,  colln, ncclSum);
+#elif NCCL_OP == 1
+#define IMPL_COLL_R(collf, colln) \
+  IMPL_COLL2(collf, prod, FuncProd, colln, ncclProd);
+#elif NCCL_OP == 2
+#define IMPL_COLL_R(collf, colln) \
+  IMPL_COLL2(collf, min,  FuncMin,  colln, ncclMin);
+#elif NCCL_OP == 3
+#define IMPL_COLL_R(collf, colln) \
+  IMPL_COLL2(collf, max,  FuncMax,  colln, ncclMax);
+#endif
+
+// Copy primitives only define one
+#if NCCL_OP == 0 && NCCL_TYPE == 0
+#define IMPL_COLL_C(collf, colln) \
+  IMPL_COLL3(collf, copy, FuncSum, i8, int8_t, colln, ncclSum, ncclInt8);
+#else
+#define IMPL_COLL_C(collf, colln)
+#endif
+
+#define COLL_UNROLL 4
 
 #endif
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
index 0eaa061..e1fb096 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@@ -192,14 +192,6 @@ struct MULTI<FUNC, int64_t> {
   }
 };
 
-#define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))
-
-template<typename T>
-__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
-  size_t ptrval = reinterpret_cast<size_t>(ptr);
-  return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
-}
-
 template<typename T> inline __device__
 T vFetch(const volatile T* ptr) {
   return *ptr;
@@ -236,25 +228,6 @@ void vStore<half>(volatile half* ptr, const half val) {
 }
 #endif
 
-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
-__device__ inline void ReduceCopy(
-    const int tid, const int nthreads,
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1,
-    volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1, const int N) {
-  for (int idx = tid; idx < N; idx += nthreads) {
-    T val = vFetch(src0+idx);
-    if (TWO_INPUTS) {
-      val = FUNC()(val, vFetch(src1+idx));
-    }
-    vStore(dest0+idx, val);
-    if (TWO_OUTPUTS) {
-      vStore(dest1+idx, val);
-    }
-  }
-}
-
 typedef ulong2 Pack128;
 
 template<class FUNC, typename T>
@@ -265,72 +238,111 @@ struct MULTI128 {
   }
 };
 
-inline __device__ void Fetch128(Pack128& v, Pack128* p) {
+inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
   asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory");
 }
 inline __device__ void Store128(Pack128* p, Pack128& v) {
   asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" :: "l"(p), "l"(v.x), "l"(v.y) : "memory");
 }
 
+template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ __forceinline__ void ReduceCopyMulti(const int tid, const int nthreads,
+    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
+    const int offset, const int N) {
+  for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
+    T val = vFetch(srcs[0]+idx);
+    #pragma unroll
+    for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
+    #pragma unroll 1
+    for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
+
+    #pragma unroll
+    for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
+    #pragma unroll 1
+    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
+  }
+}
+
 #define WARP_SIZE 32
-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL>
-__device__ inline void ReduceCopy128b( const int w, const int nw, const int t,
-    Pack128 * src0, Pack128 * src1, Pack128 * dest0, Pack128 * dest1,
-    const int N) {
-  Pack128 t0[UNROLL];
-  Pack128 t1[UNROLL];
-  const Pack128* src0_end = src0 + N;
+
+template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ __forceinline__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
+    int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
+    const int elemOffset, const int Npack) {
   const int inc = nw * UNROLL * WARP_SIZE;
-  const int offset = w * UNROLL * WARP_SIZE + t;
-  src0 += offset;  if (TWO_INPUTS)  src1 += offset;
-  dest0 += offset; if (TWO_OUTPUTS) dest1 += offset;
-
-  while (src0 < src0_end) {
-#pragma unroll
-    for (int u = 0; u < UNROLL; ++u) {
-      Fetch128(t0[u], src0+u*WARP_SIZE);
-      if (TWO_INPUTS) Fetch128(t1[u], src1+u*WARP_SIZE);
+  int offset = w * UNROLL * WARP_SIZE + t;
+
+  const Pack128* srcs[MAXSRCS];
+  for (int i=0; i<MAXSRCS; i++) srcs[i] = ((const Pack128*)(s[i]+elemOffset))+offset;
+  Pack128* dsts[MAXDSTS];
+  for (int i=0; i<MAXDSTS; i++) dsts[i] = ((Pack128*)(d[i]+elemOffset))+offset;
+
+  while (offset < Npack) {
+    Pack128 vals[UNROLL];
+    // Load and reduce
+    for (int u = 0; u < UNROLL; ++u) Fetch128(vals[u], srcs[0]+u*WARP_SIZE);
+
+    for (int i=1; i<MINSRCS; i++) {
+      Pack128 vals2[UNROLL];
+      for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+      for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
     }
-#pragma unroll
-    for (int u = 0; u < UNROLL; ++u) {
-      if (TWO_INPUTS) MULTI128<FUNC, T>()(t0[u], t1[u]);
-      Store128(dest0+u*WARP_SIZE, t0[u]);
-      if (TWO_OUTPUTS) Store128(dest1+u*WARP_SIZE, t0[u]);
+    #pragma unroll 1
+    for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) {
+      Pack128 vals2[UNROLL];
+      for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+      for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
     }
-    src0 += inc;  if (TWO_INPUTS)  src1 += inc;
-    dest0 += inc; if (TWO_OUTPUTS) dest1 += inc;
+
+    // Store
+    for (int i = 0; i < MINDSTS; i++) {
+      for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+    }
+    #pragma unroll 1
+    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
+      for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+    }
+    for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
+    for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
+    offset += inc;
   }
 }
 
-template<int UNROLL, class FUNC, typename T, bool HAS_DEST1, bool HAS_SRC1>
-__device__ inline void ReduceOrCopy(const int tid, const int nthreads,
-    volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
-    const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
+template <typename T>
+__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
+
+// Try to limit consecutive load/stores to 8.
+// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
+#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
+
+template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ __forceinline__ void ReduceOrCopyMulti(const int tid, const int nthreads,
+    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
     int N) {
   int Nrem = N;
   if (Nrem <= 0) return;
 
-  int Npreamble = (Nrem<alignof(Pack128)) ? Nrem : AlignUp(dest0, alignof(Pack128)) - dest0;
+  int alignDiff = 0;
+  int align = ptrAlign128(srcs[0]);
+  #pragma unroll
+  for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
+  for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
+  #pragma unroll
+  for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
+  for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
 
-  // stage 0: check if we'll be able to use the fast, 128-bit aligned path.
-  // If not, we'll just use the slow preamble path for the whole operation
-  bool alignable = (((AlignUp(src0,  alignof(Pack128)) == src0  + Npreamble)) &&
-          (!HAS_DEST1 || (AlignUp(dest1, alignof(Pack128)) == dest1 + Npreamble)) &&
-          (!HAS_SRC1  || (AlignUp(src1,  alignof(Pack128)) == src1  + Npreamble)));
-
-  if (!alignable) {
-    Npreamble = Nrem;
-  }
+  int Npreamble = alignDiff ? Nrem :
+    N < alignof(Pack128) ? N :
+    (alignof(Pack128) - align) % alignof(Pack128);
 
   // stage 1: preamble: handle any elements up to the point of everything coming
   // into alignment
-  ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Npreamble);
-
-  Nrem -= Npreamble;
-  if (Nrem == 0) return;
-
-  dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
-  src0  += Npreamble; if (HAS_SRC1)  { src1  += Npreamble; }
+  if (Npreamble) {
+    ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
+    Nrem -= Npreamble;
+    if (Nrem == 0) return;
+  }
+  int offset = Npreamble;
 
   // stage 2: fast path: use 128b loads/stores to do the bulk of the work,
   // assuming the pointers we have are all 128-bit alignable.
@@ -338,35 +350,33 @@ __device__ inline void ReduceOrCopy(const int tid, const int nthreads,
   int nw = nthreads / WARP_SIZE; // Number of warps
   int t = tid % WARP_SIZE;       // Thread (inside the warp)
 
-  const int PackFactor = sizeof(Pack128) / sizeof(T);
+  const int packFactor = sizeof(Pack128) / sizeof(T);
 
   // stage 2a: main loop
-  int Nalign2a = (Nrem / (PackFactor * UNROLL * nthreads))
-      * (UNROLL * nthreads); // round down
+  int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
+      * (AUTOUNROLL * WARP_SIZE); // round down
+  int Nelem2a = Npack2a * packFactor;
 
-  ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2a);
+  ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);
 
-  int Ndone2a = Nalign2a * PackFactor;
-  Nrem -= Ndone2a;
+  Nrem -= Nelem2a;
   if (Nrem == 0) return;
-  dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; }
-  src0  += Ndone2a; if (HAS_SRC1)  { src1  += Ndone2a; }
+  offset += Nelem2a;
 
   // stage 2b: slightly less optimized for section when we don't have full
-  // UNROLLs
+  // unrolling
 
-  int Nalign2b = Nrem / PackFactor;
+  int Npack2b = Nrem / packFactor;
+  int Nelem2b = Npack2b * packFactor;
 
-  ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, 1>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2b);
+  ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);
 
-  int Ndone2b = Nalign2b * PackFactor;
-  Nrem -= Ndone2b;
+  Nrem -= Nelem2b;
   if (Nrem == 0) return;
-  dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; }
-  src0  += Ndone2b; if (HAS_SRC1)  { src1  += Ndone2b; }
+  offset += Nelem2b;
 
   // stage 2c: tail
-  ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Nrem);
+  ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
 }
 
 #endif // COMMON_KERNEL_H_
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu
index 1fb8108..ea06b68 100644
--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@@ -8,9 +8,13 @@
 #include "collectives.h"
 #include "common.h"
 
-#define NCCL_FUNC4(coll, op, dtype) \
+#define NCCL_FUNC5(coll, op, dtype) \
   NCCL_COLL_NAME(coll, op, dtype), \
-  NCCL_COLL_NAME(coll##LL, op, dtype)  \
+  NCCL_COLL_NAME(coll##LL, op, dtype)
+
+#define NCCL_FUNC4(coll, op, dtype) \
+  NCCL_FUNC5(coll##Ring, op, dtype), \
+  NCCL_FUNC5(coll##Tree, op, dtype)
 
 // Must be consistent with ncclDataType_t
 #define NCCL_FUNCS3A(coll, op) \
@@ -55,7 +59,7 @@
   NCCL_FUNCS2A(ncclAllReduce) }
 
 // Must be consistent with the ncclFuncSet enum
-__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
+__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
 // Don't try to initialize the host shadow copy of this device-side global
 // variable. There is no host pointer to a device-side function, which
 // confuses clang. This will be fixed in the next clang release.
diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh
new file mode 100755
index 0000000..3942c8c
--- /dev/null
+++ b/src/collectives/device/gen_rules.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+#
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+dir=$1
+
+targets="GENOBJS := \\\\\n"
+
+for base in all_reduce all_gather broadcast reduce reduce_scatter; do
+  opn=0
+  for op in sum prod min max; do
+    dtn=0
+    for dt in i8 u8 i32 u32 i64 u64 f16 f32 f64; do
+      echo "${dir}/${base}_${op}_${dt}.o : ${base}.cu ${dir}/${base}.dep"
+      echo "	@printf \"Compiling  %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o"
+      echo "	mkdir -p ${dir}"
+      echo "	\${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc ${base}.cu -o ${dir}/${base}_${op}_${dt}.o"
+      echo ""
+      targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n"
+      dtn=$(($dtn + 1))
+    done
+    opn=$(($opn + 1))
+  done
+done
+echo -e "$targets"
diff --git a/src/collectives/device/ll_kernel.h b/src/collectives/device/ll_kernel.h
deleted file mode 100644
index 5ec3c9a..0000000
--- a/src/collectives/device/ll_kernel.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_LL_KERNEL_H_
-#define NCCL_LL_KERNEL_H_
-
-static __device__ uint64_t readLL(union ncclLLFifoLine* src, uint32_t flag) {
-  uint32_t data1, flag1, data2, flag2;
-  do {
-    asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
-  } while ((flag1 != flag) || (flag2 != flag));
-  uint64_t val64 = data1 + (((uint64_t)data2) << 32);
-  return val64;
-}
-
-static __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
-  asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
-}
-
-// Using memcpy handles misaligned pointers.
-static __device__ uint64_t readAL(uint64_t* src) {
-  uint64_t val;
-  memcpy((char*)&val, (char*)src, sizeof(uint64_t));
-  return val;
-}
-static __device__ void storeAL(uint64_t* dst, uint64_t val) {
-  memcpy((char*)dst, (char*)&val, sizeof(uint64_t));
-}
-
-template <typename T, class FUNC>
-class LLPrimitives {
- private:
-  template <int HAS_SRC1, int HAS_SRC2, int HAS_DST1, int HAS_DST2>
-  static __device__ void ReduceCopyGeneric(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
-    if (size <= 0) return;
-    size_t size64 = size * sizeof(T) / sizeof(uint64_t);
-    uint64_t* src1A = (uint64_t*)src1;
-    uint64_t* dst1A = (uint64_t*)dst1;
-    int offset = threadIdx.x;
-    // Do multiples of 64 bits
-#pragma unroll 1
-    for (; offset < size64; offset += nthreads) {
-      uint64_t val;
-      if (HAS_SRC1) {
-        val = readAL(src1A+offset);
-        if (HAS_SRC2) val = MULTI<FUNC, T>()(readLL(src2+offset, iflag), val);
-      } else if (HAS_SRC2) {
-        val = readLL(src2+offset, iflag);
-      }
-      if (HAS_DST1) storeAL(dst1A+offset, val);
-      if (HAS_DST2) storeLL(dst2+offset, val, oflag);
-    }
-    // Finish last word
-    int sizeDone = size64*(sizeof(uint64_t)/sizeof(T));
-    int sizeRem = size - sizeDone;
-    if (threadIdx.x == 0 && sizeRem) {
-      const T* src1B = src1 + sizeDone;
-      T* dst1B = dst1 + sizeDone;
-
-      uint64_t lastVal;
-      T* vals = (T*)&lastVal;
-
-      if (HAS_SRC2) {
-        uint64_t lastVal2 = readLL(src2+size64, iflag);
-        T* src2B = (T*)&lastVal2;
-        for (int offset = 0; offset < sizeRem; offset++) {
-          vals[offset] = HAS_SRC1 ? FUNC()(src2B[offset], src1B[offset]) : src2B[offset];
-        }
-      } else if (HAS_SRC1) {
-        for (int offset = 0; offset < sizeRem; offset++) {
-          vals[offset] = src1B[offset];
-        }
-      }
-      if (HAS_DST2) storeLL(dst2+size64, lastVal, oflag);
-      if (HAS_DST1) {
-        for (int offset = 0; offset < sizeRem; offset++) {
-          dst1B[offset] = vals[offset];
-        }
-      }
-    }
-  }
- public:
-  static __device__ void ReduceCopy(const T* src, union ncclLLFifoLine* dst, int size, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<1, 0, 0, 1>(src, NULL, NULL, dst, size, 0, oflag, nthreads);
-  }
-
-  static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst, int size, uint32_t iflag, int nthreads) {
-    return ReduceCopyGeneric<0, 1, 1, 0>(NULL, src, dst, NULL, size, iflag, 0, nthreads);
-  }
-
-  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, union ncclLLFifoLine* dst, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<1, 1, 0, 1>(src1, src2, NULL, dst, size, iflag, oflag, nthreads);
-  }
-
-  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst, int size, uint32_t iflag, int nthreads) {
-    return ReduceCopyGeneric<1, 1, 1, 0>(src1, src2, dst, NULL, size, iflag, 0, nthreads);
-  }
-
-  static __device__ void ReduceCopy(const T* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<1, 0, 1, 1>(src, NULL, dst1, dst2, size, 0, oflag, nthreads);
-  }
-
-  static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<0, 1, 1, 1>(NULL, src, dst1, dst2, size, iflag, oflag, nthreads);
-  }
-
-  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<1, 1, 1, 1>(src1, src2, dst1, dst2, size, iflag, oflag, nthreads);
-  }
-};
-
-// Common macros
-
-#define STEP_TO_SLOT(step) \
-  (step % NCCL_LL_CHUNKS)
-
-#define WAIT_NEXT \
-  if (tid == 0) { \
-    while (sendHead + NCCL_LL_CHUNKS <= step) { \
-      sendHead = sendHeadPtr[0]; \
-    } \
-  } \
-  asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads));
-
-#define POST_SIZE \
-  if (tid == 0 && sizesFifo) sizesFifo[step % NCCL_LL_CHUNKS] = (maxOffset <= 0) ? -1 : (maxOffset*2*(int)sizeof(T));
-
-#define ACK_PREV \
-  asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads)); \
-  if (tid == 0) recvHeadPtr[0] = step;
-
-#define FIFO_CLEANING_AND_SAVE_STEP(flag) do { \
-  if (step > ring->send.conn.llLastCleaning + NCCL_LL_CLEAN_FREQ) { \
-    /* Reset all flags */ \
-    static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); \
-    static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); \
-    const union ncclLLFifoLine resetLine = { 0, flag, 0, flag }; \
-    for (int i=0; i<NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*llNthreads); i++) { \
-      prevInput[tid+i*llNthreads].i4 = resetLine.i4; \
-    } \
-    __threadfence_system(); \
-    /* Restart from the same slot, only make sure sender waits for data to be reset */ \
-    step += NCCL_LL_CHUNKS; \
-    ACK_PREV; \
-    while (sendHeadPtr[0] < step); \
-    if (tid == 0) ring->send.conn.llLastCleaning = step; \
-  } \
-  ring->send.conn.llStep = step; \
-} while (0);
-
-#endif
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index e2baa4b..c5aaf54 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,218 +9,579 @@
 
 #include <type_traits>
 #include "reduce_kernel.h" // for reduction funcs
+#include "common.h"
+
+#define SPINS_BEFORE_CHECK_ABORT 1000000
+
+// Unroll unconditionally the first send/recv since nsend/nrecv should be at
+// least 1 if SEND/RECV is set.
+#define FOR_SEND(func, ...) do { \
+  if (SEND) { \
+    /* Send to far first, then close */ \
+    for (int i=1; i<NSEND && i<nsend; i++) func(i, ##__VA_ARGS__); \
+    func(0, ##__VA_ARGS__); \
+  } \
+} while (0)
+
+#define FOR_RECV(func, ...) do { \
+  if (RECV) { \
+    /* Recv from close first, then far */ \
+    func(0, ##__VA_ARGS__); \
+    for (int i=1; i<NRECV && i<nrecv; i++) func(i, ##__VA_ARGS__); \
+  } \
+} while (0)
 
+// Implementation of primitive types
+template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, class FUNC>
+class ncclPrimitives {
+ private:
+  const int tid;
+  const int nthreads;
+  int nrecv = 0;
+  int nsend = 0;
+  const int stepSize;
+  struct ncclConnInfo* recvConn[NRECV];
+  struct ncclConnInfo* sendConn[NSEND];
+  volatile uint64_t* waitPtr;
+  uint64_t recvStep[NRECV];
+  uint64_t sendStep[NSEND];
+  uint64_t sendConnHead[NSEND];
+  const T* recvDirectBuff[NRECV];
+  T* sendDirectBuff[NSEND];
+  const T* recvBuff[NRECV];
+  T* sendBuff[NSEND];
+  struct ncclComm* comm;
+
+  inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
+  inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
+  inline __device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
+  inline __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
+
+  inline __device__ void barrier() {
+    asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+  }
 
-/* Defines primitive operations: Copy, Reduce, DoubleCopy, and ReduceCopy.
- *
- * In order to reduce the reptetion of template arguments, the operations
- * are bundled as static methods of the Primitives class.
- *
- * Each primitive operation copies/reduces a contiguous buffer and syncs
- * an optional set of flags against a sub-step counter. The sync value is
- * based on the step parameter. Sync flags must be of type WaitFlag or
- * PostFlag. The primitive routines wait for all WaitFlag args to attain
- * at least a value of SUBSTEPS*(step-1)+substep+1 (i.e. completion of
- * corresponding substep by previous step) before executing the transfer.
- * After each substep is transfered, all PostFlag arguments get updated to
- * the value SUBSTEPS*step+substep+1.
- */
-
-
-class WaitFlag {
-  volatile uint64_t * const flag;
-  const int shift;
- public:
-  __device__ __forceinline__
-  WaitFlag(volatile uint64_t * const flag, const int shift) : flag(flag), shift(shift) { }
-  __device__ __forceinline__
-  void wait(uint64_t val) { while ((*flag + shift) < val) /*SPIN*/; }
-};
+  uint32_t mismatch = 0;
+  const uint64_t opCount;
 
+  inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
+    if (mismatch) {
+      // In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch
+      *(comm->fatalDevError) = ncclDevAssertedMismatch;
+    } else if (remoteOpCount && *remoteOpCount > opCount) {
+      mismatch += 1;
+    }
+  }
+
+  uint32_t spins = 0;
+  uint32_t abort = 0;
+
+  inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
+    spins++;
+    if (spins == SPINS_BEFORE_CHECK_ABORT) {
+      abort = *(comm->abortFlag);
+      checkMismatch(remoteOpCount);
+      spins = 0;
+    }
+    return abort;
+  }
+
+  inline __device__ void waitRecv(int i) {
+    spins = 0;
+    mismatch = 0;
+    recvStep[i] += SLICESTEPS;
+    if (tid == i) {
+      while (*(waitPtr) < recvStep[i]) {
+        if (checkAbort(recvConn[i]->opCountRem)) break;
+      }
+    }
+  }
+
+  inline __device__ void waitSend(int i) {
+    spins = 0;
+    mismatch = 0;
+    sendStep[i] += SLICESTEPS;
+    if (tid == WARP_SIZE+i) {
+      while (sendConnHead[i] + NCCL_STEPS < sendStep[i]) {
+        sendConnHead[i] = *waitPtr;
+        if (checkAbort(sendConn[i]->opCountRem)) break;
+      }
+    }
+  }
+
+  inline __device__ void postRecv(int i) {
+    *(recvConn[i]->head) = recvStep[i] += SLICESTEPS;
+  }
+
+  inline __device__ void postSend(int i) {
+    *(sendConn[i]->tail) = sendStep[i] += SLICESTEPS;
+  }
+
+  inline __device__ void postSendSize(int i, int size) {
+    if (sendConn[i]->fifo) sendConn[i]->fifo[sendStep[i]%NCCL_STEPS] = size;
+  }
+
+  template <int DIRECTRECV>
+  inline __device__ const T* directRecvPtr(int i, int directOffset) {
+    return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i);
+  }
+
+  template <int DIRECTSEND>
+  inline __device__ T* directSendPtr(int i, int directOffset) {
+    return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
+  }
+
+  template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
+  inline __device__ void
+  GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) {
+    int offset = 0;
+    int sliceSize = stepSize * SLICESTEPS;
+
+    const T* srcs[RECV*NRECV+SRC];
+    srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset);
+    if (RECV) {
+      if (SRC) srcs[1] = recvPtr(0);
+      for (int i=1; i<NRECV && i<nrecv; i++) srcs[SRC+i] = recvPtr(i);
+    }
+
+    T* dsts[SEND*NSEND+DST];
+    dsts[0] = DST ? dstPtr : directSendPtr<DIRECTSEND>(0, directOffset);
+    if (SEND) {
+      if (DST) dsts[1] = directSendPtr<DIRECTSEND>(0, directOffset);
+      for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
+    }
+
+    #pragma unroll 1
+    for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
+      int realSize = max(0, min(sliceSize, nelem-offset));
+      if (tid < nthreads) {
+        FOR_SEND(waitSend);
+        FOR_RECV(waitRecv);
+        if (realSize > 0) {
+          barrier();
+          if (DIRECTRECV && recvDirectBuff[0]) {
+            // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
+            if (SEND) {
+              ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
+            }
+          } else {
+            ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
+          }
+        }
+        exitIfAbortBarrier(abort);
+      } else {
+        exitIfAbortBarrier(abort);
+        FOR_SEND(postSendSize, realSize*sizeof(T));
+        if (SEND) __threadfence_system();
+        FOR_SEND(postSend);
+        FOR_RECV(postRecv);
+      }
+      for (int i=0; i<RECV*NRECV+SRC; i++) srcs[i] += sliceSize;
+      for (int i=0; i<SEND*NSEND+DST; i++) dsts[i] += sliceSize;
+      offset += sliceSize;
+    }
+  }
+
+  __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
+    recvConn[i] = conn;
+    recvBuff[i] = (const T*)recvConn[i]->buff;
+    recvStep[i] = recvConn[i]->step;
+    recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
+    // Return credits in case we rounded up.
+    if (tid == nthreads) *recvConn[i]->head = recvStep[i];
+    if (tid == i) {
+      waitPtr = recvConn[i]->tail;
+      *(recvConn[i]->opCountLoc) = opCount;
+    }
+    recvDirectBuff[i] = NULL;
+    if (directBuff && recvConn[i]->direct) {
+      recvDirectBuff[i] = directBuff;
+      if (tid == 0) *recvConn[i]->ptrExchange = directBuff;
+    }
+    nrecv++;
+  }
+
+  __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
+    sendConn[i] = conn;
+    sendBuff[i] = (T*)sendConn[i]->buff;
+    sendStep[i] = sendConn[i]->step;
+    sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
+    if (tid == WARP_SIZE+i) {
+      waitPtr = sendConn[i]->head;
+      sendConnHead[i] = *waitPtr;
+      *(sendConn[i]->opCountLoc) = opCount;
+    }
+    sendDirectBuff[i] = NULL;
+    if (directBuff && sendConn[i]->direct) {
+      void* volatile* ptr = sendConn[i]->ptrExchange;
+      while ((sendDirectBuff[i] = (T*)(*ptr)) == NULL);
+      __syncthreads();
+      if (tid == 0) *ptr = NULL;
+    }
+    nsend++;
+  }
+
+  __device__ __forceinline__ void saveRecvConn(int i) {
+    if (tid == i) {
+      recvConn[i]->step = recvStep[i];
+      __threadfence_system();
+      *(recvConn[i]->opCountLoc) += 1;
+    }
+  }
+
+  __device__ __forceinline__ void saveSendConn(int i) {
+    if (tid == WARP_SIZE+i) {
+      sendConn[i]->step = sendStep[i];
+      __threadfence_system();
+      *(sendConn[i]->opCountLoc) += 1;
+    }
+  }
 
-class PostFlag {
-  volatile uint64_t * const flag;
-  const int shift;
-  volatile int * const fifo;
-  const int fifo_size;
  public:
   __device__ __forceinline__
-  PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size) : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size) { }
-  __device__ __forceinline__
-  void post(uint64_t val) { *flag = (val - shift); }
-  __device__ __forceinline__
-  void postSize(uint64_t step, int size) { if (fifo != NULL) fifo[step%fifo_size] = size; };
-};
+  ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
+    : comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
+    // Make sure step is updated before we read it
+    __syncthreads();
 
+    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, directBuff);
+    for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, directBuff);
+  }
 
-// Helper to check if any argument is of type T.
-// e.g. AnyAre<WaitFlag>(Flag1, Flag2, ...)
-template<typename T> __device__ __forceinline__
-bool AnyAre() { return false; }
+  __device__ __forceinline__ void
+  send(const T* src, int nelem) {
+    GenericOp<0, 0, 0, 1, 1, 0>(src, NULL, nelem, 0);
+  }
+  __device__ __forceinline__ void
+  directSend(const T* src, int directOffset, int nelem) {
+    GenericOp<0, 1, 0, 1, 1, 0>(src, NULL, nelem, directOffset);
+  }
 
-template<typename T, typename FIRST_T, typename... TAIL_Ts>
-__device__ __forceinline__
-bool AnyAre(FIRST_T first, TAIL_Ts... tail) {
-  return std::is_same<T, FIRST_T>::value || AnyAre<T>(tail...);
-}
+  __device__ __forceinline__ void
+  recv(T* dst, int nelem) {
+    GenericOp<0, 0, 1, 0, 0, 1>(NULL, dst, nelem, 0);
+  }
+  __device__ __forceinline__ void
+  directRecv(T* dst, int directOffset, int nelem) {
+    GenericOp<1, 0, 1, 0, 0, 1>(NULL, dst, nelem, directOffset);
+  }
 
+  __device__ __forceinline__ void
+  copySend(const T* src, T* dst, int nelem) {
+    GenericOp<0, 0, 0, 1, 1, 1>(src, dst, nelem, 0);
+  }
+  __device__ __forceinline__ void
+  directCopySend(const T* src, T* dst, int directOffset, int nelem) {
+    GenericOp<0, 1, 0, 1, 1, 1>(src, dst, nelem, directOffset);
+  }
 
-// Wait on all WaitFlags, ignore PostFlags
-__device__ __forceinline__
-void WaitOnFlags(uint64_t val) { }
+  __device__ __forceinline__ void
+  recvCopySend(T* dst, int nelem) {
+    GenericOp<0, 0, 1, 1, 0, 1>(NULL, dst, nelem, 0);
+  }
+  __device__ __forceinline__ void
+  directRecvCopySend(T* dst, int directOffset, int nelem) {
+    GenericOp<1, 1, 1, 1, 0, 1>(NULL, dst, nelem, directOffset);
+  }
 
-template <typename... TAIL_Ts> __device__ __forceinline__
-void WaitOnFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
-  flag.wait(val);
-  WaitOnFlags(val, tail...);
-}
+  __device__ __forceinline__ void
+  recvReduceCopy(const T* src, T* dst, int nelem) {
+    GenericOp<0, 0, 1, 0, 1, 1>(src, dst, nelem, 0);
+  }
 
-template <typename... TAIL_Ts> __device__ __forceinline__
-void WaitOnFlags(uint64_t val, PostFlag, TAIL_Ts... tail) {
-  WaitOnFlags(val, tail...);
-}
+  __device__ __forceinline__ void
+  recvReduceSend(const T* src, int nelem) {
+    GenericOp<0, 0, 1, 1, 1, 0>(src, NULL, nelem, 0);
+  }
 
+  __device__ __forceinline__ void
+  recvReduceCopySend(const T* src, T* dst, int nelem) {
+    GenericOp<0, 0, 1, 1, 1, 1>(src, dst, nelem, 0);
+  }
+  __device__ __forceinline__ void
+  directRecvReduceCopySend(const T* src, T* dst, int directOffset, int nelem) {
+    // Direct is only for the send part
+    GenericOp<0, 1, 1, 1, 1, 1>(src, dst, nelem, directOffset);
+  }
 
-// Post all PostFlags, ignore WaitFlags
-__device__ __forceinline__
-void PostToFlags(uint64_t val) { }
+  __device__ __forceinline__ ~ncclPrimitives() {
+    // Save steps for next collective. Have thread 0 do it to be compatible
+    // with the way LL works.
+    for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
+    for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
+  }
+};
 
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostToFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
-  PostToFlags(val, tail...);
-}
+template <typename T, class FUNC, int NRECV, int NSEND>
+class ncclLLPrimitives {
+ private:
+  const int tid;
+  const int nthreads;
+  int nrecv = 0;
+  int nsend = 0;
+  struct ncclConnInfo* recvConn[NRECV];
+  struct ncclConnInfo* sendConn[NSEND];
+  volatile uint64_t* waitPtr;
+  volatile uint64_t* postPtr;
+  volatile int* fifoPtr;
+  uint64_t recvStep[NRECV];
+  uint64_t sendStep[NSEND];
+  uint64_t sendConnHead;
+  union ncclLLFifoLine* recvBuff[NRECV];
+  union ncclLLFifoLine* sendBuff[NSEND];
+  struct ncclComm* comm;
+
+  inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+  inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+  inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
+  inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
+  inline __device__ uint32_t recvFlag(int i) { return recvStep[i]+1; }
+  inline __device__ uint32_t sendFlag(int i) { return sendStep[i]+1; }
+
+  // Exit If Abort Barrier : make sure all threads exit consistently
+  // Each thread sets a predicate to true if val == 1
+  // all CTA's threads enter the barrier and do a popc on their predicates being True
+  // If any of the thread's predicate was True, all the threads call exit()
+  inline __device__ void exitIfAbortLocalBarrier() {
+    uint32_t popc;
+    asm ("{");
+    asm volatile ("   .reg .pred barr_pred;");
+    asm volatile ("   setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
+    asm volatile ("   bar.red.popc.u32 %0, 14, %1, barr_pred;" : "=r"(popc) : "r"(nthreads));
+    asm ("}");
+    if (popc) {
+      // Make sure threads not participating in the operation get the abort and all threads exit
+      exitIfAbortBarrier(1);
+    }
+  }
+
+  inline __device__ void barrier() {
+    asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+  }
+
+  uint32_t mismatch = 0;
+  const uint64_t opCount;
+
+  inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
+    if (mismatch > 20) {
+      // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
+      // Note that we are not using _threadfence_system in LL so the error cannot be asserted
+      *(comm->fatalDevError) = ncclDevSuspectedMismatch;
+    } else if (remoteOpCount && *remoteOpCount > opCount) {
+      mismatch += 1;
+    }
+  }
 
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostToFlags(uint64_t val, PostFlag flag, TAIL_Ts... tail) {
-  flag.post(val);
-  PostToFlags(val, tail...);
-}
+  uint32_t spins = 0;
+  uint32_t abort = 0;
 
+  inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
+    spins++;
+    if (spins == SPINS_BEFORE_CHECK_ABORT) {
+      abort = *(comm->abortFlag);
+      checkMismatch(remoteOpCount);
+      spins = 0;
+    }
+    return abort;
+  }
 
-// Post sizes for PostFlags, ignore WaitFlags
-__device__ __forceinline__
-void PostSizeToFlags(uint64_t step, int size) { }
+  inline __device__ void waitSend(int i, int nbytes) {
+    spins = 0;
+    mismatch = 0;
+    if (tid == WARP_SIZE+i) {
+      while (sendConnHead + NCCL_STEPS < sendStep[i] + 1) {
+        sendConnHead = *waitPtr;
+        if (checkAbort(sendConn[i]->opCountRem)) break;
+      }
+      if (fifoPtr) fifoPtr[sendStep[i]%NCCL_STEPS] = nbytes;
+    }
+  }
 
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostSizeToFlags(uint64_t step, int size, WaitFlag flag, TAIL_Ts... tail) {
-  PostSizeToFlags(step, size, tail...);
-}
+  inline __device__ void postRecv(int i) {
+    recvStep[i]++;
+    if (tid == i) *postPtr = recvStep[i];
+  }
 
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostSizeToFlags(uint64_t step, int size, PostFlag flag, TAIL_Ts... tail) {
-  flag.postSize(step, size);
-  PostSizeToFlags(step, size, tail...);
-}
+  inline __device__ void postSend(int i) {
+    sendStep[i]++;
+  }
 
+  __device__ uint64_t readLL(int i, int offset) {
+    union ncclLLFifoLine* src = recvPtr(i) + offset;
+    uint32_t flag = recvFlag(i);
+    uint32_t data1, flag1, data2, flag2;
+    spins = 0;
+    mismatch = 0;
+    do {
+      asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
+      if (checkAbort(recvConn[i]->opCountRem)) break;
+    } while ((flag1 != flag) || (flag2 != flag));
+    uint64_t val64 = data1 + (((uint64_t)data2) << 32);
+    return val64;
+  }
 
-// Create pointer arithmetic syntax that doesn't break for std::nullptr_t
-template <typename Tptr> __device__ __forceinline__
-Tptr ptradd(Tptr ptr, int i) {
-  return ptr + i;
-}
+  __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
+    asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
+  }
 
-__device__ __forceinline__
-std::nullptr_t ptradd(std::nullptr_t ptr, int i) {
-  return nullptr;
-}
+  // Using memcpy handles misaligned pointers.
+  __device__ uint64_t readAL(uint64_t* src) {
+    uint64_t val;
+    memcpy((char*)&val, (char*)src, sizeof(uint64_t));
+    return val;
+  }
 
+  __device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) {
+    memcpy((char*)dst, (char*)&val, nbytes);
+  }
 
-// Implementation of primitive types
-template <int UNROLL, int SUBSTEPS, typename T, typename REDOP=FuncSum<T> >
-class Primitives {
- private:
-  template <typename SRC2_T, // either T* or std::nullptr_t
-      typename DST2_T, // either T* or std::nullptr_t
-      typename... SYNC_Ts> // either WaitFunc or PostFunc
-  static __device__ __forceinline__ void
-  GenericOp(const int tid, const int nthreads,
-      const T*     src1,
-      const SRC2_T src2,
-      T*     dst1,
-      DST2_T dst2,
-      int len, int maxoffset, uint64_t step, SYNC_Ts... flags) {
-
-    enum { noSrc2 = std::is_same<SRC2_T, std::nullptr_t>::value };
-    enum { noDst2 = std::is_same<DST2_T, std::nullptr_t>::value };
-    static_assert(noSrc2 || std::is_same<SRC2_T, const T*>::value,
-        "src2 must be of type T* or std::nullptr_t");
-    static_assert(noDst2 || std::is_same<DST2_T, T*>::value,
-        "dst2 must be of type T* or std::nullptr_t");
-
-    using OpType = typename std::conditional<noSrc2, FuncSum<T>, REDOP>::type;
-
-    int sliceSize = len / SUBSTEPS;
-    int sliceOffset = 0;
-
-#pragma unroll 1
-    for (int sub=0; sub<SUBSTEPS; ++sub) {
-      int realSize = max(0, min(sliceSize, maxoffset-sliceOffset));
-      if (tid < nthreads) {
-        if (AnyAre<WaitFlag>(flags...)) {
-          if (tid == 0) {
-            WaitOnFlags(SUBSTEPS*step + sub + 1, flags...);
-          }
-          asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+  template <int RECV, int SEND, int SRC, int DST>
+  __device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) {
+    uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T);
+    FOR_SEND(waitSend, nbytes*2);
+    barrier();
+    uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
+    uint64_t* srcPack = (uint64_t*)srcPtr;
+    uint64_t* dstPack = (uint64_t*)dstPtr;
+    // Do multiples of 64 bits
+    #pragma unroll 2
+    for (int offset=tid; offset<npack; offset+=nthreads) {
+      // Recv : local, then intra-node, then inter-node
+      uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
+      if (RECV) {
+        if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val);
+        for (int i=1; i<NRECV && i<nrecv; i++) {
+          val = MULTI<FUNC, T>()(readLL(i, offset), val);
         }
-        ReduceOrCopy
-        <
-        UNROLL,
-        OpType,
-        T,
-        !std::is_same<DST2_T, std::nullptr_t>::value, // HAS_DEST1
-        !std::is_same<SRC2_T, std::nullptr_t>::value  // HAS_SRC1
-        >
-        (
-            tid, nthreads,
-            ptradd(dst1, sliceOffset),
-            ptradd(dst2, sliceOffset),
-            ptradd(src1, sliceOffset),
-            ptradd(src2, sliceOffset),
-            realSize
-        );
-        if (AnyAre<PostFlag>(flags...)) {
-          __syncthreads();
+      }
+
+      // Send : inter-node, then intra-node, then local
+      if (SEND) {
+        for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i));
+        storeLL(sendPtr(0)+offset, val, sendFlag(0));
+      }
+      if (DST) {
+        if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) {
+          // Last incomplete word
+          storeAL(dstPack+offset, val, nbytes & 0x7);
+        } else {
+          storeAL(dstPack+offset, val, sizeof(uint64_t));
         }
-      } else {
-        if (AnyAre<PostFlag>(flags...)) {
-          __syncthreads();
-          PostSizeToFlags(SUBSTEPS*step+sub, realSize*sizeof(T), flags...);
-          __threadfence_system();
-          PostToFlags(SUBSTEPS*step + sub + 1, flags...);
+      }
+    }
+    exitIfAbortLocalBarrier();
+    FOR_RECV(postRecv);
+    FOR_SEND(postSend);
+  }
+
+  __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
+    recvConn[i] = conn;
+    recvBuff[i] = recvConn[i]->llBuff;
+    recvStep[i] = recvConn[i]->step;
+    if (tid == i) {
+      postPtr = recvConn[i]->head;
+      *(recvConn[i]->opCountLoc) = opCount;
+    }
+    nrecv++;
+  }
+
+  __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
+    sendConn[i] = conn;
+    sendBuff[i] = sendConn[i]->llBuff;
+    sendStep[i] = sendConn[i]->step;
+    if (tid == WARP_SIZE+i) {
+      waitPtr = sendConn[i]->head;
+      fifoPtr = sendConn[i]->fifo;
+      sendConnHead = *waitPtr;
+      *(sendConn[i]->opCountLoc) = opCount;
+    }
+    nsend++;
+  }
+
+  __device__ __forceinline__ void saveRecvConn(int i) {
+    if (tid == i) {
+      recvConn[i]->step = recvStep[i];
+      *(recvConn[i]->opCountLoc) += 1;
+      __threadfence_block();
+    }
+  }
+
+  __device__ __forceinline__ void saveSendConn(int i) {
+    if (tid == WARP_SIZE+i) {
+      sendConn[i]->step = sendStep[i];
+      *(sendConn[i]->opCountLoc) += 1;
+      __threadfence_block();
+    }
+  }
+
+  __device__ __forceinline__ void llSendCleaning(int i) {
+    if (sendStep[i] > sendConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
+      /* Reset all flags */
+      static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS");
+      static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS");
+      for (int s=0; s<NCCL_STEPS; s++) {
+        waitSend(i, 0);
+        for (int o=tid; o<NCCL_LL_SLICE_LINES; o+=nthreads) {
+          const union ncclLLFifoLine resetLine = { 0, sendFlag(i), 0, sendFlag(i) };
+          sendPtr(i)[o].i4 = resetLine.i4;
         }
       }
-      sliceOffset += sliceSize;
+      if (tid == 0) sendConn[i]->llLastCleaning = sendStep[i];
+    }
+  }
+
+  __device__ __forceinline__ void llRecvCleaning(int i) {
+    if (recvStep[i] > recvConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
+      recvStep[i] += NCCL_STEPS;
+      if (tid == 0) recvConn[i]->llLastCleaning = recvStep[i];
     }
   }
 
  public:
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  Copy(const int tid, const int nthreads, const T* src, T* dst,
-      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
-    GenericOp(tid, nthreads, src, nullptr, dst, nullptr, len, maxOffset, step, flags...);
+  __device__ __forceinline__
+  ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
+    : comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
+    // Make sure step is updated before we read it.
+    barrier();
+
+    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
+    for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
   }
 
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  DoubleCopy(const int tid, const int nthreads, const T* src, T* dst1, T* dst2,
-      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
-    GenericOp(tid, nthreads, src, nullptr, dst1, dst2, len, maxOffset, step, flags...);
+  __device__ void send(const T* src, int nelem) {
+    return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem);
   }
 
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  Reduce(const int tid, const int nthreads, const T* src1, const T* src2, T* dst,
-      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
-    GenericOp(tid, nthreads, src1, src2, dst, nullptr, len, maxOffset, step, flags...);
+  __device__ void recv(T* dst, int nelem) {
+    return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem);
   }
 
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  ReduceCopy(const int tid, const int nthreads, const T* src1, const T* src2, T* dst1, T* dst2,
-      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
-    GenericOp(tid, nthreads, src1, src2, dst1, dst2, len, maxOffset, step, flags...);
+  __device__ void recvReduceSend(const T* src, int nelem) {
+    return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem);
+  }
+
+  __device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
+    return LLGenericOp<1, 0, 1, 1>(src, dst, nelem);
   }
-};
 
-#endif // end include guard
+  __device__ void copySend(const T* src, T* dst, int nelem) {
+    return LLGenericOp<0, 1, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ void recvCopySend(T* dst, int nelem) {
+    return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem);
+  }
+
+  __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
+    return LLGenericOp<1, 1, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ __forceinline__ ~ncclLLPrimitives() {
+    for (int i=0; i<NSEND && i<nsend; i++) llSendCleaning(i);
+    for (int i=0; i<NRECV && i<nrecv; i++) llRecvCleaning(i);
+    // Save steps for the next operation
+    for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
+    for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
+  }
+};
+#endif
diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu
index bd1d23c..1ef66d4 100644
--- a/src/collectives/device/reduce.cu
+++ b/src/collectives/device/reduce.cu
@@ -4,18 +4,8 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "common.h"
 #include "reduce.h"
+#include "common.h"
 #include "collectives.h"
 
-#define UNROLL 4
-
-#if NCCL_OP == 0
-IMPL_COLL2(ncclReduce, sum,  FuncSum,  ncclCollReduce, ncclSum);
-#elif NCCL_OP == 1
-IMPL_COLL2(ncclReduce, prod, FuncProd, ncclCollReduce, ncclProd);
-#elif NCCL_OP == 2
-IMPL_COLL2(ncclReduce, min,  FuncMin,  ncclCollReduce, ncclMin);
-#elif NCCL_OP == 3
-IMPL_COLL2(ncclReduce, max,  FuncMax,  ncclCollReduce, ncclMax);
-#endif
+IMPL_COLL_R(ncclReduce, ncclCollReduce);
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index f5694b1..302d053 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -8,143 +8,71 @@
 #include "primitives.h"
 #include "collectives.h"
 
-// Increase Step and boffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  boffset += sliceSize; \
-  if (boffset == buffSize) boffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
-__device__ void ncclReduceKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x - 1;
   const int bid = args->bid;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-
-  WaitFlag waitDoneFromNext(ring->send.conn.head, (REDUCE_BUFCHUNKS-1)*REDUCE_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
-  PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS);
-
-  typedef Primitives<UNROLL, REDUCE_SUBSTEPS, T, FUNC> Prims;
-
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
   const int nranks = comm->nRanks;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / REDUCE_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
   const int rank = ring->devUserRanks[0];
   const int prevRank = ring->devUserRanks[nranks-1];
   const int root = args->root;
 
-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    *ring->recv.conn.opCount = args->opCount;
-
-    if (rank != root) {
-      // Wait for next to be ready
-      WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-      waitOpCountNext.wait(args->opCount);
-    }
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int boffset = 0;
-
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t offset = gridOffset + bid*chunkSize;
-    int maxOffset = min(chunkSize, size-offset);
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t offset = gridOffset + bid*realChunkSize;
+    int nelem = min(realChunkSize, size-offset);
     if (prevRank == root) {
-      Prims::Copy(tid, nthreads,
-          thisInput + offset,
-          nextOutput + boffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
+      prims.send(thisInput+offset, nelem);
     } else if (rank == root) {
-      Prims::Reduce(tid, nthreads,
-          prevInput  + boffset,
-          thisInput + offset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
+      prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
     } else {
-      Prims::Reduce(tid, nthreads,
-          prevInput + boffset,
-          thisInput + offset,
-          nextOutput + boffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-    }
-    NEXT_STEP; // Increases step, boffset
-  }
-
-  if (tid == 0) {
-    if (rank != root) {
-      // Wait for next to have consumed data before resetting the flag
-      waitDoneFromNext.wait(REDUCE_SUBSTEPS*(step + REDUCE_BUFCHUNKS - 1));
-      *ring->send.conn.head = 0ULL;
+      prims.recvReduceSend(thisInput+offset, nelem);
     }
-    *ring->recv.conn.tail = 0ULL;
-    __threadfence_system();
-    *ring->recv.conn.opCount = args->opCount+1;
   }
 }
 
-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
-  boffset += NCCL_LL_SLICE_LINES; \
-  if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
-  flag++; \
-  step++;
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { }
 
 template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
-  const int llNthreads = args->nThreads;
+  const int nthreads = args->nThreads;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
-  const int nranks = comm->nRanks;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
+
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+  const ssize_t size = args->N;
   const int rank = comm->rank;
+  const int nranks = comm->nRanks;
   const int prevRank = ring->devUserRanks[nranks-1];
   const int root = args->root;
 
-  typedef LLPrimitives<T, FUNC> LL;
-
-  const ssize_t size = args->N;
   ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*chunkSize;
-
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t flag = step + 1;
-  int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  const ssize_t loopSize = args->nChannels*chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     if (size-gridOffset < loopSize) {
@@ -152,39 +80,16 @@ __device__ void ncclReduceLLKernel(struct CollectiveArgs* args) {
     }
     ssize_t offset = gridOffset + bid*chunkSize;
 
-    int maxOffset = min(chunkSize, size-offset);
+    int nelem = min(chunkSize, size-offset);
     if (prevRank == root) {
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          thisInput + offset,
-          nextOutput + boffset,
-          maxOffset, flag, llNthreads);
-      POST_SIZE;
-      NEXT_STEP_LL;
+      LLprims.send(thisInput+offset, nelem);
     } else if (rank == root) {
-      LL::ReduceCopy(
-          thisInput + offset,
-          prevInput  + boffset,
-          thisOutput + offset,
-          maxOffset, flag, llNthreads);
-      NEXT_STEP_LL;
-      ACK_PREV;
+      LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
     } else {
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          thisInput + offset,
-          prevInput + boffset,
-          nextOutput + boffset,
-          maxOffset, flag, flag, llNthreads);
-      POST_SIZE;
-      NEXT_STEP_LL;
-      ACK_PREV;
+      LLprims.recvReduceSend(thisInput+offset, nelem);
     }
   }
-
-  // We need everyone to acknowledge data even if they didn't receive anything
-  // so that the next collective can start right away.
-  ACK_PREV;
-
-  FIFO_CLEANING_AND_SAVE_STEP(flag);
 }
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/reduce_kernel.h b/src/collectives/device/reduce_kernel.h
index 0cb8f13..0e90793 100644
--- a/src/collectives/device/reduce_kernel.h
+++ b/src/collectives/device/reduce_kernel.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -46,30 +46,28 @@ struct FuncMin {
   }
 };
 
+#define MASK0 0x00ff00ff
+#define MASK1 0xff00ff00
+static __device__ uint32_t addChar4(const uint32_t x, const uint32_t y) {
+  /* This can be used both for signed and unsigned 8-bit addition */
+  const uint32_t x0 = x & MASK0;
+  const uint32_t x1 = x & MASK1;
+  const uint32_t y0 = y & MASK0;
+  const uint32_t y1 = y & MASK1;
+  const uint32_t r0 = (x0+y0);
+  const uint32_t r1 = (x1+y1);
+  return (r0 & MASK0) | (r1 & MASK1);
+}
+
 template<>
 struct FuncSum<int8_t> {
-  union converter { uint32_t storage; char4 a; };
   __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
     int32_t rv, z=0;
     asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
     return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vadd.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-        "vadd.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vadd.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vadd.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = cx.a.x + cy.a.x;
-    cr.a.y = cx.a.y + cy.a.y;
-    cr.a.z = cx.a.z + cy.a.z;
-    cr.a.w = cx.a.w + cy.a.w;
-    return cr.storage;
+    return addChar4(x, y);
 #endif
   }
   __device__ int8_t operator()(const int8_t x, const int8_t y) const {
@@ -78,28 +76,13 @@ struct FuncSum<int8_t> {
 };
 template<>
 struct FuncSum<uint8_t> {
-  union converter { uint32_t storage; uchar4 a; };
   __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
     int32_t rv, z=0;
     asm("vadd4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
     return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vadd.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
-        "vadd.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vadd.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vadd.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = cx.a.x + cy.a.x;
-    cr.a.y = cx.a.y + cy.a.y;
-    cr.a.z = cx.a.z + cy.a.z;
-    cr.a.w = cx.a.w + cy.a.w;
-    return cr.storage;
+    return addChar4(x, y);
 #endif
   }
   __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
@@ -109,22 +92,6 @@ struct FuncSum<uint8_t> {
 
 static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) {
   /* This can be used both for signed and unsigned 8-bit multiplication */
-#if (__CUDA_ARCH__ >= 300)
-  uint32_t rv;
-  asm("{ .reg .u32 t0, t1, t2, t3;\n\t"
-      " vmad.u32.u32.u32 t3, %1.b3, %2.b3, 0;\n\t"
-      " vmad.u32.u32.u32 t2, %1.b2, %2.b2, 0;\n\t"
-      " shl.b32          t3, t3, 16;\n\t"
-      " shl.b32          t2, t2, 16;\n\t"
-      " vmad.u32.u32.u32 t1, %1.b1, %2.b1, t3;\n\t"
-      " shl.b32          t1, t1, 8;\n\t"
-      " vmad.u32.u32.u32 t0, %1.b0, %2.b0, t2;\n\t"
-      " and.b32          t1, t1, 0xff00ff00;\n\t"
-      " and.b32          t0, t0, 0x00ff00ff;\n\t"
-      " or.b32           %0,  t0, t1;\n\t"
-      "}" : "=r"(rv) : "r"(x), "r"(y));
-  return rv;
-#else
   union converter { uint32_t storage; char4 a; };
   converter cx, cy, cr;
   cx.storage = x;
@@ -134,7 +101,6 @@ static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) {
   cr.a.z = cx.a.z * cy.a.z;
   cr.a.w = cx.a.w * cy.a.w;
   return cr.storage;
-#endif
 }
 
 template<>
@@ -164,13 +130,6 @@ struct FuncMax<int8_t> {
     int32_t rv, z=0;
     asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
     return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vmax.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-        "vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
     converter cx, cy, cr;
     cx.storage = x;
@@ -194,13 +153,6 @@ struct FuncMax<uint8_t> {
     int32_t rv, z=0;
     asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
     return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vmax.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
-        "vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
     converter cx, cy, cr;
     cx.storage = x;
@@ -225,13 +177,6 @@ struct FuncMin<int8_t> {
     int32_t rv, z=0;
     asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
     return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vmin.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-        "vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
     converter cx, cy, cr;
     cx.storage = x;
@@ -255,13 +200,6 @@ struct FuncMin<uint8_t> {
     int32_t rv, z=0;
     asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
     return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vmin.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
-        "vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
     converter cx, cy, cr;
     cx.storage = x;
diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu
index b16053c..10857ed 100644
--- a/src/collectives/device/reduce_scatter.cu
+++ b/src/collectives/device/reduce_scatter.cu
@@ -4,18 +4,8 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "common.h"
 #include "reduce_scatter.h"
+#include "common.h"
 #include "collectives.h"
 
-#define UNROLL 4
-
-#if NCCL_OP == 0
-IMPL_COLL2(ncclReduceScatter, sum,  FuncSum,  ncclCollReduceScatter, ncclSum);
-#elif NCCL_OP == 1
-IMPL_COLL2(ncclReduceScatter, prod, FuncProd, ncclCollReduceScatter, ncclProd);
-#elif NCCL_OP == 2
-IMPL_COLL2(ncclReduceScatter, min,  FuncMin,  ncclCollReduceScatter, ncclMin);
-#elif NCCL_OP == 3
-IMPL_COLL2(ncclReduceScatter, max,  FuncMax,  ncclCollReduceScatter, ncclMax);
-#endif
+IMPL_COLL_R(ncclReduceScatter, ncclCollReduceScatter);
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index cad011b..c70c845 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -8,156 +8,82 @@
 #include "primitives.h"
 #include "collectives.h"
 
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
-__device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x - 1;
   const int bid = args->bid;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-
-  WaitFlag waitDoneFromNext(ring->send.conn.head, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, REDUCESCATTER_SUBSTEPS);
-  PostFlag postDoneToPrev(ring->recv.conn.head, REDUCESCATTER_SUBSTEPS, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
-
-  typedef Primitives<UNROLL, REDUCESCATTER_SUBSTEPS, T, FUNC> Prims;
-
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
   const int nranks = comm->nRanks;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / REDUCESCATTER_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
-
-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    *ring->recv.conn.opCount = args->opCount;
-    // Wait for next to be ready
-    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-    waitOpCountNext.wait(args->opCount);
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int poffset, noffset = 0;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*realChunkSize;
 
     /////////////// begin ReduceScatter steps ///////////////
     ssize_t offset;
-    int maxOffset = min(chunkSize, size-chunkOffset);
+    int nelem = min(realChunkSize, size-chunkOffset);
     int rankDest;
 
     // step 0: push data to next GPU
     rankDest = ring->devUserRanks[nranks-1];
     offset = chunkOffset + rankDest * size;
 
-    Prims::Copy(tid, nthreads,
-        thisInput  + offset,
-        nextOutput + noffset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext,
-        postReadyToNext);
-
-    NEXT_STEP; // Increases step, poffset, noffset
+    prims.send(thisInput+offset, nelem);
 
     // k-2 steps: reduce and copy to next GPU
     for (int j=2; j<nranks; ++j) {
       rankDest = ring->devUserRanks[nranks-j];
       offset = chunkOffset + rankDest * size;
 
-      Prims::Reduce(tid, nthreads,
-          prevInput  + poffset,
-          thisInput  + offset,
-          nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
+      prims.recvReduceSend(thisInput+offset, nelem);
     }
 
-    // step k-1: reduce this buffer and data, which will produce the final
-    // result that we store in this data and push to the next GPU
+    // step k-1: reduce this buffer and data, which will produce the final result
     rankDest = ring->devUserRanks[0];
     offset = chunkOffset + rankDest * size;
 
-    Prims::Reduce(tid, nthreads,
-        prevInput  + poffset,
-        thisInput  + offset,
-        thisOutput + chunkOffset,
-        sliceSize, maxOffset,
-        step,
-        waitReadyFromPrev,
-        postDoneToPrev);
-  }
-
-  if (tid == 0) {
-    waitDoneFromNext.wait(REDUCESCATTER_SUBSTEPS*(step + REDUCESCATTER_BUFCHUNKS));
-    *ring->send.conn.head = 0ULL;
-    *ring->recv.conn.tail = 0ULL;
-    __threadfence_system();
-    *ring->recv.conn.opCount = args->opCount+1;
+    prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
   }
 }
 
-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
-  poffset = noffset; \
-  pflag = nflag; \
-  noffset += NCCL_LL_SLICE_LINES; \
-  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
-  nflag++; \
-  step++;
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { }
 
 template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
-  const int llNthreads = args->nThreads;
+  const int nthreads = args->nThreads;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
 
-  typedef LLPrimitives<T, FUNC> LL;
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
 
   const ssize_t size = args->N;
   //const int rank = comm->rank;
   const int nranks = comm->nRanks;
   ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*chunkSize;
-
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t pflag, nflag = step + 1;
-  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  const ssize_t loopSize = args->nChannels*chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     if (size-gridOffset < loopSize) {
@@ -167,37 +93,21 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
 
     /////////////// begin ReduceScatter steps ///////////////
     ssize_t offset;
-    int maxOffset = min(chunkSize, size-chunkOffset);
+    int nelem = min(chunkSize, size-chunkOffset);
     int rankDest;
 
     // step 0: push data to next GPU
     rankDest = ring->devUserRanks[nranks-1];
     offset = chunkOffset + rankDest * size;
 
-    WAIT_NEXT;
-    LL::ReduceCopy(
-        thisInput  + offset,
-        nextOutput + noffset,
-        maxOffset, nflag, llNthreads);
-    POST_SIZE;
-
-    NEXT_STEP_LL;
+    LLprims.send(thisInput+offset, nelem);
 
     // k-2 steps: reduce and copy to next GPU
     for (int j=2; j<nranks; ++j) {
       rankDest = ring->devUserRanks[nranks-j];
       offset = chunkOffset + rankDest * size;
 
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          thisInput  + offset,
-          prevInput  + poffset,
-          nextOutput + noffset,
-          maxOffset, pflag, nflag, llNthreads);
-      POST_SIZE;
-      ACK_PREV;
-
-      NEXT_STEP_LL;
+      LLprims.recvReduceSend(thisInput+offset, nelem);
     }
 
     // step k-1: reduce this buffer and data, which will produce the final
@@ -205,13 +115,9 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
     rankDest = ring->devUserRanks[0];
     offset = chunkOffset + rankDest * size;
 
-    LL::ReduceCopy(
-        thisInput  + offset,
-        prevInput  + poffset,
-        thisOutput + chunkOffset,
-        maxOffset, pflag, llNthreads);
-    ACK_PREV;
+    LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
   }
-
-  FIFO_CLEANING_AND_SAVE_STEP(nflag);
 }
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/reduce.cu b/src/collectives/reduce.cu
index d8fde80..302d4bc 100644
--- a/src/collectives/reduce.cu
+++ b/src/collectives/reduce.cu
@@ -4,30 +4,15 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
-#include "common_coll.h"
 #include "enqueue.h"
 #include "collectives.h"
 
-ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(REDUCE_SUBSTEPS, REDUCE_BUFCHUNKS, 1, 1, nbytes, proxyPatternTo(root), comm));
-    NCCLCHECK(saveKernel(ncclCollReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, 1));
-  }
-
-  return ncclSuccess;
-}
-
 NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  return ncclEnqueueCheck(ncclReduceFunc, "Reduce", sendbuff, recvbuff, count, datatype,
-          op, root, comm, stream);
+  struct ncclInfo info = { ncclCollReduce, "Reduce",
+    sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
+    REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
 }
diff --git a/src/collectives/reduce_scatter.cu b/src/collectives/reduce_scatter.cu
index 1447d4a..4ee77ef 100644
--- a/src/collectives/reduce_scatter.cu
+++ b/src/collectives/reduce_scatter.cu
@@ -4,29 +4,15 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
-#include "common_coll.h"
 #include "enqueue.h"
 #include "collectives.h"
 
-ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(REDUCESCATTER_SUBSTEPS, REDUCESCATTER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
-    NCCLCHECK(saveKernel(ncclCollReduceScatter, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes*comm->nRanks, 1));
-  }
-  return ncclSuccess;
-}
-
 NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
 ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
-  return ncclEnqueueCheck(ncclReduceScatterFunc, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype,
-          op, 0, comm, stream);
+  struct ncclInfo info = { ncclCollReduceScatter, "ReduceScatter",
+    sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
+    REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
 }
diff --git a/src/enqueue.cu b/src/enqueue.cu
new file mode 100644
index 0000000..d283223
--- /dev/null
+++ b/src/enqueue.cu
@@ -0,0 +1,442 @@
+/*************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "checks.h"
+#include "param.h"
+
+#include "collectives/collectives.h"
+
+// Only generate inline kernels for LL
+#define NCCL_FUNC5(coll, op, dtype) \
+  (void*)NCCL_KERN_NAME(coll##LL, op, dtype), \
+  (void*)NCCL_KERN_NAME(coll##LL, op, dtype)
+
+#define NCCL_FUNC4(coll, op, dtype) \
+  (void*)NCCL_FUNC5(coll##Ring, op, dtype), \
+  (void*)NCCL_FUNC5(coll##Tree, op, dtype)
+
+// Must be consistent with ncclDataType_t
+#define NCCL_FUNCS3A(coll, op) \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  u8), \
+  (void*)NCCL_FUNC4(coll, op, i32), \
+  (void*)NCCL_FUNC4(coll, op, u32), \
+  (void*)NCCL_FUNC4(coll, op, i64), \
+  (void*)NCCL_FUNC4(coll, op, u64), \
+  (void*)NCCL_FUNC4(coll, op, f16), \
+  (void*)NCCL_FUNC4(coll, op, f32), \
+  (void*)NCCL_FUNC4(coll, op, f64)
+#define NCCL_FUNCS3B(coll, op) \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8)
+
+// Must be consistent with ncclRedOp_t -- but we only generate kernel for sums.
+#define NCCL_FUNCS2A(coll) \
+  NCCL_FUNCS3A(coll, sum), \
+  NCCL_FUNCS3A(coll, sum), \
+  NCCL_FUNCS3A(coll, sum), \
+  NCCL_FUNCS3A(coll, sum)
+#define NCCL_FUNCS2B(coll) \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy)
+
+// Must be consistent with the ncclFuncSet enum
+static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
+  NCCL_FUNCS2B(ncclBroadcast),
+  NCCL_FUNCS2A(ncclReduce),
+  NCCL_FUNCS2B(ncclAllGather),
+  NCCL_FUNCS2A(ncclReduceScatter),
+  NCCL_FUNCS2A(ncclAllReduce)
+};
+
+/*****************************************************************************/
+/*       Launch system : synchronization and CUDA kernel launch              */
+/*****************************************************************************/
+
+ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
+#if CUDART_VERSION >= 9000
+  if (cgMode & 0x01) {
+    CUDACHECK(cudaLaunchCooperativeKernelMultiDevice(paramsList, numDevices,
+            // These flags are to reduce the latency of using this API
+            cudaCooperativeLaunchMultiDeviceNoPreSync|cudaCooperativeLaunchMultiDeviceNoPostSync));
+    return ncclSuccess;
+  }
+#endif
+  int savedDev;
+  CUDACHECK(cudaGetDevice(&savedDev));
+  for (int i = 0; i < numDevices; i++) {
+    struct cudaLaunchParams* params = paramsList+i;
+    CUDACHECK(cudaSetDevice(cudaDevs[i]));
+    CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
+  }
+  CUDACHECK(cudaSetDevice(savedDev));
+  return ncclSuccess;
+}
+
+ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
+  params->gridDim.x = std::min((int) params->gridDim.x, comm->nChannels);
+
+  // Set active = 2 for the last operation
+  for (int r=0; r<params->gridDim.x; r++) {
+    struct ncclChannel* channel = comm->channels+r;
+    channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active = 2;
+  }
+
+  // Find the first operation, choose the kernel accordingly and pass it
+  // as the first argument.
+  struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart;
+  memcpy(&comm->args, coll, sizeof(struct ncclColl));
+  // As we pass that coll directly, we can free it immediately.
+  coll->active = 0;
+
+  params->func = ncclKerns[coll->funcIndex];
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
+  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+  int val = *ptr;
+  bool done = false;
+  while (done == false) {
+    if (val >= comm->intraRanks) {
+      WARN("Trying to launch too many collectives");
+      return ncclInvalidUsage;
+    }
+    if (val+1 == comm->intraRanks) {
+      // Reset the barrier.
+      comm->intraBarrier[comm->intraPhase^1] = 0;
+      *isLast = 1;
+      return ncclSuccess;
+    }
+    done = __sync_bool_compare_and_swap(ptr, val, val+1);
+    val++;
+  }
+  *isLast = 0;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
+  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+  int val = *ptr;
+  if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
+    WARN("Trying to launch too many collectives");
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
+  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+  while (*ptr < comm->intraRanks) pthread_yield();
+  comm->intraPhase ^= 1;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
+  if (comm->nRanks == 1) return ncclSuccess;
+  struct cudaLaunchParams* params = comm->myParams;
+
+  NCCLCHECK(setupLaunch(comm, params));
+
+  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
+  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
+    // Enqueue event in user stream
+    CUDACHECK(cudaEventRecord(comm->doneEvent, comm->userStream));
+    // Create dependency between user stream and internal NCCL stream
+    CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
+    params->stream = comm->groupStream;
+  } else {
+    if (comm->userStream != params->stream) {
+      // Stream changed from last call, create dependency against last NCCL kernel launch
+      CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
+    }
+    params->stream = comm->userStream;
+  }
+
+  int isLast = 0;
+  NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
+
+  if (isLast) {
+    if (comm->launchMode == ncclComm::GROUP) {
+      // I'm the last. Launch all operations.
+      NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode));
+    }
+    NCCLCHECK(ncclCpuBarrierLast(comm));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
+  if (comm->nRanks == 1) return ncclSuccess;
+  // We can't print the CG mode before the first barrier happened.
+  if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
+    *comm->intraCGMode ^= 0x10;
+    INFO(NCCL_INIT,"Launch mode %s%s%s",
+        comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
+        *comm->intraCGMode ? "/CGMD" : "",
+        (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
+  }
+
+  NCCLCHECK(ncclCpuBarrierOut(comm));
+
+  struct cudaLaunchParams *params = comm->myParams;
+  if (comm->launchMode == ncclComm::PARALLEL) {
+    CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
+  }
+  // Start the network proxies as soon as the kernel has been launched. We can't
+  // perform any CUDA call between the two or having a cudaFree between the CUDA
+  // launch and the transportStartProxy call could cause a deadlock.
+  // Also, starting the proxies after the CUDA launch seems to be better for
+  // performance (latency).
+  for (int r=0; r<params->gridDim.x; r++) {
+    struct ncclChannel* channel = comm->channels+r;
+    channel->collStart = channel->collFifoTail;
+    channel->collCount = 0;
+  }
+  params->gridDim.x = params->blockDim.x = 0;
+  NCCLCHECK(transportStartProxy(comm));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
+  struct cudaLaunchParams *params = comm->myParams;
+  // Enqueue event after NCCL kernel
+  CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream));
+  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
+  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
+    // Create dependency between NCCL internal stream and user stream
+    CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
+  }
+  comm->userStreamSet = false;
+  return ncclSuccess;
+}
+
+/*****************************************************************************/
+/* Enqueueing system : computation of kernel and proxy operations parameters */
+/*****************************************************************************/
+
+static ncclResult_t getPatternInfo(struct ncclInfo* info) {
+  if (info->coll == ncclCollBroadcast) info->pattern = ncclPatternPipelineFrom;
+  else if (info->coll == ncclCollReduce) info->pattern = ncclPatternPipelineTo;
+  else if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->pattern = ncclPatternRing;
+  else if (info->coll == ncclCollAllReduce) {
+    if (info->nBytes <= info->comm->treeThreshold)
+      info->pattern = ncclPatternTreeUpDown;
+    else
+      info->pattern = ncclPatternRingTwice;
+  }
+  else {
+    WARN("Unknown collective %d", info->coll);
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t getLoopInfo(struct ncclInfo* info) {
+  switch (info->pattern) {
+    case ncclPatternTreeUp:
+    case ncclPatternTreeDown:
+    case ncclPatternTreeUpDown:
+    case ncclPatternPipelineFrom:
+    case ncclPatternPipelineTo:
+      info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
+    case ncclPatternRing:
+      info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
+    case ncclPatternRingTwice:
+      info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break;
+    default:
+      WARN("Unknown pattern %d\n", info->pattern);
+      return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
+static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) {
+  // Compute thresholds and limits that users can override
+  int perThreadLLThreshold = std::min(info->comm->threadThreshold, (ssize_t)NCCL_LL_CHANNEL_THRESHOLD);
+  int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads);
+
+  // First compute nThreads
+  int nt = NCCL_LL_MIN_NTHREADS;
+  while (DIVUP(info->nBytes, nt*info->nchunksPerLoop) > perThreadLLThreshold && nt*2 <= maxLLNthreads) nt *= 2;
+
+  // Then compute nChannels
+  int nc = DIVUP(info->nBytes, nt*info->nchunksPerLoop*perThreadLLThreshold);
+  if (nc == 0) nc = 1;
+  if (nc > info->comm->nChannels) nc = info->comm->nChannels;
+
+  // Check if we have a fixed LL threshold, otherwise compute it.
+  int perThreadThreshold = info->comm->threadThreshold;
+  if (info->pattern >= ncclPatternTreeUp) perThreadThreshold *= 4;
+  ssize_t llThreshold = info->comm->llThreshold >= 0 ?
+    info->comm->llThreshold :
+    nc*nt*info->nchunksPerLoop*perThreadThreshold;
+
+  if (info->nBytes <= llThreshold) {
+    *llMode = 1;
+    *nChannels = nc;
+    *nThreads = nt;
+  } else {
+    *llMode = 0;
+    *nChannels = info->comm->nChannels;
+    *nThreads = info->comm->nThreads+1;
+  }
+}
+
+static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
+  // Set nstepsPerLoop and nchunksPerLoop
+  NCCLCHECK(getPatternInfo(info));
+  NCCLCHECK(getLoopInfo(info));
+
+  coll->args.root = info->root;
+  coll->args.N = info->count;
+  coll->args.ThisInput = info->sendbuff;
+  coll->args.ThisOutput = info->recvbuff;
+  coll->args.comm = info->comm->devComm;
+  coll->args.opCount = info->comm->opCount;
+
+  // Compute llMode, nChannels, nThreads
+  int llMode;
+  getKernelInfo(info, &coll->args.nChannels, &coll->args.nThreads, &llMode);
+
+  int treeMode = info->pattern >= ncclPatternTreeUp ? 1 : 0;
+  coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, llMode, treeMode);
+
+  int stepSize   = ( llMode ? NCCL_LL_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS;
+  int chunkSteps = (llMode|treeMode) ? 1 : info->chunkSteps;
+  int sliceSteps = (llMode|treeMode) ? 1 : info->sliceSteps;
+  int chunkSize  = stepSize*chunkSteps;
+
+  // Compute lastChunkSize
+  if (treeMode == 1 && llMode == 0) {
+    if (info->pattern == ncclPatternTreeUpDown) {
+      // Optimize chunkSize / nSteps
+      while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
+      while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
+      while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
+    }
+    // Use lastChunkSize as chunkSize
+    coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+  } else if (llMode == 1) {
+    int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t);
+    const ssize_t loopSize = coll->args.nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
+    coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), coll->args.nChannels*info->nchunksPerLoop);
+    ALIGN_SIZE(coll->args.lastChunkSize, coll->args.nThreads*sizeof(uint64_t));
+    coll->args.lastChunkSize /= ncclTypeSize(info->datatype);
+  }
+
+  // Compute nSteps for proxies
+  size_t nBytes  = llMode ? info->nBytes*2 : info->nBytes;
+
+  int nLoops = (int)(DIVUP(nBytes, (((size_t)(coll->args.nChannels))*info->nchunksPerLoop*chunkSize)));
+  proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
+  proxyArgs->sliceSteps = sliceSteps;
+  proxyArgs->chunkSteps = chunkSteps;
+  proxyArgs->llMode = llMode;
+  proxyArgs->opCount = info->comm->opCount;
+  TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> llmode %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
+      coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, nBytes, llMode, coll->args.nChannels, coll->args.nThreads,
+      nLoops, proxyArgs->nsteps, info->comm);
+  return ncclSuccess;
+}
+
+static ncclResult_t saveKernel(struct ncclInfo* info) {
+  if (info->comm->nRanks == 1) {
+    if (info->sendbuff != info->recvbuff)
+      CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, cudaMemcpyDeviceToDevice, info->stream));
+    return ncclSuccess;
+  }
+
+  struct ncclColl coll;
+  struct ncclProxyArgs proxyArgs;
+  memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
+  NCCLCHECK(computeColl(info, &coll, &proxyArgs));
+
+  info->comm->myParams->blockDim.x = max(info->comm->myParams->blockDim.x, coll.args.nThreads);
+  if (info->comm->userStreamSet == false) {
+    info->comm->userStream = info->stream;
+    info->comm->userStreamSet = true;
+  } else if (info->stream != info->comm->userStream) {
+    WARN("Error : mixing different streams within a group call is not supported.");
+    return ncclInvalidUsage;
+  }
+  for (int bid=0; bid<coll.args.nChannels; bid++) {
+    struct ncclChannel* channel = info->comm->channels+(info->comm->myParams->gridDim.x % info->comm->nChannels);
+
+    if (channel->collCount == NCCL_MAX_OPS) {
+      WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
+      return ncclInvalidUsage;
+    }
+
+    // Proxy
+    proxyArgs.channel = channel;
+    NCCLCHECK(transportSaveProxies(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
+
+    info->comm->myParams->gridDim.x++;
+
+    int opIndex = channel->collFifoTail;
+    struct ncclColl* c = channel->collectives+opIndex;
+    volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
+    while (activePtr[0] != 0) sched_yield();
+
+    memcpy(c, &coll, sizeof(struct ncclColl));
+
+    c->args.bid = bid;
+    c->active = 1;
+    opIndex = (opIndex+1)%NCCL_MAX_OPS;
+    c->nextIndex = opIndex;
+    channel->collFifoTail = opIndex;
+    channel->collCount++;
+  }
+  /*if (llMode == 0)*/ info->comm->opCount++;
+  return ncclSuccess;
+}
+
+
+ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
+  if (info->comm == NULL) return ncclInvalidArgument;
+
+  INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
+       info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
+       info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
+
+  // Launch asynchronously if needed
+  if (ncclAsyncMode()) {
+    ncclResult_t ret = ncclSuccess;
+    int savedDev = -1;
+    if (info->comm->checkPointers) {
+      CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end);
+      CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, end);
+    }
+    // Check arguments
+    NCCLCHECKGOTO(ArgsCheck(info), ret, end);
+    // Always register comm even in case of error to make sure ncclGroupEnd
+    // cleans it up.
+    NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
+    NCCLCHECKGOTO(saveKernel(info), ret, end);
+end:
+    if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev));
+    ncclAsyncErrCheck(ret);
+    return ret;
+  } else {
+    NCCLCHECK(ArgsCheck(info));
+    NCCLCHECK(saveKernel(info));
+    NCCLCHECK(ncclBarrierEnqueue(info->comm));
+    NCCLCHECK(ncclBarrierEnqueueWait(info->comm));
+    NCCLCHECK(ncclEnqueueEvents(info->comm));
+    return ncclSuccess;
+  }
+}
diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h
index 278593c..a1aaf50 100644
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@@ -13,5 +13,7 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
 ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
 ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState);
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
+ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
+ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
 ncclResult_t bootstrapClose(void* commState);
 #endif
diff --git a/src/include/channel.h b/src/include/channel.h
new file mode 100644
index 0000000..76c5e8a
--- /dev/null
+++ b/src/include/channel.h
@@ -0,0 +1,14 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CHANNEL_H_
+#define NCCL_CHANNEL_H_
+#include "core.h"
+
+ncclResult_t initChannel(struct ncclComm* comm, int channelid);
+ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
+
+#endif
diff --git a/src/include/checks.h b/src/include/checks.h
new file mode 100644
index 0000000..bf7750e
--- /dev/null
+++ b/src/include/checks.h
@@ -0,0 +1,10 @@
+/*************************************************************************
+ * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+
+ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
+ncclResult_t ArgsCheck(struct ncclInfo* info);
diff --git a/src/include/common_coll.h b/src/include/common_coll.h
deleted file mode 100644
index 3ec7354..0000000
--- a/src/include/common_coll.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef COMMON_COLL_H_
-#define COMMON_COLL_H_
-
-#include "core.h"
-#include "enqueue.h"
-#include "collectives/collectives.h"
-
-static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
-  cudaPointerAttributes attr;
-  cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
-  if (err != cudaSuccess || attr.devicePointer == NULL) {
-    WARN("%s : %s is not a valid pointer", opname, ptrname);
-    return ncclInvalidArgument;
-  }
-#if CUDART_VERSION >= 10000
-  if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
-#else
-  if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
-#endif
-    WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
-    return ncclInvalidArgument;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
-  if (ptr == NULL) {
-    WARN("%s : %s argument is NULL", opname, ptrname);
-    return ncclInvalidArgument;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) {
-  NCCLCHECK(PtrCheck(comm, opname, "comm"));
-  // First, the easy ones
-  if (root < 0 || root >= comm->nRanks) {
-    WARN("%s : invalid root %d (root should be in the 0..%d range)", opname, root, comm->nRanks);
-    return ncclInvalidArgument;
-  }
-  if (type < 0 || type >= ncclNumTypes) {
-    WARN("%s : invalid type %d", opname, type);
-    return ncclInvalidArgument;
-  }
-  if (op < 0 || op >= ncclNumOps) {
-    WARN("%s : invalid reduction operation %d", opname, op);
-    return ncclInvalidArgument;
-  }
-
-  if (comm->checkPointers) {
-    // Check CUDA device pointers
-    if (strcmp(opname, "Broadcast") != 0 || comm->rank == root) {
-      NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname));
-    }
-    if (strcmp(opname, "Reduce") != 0 || comm->rank == root) {
-      NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname));
-    }
-  }
-  return ncclSuccess;
-}
-
-static __inline__ int ncclTypeSize(ncclDataType_t type) {
-  switch (type) {
-    case ncclInt8:
-    case ncclUint8:
-      return 1;
-    case ncclFloat16:
-      return 2;
-    case ncclInt32:
-    case ncclUint32:
-    case ncclFloat32:
-      return 4;
-    case ncclInt64:
-    case ncclUint64:
-    case ncclFloat64:
-      return 8;
-    default:
-      return -1;
-  }
-}
-
-// In : comm, nbytes ; Out : nrings, nthreads, ll
-// - We start with the minimum number of threads possible (64) and see if the size fits in LL;
-//   If not, we increase the number of threads by 2x, until we reach the max number of LL threads (256, or set by user via NCCL_NTHREADS, or platform non-LL default)
-// - We use "maxRings" to limit the max number of rings we can use before reaching the max number of LL threads
-//   This ensures we don't use a large number of rings with a small number of threads
-// - We use the NCCL_LL_RING_THRESHOLD as the per-thread threshold before we reach the max number of threads
-//   we use NCCL_THREAD_THRESHOLD when we reach the max
-// - If by the max number of LL threads, the size still cannot fit in LL, then we use non-LL setting
-// - We honor the NCCL_LL_THRESHOLD (total threshold) set by user too
-static inline void ncclGetCollResource(ncclComm_t comm, size_t nbytes, int* nrings, int* nthreads, int* ll) {
-  *ll = 0;
-  int llEnforced = 0; /* see if the size falls in the NCCL_LL_THRESHOLD range set by user */
-  if (comm->llThreshold >= 0) { /* user sets total LL threshold */
-    if (nbytes > comm->llThreshold) { /* non-LL */
-      *nthreads = comm->nThreads+1;
-      *nrings = comm->nRings;
-      return;
-    } else {
-      llEnforced = 1; /* user wants to use LL */
-    }
-  }
-  int nt = NCCL_LL_MIN_NTHREADS; /* start with min number of LL threads */
-  size_t nr;
-  int ll_max_nthreads = std::min(NCCL_LL_MAX_NTHREADS, comm->nThreads); /* respect user's setting or platform's default setting */
-  int maxRings = (comm->nRanks <= 4) ? 1 : ll_max_nthreads / NCCL_LL_MIN_NTHREADS;
-  ssize_t threshold = std::min(comm->threadThreshold, (ssize_t)NCCL_LL_RING_THRESHOLD);
-  while (nt < ll_max_nthreads && *ll == 0) {
-    nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*nt*comm->nRanks));
-    if (nr <= maxRings) { /* avoid using few threads but many rings */
-      nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
-      *ll = nbytes > comm->nRanks*nr*nt*threshold ? 0 : 1;
-    }
-    if (*ll == 0) {
-      nt = nt << 1;
-    }
-  }
-  if (*ll == 1) {
-    *nthreads = nt;
-    *nrings = (int)nr;
-    return; /* we can use smaller number of threads to make LL work, stop here */
-  }
-  nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*ll_max_nthreads*comm->nRanks)); /* else we try the max number of LL threads */
-  nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
-  *ll = nbytes > comm->nRanks*nr*ll_max_nthreads*comm->threadThreshold ? llEnforced : 1;
-  *nthreads = *ll ? ll_max_nthreads : comm->nThreads+1;
-  *nrings = *ll ? (int)nr : comm->nRings;
-}
-
-static ncclResult_t saveKernel(int coll, const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t dtype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, size_t nbytes, int loopFactor) {
-  int llMode, nBlocks, nThreads;
-  ncclGetCollResource(comm, nbytes, &nBlocks, &nThreads, &llMode);
-  comm->myParams->blockDim.x = std::max((int)comm->myParams->blockDim.x, nThreads);
-  if (comm->userStreamSet == false) {
-    comm->userStream = stream;
-    comm->userStreamSet = true;
-  } else if (stream != comm->userStream) {
-    WARN("Error : mixing different streams within a group call is not supported.");
-    return ncclInvalidUsage;
-  }
-  int lastChunkSize = 0;
-  if (llMode == 1) {
-    int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / ncclTypeSize(dtype);
-    const ssize_t loopSize = nBlocks*loopFactor*(ssize_t)sliceSize;
-    lastChunkSize = DIVUP((count-count/loopSize*loopSize), nBlocks*loopFactor);
-    ALIGN_SIZE(lastChunkSize, nThreads*sizeof(uint64_t)/ncclTypeSize(dtype));
-  }
-  for (int bid=0; bid<nBlocks; bid++) {
-    struct ncclRing* ring = comm->rings+(comm->myParams->gridDim.x % comm->nRings);
-    if (ring->collCount == NCCL_MAX_OPS) {
-      WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
-      return ncclInvalidUsage;
-    }
-
-    comm->myParams->gridDim.x++;
-
-    int opIndex = ring->collFifoTail;
-    struct ncclColl* c = ring->collectives+opIndex;
-    volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
-    while (activePtr[0] != 0) sched_yield();
-
-    struct CollectiveArgs* args = &c->args;
-    args->root = root;
-    args->N = count;
-    args->ThisInput = sendbuff;
-    args->ThisOutput = recvbuff;
-    args->comm = comm->devComm;
-    args->opCount = comm->opCount;
-    args->bid = bid;
-    args->nRings = nBlocks;
-    args->nThreads = nThreads;
-    args->lastChunkSize = lastChunkSize;
-
-    c->nThreads = nThreads;
-    c->funcIndex = FUNC_INDEX(coll, op, dtype, llMode);
-    c->active = 1;
-    opIndex = (opIndex+1)%NCCL_MAX_OPS;
-    c->nextIndex = opIndex;
-    ring->collFifoTail = opIndex;
-    ring->collCount++;
-  }
-  /*if (llMode == 0)*/ comm->opCount++;
-  return ncclSuccess;
-}
-
-extern __global__ void ncclMultiOpKernel (struct ncclColl firstColl);
-
-#endif
diff --git a/src/include/core.h b/src/include/core.h
index 8285df5..d57d271 100644
--- a/src/include/core.h
+++ b/src/include/core.h
@@ -8,6 +8,7 @@
 #define NCCL_CORE_H_
 
 #define NCCL_MAX_OPS 2048
+#define NCCL_STEPS 8
 
 #include "nccl.h"
 #include "transport.h"
@@ -29,15 +30,15 @@ struct cudaLaunchParams {
 };
 #endif
 
-#define MAXRINGS 16
+#define MAXCHANNELS 16
 #define MAXTHREADS 256
 #define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
 
-// Rings / LL tuning
-#define NCCL_LL_RING_THRESHOLD 8 // Per thread size before we start increasing nrings
-#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL for Volta and above
+// Channels / LL tuning
+#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
+#define NCCL_THREAD_THRESHOLD 64  // Per thread size before we switch to non-LL
 #define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
-#define NCCL_LL_MAX_NTHREADS 256
+#define NCCL_LL_MAX_NTHREADS MAXTHREADS
 #define NCCL_LL_MIN_NTHREADS 64
 
 #define DIVUP(x, y) \
@@ -63,43 +64,84 @@ union ncclLLFifoLine {
   int4 i4;
 };
 
+typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
+
+typedef enum {
+  ncclPatternRing,
+  ncclPatternRingTwice,
+  ncclPatternPipelineFrom,
+  ncclPatternPipelineTo,
+  ncclPatternTreeUp,
+  ncclPatternTreeDown,
+  ncclPatternTreeUpDown
+} ncclPattern_t;
+
+typedef enum {
+  ncclDevSuccess,
+  ncclDevAssertedMismatch,
+  ncclDevSuspectedMismatch
+} ncclDevError_t;
+
+// Used to pass NCCL call information between functions
+struct ncclInfo {
+  ncclColl_t coll;
+  const char* opName;
+  // NCCL Coll Args
+  const void* sendbuff;
+  void* recvbuff;
+  size_t count;
+  ncclDataType_t datatype;
+  ncclRedOp_t op;
+  int root;
+  ncclComm_t comm;
+  cudaStream_t stream;
+  // Algorithm details
+  int chunkSteps;
+  int sliceSteps;
+  // Computed later
+  ncclPattern_t pattern;
+  size_t nBytes;
+  int nstepsPerLoop;
+  int nchunksPerLoop;
+};
+
 struct ncclConnInfo {
   // Regular comm mechanism
   char *buff;         // Local for recv, remote for send
   uint64_t *tail;     // Local for recv, remote for send
   uint64_t *head;     // Local for send, remote for recv
-  uint64_t *opCount;  // Local for recv, remote for send
+  uint64_t *opCountLoc; // opCount of local rank
+  uint64_t *opCountRem; // opCount of remote rank
 
   int direct;         // Direct communication
   void **ptrExchange; // Pointer exchange for direct communication
 
   int *fifo;          // Size fifo for proxy
 
+  uint64_t step;      // Keep where we are
+
   // Low latency mechanism
-  char *llBuff;       // Local for recv, remote for send
-  uint64_t *llHead;   // Local for send, remote for recv
-  int *llFifo;        // LL Size fifo for proxy
-  uint64_t llStep;    // Keep where we are
+  union ncclLLFifoLine *llBuff; // Local for recv, remote for send
   uint64_t llLastCleaning;
 };
 
 struct ncclConnector {
-  struct transportProxyInfo* proxyInfo;
-  struct ncclTransport* transport;
+  int connected;
+  struct ncclProxyArgs *proxyAppend;
+  struct ncclTransportComm* transportComm;
   void* transportResources; // Host-side resources
   struct ncclConnInfo conn;
+  struct ncclComm *comm;
 };
 
 #define CACHE_LINE_SIZE 128
 #define MEM_ALIGN 4096
-#define SIZES_FIFO_SIZE 32
 #define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
 
-#define NCCL_LL_CHUNKS 8
 #define NUM_LINES_PER_THREAD 8
-#define NCCL_LL_BUFF_SIZE (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_LL_CHUNKS*sizeof(union ncclLLFifoLine)) // 256K
-#define NCCL_LL_BUFF_LINES (NCCL_LL_BUFF_SIZE / (2*sizeof(uint64_t)))
-#define NCCL_LL_SLICE_LINES (NCCL_LL_BUFF_LINES / NCCL_LL_CHUNKS)
+#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
+#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
+#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
 #define NCCL_LL_CLEAN_FREQ 0x10000000
 
 struct ncclSendMem {
@@ -109,7 +151,7 @@ struct ncclSendMem {
       char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
       void* ptrExchange;
       char pad2[CACHE_LINE_SIZE-sizeof(void*)];
-      uint64_t llHead;
+      uint64_t opCount;
     };
     char pad3[MEM_ALIGN];
   };
@@ -119,37 +161,54 @@ struct ncclRecvMem {
   union {
     struct {
       uint64_t tail;
-      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
       uint64_t opCount;
-      char pad4[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      int sizesFifo[SIZES_FIFO_SIZE];
-      int llSizesFifo[SIZES_FIFO_SIZE];
+      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      int sizesFifo[NCCL_STEPS];
     };
-    char pad5[MEM_ALIGN];
+    char pad4[MEM_ALIGN];
   };
-  char llBuff[NCCL_LL_BUFF_SIZE];
+  ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
   char buff[1]; // Actually larger than that
 };
 
 struct ncclRing {
+  // Shortcuts for userRanks[1] and userRanks[n-1]
+  int prev;
+  int next;
+
+  // Maps an internal nccl index to user-specified rank order. This is necessary
+  // since we need to know how the user expects data to be ordered across
+  // devices. Ordered from current device.
+  int* userRanks;
+  int* devUserRanks;
+};
+
+#define NCCL_MAX_TREE_ARITY 3
+struct ncclTree {
+  int depth;
+  int up;
+  int down[NCCL_MAX_TREE_ARITY];
+};
+
+struct ncclPeer {
+  struct ncclConnector send;
+  struct ncclConnector recv;
+};
+
+struct ncclChannel {
   union {
     struct {
+      struct ncclRing ring;
+      struct ncclTree tree;
+
       int id;
       int nthreads;
-      // Per ring resources
-      struct ncclSendMem* devMemSend;   // CUDA-size resources
-      struct ncclRecvMem* devMemRecv;   // CUDA-size resources
       int buffSize;
-      int devMemSendSize;    // Keep the size for IPCs
-      int devMemRecvSize;    // Keep the size for IPCs
-      struct ncclConnector send;
-      struct ncclConnector recv;
 
-      // Maps an internal nccl index to user-specified rank order. This is necessary
-      // since we need to know how the user expects data to be ordered across
-      // devices. Ordered from current device.
-      int* userRanks;
-      int* devUserRanks;
+      // Communication structures
+      struct ncclPeer* peers;
+      struct ncclPeer* devPeers;
 
       // Operation list for aggregation
       struct ncclColl* collectives;
@@ -162,7 +221,7 @@ struct ncclRing {
     int data[0x80];
   };
 };
-static_assert(sizeof(struct ncclRing) == 0x80*sizeof(int), "ncclRing must have a pow2 size");
+static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
 
 /* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
 /* to make sure reads to host from the CUDA kernel are aligned. */
@@ -179,7 +238,7 @@ struct CollectiveArgs {
   size_t N;
   uint32_t root;
   uint8_t bid;
-  uint8_t nRings;
+  uint8_t nChannels;
   uint16_t nThreads;
 
   int lastChunkSize;
@@ -188,7 +247,6 @@ struct ncclColl {
   union {
     struct {
       struct CollectiveArgs args;
-      uint16_t nThreads;
       uint16_t funcIndex;
       uint16_t nextIndex;
       uint8_t  active;
@@ -199,11 +257,16 @@ struct ncclColl {
 static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
 
 struct ncclComm {
-  struct ncclRing rings[MAXRINGS];
+  struct ncclChannel channels[MAXCHANNELS];
+
+  struct ncclPeerInfo* peerInfo;
+
+  void* bootstrap;
 
   int rank;    // my rank in the communicator
   int nRanks;  // number of GPUs in communicator
   int cudaDev; // my cuda device index
+  int nvmlDev; // my NVML device number
 
   enum { GROUP, PARALLEL } launchMode;
   cudaStream_t userStream;
@@ -215,18 +278,31 @@ struct ncclComm {
   // where syncs are not symmetric).
   uint64_t opCount;
 
-  // Rings for collectives
-  int nRings;
+  // Channels for collectives
+  int nChannels;
   int nThreads;
 
   // Low-latency algorithm threshold
   ssize_t llThreshold;
   ssize_t threadThreshold;
 
+  // Tree algorithm threshold
+  ssize_t treeThreshold;
+
   // An internal CUDA stream for NCCL kernel CGMD launches
   int groupCudaStream;
   cudaStream_t groupStream;
 
+  // Whether there has been a fatal error in this communicator.
+  ncclResult_t fatalError;
+
+  // Error reported by GPU
+  volatile ncclDevError_t* fatalDevError;
+
+  // On host: this pointer has been obtained from cudaHostAlloc(cudaHostAllocMapped)
+  // On device:  this pointer has been obtained from cudaHostGetDevicePointer()
+  volatile uint32_t *abortFlag;
+
   // Device copy of the communicator
   struct ncclComm *devComm;
 
@@ -244,6 +320,10 @@ struct ncclComm {
   int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
   struct ncclColl args;
   void* argsptr;
+
+  // Global proxy thread
+  pthread_t proxyThread;
+  struct ncclProxyState proxyState;
 };
 
 // Check CUDA calls
@@ -324,6 +404,28 @@ struct ncclComm {
 #endif // end PROFAPI
 
 int ncclCudaCompCap();
+ncclResult_t ncclNvlinkGpu(int* nvlink);
+int64_t ncclTreeThreshold();
+
+static __inline__ int ncclTypeSize(ncclDataType_t type) {
+  switch (type) {
+    case ncclInt8:
+    case ncclUint8:
+      return 1;
+    case ncclFloat16:
+      return 2;
+    case ncclInt32:
+    case ncclUint32:
+    case ncclFloat32:
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclFloat64:
+      return 8;
+    default:
+      return -1;
+  }
+}
 
 #include <sys/mman.h>
 static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
diff --git a/src/include/cpuset.h b/src/include/cpuset.h
new file mode 100644
index 0000000..f70d1d8
--- /dev/null
+++ b/src/include/cpuset.h
@@ -0,0 +1,61 @@
+/*************************************************************************
+ * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CPUSET_H_
+#define NCCL_CPUSET_H_
+
+// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
+
+static int hexToInt(char c) {
+  int v = c - '0';
+  if (v < 0) return -1;
+  if (v > 9) v = 10 + c - 'a';
+  if ((v < 0) || (v > 15)) return -1;
+  return v;
+}
+
+#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
+
+ncclResult_t ncclStrToCpuset(char* str, cpu_set_t* mask) {
+  uint32_t cpumasks[CPU_SET_N_U32];
+  int m = CPU_SET_N_U32-1;
+  cpumasks[m] = 0;
+  for (int o=0; o<strlen(str); o++) {
+    char c = str[o];
+    if (c == ',') {
+      m--;
+      cpumasks[m] = 0;
+    } else {
+      int v = hexToInt(c);
+      if (v == -1) break;
+      cpumasks[m] <<= 4;
+      cpumasks[m] += v;
+    }
+  }
+  // Copy cpumasks to mask
+  for (int a=0; m<CPU_SET_N_U32; a++,m++) {
+    memcpy(((uint32_t*)mask)+a, cpumasks+m, sizeof(uint32_t));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
+  int c = 0;
+  uint8_t* m8 = (uint8_t*)mask;
+  for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
+    if (c == 0 && m8[o] == 0) continue;
+    sprintf(str+c, "%02x", m8[o]);
+    c+=2;
+    if (o && o%4 == 0) {
+      sprintf(str+c, ",");
+      c++;
+    }
+  }
+  str[c] = '\0';
+  return ncclSuccess;
+}
+
+#endif
diff --git a/src/include/debug.h b/src/include/debug.h
index 55dee18..3acdf8c 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -25,6 +25,7 @@ extern uint64_t ncclDebugMask;
 extern pthread_mutex_t ncclDebugOutputLock;
 extern FILE *ncclDebugFile;
 extern ncclResult_t getHostName(char* hostname, int maxlen);
+extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev);
 
 extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
 
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
index 69d0463..4db7094 100644
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@@ -10,12 +10,7 @@
 #include "core.h"
 #include "group.h"
 
-typedef ncclResult_t(*ncclFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
-
-ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff,
-    void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root,
-    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
 ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
 ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
 ncclResult_t ncclCpuBarrierOut(ncclComm_t comm);
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
index ce3f6ca..89edbf5 100644
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@@ -58,8 +58,50 @@ typedef struct {
   ncclResult_t (*closeListen)(void* listenComm);
 } ncclNet_v1_t;
 
-typedef ncclNet_v1_t ncclNet_t;
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Return the device path in /sys. NCCL will call free on this path.
+  ncclResult_t (*pciPath)(int dev, char** path);
+  // Return whether this device supports host pointers and/or CUDA pointers
+  // as data from the current GPU. Supported types should be composed with
+  // NCCL_PTR_HOST and NCCL_PTR_CUDA.
+  ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connectHandle
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
+  // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v2_t;
+
+typedef ncclNet_v2_t ncclNet_t;
 
-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v1
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v2
 
 #endif // end include guard
diff --git a/src/include/net.h b/src/include/net.h
index ebc9677..e75e6bb 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -26,9 +26,11 @@ static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(
 static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
 static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
 static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
-static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, type, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, type, request)); return ncclSuccess; }
-static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size) { NCCLCHECK(ncclNet->flush(recvComm, data, size)); return ncclSuccess; }
+static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
+static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
+static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; }
+static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclNet->flush(recvComm, data, size, mhandle)); return ncclSuccess; }
 static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
diff --git a/src/include/nvlink.h b/src/include/nvlink.h
index 7eb74c9..1baf9e5 100644
--- a/src/include/nvlink.h
+++ b/src/include/nvlink.h
@@ -67,18 +67,15 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
   if (res != ncclSuccess) return 0;
 
   for(int l=0; l<maxNvLinks; ++l) {
-    // nvmlDeviceGetNvLinkCapability(NVML_NVLINK_CAP_P2P_SUPPORTED) would seem to
-    // report whether the NVLink connects to a peer GPU (versus a POWER CPU?). I
-    // don't know whether nvmlDeviceGetNvLinkRemotePciInfo() would succeed in
-    // the POWER CPU case, so it seems best to check this as well.
+    // Check whether we can use this NVLink for P2P
     unsigned canP2P;
     if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
 
-    // nvmlDeviceGetNvLinkRemotePciInfo() will return NVML_ERROR_NOT_SUPPORTED
-    // if the links don't exist, or are disabled. So checking for that return
-    // here would probably make the nvmlDeviceGetNvLinkCapability check above
-    // redundant. Presumably, we still need to check the P2P capability above,
-    // since even non-GPUs would possess PCI info.
+    // Make sure the Nvlink is up. The previous call should have trained the link.
+    nvmlEnableState_t isActive;
+    if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
+
+    // Try to figure out what's on the other side of the NVLink
     nvmlPciInfo_t remoteProc;
     if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
 
@@ -89,7 +86,7 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
       p[c] = toupper(p[c]);
     }
 
-    if (strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
+    if (busId2 != NULL && strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
       links++;
     } else {
       // Make a lower case copy of the bus ID for calling ncclDeviceType
@@ -101,11 +98,21 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
         lowerId[c] = tolower(p[c]);
       }
 
-      // Determine if the remote side is NVswitch
+      // Determine if the remote side is NVswitch or a GPU
       enum ncclNvLinkDeviceType type;
-      if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
-        //TODO: we are making an assumption that all GPUs are connected to this switch
-        //This assumption may change for future architectures
+      ncclResult_t ret = ncclDeviceType(lowerId, &type);
+      if (ret == ncclSuccess) {
+        if (type == ncclNvLinkDeviceSwitch) {
+          //TODO: we are making an assumption that all GPUs are connected to this switch
+          //This assumption may change for future architectures
+          nvswitch_links++;
+        } else if (type == ncclNvLinkDeviceGpu && busId2 == NULL) {
+          links++;
+        }
+      } else {
+        // The NVLink is up but we couldn't find the PCI device on the other
+        // side. Assume it's an NVswitch outside a VM.
+        if (l==0) INFO(NCCL_INIT, "Assuming NVLink is connected to NVswitch");
         nvswitch_links++;
       }
     }
@@ -113,43 +120,4 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
   return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*links;
 }
 
-static int getNumNvlinks(const char* busId) {
-  nvmlDevice_t nvmlDev;
-  ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev);
-  if (res != ncclSuccess) return 0;
-
-  int nvlinks = 0, nvswitch_links = 0;
-  int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
-  for(int l=0; l<maxNvLinks; ++l) {
-    unsigned canP2P;
-    nvmlEnableState_t isActive;
-    if (wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) == ncclSuccess && canP2P &&
-        wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) == ncclSuccess && isActive == NVML_FEATURE_ENABLED) {
-      nvlinks++;
-    } else {
-      continue;
-    }
-
-    nvmlPciInfo_t remoteProc;
-    if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
-
-    // Make a lower case copy of the bus ID for calling ncclDeviceType
-    // PCI system path is in lower case
-    char* p = remoteProc.busId;
-    char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-    for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
-      if (p[c] == 0) break;
-      lowerId[c] = tolower(p[c]);
-    }
-
-    // Determine if the remote side is NVswitch
-    enum ncclNvLinkDeviceType type;
-    if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
-      //TODO: we are making an assumption that all GPUs are connected to this switch
-      //This assumption may change for future architectures
-      nvswitch_links++;
-    }
-  }
-  return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*nvlinks;
-}
 #endif
diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h
index ddfd233..0b6198a 100644
--- a/src/include/nvmlwrap.h
+++ b/src/include/nvmlwrap.h
@@ -7,7 +7,7 @@
 #ifndef NCCL_NVMLWRAP_H_
 #define NCCL_NVMLWRAP_H_
 
-#include "core.h"
+#include "nccl.h"
 
 //#define NVML_DIRECT 1
 #ifdef NVML_DIRECT
@@ -32,14 +32,6 @@ static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index)
   NVMLCHECK(nvmlDeviceGetIndex(device, index));
   return ncclSuccess;
 }
-static ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
-  NVMLCHECK(nvmlDeviceSetCpuAffinity(device));
-  return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
-  NVMLCHECK(nvmlDeviceClearCpuAffinity(device));
-  return ncclSuccess;
-}
 static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
   NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device));
   return ncclSuccess;
@@ -61,6 +53,10 @@ static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsig
   NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
   return ncclSuccess;
 }
+static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
+  NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber));
+  return ncclSuccess;
+}
 #else
 // Dynamically handle dependencies on NVML
 
@@ -136,14 +132,14 @@ ncclResult_t wrapNvmlInit(void);
 ncclResult_t wrapNvmlShutdown(void);
 ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
 ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
-ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
-ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
 ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
 ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci);
 ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
 ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
 ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
                                                    nvmlNvLinkCapability_t capability, unsigned int *capResult);
+ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber);
+
 #endif // NVML_DIRECT
 
 #endif // End include guard
diff --git a/src/include/ring.h b/src/include/ring.h
deleted file mode 100644
index fa5e099..0000000
--- a/src/include/ring.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_RING_H_
-#define NCCL_RING_H_
-#include "core.h"
-
-ncclResult_t initRing(struct ncclComm* comm, int ringid);
-ncclResult_t freeRing(struct ncclRing* ring);
-
-#endif
diff --git a/src/include/rings.h b/src/include/rings.h
index 751846c..43fc595 100644
--- a/src/include/rings.h
+++ b/src/include/rings.h
@@ -12,6 +12,6 @@ static int getDefaultThreads() {
   return ncclCudaCompCap() == 3 ? 128 : 256;
 }
 
-ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next);
+ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut);
 
 #endif
diff --git a/src/include/socket.h b/src/include/socket.h
index 624af40..fb5cfc0 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -60,7 +60,9 @@ static inline int envSocketFamily(void) {
 }
 
 static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
+#ifdef ENABLE_TRACE
   char line[1024];
+#endif
   struct netIf userIfs[MAX_IFS];
   bool searchNot = prefixList && prefixList[0] == '^';
   int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
@@ -106,7 +108,6 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
       // Store the IP address
       int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
       memcpy(addrs+found, interface->ifa_addr, salen);
-      INFO(NCCL_INIT|NCCL_NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
       found++;
     }
   }
@@ -336,8 +337,10 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
   TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
 #endif
 
-  /* Put the socket in listen mode */
-  SYSCHECK(listen(sockfd, 128), "listen");
+  /* Put the socket in listen mode
+   * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
+   */
+  SYSCHECK(listen(sockfd, 16384), "listen");
   *fd = sockfd;
   return ncclSuccess;
 }
diff --git a/src/include/transport.h b/src/include/transport.h
index 59f83c9..6231a71 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,6 +9,7 @@
 
 #include "nccl.h"
 #include <stdint.h>
+#include "nvmlwrap.h"
 
 #define NTRANSPORTS 3
 
@@ -19,11 +20,13 @@ struct ncclRing;
 struct ncclConnector;
 struct ncclComm;
 
-#define RANK_INFO_SIZE 64
-typedef char ncclTinfo_t[RANK_INFO_SIZE];
-
-struct ncclInfo {
-  ncclTinfo_t tinfo[NTRANSPORTS];
+struct ncclPeerInfo {
+  int rank;
+  int cudaDev;
+  int nvmlDev;
+  uint64_t hostHash;
+  uint64_t pidHash;
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
 };
 
 // Used to hold the transport connection values
@@ -34,18 +37,47 @@ struct ncclConnect {
   char data[CONNECT_SIZE];
 };
 
+enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress, ncclProxyOpDone };
+
+struct ncclProxyArgs;
+typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
+
 struct ncclProxyArgs {
-  struct ncclRing* ring;
-  int substeps;
+  proxyProgressFunc_t progress;
+  struct ncclChannel* channel;
+  struct ncclConnector* connector;
+  int sliceSteps;
+  int chunkSteps;
   int nsteps;
   uint64_t opCount;
   int llMode;
-  bool needProxy;
-  int active;   // add component before this line -- it is left out during initialization
+  int state;   // add component before this line -- it is left out during initialization
+
+  // Internal state
+  uint64_t head;
+  uint64_t tail;
+  uint64_t end;
+  void* requests[NCCL_STEPS];
+  int idle;
+
+  // Element linking
+  pthread_mutex_t mutex;
+  struct ncclProxyArgs* next;
+  struct ncclProxyArgs* nextPeer;
+};
+
+struct ncclProxyPool;
+struct ncclProxyState {
+  pthread_cond_t cond;
+  pthread_mutex_t mutex;
+  bool stop;
+  struct ncclProxyArgs* ops;
+  struct ncclProxyArgs* pool;
+  struct ncclProxyPool* pools;
 };
 
 struct ncclTransportComm {
-  ncclResult_t (*setup)(ncclTinfo_t*, ncclTinfo_t*, struct ncclConnect*, struct ncclRing*);
+  ncclResult_t (*setup)(struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
   ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*);
   ncclResult_t (*free)(void*);
   ncclResult_t (*proxy)(struct ncclProxyArgs*);
@@ -53,8 +85,7 @@ struct ncclTransportComm {
 
 struct ncclTransport {
   const char name[4];
-  ncclResult_t (*fillInfo)(ncclTinfo_t*, int);
-  ncclResult_t (*canConnect)(ncclTvalue_t*, ncclTinfo_t*, ncclTinfo_t*);
+  ncclResult_t (*canConnect)(ncclTvalue_t*, struct ncclPeerInfo*, struct ncclPeerInfo*);
   ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*);
   struct ncclTransportComm send;
   struct ncclTransportComm recv;
@@ -64,37 +95,17 @@ struct ncclTransport {
 
 typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
 
-#define TRANSPORT_PROXY_FIFO_SIZE NCCL_MAX_OPS
-
-struct transportProxyInfo {
-  struct ncclComm* comm;
-  pthread_t thread;
-  threadFunc_t func;
-  volatile int proxyReady;
-  struct ncclProxyArgs argsFifo[TRANSPORT_PROXY_FIFO_SIZE];
-  volatile uint64_t argsFifoHead;
-  volatile uint64_t argsFifoTail;
-  pthread_cond_t cond;
-  pthread_mutex_t mutex;
-};
-
-ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm);
-ncclResult_t transportDestroyProxy(struct ncclConnector* connector);
-
 enum proxyMode {
   proxyRing = 0,
   proxyFrom = 1,
   proxyTo = 2
 };
 
-static int proxyPatternRing = proxyRing;
-static inline int proxyPatternFrom(int root) { return 1+root; }
-static inline int proxyPatternTo(int root) { return -1-root; }
-static inline enum proxyMode proxyPatternMode(int pattern) { return (pattern == 0) ? proxyRing : ((pattern > 0) ? proxyFrom : proxyTo); }
-static inline int proxyPatternRoot(int pattern) { return (pattern > 0) ? pattern-1 : -pattern-1; }
-
-ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t size, int pattern, struct ncclComm* comm);
-ncclResult_t transportStartProxies(struct ncclComm* comm);
+ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr);
+ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks);
+ncclResult_t transportStartProxy(struct ncclComm* comm);
+ncclResult_t transportCreateProxy(struct ncclComm* comm);
+ncclResult_t transportDestroyProxy(struct ncclComm* comm);
 
 #include <unistd.h>
 
diff --git a/src/include/trees.h b/src/include/trees.h
new file mode 100644
index 0000000..1a151d1
--- /dev/null
+++ b/src/include/trees.h
@@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TREES_H_
+#define NCCL_TREES_H_
+
+ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0);
+ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* u1, int* d1_0, int* d1_1);
+
+#endif
diff --git a/src/init.cu b/src/init.cu
index 9d0188e..75822e6 100644
--- a/src/init.cu
+++ b/src/init.cu
@@ -1,21 +1,26 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
 #include "nccl.h"
 #include "core.h"
-#include "ring.h"
+#include "channel.h"
 #include "param.h"
 #include "nvmlwrap.h"
 #include "rings.h"
+#include "trees.h"
 #include "bootstrap.h"
 #include "transport.h"
-#include "common_coll.h"
 #include "group.h"
 #include "utils.h"
 #include "net.h"
+#include "checks.h"
+#include "enqueue.h"
+#include "topo.h"
+#include "nvlink.h"
+#include "cpuset.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/mman.h>
@@ -55,6 +60,16 @@ NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
 ncclNet_t* ncclNet = NULL;
 
 // We define this as weak to let tests redefine their own
+#pragma weak ncclNvlinkGpu
+ncclResult_t ncclNvlinkGpu(int* nvlink) {
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+  CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
+  *nvlink = getNvlinkGpu(busId, NULL);
+  return ncclSuccess;
+}
+// We define this as weak to let tests redefine their own
 #pragma weak ncclCudaCompCap
 int ncclCudaCompCap() {
   int cudaDev;
@@ -77,10 +92,7 @@ ncclResult_t initNet(ncclNet_t* net) {
   int ndev;
   if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
   if (net->devices(&ndev) != ncclSuccess) return ncclInternalError;
-  if (ndev <= 0) {
-    INFO(NCCL_INIT|NCCL_NET, "Net/%s: call to devices() returned 0 devices.", net->name);
-    return ncclSystemError;
-  }
+  if (ndev <= 0) return ncclSystemError;
   return ncclSuccess;
 }
 
@@ -91,15 +103,15 @@ ncclResult_t initNetPlugin(ncclNet_t** net) {
     // string, so checking errno doesn't hurt to try to provide a better
     // error message
     if (errno == ENOENT) {
-      INFO(NCCL_INIT|NCCL_NET, "No network plugin found.");
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (libnccl-net.so).");
     } else {
-      INFO(NCCL_INIT|NCCL_NET, "Unable to load libnccl-net.so : %s", dlerror());
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
     }
     return ncclSuccess;
   }
   ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
   if (extNet == NULL) {
-    INFO(NCCL_INIT|NCCL_NET, "NetPlugin: could not find " STR(NCCL_PLUGIN_SYMBOL) " symbol");
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol.");
     goto cleanup;
   }
   if (initNet(extNet) == ncclSuccess) {
@@ -116,21 +128,18 @@ ncclResult_t initNet() {
   NCCLCHECK(initNet(&ncclNetSocket));
 
   NCCLCHECK(initNetPlugin(&ncclNet));
-  if (ncclNet != NULL) {
-    INFO(NCCL_INIT|NCCL_NET, "Using network plugin %s", ncclNetName());
-    return ncclSuccess;
-  }
+  if (ncclNet != NULL) return ncclSuccess;
   if (initNet(&ncclNetIb) == ncclSuccess) {
     ncclNet = &ncclNetIb;
   } else {
     ncclNet = &ncclNetSocket;
   }
-  INFO(NCCL_INIT|NCCL_NET,"Using network %s", ncclNetName());
   return ncclSuccess;
 }
 
 NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
 NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2);
+NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", -2);
 
 int ncclThreadThreshold(int minCompCap, int multiNode) {
   int threshold = ncclParamThreadThreshold();
@@ -177,10 +186,15 @@ static ncclResult_t commFree(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
 
+  free(comm->peerInfo);
+
+  if (comm->bootstrap)
+    NCCLCHECK(bootstrapClose(comm->bootstrap));
+
   CUDACHECK(cudaFree(comm->devComm));
 
-  for (int ring=0; ring<comm->nRings; ring++)
-    NCCLCHECK(freeRing(comm->rings+ring));
+  for (int channel=0; channel<comm->nChannels; channel++)
+    NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks));
 
   if (comm->doneEvent != NULL)
     CUDACHECK(cudaEventDestroy(comm->doneEvent));
@@ -199,6 +213,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
     free(comm->intraCGMode);
     free(comm->intraCC);
   }
+  CUDACHECK(cudaFreeHost((void *)comm->abortFlag));
+  CUDACHECK(cudaFreeHost((void *)comm->fatalDevError));
 
   free(comm);
   return ncclSuccess;
@@ -222,12 +238,15 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   struct ncclComm* comm;
   NCCLCHECK(ncclCalloc(&comm, 1));
 
-  INFO(NCCL_INIT,"comm %p rank %d nranks %d", comm, rank, ndev);
   comm->rank = rank;
   comm->nRanks = ndev;
   cudaGetDevice(&comm->cudaDev);
+  getNvmlDevice(comm->cudaDev, &comm->nvmlDev);
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);
+
   comm->doneEvent = doneEvent;
   comm->llThreshold = ncclParamLlThreshold();
+  comm->treeThreshold = ncclParamTreeThreshold();
   comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
 #if CUDART_VERSION >= 9200
   comm->groupCudaStream = ncclParamGroupCudaStream();
@@ -235,6 +254,13 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   // Don't allow the user to overload the default setting in older CUDA builds
   comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM;
 #endif
+  comm->fatalError = ncclSuccess;
+
+  CUDACHECK(cudaHostAlloc((void**) &comm->fatalDevError, sizeof(ncclDevError_t), cudaHostAllocMapped));
+  *comm->fatalDevError = ncclDevSuccess;
+
+  CUDACHECK(cudaHostAlloc((void**) &comm->abortFlag, sizeof(uint32_t), cudaHostAllocMapped));
+  *comm->abortFlag = 0;
 
   comm->argsptr = &comm->args;
 
@@ -248,9 +274,18 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   // Copy the comm on the device
   NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1));
   // Copy userRanks
-  for (int r=0; r<comm->nRings; r++) {
-    NCCLCHECK(ncclCudaMemcpy(comm->rings[r].devUserRanks, comm->rings[r].userRanks, comm->nRanks));
+  for (int r=0; r<comm->nChannels; r++) {
+    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
+    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks));
   }
+  // Copy the device-accessible pointer to comm->abortFlag
+  void *devAbortFlag;
+  CUDACHECK(cudaHostGetDevicePointer(&devAbortFlag, (uint32_t *)comm->abortFlag, 0));
+  CUDACHECK(cudaMemcpy(&comm->devComm->abortFlag, &devAbortFlag, sizeof(int *), cudaMemcpyHostToDevice));
+  // Copy the device-accessible pointer to comm->fatalDevError
+  void *devFatalError;
+  CUDACHECK(cudaHostGetDevicePointer(&devFatalError, (ncclDevError_t *)comm->fatalDevError, 0));
+  CUDACHECK(cudaMemcpy(&comm->devComm->fatalDevError, &devFatalError, sizeof(ncclDevError_t *), cudaMemcpyHostToDevice));
   return ncclSuccess;
 }
 
@@ -267,35 +302,81 @@ static void showVersion() {
   }
 }
 
-static ncclResult_t fillInfo(struct ncclInfo* info, int rank) {
-  for (int t=0; t<NTRANSPORTS; t++) {
-    NCCLCHECK(ncclTransports[t].fillInfo(info->tinfo+t, rank));
-  }
+static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank) {
+  info->rank = rank;
+  CUDACHECK(cudaGetDevice(&info->cudaDev));
+  NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev))
+  info->hostHash=getHostHash();
+  info->pidHash=getPidHash();
+
+  // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
+  // cudaDev is a CUDA runtime dev number which could be different from the
+  // NVML device number. Then we get the busID from NVML to be sure it is
+  // consistent with NVML remote PCI bus Ids.
+  CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
+  nvmlDevice_t nvmlDevice;
+  NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice));
+  nvmlPciInfo_t pciInfo;
+  NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo));
+  strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE);
   return ncclSuccess;
 }
 
 template <int type>
-static ncclResult_t selectTransport(struct ncclInfo* myInfo, struct ncclInfo* peerInfo, struct ncclConnect* connect, struct ncclTransport** transportRet, struct ncclRing* ring) {
+static ncclResult_t selectTransport(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) {
   for (int t=0; t<NTRANSPORTS; t++) {
     struct ncclTransport *transport = ncclTransports+t;
     struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
     ncclTvalue_t ret = 0;
-    NCCLCHECK(transport->canConnect(&ret, myInfo->tinfo+t, peerInfo->tinfo+t));
+    NCCLCHECK(transport->canConnect(&ret, myInfo, peerInfo));
     if (ret > 0) {
-      NCCLCHECK(transportComm->setup(myInfo->tinfo+t, peerInfo->tinfo+t, connect, ring));
-      *transportRet = transport;
+      connector->transportComm = transportComm;
+      NCCLCHECK(transportComm->setup(myInfo, peerInfo, connect, connector, buffSize, channelId));
       return ncclSuccess;
     }
   }
   WARN("No transport found !");
-  *transportRet = NULL;
   return ncclInternalError;
 }
 
-static ncclResult_t setupRing(struct ncclComm* comm, int ringid, int rank, int nranks, int* ringRanks, struct ncclInfo* allInfo, struct ncclConnect* connect) {
-  NCCLCHECK(initRing(comm, ringid));
+static int log2(int n) {
+ int l = 0;
+ while (n>>=1) l++;
+ return l;
+}
+
+static ncclResult_t ncclTreeThreshold(int nnodes, int nranks, int nChannels, ssize_t *treeThreshold) {
+  int nvlink;
+  NCCLCHECK(ncclNvlinkGpu(&nvlink));
+  float ringbw = nvlink ? 5000*nChannels : 5000; // approx, in MB/s or B/us
+  float ringlatinter = 6;
+  float treelatintra = 4;
+  float treelatinter = 15;
+  float treebw;
+  if (!nvlink) {
+    treebw = ringbw * 2 / 3;
+  } else {
+    treebw = ringbw * 3 / 4;
+    if (nnodes == 2) treebw *= 2;
+  }
+  float ringlat = ringlatinter*(nranks-1);
+  float treelat = treelatinter*log2(nnodes)+treelatintra*(nranks/nnodes-1);
+  if (nnodes < 2 || ringlat <= treelat)
+    *treeThreshold = 0;
+  else if (treebw > ringbw)
+    *treeThreshold = 0x7fffffffffffffff;
+  else
+    *treeThreshold = (ssize_t)(((ringbw*treebw/(ringbw-treebw)))*(ringlat-treelat));
+  return ncclSuccess;
+}
+
+static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks, int* treeMasters) {
+  TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
+  NCCLCHECK(initChannel(comm, channelId));
+
+  struct ncclChannel* channel = comm->channels+channelId;
+  struct ncclRing* ring = &channel->ring;
 
-  struct ncclRing* ring = comm->rings+ringid;
   // Reorganize ranks to start with rank.
   int shift;
   for (shift = 0; shift<nranks; shift++) {
@@ -306,21 +387,85 @@ static ncclResult_t setupRing(struct ncclComm* comm, int ringid, int rank, int n
   for (int i=0; i<nranks; i++) {
     ring->userRanks[i] = ringRanks[(i+shift)%nranks];
   }
-  int prev = ring->userRanks[nranks-1];
-  int next = ring->userRanks[1];
+  int prev = ring->prev = ring->userRanks[nranks-1];
+  int next = ring->next = ring->userRanks[1];
+
+  struct ncclTree* tree = &channel->tree;
+  tree->up = -1;
+  tree->down[0] = tree->down[1] = tree->down[2] = -1;
+
+  //
+  // Find per-node masters and connect them via a binary tree
+  //
+
+  int nMasters = 0;
+  for (int r=0; r<nranks; r++) nMasters += treeMasters[r];
+  if (nMasters == 0) {
+    nMasters = 1;
+    treeMasters[0] = 1;
+  }
+
+  if (comm->treeThreshold == -2)
+    NCCLCHECK(ncclTreeThreshold(nMasters, comm->nRanks, comm->nChannels, &comm->treeThreshold));
+
+  if (comm->treeThreshold > 0) {
+    // Compute tree depth. Not an exact value but a good approximation in most
+    // cases and consistent across nodes
+    tree->depth = nranks/nMasters + log2(nMasters);
+
+    // Find my master : go backwards in the ring to find my root
+    int master = 0;
+    for (int i = 0; i<nranks; i++) {
+      int r = ring->userRanks[(nranks-i)%nranks];
+      if (treeMasters[r]) {
+        master = r;
+        break;
+      }
+    }
 
-  NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+0, &ring->recv.transport, ring));
-  NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+1, &ring->send.transport, ring));
-  NCCLCHECK(transportCreateProxy(0, ring, comm));
-  NCCLCHECK(transportCreateProxy(1, ring, comm));
+    int ranks[nMasters];
+    int i = 0, masterIndex = -1;
+    // Build binary tree
+    for (int r=0; r<nranks; r++) {
+      // Create index table
+      if (r == master) masterIndex = i;
+      if (treeMasters[r]) ranks[i++] = r;
+    }
+    int btreeUp, btreeDown0, btreeDown1;
+    int u0, d0_0, d0_1, u1, d1_0, d1_1;
+    NCCLCHECK(ncclGetDtree(nMasters, masterIndex, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1));
+    if (channelId < DIVUP(comm->nChannels, 2)) {
+      btreeUp = u0; btreeDown0 = d0_0; btreeDown1 = d0_1;
+    } else {
+      btreeUp = u1; btreeDown0 = d1_0; btreeDown1 = d1_1;
+    }
+
+    //
+    // Now build the full tree, combining the intra-node ring and the
+    // inter-node binary tree.
+    //
+
+    if (rank == master) {
+      int nDown = 0;
+      if (btreeUp != -1) tree->up = ranks[btreeUp];
+      if (treeMasters[next] == 0) tree->down[nDown++] = next;
+      if (btreeDown0 != -1) tree->down[nDown++] = ranks[btreeDown0];
+      if (btreeDown1 != -1) tree->down[nDown++] = ranks[btreeDown1];
+    } else {
+      tree->up = prev;
+      if (treeMasters[next] == 0) tree->down[0] = next;
+    }
+  }
+
+  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
   return ncclSuccess;
 }
 
-static ncclResult_t fillConnect(struct ncclInfo* allInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) {
+static ncclResult_t fillConnect(struct ncclPeerInfo* peerInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) {
   for (int r=0; r<nranks; r++) {
     connectTransport[r] = -1;
     for (int t=0; t<NTRANSPORTS; t++) {
-      NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, allInfo[rank].tinfo+t, allInfo[r].tinfo+t));
+      NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, peerInfo+rank, peerInfo+r));
       if (connectValue[r] > 0) {
         connectTransport[r] = t;
         break;
@@ -330,11 +475,6 @@ static ncclResult_t fillConnect(struct ncclInfo* allInfo, int nranks, int rank,
   return ncclSuccess;
 }
 
-static void swap(void* mem1, void* mem2, int size) {
-  char tmp[size];
-  memcpy(tmp, mem1, size); memcpy(mem1, mem2, size); memcpy(mem2, tmp, size);
-}
-
 #define MAXWIDTH 20
 #define PREFIXLEN 15
 #define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
@@ -380,9 +520,9 @@ void dumpLine(int* values, int nranks, const char* prefix) {
 static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
   for (int r=0; r<nrings; r++) {
     char prefix[30];
-    /*sprintf(prefix, "[%d] Ring %d Prev : ", rank, r);
+    /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
     dumpLine(prev+r*nranks, nranks, prefix);
-    sprintf(prefix, "[%d] Ring %d Next : ", rank, r);
+    sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
     dumpLine(next+r*nranks, nranks, prefix);*/
 
     int current = rank;
@@ -390,7 +530,7 @@ static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int
       rings[r*nranks+i] = current;
       current = next[r*nranks+current];
     }
-    sprintf(prefix, "Ring %02d : ", r);
+    sprintf(prefix, "Channel %02d : ", r);
     if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
     if (current != rank) {
       WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
@@ -488,140 +628,274 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct
   return ncclSuccess;
 }
 
+static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
+  TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
+  uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
+  struct ncclConnect connect;
+  struct ncclConnector* conn;
+  for (int i=0; i<nrecv; i++) {
+    int peer = peerRecv[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].recv;
+    if (conn->connected) { ++nSkippedRecv; continue; }
+    NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
+    NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+  }
+  for (int i=0; i<nsend; i++) {
+    int peer = peerSend[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].send;
+    if (conn->connected) { ++nSkippedSend; continue; }
+    NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
+    NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+  }
+  for (int i=0; i<nsend; i++) {
+    int peer = peerSend[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].send;
+    if (conn->connected) {++nSkippedSend; continue; }
+    NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+    NCCLCHECK(conn->transportComm->connect(&connect, conn));
+    conn->connected = 1;
+  }
+  for (int i=0; i<nrecv; i++) {
+    int peer = peerRecv[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].recv;
+    if (conn->connected) {++nSkippedRecv; continue; }
+    NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+    NCCLCHECK(conn->transportComm->connect(&connect, conn));
+    conn->connected = 1;
+  }
+  TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
+  return ncclSuccess;
+}
+
 static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
+  // We use 3 AllGathers
+  // 1. { peerInfo, comm }
+  // 2. ConnectTransport[nranks], ConnectValue[nranks]
+  // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
+
   int rank = comm->rank;
   int nranks = comm->nRanks;
-  void* commState;
-  NCCLCHECK(bootstrapInit(commId, rank, nranks, &commState));
+  TRACE(NCCL_INIT, "rank %d nranks %d - BEGIN", rank, nranks);
+  NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
 
-  struct ncclInfo* allInfo;
-  NCCLCHECK(ncclCalloc(&allInfo, nranks));
-  NCCLCHECK(fillInfo(allInfo+rank, rank));
-  NCCLCHECK(bootstrapAllGather(commState, allInfo, sizeof(struct ncclInfo)));
+  // AllGather1 - begin
+  struct {
+    struct ncclPeerInfo peerInfo;
+    struct ncclComm* comm;
+  } *allGather1Data;
+
+  NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
+  allGather1Data[rank].comm = comm;
+  NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank));
+  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
+
+  NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
+  for (int i = 0; i < nranks; i++) {
+    memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
+  }
+  // AllGather1 data is used again below
+  // AllGather1 - end
+
+  // AllGather2 - begin
+  size_t allGather2DataRowSize = sizeof(int)*nranks + sizeof(ncclTvalue_t)*nranks;
+  void *allGather2Data;
+  NCCLCHECK(ncclCalloc((char **)&allGather2Data, allGather2DataRowSize*nranks));
+  int *myTransportRow = (int *)((char *)allGather2Data + allGather2DataRowSize*rank);
+  ncclTvalue_t *myValueRow = (ncclTvalue_t *)(myTransportRow + nranks);
+
+  NCCLCHECK(fillConnect(comm->peerInfo, nranks, rank, myTransportRow, myValueRow));
+  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather2Data, allGather2DataRowSize));
 
   int* connectTransport;
   ncclTvalue_t* connectValue;
   NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
   NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
+  for (int i = 0; i < nranks; i++) {
+    memcpy(connectTransport + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize, sizeof(int)*nranks);
+    memcpy(connectValue + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize + nranks*sizeof(int), sizeof(ncclTvalue_t)*nranks);
+  }
+  free(allGather2Data);
+  // AllGather2 - end
 
-  NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
-  NCCLCHECK(bootstrapAllGather(commState, connectTransport, nranks*(sizeof(int))));
-  NCCLCHECK(bootstrapAllGather(commState, connectValue, nranks*(sizeof(ncclTvalue_t))));
   //if (rank == 0) dumpMatrix(connectTransport, nranks);
   //if (rank == 0) dumpMatrixTvalue(connectValue, nranks);
 
   // Get my rings
   int nrings;
-  int* prev, *next;
-  NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
-  NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
+  int* prev, *next, *treeIn, *treeOut;
+  NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS));
   comm->nThreads = getDefaultThreads();
-  NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next));
+  NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut));
+  TRACE(NCCL_INIT, "rank %d nranks %d - BUILD %d RINGS", rank, nranks, nrings);
+  assert(nrings <= MAXCHANNELS);
   free(connectTransport);
   free(connectValue);
 
+  // AllGather3 - begin
+  struct {
+    int nThreads;
+    int nrings;
+    int cudaCompCap;
+    int prev[MAXCHANNELS];
+    int next[MAXCHANNELS];
+  } *allGather3Data;
+
+  NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
+  allGather3Data[rank].nThreads = comm->nThreads;
+  allGather3Data[rank].nrings = nrings;
+  allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
+  for (int r=0; r<nrings; r++) {
+    allGather3Data[rank].prev[r] = *(prev+r*nranks+rank);
+    allGather3Data[rank].next[r] = *(next+r*nranks+rank);
+  }
+  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
+
   // Find max nThreads
-  int allData[nranks];
-  allData[rank] = comm->nThreads;
-  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
   for (int i=0; i<nranks; i++)
-    comm->nThreads = std::max(allData[i], comm->nThreads);
-  if (rank == 0) INFO(NCCL_INIT,"Using %d threads", comm->nThreads);
+    comm->nThreads = std::max(allGather3Data[i].nThreads, comm->nThreads);
 
   // Determine the minimum CUDA Compute capability of all GPUs
-  int myCompCap = ncclCudaCompCap();
+  int myCompCap = allGather3Data[rank].cudaCompCap;
   int minCompCap = myCompCap;
-  allData[rank] = myCompCap;
-  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
-  for (int i=0; i<nranks; i++)
-    minCompCap = std::min(allData[i], minCompCap);
-  if (rank == 0) INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
+  for (int i = 0; i < nranks; i++)
+    minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap);
+
+  // Determine thread threshold across all GPUs
+  int nnodes = 0;
+  for (int r=0; r<nranks; r++) nnodes += treeIn[r];
+  comm->threadThreshold = ncclThreadThreshold(minCompCap, nnodes);
 
   // Find min nrings across ranks
-  allData[rank] = nrings;
-  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
   for (int i=0; i<nranks; i++)
-    nrings = std::min(allData[i], nrings);
-
-  // Exchange data with others to build complete rings
-  comm->nRings = nrings;
-  for (int r=0; r<nrings; r++) {
-    NCCLCHECK(bootstrapAllGather(commState, prev+r*nranks, sizeof(int)));
-    NCCLCHECK(bootstrapAllGather(commState, next+r*nranks, sizeof(int)));
+    nrings = std::min(allGather3Data[i].nrings, nrings);
+  comm->nChannels = nrings;
+
+  // Unpack the per ring prev/next arrays
+  for (int i = 0; i < nranks; i++) {
+    for (int r = 0; r < nrings; r++) {
+      prev[r*nranks+i] = allGather3Data[i].prev[r];
+      next[r*nranks+i] = allGather3Data[i].next[r];
+    }
   }
+  free(allGather3Data);
+  // AllGather3 - end
+
   int *rings;
-  NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
   NCCLCHECK(buildRings(nrings, rings, rank, nranks, prev, next));
   free(prev);
   free(next);
+  TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d RINGS", rank, nranks, nrings);
 
   // Connect with prev/next for each ring
-  struct ncclConnect *connectData;
-  NCCLCHECK(ncclCalloc(&connectData, 2*nranks));
+  struct ncclConnect *connect;
+  NCCLCHECK(ncclCalloc(&connect, 2));
   for (int r=0; r<nrings; r++) {
-    int* ringRanks = rings+r*nranks;
-    struct ncclRing *ring = comm->rings+r;
-    NCCLCHECK(setupRing(comm, r, rank, nranks, ringRanks, allInfo, connectData+2*rank));
-    int prev_offset = ring->userRanks[nranks-1]*2+1;
-    int next_offset = ring->userRanks[1]*2;
-    NCCLCHECK(bootstrapAllGather(commState, connectData, sizeof(struct ncclConnect)*2));
-    NCCLCHECK(ring->send.transport->send.connect(connectData+next_offset, &ring->send));
-    NCCLCHECK(ring->recv.transport->recv.connect(connectData+prev_offset, &ring->recv));
-  }
-  free(connectData);
-  free(rings);
-  free(allInfo);
+    struct ncclChannel* channel = comm->channels+r;
+    NCCLCHECK(setupChannel(comm, r, rank, nranks, rings+r*nranks, treeIn+r*nranks));
+    NCCLCHECK(p2pSetup(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next));
+    NCCLCHECK(p2pSetup(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up));
+    NCCLCHECK(p2pSetup(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down));
+  }
+  if (comm->treeThreshold > 0) {
+    char line[1024];
+    line[0]='\0';
+    for (int c=0; c<nrings; c++) {
+      struct ncclTree* tree = &comm->channels[c].tree;
+      snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d->%d->%d/%d/%d",
+          c, tree->up, rank, tree->down[0], tree->down[1], tree->down[2]);
+    }
+    line[1023] = '\0';
+    INFO(NCCL_INIT, "Trees%s", line);
+  }
+  if (rank == 0) {
+    char treeline[64];
+    snprintf(treeline, 64, "enabled up to size %ld", comm->treeThreshold);
+    INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees %s", comm->nThreads, minCompCap,
+       comm->treeThreshold == 0 ? "disabled" :
+       comm->treeThreshold == 0x7fffffffffffffff ? "enabled for all sizes" :
+       treeline);
+  }
 
-  // Intra-process barrier setup
-  struct rankInfo {
-    uint64_t hostHash;
-    uint64_t pidHash;
-    struct ncclComm* comm;
-  } rankInfos[nranks];
-  rankInfos[rank].hostHash = getHostHash();
-  rankInfos[rank].pidHash = getPidHash();
-  rankInfos[rank].comm = comm;
-  NCCLCHECK(bootstrapAllGather(commState, rankInfos, sizeof(struct rankInfo)));
+  TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, nrings);
+  free(connect);
+  free(rings);
+  free(treeIn);
+  free(treeOut);
 
-  // Compute intra ranks
+  // Compute intra ranks (using AllGather1 data)
   int intraRank0 = -1, intraRank = -1, intraRanks = 0;
-  int multiNode = 0;
-  for (int r=0; r<nranks; r++) {
-    if ((rankInfos[r].hostHash == rankInfos[rank].hostHash) &&
-        (rankInfos[r].pidHash == rankInfos[rank].pidHash)) {
-      if (intraRanks == 0) intraRank0 = r;
-      if (r == rank) intraRank = intraRanks;
+  for (int i = 0; i < nranks; i++) {
+    if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) &&
+        (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) {
+      if (intraRanks == 0) intraRank0 = i;
+      if (i == rank) intraRank = intraRanks;
       intraRanks++;
-    } else if (rankInfos[r].hostHash != rankInfos[rank].hostHash) {
-      multiNode = 1;
     }
   }
   TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
-      rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
-  if (intraRank == -1 || intraRank0 == -1 || rankInfos[intraRank0].comm == NULL) {
+        rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
+  if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
     WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
-        rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
+         rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
     return ncclInternalError;
   }
-  NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, rankInfos[intraRank0].comm));
+  NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm));
 
-  // Determine thread threshold across all GPUs
-  comm->threadThreshold = ncclThreadThreshold(minCompCap, multiNode);
+  // Done with AllGather1 data
+  free(allGather1Data);
 
-  // Barrier
-  bootstrapClose(commState);
+  if (nnodes) NCCLCHECK(transportCreateProxy(comm));
+
+  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
   return ncclSuccess;
 }
 
-bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice) {
-  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-  if (cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev) != cudaSuccess) return false;
-  if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevice) != ncclSuccess) return false;
-  if (wrapNvmlDeviceSetCpuAffinity(*nvmlDevice) != ncclSuccess) {
-    WARN("Failed to set CPU affinity");
-    return false;
+static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) {
+  CPU_ZERO_S(sizeof(cpu_set_t), mask);
+  char* cudaPath;
+  NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
+  char path[PATH_MAX];
+  strncpy(path, cudaPath, PATH_MAX-1);
+  snprintf(path+strlen(path), PATH_MAX-1-strlen(path), "/local_cpus");
+  path[PATH_MAX-1] = '\0';
+  int fd;
+  SYSCHECKVAL(open(path, O_RDONLY), "open", fd);
+  char affinityStr[sizeof(cpu_set_t)*2];
+  int r = read(fd, affinityStr, sizeof(cpu_set_t)*2);
+  if (r > 0)
+    NCCLCHECK(ncclStrToCpuset(affinityStr, mask));
+  close(fd);
+  free(cudaPath);
+  return ncclSuccess;
+}
+
+static ncclResult_t setCpuAffinity(int cudaDev) {
+  // Work within the enveloppe we were provided
+  cpu_set_t mask;
+  SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
+
+  // Find the subpart that is local to our GPU
+  cpu_set_t gpuMask;
+  NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask));
+  cpu_set_t finalMask;
+  CPU_AND(&finalMask, &mask, &gpuMask);
+
+  // If those are not disjoint, try to stay local
+  if (CPU_COUNT(&finalMask)) {
+    char affinityStr[sizeof(cpu_set_t)*2];
+    NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
+    INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", cudaDev, affinityStr);
+    SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity");
   }
-  return true;
+  return ncclSuccess;
 }
 
 ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
@@ -633,9 +907,8 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
 
   // Make sure all host memory allocation are close to the GPU
   int cudaDev;
-  nvmlDevice_t nvmlDevice;
   CUDACHECK(cudaGetDevice(&cudaDev));
-  SetCpuAffinity(cudaDev, &nvmlDevice);
+  NCCLCHECK(setCpuAffinity(cudaDev));
   ncclResult_t res;
 
   NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
@@ -645,7 +918,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
   sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
   NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
 
-  INFO(NCCL_INIT,"comm %p rank %d nranks %d - COMPLETE", *newcomm, myrank, nranks);
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->nvmlDev);
 
   return ncclSuccess;
 cleanup:
@@ -664,8 +937,6 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
   NCCLCHECK(ncclInit());
   if (myrank == 0) showVersion();
 
-  INFO(NCCL_INIT,"rank %d nranks %d", myrank, nranks);
-
   // Make sure the CUDA runtime is initialized.
   CUDACHECK(cudaFree(NULL));
 
@@ -685,7 +956,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
 }
 
 static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) {
-  struct ncclInfo* allInfo;
+  struct ncclPeerInfo* allInfo;
   NCCLCHECK(ncclCalloc(&allInfo, nranks));
   for (int rank=0; rank<nranks; rank++) {
     CUDACHECK(cudaSetDevice(devs[rank]));
@@ -699,12 +970,14 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
   for (int rank=0; rank<nranks; rank++)
     NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
 
-  int* prev, *prevFinal, *next, *nextFinal;
-  NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
-  NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXRINGS));
-  NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
-  NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXRINGS));
-  int nrings = MAXRINGS;
+  int* prev, *prevFinal, *next, *nextFinal, *treeIn, *treeOut;
+  NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS));
+  int nrings = MAXCHANNELS;
   int nthreads=0;
   int myCompCap = ncclCudaCompCap();
   int minCompCap = myCompCap;
@@ -713,7 +986,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
     int nringsRank;
     int nthreadsRank = getDefaultThreads();
     myCompCap = ncclCudaCompCap();
-    NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next));
+    NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut));
     nrings = std::min(nrings, nringsRank);
     nthreads = std::max(nthreads, nthreadsRank);
     minCompCap = std::min(minCompCap, myCompCap);
@@ -728,11 +1001,10 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
   free(prev);
   free(next);
 
-  INFO(NCCL_INIT,"Using %d threads", nthreads);
-  INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
+  INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees disabled", nthreads, minCompCap);
 
   int* rings;
-  NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
   NCCLCHECK(buildRings(nrings, rings, 0, nranks, prevFinal, nextFinal));
   free(prevFinal);
   free(nextFinal);
@@ -741,7 +1013,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
   int threadThreshold = ncclThreadThreshold(minCompCap, 0);
 
   for (int rank=0; rank<nranks; rank++) {
-    comms[rank]->nRings = nrings;
+    comms[rank]->nChannels = nrings;
     comms[rank]->nThreads = nthreads;
     comms[rank]->threadThreshold = threadThreshold;
   }
@@ -751,26 +1023,32 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
     int* ringRanks = rings+r*nranks;
     for (int rank=0; rank<nranks; rank++) {
       CUDACHECK(cudaSetDevice(devs[rank]));
-      NCCLCHECK(setupRing(comms[rank], r, rank, nranks, ringRanks, allInfo, connect+2*rank));
-    }
-    // RingExchange connect information
-    for (int rank=0; rank<nranks; rank++) {
-      // Swap rank->prev and prevRank->next
-      struct ncclRing *ring = comms[rank]->rings+r;
-      int prevRank = ring->userRanks[nranks-1];
-      struct ncclConnect* prevRankNextConnect = connect+2*prevRank+1;
-      struct ncclConnect* rankPrevConnect = connect+2*rank;
-      swap(prevRankNextConnect, rankPrevConnect, sizeof(struct ncclConnect));
+      struct ncclChannel* channel = comms[rank]->channels+r;
+      struct ncclRing *ring = &channel->ring;
+      NCCLCHECK(setupChannel(comms[rank], r, rank, nranks, ringRanks, treeIn));
+      // Make sure we don't use trees, we cannot use them with initAll
+      comms[rank]->treeThreshold = 0;
+      int prev = channel->ring.prev = ring->userRanks[nranks-1];
+      int next = channel->ring.next = ring->userRanks[1];
+      struct ncclConnector* recv = &channel->peers[prev].recv;
+      struct ncclConnector* send = &channel->peers[next].send;
+      NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+rank*2+0, recv, channel->buffSize, channel->id));
+      NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+rank*2+1, send, channel->buffSize, channel->id));
     }
     for (int rank=0; rank<nranks; rank++) {
       CUDACHECK(cudaSetDevice(devs[rank]));
-      struct ncclRing *ring = comms[rank]->rings+r;
-      NCCLCHECK(ring->send.transport->send.connect(connect+2*rank+1, &ring->send));
-      NCCLCHECK(ring->recv.transport->recv.connect(connect+2*rank+0, &ring->recv));
+      struct ncclChannel* channel = comms[rank]->channels+r;
+      struct ncclRing *ring = &channel->ring;
+      struct ncclConnector* recv = &channel->peers[ring->prev].recv;
+      struct ncclConnector* send = &channel->peers[ring->next].send;
+      NCCLCHECK(recv->transportComm->connect(connect+ring->prev*2+1, recv));
+      NCCLCHECK(send->transportComm->connect(connect+ring->next*2+0, send));
     }
   }
-  free(rings);
   free(allInfo);
+  free(rings);
+  free(treeIn);
+  free(treeOut);
   return ncclSuccess;
 }
 
@@ -794,7 +1072,6 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
   int savedDevice;
   int rank, cudaDev;
   ncclComm_t comm = NULL;
-  nvmlDevice_t nvmlDevice;
   int ncclDevList[ndev];
   for (int i=0; i<ndev; i++) {
     ncclDevList[i] = devlist ? devlist[i] : i;
@@ -812,7 +1089,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
     cudaDev = ncclDevList[rank];
     CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
 
-    SetCpuAffinity(cudaDev, &nvmlDevice);
+    NCCLCHECK(setCpuAffinity(cudaDev));
 
     NCCLCHECKGOTO(commAlloc(&comm, ndev, rank), res, cleanup);
     comms[rank] = comm;
@@ -848,27 +1125,50 @@ final:
   return res;
 }
 
-NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
-ncclResult_t ncclCommDestroy(ncclComm_t comm) {
 
-  if (comm == NULL)
-    return ncclSuccess;
+static ncclResult_t commDestroy(ncclComm_t comm) {
   int savedDevice;
   CUDACHECK(cudaGetDevice(&savedDevice));
   int commDevice = comm->cudaDev;
+  int rank = comm->rank;
 
   if (savedDevice != commDevice) {
     CUDACHECK(cudaSetDevice(commDevice));
   }
 
+  TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, *comm->abortFlag, comm->fatalError);
+
+  CUDACHECK(cudaStreamSynchronize(comm->groupStream));
+  NCCLCHECK(transportDestroyProxy(comm));
   NCCLCHECK(commFree(comm));
 
   if (savedDevice != commDevice)
     CUDACHECK(cudaSetDevice(savedDevice));
 
+  INFO(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
+
   return ncclSuccess;
 }
 
+NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
+ncclResult_t ncclCommDestroy(ncclComm_t comm) {
+  if (comm == NULL)
+    return ncclSuccess;
+
+  return commDestroy(comm);
+}
+
+NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
+ncclResult_t ncclCommAbort(ncclComm_t comm) {
+  if (comm == NULL)
+    return ncclSuccess;
+
+  // Ask anything that might still be running on the device to quit
+  *comm->abortFlag = 1;
+
+  return commDestroy(comm);
+}
+
 NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
 const char* ncclGetErrorString(ncclResult_t code) {
   switch (code) {
@@ -882,6 +1182,39 @@ const char* ncclGetErrorString(ncclResult_t code) {
   }
 }
 
+NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError);
+ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
+  NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm"));
+  NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError"));
+
+  // Check device reported error
+  static ncclDevError_t printedDevErr = ncclDevSuccess;
+  switch(*comm->fatalDevError) {
+    case ncclDevSuccess :
+      break;
+    case ncclDevAssertedMismatch :
+      if (printedDevErr != ncclDevAssertedMismatch) {
+        WARN("Mismatched collective detected, please check your collective calls at and around rank %d. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank);
+        printedDevErr = ncclDevAssertedMismatch;
+      }
+      if (comm->fatalError == ncclSuccess) {
+        comm->fatalError = ncclInvalidUsage;
+      }
+      break;
+    case ncclDevSuspectedMismatch :
+      if (printedDevErr != ncclDevSuspectedMismatch) {
+        WARN("Your program may be hanging, this may be caused by a collective mismatch around rank %d. Please check your collective calls at and around this rank. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank);
+        printedDevErr = ncclDevSuspectedMismatch;
+      }
+      break;
+    default:
+      WARN("Unknown device error %d", *comm->fatalDevError);
+      return ncclInternalError;
+  }
+  *asyncError = comm->fatalError;
+  return ncclSuccess;
+}
+
 NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
 ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
   NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
diff --git a/src/misc/checks.cu b/src/misc/checks.cu
new file mode 100644
index 0000000..a07e577
--- /dev/null
+++ b/src/misc/checks.cu
@@ -0,0 +1,69 @@
+/*************************************************************************
+ * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "checks.h"
+
+static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
+  cudaPointerAttributes attr;
+  cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
+  if (err != cudaSuccess || attr.devicePointer == NULL) {
+    WARN("%s : %s is not a valid pointer", opname, ptrname);
+    return ncclInvalidArgument;
+  }
+#if CUDART_VERSION >= 10000
+  if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
+#else
+  if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
+#endif
+    WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
+    return ncclInvalidArgument;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
+  if (ptr == NULL) {
+    WARN("%s : %s argument is NULL", opname, ptrname);
+    return ncclInvalidArgument;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ArgsCheck(struct ncclInfo* info) {
+  NCCLCHECK(PtrCheck(info->comm, info->opName, "comm"));
+  // First, the easy ones
+  if (info->root < 0 || info->root >= info->comm->nRanks) {
+    WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks);
+    return ncclInvalidArgument;
+  }
+  if (info->datatype < 0 || info->datatype >= ncclNumTypes) {
+    WARN("%s : invalid type %d", info->opName, info->datatype);
+    return ncclInvalidArgument;
+  }
+  // Type is OK, compute nbytes. Convert Allgather/Broadcast calls to chars.
+  info->nBytes = info->count * ncclTypeSize(info->datatype);
+  if (info->coll == ncclCollAllGather || info->coll == ncclCollBroadcast) {
+    info->count = info->nBytes;
+    info->datatype = ncclInt8;
+  }
+  if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
+
+  if (info->op < 0 || info->op >= ncclNumOps) {
+    WARN("%s : invalid reduction operation %d", info->opName, info->op);
+    return ncclInvalidArgument;
+  }
+
+  if (info->comm->checkPointers) {
+    // Check CUDA device pointers
+    if (info->coll != ncclCollBroadcast || info->comm->rank == info->root) {
+      NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
+    }
+    if (info->coll != ncclCollReduce || info->comm->rank == info->root) {
+      NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
+    }
+  }
+  return ncclSuccess;
+}
diff --git a/src/misc/enqueue.cu b/src/misc/enqueue.cu
deleted file mode 100644
index 80846dd..0000000
--- a/src/misc/enqueue.cu
+++ /dev/null
@@ -1,248 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "enqueue.h"
-#include "common_coll.h"
-#include "param.h"
-
-#include "collectives/collectives.h"
-
-#define NCCL_FUNC4(coll, op, dtype) \
-  (void*)NCCL_KERN_NAME(coll, op, dtype), \
-  (void*)NCCL_KERN_NAME(coll##LL, op, dtype)
-
-// Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(coll, op) \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  u8), \
-  (void*)NCCL_FUNC4(coll, op, i32), \
-  (void*)NCCL_FUNC4(coll, op, u32), \
-  (void*)NCCL_FUNC4(coll, op, i64), \
-  (void*)NCCL_FUNC4(coll, op, u64), \
-  (void*)NCCL_FUNC4(coll, op, f16), \
-  (void*)NCCL_FUNC4(coll, op, f32), \
-  (void*)NCCL_FUNC4(coll, op, f64)
-#define NCCL_FUNCS3B(coll, op) \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8)
-
-// Must be consistent with ncclRedOp_t
-#define NCCL_FUNCS2A(coll) \
-  NCCL_FUNCS3A(coll, sum ), \
-  NCCL_FUNCS3A(coll, prod), \
-  NCCL_FUNCS3A(coll, max ), \
-  NCCL_FUNCS3A(coll, min )
-#define NCCL_FUNCS2B(coll) \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy)
-
-// Must be consistent with the ncclFuncSet enum
-static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
-  NCCL_FUNCS2B(ncclBroadcast),
-  NCCL_FUNCS2A(ncclReduce),
-  NCCL_FUNCS2B(ncclAllGather),
-  NCCL_FUNCS2A(ncclReduceScatter),
-  NCCL_FUNCS2A(ncclAllReduce)
-};
-
-ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
-#if CUDART_VERSION >= 9000
-  if (cgMode & 0x01) {
-    CUDACHECK(cudaLaunchCooperativeKernelMultiDevice(paramsList, numDevices,
-            // These flags are to reduce the latency of using this API
-            cudaCooperativeLaunchMultiDeviceNoPreSync|cudaCooperativeLaunchMultiDeviceNoPostSync));
-    return ncclSuccess;
-  }
-#endif
-  int savedDev;
-  CUDACHECK(cudaGetDevice(&savedDev));
-  for (int i = 0; i < numDevices; i++) {
-    struct cudaLaunchParams* params = paramsList+i;
-    CUDACHECK(cudaSetDevice(cudaDevs[i]));
-    CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
-  }
-  CUDACHECK(cudaSetDevice(savedDev));
-  return ncclSuccess;
-}
-
-ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
-  params->gridDim.x = std::min((int) params->gridDim.x, comm->nRings);
-
-  // Set active = 2 for the last operation
-  for (int r=0; r<params->gridDim.x; r++) {
-    struct ncclRing* ring = comm->rings+r;
-    ring->collectives[(ring->collStart+ring->collCount-1)%NCCL_MAX_OPS].active = 2;
-  }
-
-  // Find the first operation, choose the kernel accordingly and pass it
-  // as the first argument.
-  struct ncclColl* coll = comm->rings[0].collectives+comm->rings[0].collStart;
-  memcpy(&comm->args, coll, sizeof(struct ncclColl));
-  // As we pass that coll directly, we can free it immediately.
-  coll->active = 0;
-
-  params->func = ncclKerns[coll->funcIndex];
-  return ncclSuccess;
-}
-
-ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
-  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
-  int val = *ptr;
-  bool done = false;
-  while (done == false) {
-    if (val >= comm->intraRanks) {
-      WARN("Trying to launch too many collectives");
-      return ncclInvalidUsage;
-    }
-    if (val+1 == comm->intraRanks) {
-      // Reset the barrier.
-      comm->intraBarrier[comm->intraPhase^1] = 0;
-      *isLast = 1;
-      return ncclSuccess;
-    }
-    done = __sync_bool_compare_and_swap(ptr, val, val+1);
-    val++;
-  }
-  *isLast = 0;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
-  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
-  int val = *ptr;
-  if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
-    WARN("Trying to launch too many collectives");
-    return ncclInternalError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
-  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
-  while (*ptr < comm->intraRanks) pthread_yield();
-  comm->intraPhase ^= 1;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
-  if (comm->nRanks == 1) return ncclSuccess;
-  struct cudaLaunchParams* params = comm->myParams;
-
-  NCCLCHECK(setupLaunch(comm, params));
-
-  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
-  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
-    // Enqueue event in user stream
-    CUDACHECK(cudaEventRecord(comm->doneEvent, comm->userStream));
-    // Create dependency between user stream and internal NCCL stream
-    CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
-    params->stream = comm->groupStream;
-  } else {
-    if (comm->userStream != params->stream) {
-      // Stream changed from last call, create dependency against last NCCL kernel launch
-      CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
-    }
-    params->stream = comm->userStream;
-  }
-
-  int isLast = 0;
-  NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
-
-  if (isLast) {
-    if (comm->launchMode == ncclComm::GROUP) {
-      // I'm the last. Launch all operations.
-      NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode));
-    }
-    NCCLCHECK(ncclCpuBarrierLast(comm));
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
-  if (comm->nRanks == 1) return ncclSuccess;
-  // We can't print the CG mode before the first barrier happened.
-  if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
-    *comm->intraCGMode ^= 0x10;
-    INFO(NCCL_INIT,"Launch mode %s%s%s",
-        comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
-        *comm->intraCGMode ? "/CGMD" : "",
-        (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
-  }
-
-  NCCLCHECK(ncclCpuBarrierOut(comm));
-
-  struct cudaLaunchParams *params = comm->myParams;
-  if (comm->launchMode == ncclComm::PARALLEL) {
-    CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
-  }
-  // Start the network proxies as soon as the kernel has been launched. We can't
-  // perform any CUDA call between the two or having a cudaFree between the CUDA
-  // launch and the transportStartProxies call could cause a deadlock.
-  // Also, starting the proxies after the CUDA launch seems to be better for
-  // performance (latency).
-  for (int r=0; r<params->gridDim.x; r++) {
-    struct ncclRing* ring = comm->rings+r;
-    ring->collStart = ring->collFifoTail;
-    ring->collCount = 0;
-  }
-  params->gridDim.x = params->blockDim.x = 0;
-  NCCLCHECK(transportStartProxies(comm));
-  return ncclSuccess;
-}
-
-ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
-  struct cudaLaunchParams *params = comm->myParams;
-  // Enqueue event after NCCL kernel
-  CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream));
-  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
-  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
-    // Create dependency between NCCL internal stream and user stream
-    CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
-  }
-  comm->userStreamSet = false;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff,
-    void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root,
-    ncclComm_t comm, cudaStream_t stream) {
-  if (comm == NULL) return ncclInvalidArgument;
-  // Launch asynchronously if needed
-  if (ncclAsyncMode()) {
-    ncclResult_t ret = ncclSuccess;
-    int savedDev = -1;
-    if (comm->checkPointers) {
-      CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end);
-      CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, end);
-    }
-    // Check arguments
-    NCCLCHECKGOTO(ArgsCheck(sendbuff, recvbuff, count, type, op, root, comm, primName), ret, end);
-    // Always register comm even in case of error to make sure ncclGroupEnd
-    // cleans it up.
-    NCCLCHECK(ncclAsyncColl(comm));
-    NCCLCHECKGOTO(func(sendbuff, recvbuff, count, type, op, root, comm, stream), ret, end);
-end:
-    if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev));
-    ncclAsyncErrCheck(ret);
-    return ret;
-  } else {
-    NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, type, op, root, comm, primName));
-    NCCLCHECK(func(sendbuff, recvbuff, count, type, op, root, comm, stream));
-    NCCLCHECK(ncclBarrierEnqueue(comm));
-    NCCLCHECK(ncclBarrierEnqueueWait(comm));
-    NCCLCHECK(ncclEnqueueEvents(comm));
-    return ncclSuccess;
-  }
-}
diff --git a/src/misc/group.cu b/src/misc/group.cu
index 1716a75..c428a22 100644
--- a/src/misc/group.cu
+++ b/src/misc/group.cu
@@ -179,13 +179,13 @@ group_cleanup:
   // an atomic operation, we need to cancel all operations.
   for (int i=0; i<ncclGroupIndex; i++) {
     struct ncclComm* comm = ncclGroupArgs[i].coll.comm;
-    for (int r=0; r<comm->nRings; r++) {
-      struct ncclRing* ring = comm->rings+r;
-      for (int i=0; i<ring->collCount; i++) {
-        ring->collectives[(ring->collStart + i)%NCCL_MAX_OPS].active = 0;
+    for (int c=0; c<comm->nChannels; c++) {
+      struct ncclChannel* channel = comm->channels+c;
+      for (int i=0; i<channel->collCount; i++) {
+        channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
       }
-      ring->collFifoTail = ring->collStart;
-      ring->collCount = 0;
+      channel->collFifoTail = channel->collStart;
+      channel->collCount = 0;
     }
     comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
     comm->userStreamSet = false;
diff --git a/src/misc/nvmlwrap.cu b/src/misc/nvmlwrap.cu
index d9407f4..635f332 100644
--- a/src/misc/nvmlwrap.cu
+++ b/src/misc/nvmlwrap.cu
@@ -16,14 +16,14 @@ static nvmlReturn_t (*nvmlInternalInit)(void);
 static nvmlReturn_t (*nvmlInternalShutdown)(void);
 static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
 static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
-static nvmlReturn_t (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device);
-static nvmlReturn_t (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device);
 static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
 static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
 static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci);
 static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
 static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link,
     nvmlNvLinkCapability_t capability, unsigned int *capResult);
+static nvmlReturn_t (*nvmlInternalDeviceGetMinorNumber)(nvmlDevice_t device, unsigned int* minorNumber);
+
 
 ncclResult_t wrapNvmlSymbols(void) {
   if (nvmlState == nvmlInitialized)
@@ -70,10 +70,9 @@ ncclResult_t wrapNvmlSymbols(void) {
   LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
   LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
   LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceSetCpuAffinity", nvmlInternalDeviceSetCpuAffinity);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity);
   LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
   LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceGetMinorNumber", nvmlInternalDeviceGetMinorNumber);
   LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState);
   LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo);
   LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability);
@@ -86,9 +85,8 @@ teardown:
   nvmlInternalShutdown = NULL;
   nvmlInternalDeviceGetHandleByPciBusId = NULL;
   nvmlInternalDeviceGetIndex = NULL;
-  nvmlInternalDeviceSetCpuAffinity = NULL;
-  nvmlInternalDeviceClearCpuAffinity = NULL;
   nvmlInternalDeviceGetPciInfo = NULL;
+  nvmlInternalDeviceGetMinorNumber = NULL;
   nvmlInternalDeviceGetNvLinkState = NULL;
   nvmlInternalDeviceGetNvLinkRemotePciInfo = NULL;
   nvmlInternalDeviceGetNvLinkCapability = NULL;
@@ -155,46 +153,28 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
   return ncclSuccess;
 }
 
-ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
-  if (nvmlInternalDeviceSetCpuAffinity == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
-  }
-  // Workaround : it seems SetCpuAffinity is not thread safe.
-  static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
-  pthread_mutex_lock(&lock);
-  nvmlReturn_t ret = nvmlInternalDeviceSetCpuAffinity(device);
-  pthread_mutex_unlock(&lock);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceSetCpuAffinity() failed: %s ",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
-  if (nvmlInternalInit == NULL) {
+ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
+  if (nvmlInternalDeviceGetPciInfo == NULL) {
     WARN("lib wrapper not initialized.");
     return ncclInternalError;
   }
-  nvmlReturn_t ret = nvmlInternalDeviceClearCpuAffinity(device);
+  nvmlReturn_t ret = nvmlInternalDeviceGetPciInfo(device, pci);
   if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceClearCpuAffinity() failed: %s ",
+    WARN("nvmlDeviceGetPciInfo() failed: %s ",
         nvmlInternalErrorString(ret));
     return ncclSystemError;
   }
   return ncclSuccess;
 }
 
-ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
-  if (nvmlInternalDeviceGetPciInfo == NULL) {
+ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
+  if (nvmlInternalDeviceGetMinorNumber == NULL) {
     WARN("lib wrapper not initialized.");
     return ncclInternalError;
   }
-  nvmlReturn_t ret = nvmlInternalDeviceGetPciInfo(device, pci);
+  nvmlReturn_t ret = nvmlInternalDeviceGetMinorNumber(device, minorNumber);
   if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetPciInfo() failed: %s ",
+    WARN("nvmlDeviceGetMinorNumber() failed: %s ",
         nvmlInternalErrorString(ret));
     return ncclSystemError;
   }
@@ -208,8 +188,9 @@ ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link
   }
   nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive);
   if (ret != NVML_SUCCESS) {
-    INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
-        nvmlInternalErrorString(ret));
+    if (ret != NVML_ERROR_NOT_SUPPORTED)
+      INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
+          nvmlInternalErrorString(ret));
     return ncclSystemError;
   }
   return ncclSuccess;
diff --git a/src/misc/rings.cu b/src/misc/rings.cu
index a5d4616..a7b122c 100644
--- a/src/misc/rings.cu
+++ b/src/misc/rings.cu
@@ -160,7 +160,10 @@ static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankTo
     while ((rank = findConnected(curRank, matrix, nranks, transport, coords)) == -1) {
       current[transport] = 0;
       transport++;
-      if (transport == NTRANSPORTS) { free(p2pConnected); return ncclInternalError; }
+      if (transport == NTRANSPORTS) {
+        WARN("Error : Could not find transport to connect next group\n");
+        free(p2pConnected);
+        return ncclInternalError; }
     }
     curRank = rank;
     current[transport]++;
@@ -179,8 +182,20 @@ ncclResult_t getEnvThreads(int* nthreads) {
   return ncclSuccess;
 }
 
+static inline int copyRings(int nrings, int newNrings, int nranks, int* a, int* b, int* c, int* d) {
+  if (newNrings > MAXCHANNELS) newNrings = MAXCHANNELS;
+  for (int r=nrings; r<newNrings; r++) {
+    for (int i=0; i<nranks; i++) {
+      a[r*nranks+i] = a[(r-nrings)*nranks+i];
+      b[r*nranks+i] = b[(r-nrings)*nranks+i];
+      c[r*nranks+i] = c[(r-nrings)*nranks+i];
+      d[r*nranks+i] = d[(r-nrings)*nranks+i];
+    }
+  }
+  return newNrings;
+}
 /* Main ring creation function */
-ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next) {
+ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut) {
   *nrings = 0;
 
   if (nranks == 1) return ncclSuccess;
@@ -191,6 +206,12 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
     if (ret == ncclSuccess && *nrings > 0) {
       if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings);
       NCCLCHECK(getEnvThreads(nthreads));
+      for (int r = 0; r<*nrings; r++) {
+        for (int i = 0; i<nranks; i++) {
+          if (transports[i*nranks+prev[i]] == 2) treeIn[i] = 1;
+          if (transports[i*nranks+next[i]] == 2) treeOut[i] = 1;
+        }
+      }
       return ncclSuccess;
     }
     if (rank == 0) INFO(NCCL_INIT,"No valid ring found in environment, ignoring");
@@ -210,8 +231,8 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
   int minScore = NCCL_MAX_SCORE;
   int nringsTmp;
   int *prevTmp, *nextTmp, *idxToRank, *rankToIdx, *groups, *subgroups;
-  NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXRINGS));
-  NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXCHANNELS));
   NCCLCHECK(ncclCalloc(&idxToRank, nranks));
   NCCLCHECK(ncclCalloc(&rankToIdx, nranks));
   NCCLCHECK(ncclCalloc(&groups, nranks));
@@ -220,8 +241,8 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
   int nThreads;
   do {
     nThreads = *nthreads;
-    for (int i=0; i<nranks*MAXRINGS; i++) prevTmp[i] = nextTmp[i] = -1;
-    nringsTmp = MAXRINGS;
+    for (int i=0; i<nranks*MAXCHANNELS; i++) prevTmp[i] = nextTmp[i] = -1;
+    nringsTmp = MAXCHANNELS;
     // Loop over transports to connect groups
     for (int t=NTRANSPORTS-1; t>=0; t--) {
       for (int i=0; i<nranks; i++) idxToRank[i] = rankToIdx[i] = -1;
@@ -282,6 +303,11 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
           for (int i=0; i<nidx; i++) {
             if ((prevTmp[r*nranks+idxToRank[i]] == -1) && (subprev[r*nidx+i] != -1)) prevTmp[r*nranks+idxToRank[i]] = idxToRank[subprev[r*nidx+i]];
             if ((nextTmp[r*nranks+idxToRank[i]] == -1) && (subnext[r*nidx+i] != -1)) nextTmp[r*nranks+idxToRank[i]] = idxToRank[subnext[r*nidx+i]];
+            if (t == NTRANSPORTS-1) {
+              // Save node-level masters for trees
+              treeIn[r*nranks+idxToRank[i]] = prevTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
+              treeOut[r*nranks+idxToRank[i]] = nextTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
+            }
           }
         }
         //for (int r=0; r<nringsTmp; r++) {
@@ -316,6 +342,15 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
 
   *nthreads = nThreads;
 
+  /* Duplicate the rings in case of multinode+NVLink */
+  int nnodes = 0;
+  for (int r=0; r<nranks; r++) nnodes += treeIn[r];
+  int nvlink;
+  NCCLCHECK(ncclNvlinkGpu(&nvlink));
+  if (nnodes > 1 && nvlink) {
+    *nrings = copyRings(*nrings, *nrings*2, nranks, prev, next, treeIn, treeOut);
+  }
+
   if (*nrings == 0) {
     WARN("Could not create rings, falling back on simple ring");
     *nrings = 1;
@@ -329,9 +364,9 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
     if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than NCCL_MAX_NRINGS, ignoring NCCL_MIN_NRINGS");
     minNrings = 0;
   }
-  if (minNrings > MAXRINGS) {
-    if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXRINGS, MAXRINGS);
-    minNrings = MAXRINGS;
+  if (minNrings > MAXCHANNELS) {
+    if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXCHANNELS, MAXCHANNELS);
+    minNrings = MAXCHANNELS;
   }
   if (maxNrings > 0 && maxNrings <= *nrings) {
     if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
@@ -341,13 +376,7 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
     if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
     if (minNrings > 0 && minNrings > *nrings) {
       if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings);
-      for (int r=*nrings; r<MAXRINGS && r <minNrings; r++) {
-        for (int i=0; i<nranks; i++) {
-          prev[r*nranks+i] = prev[(r-*nrings)*nranks+i];
-          next[r*nranks+i] = next[(r-*nrings)*nranks+i];
-        }
-      }
-      *nrings = minNrings;
+      *nrings = copyRings(*nrings, minNrings, nranks, prev, next, treeIn, treeOut);
     }
   }
 
diff --git a/src/misc/trees.cu b/src/misc/trees.cu
new file mode 100644
index 0000000..e53ea0b
--- /dev/null
+++ b/src/misc/trees.cu
@@ -0,0 +1,108 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "net.h"
+#include "param.h"
+
+#define RANK_TO_INDEX(r) (rank > root ? rank-1 : rank)
+
+/* Btree which alternates leaves and nodes.
+ * Assumes root is 0, which conveniently builds a tree on powers of two,
+ * (because we have pow2-1 ranks) which lets us manipulate bits.
+ * Find first non-zero bit, then :
+ * Find the parent :
+ *   xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below)
+ *   xx11[0] -> xx10[0] (3,7,11 below)
+ * Find the children :
+ *   xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13)
+ *   xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13)
+ *
+ * Illustration :
+ * 0---------------8
+ *          ______/ \______
+ *         4               12
+ *       /   \            /  \
+ *     2       6       10     \
+ *    / \     / \     /  \     \
+ *   1   3   5   7   9   11    13
+ */
+ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
+  int up, down0, down1;
+  int bit;
+  for (bit=1; bit<nranks; bit<<=1) {
+    if (bit & rank) break;
+  }
+
+  if (rank == 0) {
+    *u = -1;
+    *d0 = nranks > 1 ? bit >> 1 : -1;
+    *d1 = -1;
+    return ncclSuccess;
+  }
+
+  up = (rank ^ bit) | (bit << 1);
+  if (up >= nranks) up = (rank ^ bit);
+  *u = up;
+
+  int lowbit = bit >> 1;
+  // down0 is always within bounds
+  down0 = lowbit == 0 ? -1 : rank-lowbit;
+
+  down1 = lowbit == 0 ? -1 : rank+lowbit;
+  // Make sure down1 is within bounds
+  while (down1 >= nranks) {
+    down1 = lowbit == 0 ? -1 : rank+lowbit;
+    lowbit >>= 1;
+  }
+  *d0 = down0; *d1 = down1;
+
+  return ncclSuccess;
+}
+
+/* Build a double binary tree. Take the previous tree for the first tree.
+ * For the second tree, we use a mirror tree (if nranks is odd)
+ *
+ *                 8---------0---------5
+ *          ______/ \______      _____/ \______
+ *         4               12   1              9
+ *       /   \            /      \           /   \
+ *     2       6       10          3       7      10
+ *    / \     / \     /  \        / \     / \    /  \
+ *   1   3   5   7   9   11      2   4   6   8  11  12
+ *
+ * or shift it by one rank (if nranks is even)
+ *
+ *                 8---------0--------------9
+ *          ______/ \                ______/ \
+ *         4         \              5         \
+ *       /   \        \           /   \        \
+ *     2       6       10       3       7       11
+ *    / \     / \     /  \     / \     / \     /  \
+ *   1   3   5   7   9   11   2   4   6   8   10   1
+ */
+ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* s1, int* d1_0, int* d1_1) {
+  // First tree ... use a btree
+  ncclGetBtree(nranks, rank, s0, d0_0, d0_1);
+  // Second tree ... mirror or shift
+  if (nranks % 2 == 0) {
+    // shift
+    int shiftrank = (rank-1+nranks) % nranks;
+    int u, d0, d1;
+    ncclGetBtree(nranks, shiftrank, &u, &d0, &d1);
+    *s1 = u == -1 ? -1 : (u+1) % nranks;
+    *d1_0 = d0 == -1 ? -1 : (d0+1) % nranks;
+    *d1_1 = d1 == -1 ? -1 : (d1+1) % nranks;
+  } else {
+    // mirror
+    int u, d0, d1;
+    ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1);
+    *s1 = u == -1 ? -1 : nranks-1-u;
+    *d1_0 = d0 == -1 ? -1 : nranks-1-d0;
+    *d1_1 = d1 == -1 ? -1 : nranks-1-d1;
+  }
+  return ncclSuccess;
+}
diff --git a/src/misc/utils.cu b/src/misc/utils.cu
index d8e3aec..c618e71 100644
--- a/src/misc/utils.cu
+++ b/src/misc/utils.cu
@@ -11,6 +11,24 @@
 #include <string.h>
 #include <stdarg.h>
 
+#include "nvmlwrap.h"
+#include "core.h"
+
+// Convert a logical cudaDev index to the NVML device minor number
+ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) {
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+  nvmlDevice_t nvmlDevice;
+  unsigned int dev;
+  *nvmlDev = -1;
+  CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
+  NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDevice));
+  NCCLCHECK(wrapNvmlDeviceGetMinorNumber(nvmlDevice, &dev));
+
+  *nvmlDev = dev;
+
+  return ncclSuccess;
+}
+
 ncclResult_t getHostName(char* hostname, int maxlen) {
   if (gethostname(hostname, maxlen) != 0) {
     strncpy(hostname, "unknown", maxlen);
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 7227625..985274e 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -68,14 +68,24 @@ ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId
 ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
 ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
 
-/* Frees resources associated with communicator object. */
+/* Frees resources associated with communicator object, but waits for any operations
+ * that might still be running on the device. */
 ncclResult_t  ncclCommDestroy(ncclComm_t comm);
 ncclResult_t pncclCommDestroy(ncclComm_t comm);
 
+/* Frees resources associated with communicator object and aborts any operations
+ * that might still be running on the device. */
+ncclResult_t  ncclCommAbort(ncclComm_t comm);
+ncclResult_t pncclCommAbort(ncclComm_t comm);
+
 /* Returns a human-readable error message. */
 const char*  ncclGetErrorString(ncclResult_t result);
 const char* pncclGetErrorString(ncclResult_t result);
 
+/* Checks whether the comm has encountered any asynchronous errors */
+ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+
 /* Gets the number of ranks in the communicator clique. */
 ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
 ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
diff --git a/src/ring.cu b/src/ring.cu
deleted file mode 100644
index fede793..0000000
--- a/src/ring.cu
+++ /dev/null
@@ -1,70 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "ring.h"
-#include "param.h"
-
-NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
-
-ncclResult_t initRing(struct ncclComm* comm, int ringid) {
-  struct ncclRing* ring = comm->rings+ringid;
-  ring->id = ringid;
-
-  // Setup intermediate buffering
-  ring->buffSize = ncclParamBuffsize();
-
-  const int sendSize = ring->devMemSendSize = sizeof(struct ncclSendMem);
-  struct ncclSendMem* sendMem;
-  NCCLCHECK(ncclCudaCalloc((char**)&sendMem, sendSize));
-  ring->devMemSend = sendMem;
-
-  const int recvSize = ring->devMemRecvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
-  struct ncclRecvMem* recvMem;
-  NCCLCHECK(ncclCudaCalloc((char**)&recvMem, recvSize));
-  ring->devMemRecv = recvMem;
-
-  TRACE(NCCL_INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize);
-
-  // Pre-configure send/recv pointers. Those are the default, they may change later.
-  ring->recv.conn.buff = recvMem->buff;
-  ring->recv.conn.llBuff = recvMem->llBuff;
-  ring->recv.conn.tail = &recvMem->tail;
-  ring->recv.conn.opCount = &recvMem->opCount;
-  ring->recv.conn.direct = 0;
-  ring->send.conn.head = &sendMem->head;
-  ring->send.conn.llHead = &sendMem->llHead;
-  ring->send.conn.direct = 0;
-  ring->send.conn.llStep = 0;
-  ring->send.conn.llLastCleaning = 0;
-
-  // Ring index to user rank table.
-  NCCLCHECK(ncclCudaCalloc(&ring->devUserRanks, comm->nRanks));
-  NCCLCHECK(ncclCalloc(&ring->userRanks, comm->nRanks));
-
-  // Per-ring operation list.
-  NCCLCHECK(ncclCudaHostAlloc((void**)&ring->collectives, (void**)&ring->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
-  return ncclSuccess;
-}
-
-ncclResult_t freeRing(struct ncclRing* ring) {
-  // Intermediate buffering
-  CUDACHECK(cudaFree(ring->devMemSend));
-  CUDACHECK(cudaFree(ring->devMemRecv));
-
-  // Index to rank table
-  free(ring->userRanks);
-  CUDACHECK(cudaFree(ring->devUserRanks));
-
-  // Operation list
-  NCCLCHECK(ncclCudaHostFree(ring->collectives));
-
-  // Free transport proxy resources
-  if (ring->send.transportResources) NCCLCHECK(ring->send.transport->send.free(ring->send.transportResources));
-  NCCLCHECK(transportDestroyProxy(&ring->send));
-  if (ring->recv.transportResources) NCCLCHECK(ring->recv.transport->recv.free(ring->recv.transportResources));
-  NCCLCHECK(transportDestroyProxy(&ring->recv));
-  return ncclSuccess;
-}
diff --git a/src/transport.cu b/src/transport.cu
index 7c13d5c..1436a5b 100644
--- a/src/transport.cu
+++ b/src/transport.cu
@@ -1,11 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
 #include "core.h"
-#include "common_coll.h"
 
 extern struct ncclTransport p2pTransport;
 extern struct ncclTransport shmTransport;
@@ -17,74 +16,16 @@ struct ncclTransport ncclTransports[NTRANSPORTS] = {
   netTransport,
 };
 
-static void FifoPullArgs(struct transportProxyInfo* info, struct ncclProxyArgs *args) {
-  struct ncclProxyArgs *fifoArgs = info->argsFifo + (info->argsFifoHead % TRANSPORT_PROXY_FIFO_SIZE);
-  pthread_mutex_lock(&info->mutex);
-  while (fifoArgs->active == 0)
-    pthread_cond_wait(&info->cond, &info->mutex);
-  __sync_synchronize();
-  memcpy(args, fifoArgs, sizeof(struct ncclProxyArgs));
-  __sync_synchronize();
-  fifoArgs->active = 0;
-  pthread_cond_signal(&info->cond);
-  pthread_mutex_unlock(&info->mutex);
-  info->argsFifoHead++;
-}
-
-static struct ncclProxyArgs* FifoGetNextArgs(struct transportProxyInfo* info) {
-  if (info == NULL) return NULL;
-  struct ncclProxyArgs* fifoArgs = info->argsFifo + (info->argsFifoTail % TRANSPORT_PROXY_FIFO_SIZE);
-  pthread_mutex_lock(&info->mutex);
-  while (fifoArgs->active == 1)
-    pthread_cond_wait(&info->cond, &info->mutex);
-  pthread_mutex_unlock(&info->mutex);
-  info->argsFifoTail++;
-  return fifoArgs;
-}
-
-static void FifoPushArgs(struct transportProxyInfo* info) {
-  if (info == NULL) return;
-
-  struct ncclProxyArgs* fifoArgs = info->argsFifo + ((info->argsFifoTail-1) % TRANSPORT_PROXY_FIFO_SIZE);
-  if (fifoArgs->active == 0) return;
-
-  pthread_mutex_lock(&info->mutex);
-  pthread_cond_signal(&info->cond);
-  pthread_mutex_unlock(&info->mutex);
-}
-
-static void WaitProxyReady(struct transportProxyInfo* info) {
-  pthread_mutex_lock(&info->mutex);
-  while (info->proxyReady == 0)
-    pthread_cond_wait(&info->cond, &info->mutex);
-  pthread_mutex_unlock(&info->mutex);
-}
-
-static void SetProxyReady(struct transportProxyInfo* info) {
-  pthread_mutex_lock(&info->mutex);
-  info->proxyReady = 1;
-  pthread_cond_signal(&info->cond);
-  pthread_mutex_unlock(&info->mutex);
-}
-
-static void StopProxy(struct transportProxyInfo* info) {
-  struct ncclProxyArgs* fifoArgs = FifoGetNextArgs(info);
-  fifoArgs->active = -1;
-  FifoPushArgs(info);
-}
-
 #define RECV 0
 #define SEND 1
 
-static bool NeedProxy(int type, int pattern, struct ncclRing* ring, int nranks) {
-  enum proxyMode mode = proxyPatternMode(pattern);
-  if (mode == proxyRing) return true;
+static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
+  if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
 
   /* In chains, one rank does not need a proxy. Let's figure out which one it is */
-  int root = proxyPatternRoot(pattern);
   // Which index in the reorganized rings should we compare root against */
   const int myrank = 0, nextrank = 1, prevrank = nranks-1;
-  int index = mode == proxyFrom ?
+  int index = pattern == ncclPatternPipelineFrom ?
       /*                            no recv /  no send    if root = */
       /* bcast  */ (type == RECV ?   myrank : nextrank ):
       /* reduce */ (type == RECV ? prevrank :   myrank );
@@ -92,96 +33,216 @@ static bool NeedProxy(int type, int pattern, struct ncclRing* ring, int nranks)
   return (root != rank);
 }
 
-static void SaveProxy(struct ncclConnector* connector, struct ncclProxyArgs* args, int needProxy) {
-  struct transportProxyInfo* info = connector->proxyInfo;
-  if (info == NULL) return;
-  struct ncclProxyArgs* fifoArgs = FifoGetNextArgs(info);
-  args->needProxy = needProxy;
-  __sync_synchronize();
-  memcpy(fifoArgs, args, sizeof(struct ncclProxyArgs));
-  __sync_synchronize();
-  fifoArgs->active = 1;
+enum { proxyRecv=0, proxySend=1 };
+
+#define PROXYARGS_ALLOCATE_SIZE 32
+struct ncclProxyPool {
+  struct ncclProxyPool *next;
+  struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
+};
+
+ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) {
+  struct ncclProxyState* state = &comm->proxyState;
+  struct ncclProxyArgs* elem;
+  pthread_mutex_lock(&state->mutex);
+  if (state->pool == NULL) {
+    // Allocate a new pool of elements
+    struct ncclProxyPool* newPool;
+    NCCLCHECK(ncclCalloc(&newPool, 1));
+    struct ncclProxyArgs* newElems = newPool->elems;
+    // Chain newly allocated elements
+    for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) {
+      if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1;
+    }
+    // Add them all to the pool list
+    state->pool = newElems;
+    // Save the pool memory block for later resource release
+    newPool->next = state->pools;
+    state->pools = newPool;
+  }
+  elem = state->pool;
+  state->pool = state->pool->next;
+  pthread_mutex_unlock(&state->mutex);
+  elem->next = elem->nextPeer = NULL;
+  *argsptr = elem;
+  return ncclSuccess;
 }
 
-ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t nbytes, int pattern, struct ncclComm* comm) {
-  int llMode, nrings, nthreads;
-  ncclGetCollResource(comm, nbytes, &nrings, &nthreads, &llMode);
-  nbytes       = llMode ? nbytes * 2    : nbytes;
-  substeps     = llMode ? 1             : substeps;
-  subchunks    = llMode ? NCCL_LL_CHUNKS : subchunks;
-  int buffSize = llMode ? NCCL_LL_BUFF_SIZE : comm->rings[0].buffSize;
-
-  int nrounds = (int)(DIVUP(nbytes, ((size_t)nrings * nblocksPerRound * (buffSize/subchunks)))); // Fixed 32-bit overflow
-  int nsteps = nstepsPerRound * nrounds * substeps;
-  TRACE(NCCL_NET,"opCount %lx substeps %d subchunks %d nrounds %d nsteps %d comm %p", comm->opCount, subchunks, subchunks, nrounds, nsteps, comm);
-  TRACE(NCCL_NET,"opCount %lx nbytes %zi nrings %d buffSize %d pattern %d comm %p", comm->opCount, nbytes, nrings, buffSize, pattern, comm);
-  for (int r=0; r<nrings; r++) {
-    struct ncclRing* ring = comm->rings+((comm->myParams->gridDim.x+r)%comm->nRings);
-    struct ncclProxyArgs args = { ring, substeps*subchunks, nsteps, comm->opCount, llMode, 0 };
-    SaveProxy(&ring->recv, &args, NeedProxy(RECV, pattern, ring, comm->nRanks));
-    SaveProxy(&ring->send, &args, NeedProxy(SEND, pattern, ring, comm->nRanks));
+static void ProxyAppend(struct ncclConnector* connector, struct ncclProxyArgs* args) {
+  struct ncclComm* comm = connector->comm;
+  struct ncclProxyState* state = &comm->proxyState;
+  pthread_mutex_lock(&state->mutex);
+  if (connector->proxyAppend == NULL) {
+    // Nothing running for that peer. Add to the circular list
+    if (state->ops == NULL) {
+      // Create the list
+      args->next = args;
+      state->ops = args;
+    } else {
+      // Insert element in the list
+      args->next = state->ops->next;
+      state->ops->next = args;
+    }
+    connector->proxyAppend = args;
+  } else {
+    // There is an active operation already for that peer.
+    // Add it to the per-peer list
+    connector->proxyAppend->nextPeer = args;
+    connector->proxyAppend = args;
   }
+  pthread_mutex_unlock(&state->mutex);
+}
+
+template <int type>
+static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) {
+  if (peer < 0) return ncclSuccess;
+
+  struct ncclPeer* peerComm = args->channel->peers+peer;
+  struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send;
+  if (connector->transportComm->proxy == NULL) return ncclSuccess;
+
+  struct ncclProxyArgs* op;
+  NCCLCHECK(transportAllocateProxyArgs(connector->comm, &op));
+  memcpy(op, args, sizeof(struct ncclProxyArgs));
+  op->connector = connector;
+  op->progress = connector->transportComm->proxy;
+  op->state = ncclProxyOpReady;
+  ProxyAppend(connector, op);
   return ncclSuccess;
 }
 
-ncclResult_t transportStartProxies(ncclComm* comm) {
-  for (int r=0; r<comm->nRings; r++) {
-    FifoPushArgs(comm->rings[r].send.proxyInfo);
-    FifoPushArgs(comm->rings[r].recv.proxyInfo);
+ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks) {
+  if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
+    struct ncclRing* ring = &args->channel->ring;
+    if (NeedProxy(RECV, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxyRecv>(ring->prev, args));
+    if (NeedProxy(SEND, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxySend>(ring->next, args));
+  }
+  if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
+    // Tree up
+    struct ncclTree* tree = &args->channel->tree;
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args));
+    NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
+  }
+  if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
+    // Tree down
+    struct ncclTree* tree = &args->channel->tree;
+    for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
+    NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
   }
-  pthread_yield(); // Let other threads run
   return ncclSuccess;
 }
 
-void* persistentThread(void *opaqueInfo) {
-  struct transportProxyInfo* info = (struct transportProxyInfo*)opaqueInfo;
-  // We need to initialize the context before launching any NCCL cuda kernel,
-  // otherwise we would create it during the first cudaMemcpyAsync inside the
-  // proxy function and that would cause a deadlock
-  cudaSetDevice(info->comm->cudaDev);
-  // Signal the main thread the context is created and it can proceed.
-  SetProxyReady(info);
+void* persistentThread(void *comm_) {
+  struct ncclComm* comm = (struct ncclComm*)comm_;
+  struct ncclProxyState* state = &comm->proxyState;
+  struct ncclProxyArgs* op = NULL;
+  ncclResult_t ret = ncclSuccess;
+  int idle = 1;
+  int idleSpin = 0;
   while (1) {
-    struct ncclProxyArgs args;
-    FifoPullArgs(info, &args);
-    if (args.active == -1) {
-      // Main thread asked to stop
+    do {
+      if (*comm->abortFlag) return NULL;
+      if (op == NULL) {
+        pthread_mutex_lock(&state->mutex);
+        op = state->ops;
+        if (op == NULL) {
+          if (state->stop) {
+            // No more commands to process and proxy has been requested to stop
+            pthread_mutex_unlock(&state->mutex);
+            return NULL;
+          }
+          pthread_cond_wait(&state->cond, &state->mutex);
+        }
+        pthread_mutex_unlock(&state->mutex);
+      }
+    } while (op == NULL);
+    op->idle = 0;
+    if (op->state != ncclProxyOpNone) ret = op->progress(op);
+    if (ret != ncclSuccess) {
+      comm->fatalError = ret;
+      INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
       return NULL;
     }
-    ncclResult_t res = info->func(&args);
-    if (res != ncclSuccess) {
-      WARN("%s:%d -> %d [Proxy thread error]", __FILE__, __LINE__, res);
+    idle &= op->idle;
+    pthread_mutex_lock(&state->mutex);
+    if (!idle) idleSpin = 0;
+    struct ncclProxyArgs *next = op->next;
+    if (next->state == ncclProxyOpNone) {
+      struct ncclProxyArgs *freeOp = next;
+      if (next->nextPeer) {
+        // Replace next by its next per-peer element.
+        next = next->nextPeer;
+        if (op != freeOp) {
+          next->next = freeOp->next;
+          op->next = next;
+        } else {
+          next->next = next;
+        }
+      } else {
+        // Remove next from circular list
+        next->connector->proxyAppend = NULL;
+        if (op != freeOp) {
+          next = next->next;
+          op->next = next;
+        } else {
+          next = NULL;
+        }
+      }
+      if (freeOp == state->ops) state->ops = next;
+      freeOp->next = state->pool;
+      state->pool = freeOp;
     }
+    op = next;
+    if (op == state->ops) {
+      if (idle == 1) {
+        if (++idleSpin == 10) {
+          sched_yield();
+          idleSpin = 0;
+        }
+      }
+      idle = 1;
+    }
+    pthread_mutex_unlock(&state->mutex);
   }
 }
 
-ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm) {
-  struct ncclConnector* connector = (type == RECV) ? &ring->recv : &ring->send;
-  threadFunc_t proxyfunc = (threadFunc_t) ((type == RECV) ? connector->transport->recv.proxy : connector->transport->send.proxy);
-  if (proxyfunc) {
-    TRACE(NCCL_NET,"type %d ring %p proxyfunc %p comm %p", type, ring, proxyfunc, comm);
-    struct transportProxyInfo* info;
-    NCCLCHECK(ncclCalloc(&info, 1));
-    connector->proxyInfo = info;
-    info->comm = comm;
-    info->cond = PTHREAD_COND_INITIALIZER;
-    info->mutex = PTHREAD_MUTEX_INITIALIZER;
-    info->func = proxyfunc;
-    info->argsFifoHead = info->argsFifoTail = 0;
-    info->proxyReady = 0;
-    pthread_create(&connector->proxyInfo->thread, NULL, persistentThread, info);
-    // Wait for thread to initialize its CUDA context.
-    WaitProxyReady(info);
+ncclResult_t transportStartProxy(struct ncclComm* comm) {
+  pthread_mutex_lock(&comm->proxyState.mutex);
+  if (comm->proxyState.ops != NULL)
+    pthread_cond_signal(&comm->proxyState.cond);
+  pthread_mutex_unlock(&comm->proxyState.mutex);
+  return ncclSuccess;
+}
+
+ncclResult_t transportCreateProxy(struct ncclComm* comm) {
+  if (!comm->proxyThread) {
+    comm->proxyState.cond = PTHREAD_COND_INITIALIZER;
+    comm->proxyState.mutex = PTHREAD_MUTEX_INITIALIZER;
+    comm->proxyState.ops = NULL;
+    pthread_create(&comm->proxyThread, NULL, persistentThread, comm);
   }
   return ncclSuccess;
 }
 
-ncclResult_t transportDestroyProxy(struct ncclConnector* connector) {
-  if (connector->proxyInfo) {
-    StopProxy(connector->proxyInfo);
-    pthread_join(connector->proxyInfo->thread, NULL);
-    free(connector->proxyInfo);
-    connector->proxyInfo = NULL;
+ncclResult_t transportDestroyProxy(struct ncclComm* comm) {
+  struct ncclProxyState* state = &comm->proxyState;
+
+  // Request the proxy to stop and then wake it
+  pthread_mutex_lock(&state->mutex);
+  state->stop = true;
+  pthread_cond_signal(&state->cond);
+  pthread_mutex_unlock(&state->mutex);
+  if (comm->proxyThread) pthread_join(comm->proxyThread, NULL);
+
+  // Free off any memory allocated for the proxy arg pools
+  pthread_mutex_lock(&state->mutex);
+  struct ncclProxyState* proxyState = &comm->proxyState;
+  while (proxyState->pools != NULL) {
+    struct ncclProxyPool *next = proxyState->pools->next;
+    free(proxyState->pools);
+    proxyState->pools = next;
   }
+  pthread_mutex_unlock(&state->mutex);
+
   return ncclSuccess;
 }
diff --git a/src/transport/net.cu b/src/transport/net.cu
index 9c366b3..06a6e23 100644
--- a/src/transport/net.cu
+++ b/src/transport/net.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,11 +9,17 @@
 #include "nvmlwrap.h"
 #include "net.h"
 #include "param.h"
-#include "nvlink.h"
+#include "topo.h"
 #include <cuda_runtime.h>
 #include <assert.h>
 
 #define NET_MAX_IFS 16
+#define NET_MAX_GPUS 32
+
+// Cache GPU-NIC distances to avoid re-computing them
+#define NET_TVALUE_UNKNOWN 0ULL
+static ncclTvalue_t ncclNetTvalues[NET_MAX_GPUS] = { NET_TVALUE_UNKNOWN };
+static int ncclNetNDev;
 
 // We encode 3 bits of distance per interface into a ncclTvalue_t (64-bit)
 #define NET_BITS_PER_IF 3
@@ -28,13 +34,9 @@ static ncclTvalue_t getTvalue(short* distances, int ndev) {
   }
   return tvalue;
 }
-
-struct netInfo {
-  int rank;
-  int ndev;
-  ncclTvalue_t tValue;
-  short distances[NET_MAX_IFS];
-};
+static int getScore(ncclTvalue_t tvalue, int dev) {
+  return (tvalue >> (dev*NET_BITS_PER_IF)) & NET_BITS_PER_IF_MASK;
+}
 
 struct netConnectInfo {
   ncclNetHandle_t netHandle;
@@ -46,11 +48,13 @@ struct netSendResources {
   struct ncclRecvMem* hostRecvMem;
   struct ncclSendMem* devHostSendMem;
   struct ncclRecvMem* devHostRecvMem;
-  struct ncclSendMem* hostDevMem;
   int netDev;
   int useGdr;
-  struct ncclRecvMem* devNetMem;
-  uint64_t llStep;
+  int buffSize;
+  void* mhandle;
+  void* llMhandle;
+  struct ncclRecvMem* devRecvMem;
+  uint64_t step;
   uint64_t llLastCleaning;
 };
 
@@ -61,50 +65,70 @@ struct netRecvResources {
   struct ncclRecvMem* hostRecvMem;
   struct ncclSendMem* devHostSendMem;
   struct ncclRecvMem* devHostRecvMem;
-  struct ncclRecvMem* hostDevMem;
   int netDev;
   int useGdr;
-  uint64_t llStep;
+  int buffSize;
+  void* mhandle;
+  void* llMhandle;
+  struct ncclRecvMem* devRecvMem;
+  uint64_t step;
   uint64_t llLastCleaning;
 };
 
-/* Fill information necessary to exchange between ranks to choose whether or not
- * to use this transport */
-ncclResult_t netFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
-  struct netInfo* info = (struct netInfo*)opaqueInfo;
-  static_assert(sizeof(struct netInfo) <= sizeof(ncclTinfo_t), "NET Info too large");
-  info->rank = rank;
-  NCCLCHECK(ncclNetDevices(&info->ndev));
-  if (info->ndev == 0) {
+static ncclResult_t netDistance(int cudaDev, int dev, short* distance) {
+  char* cudaPath = NULL;
+  char* nicPath = NULL;
+  ncclResult_t err;
+  NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
+  err = ncclNetPciPath(dev, &nicPath);
+  *distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath);
+  if (nicPath) free(nicPath);
+  if (cudaPath) free(cudaPath);
+  return ncclSuccess;
+}
+
+static ncclResult_t netDevices(int* ndev, short** distances) {
+  NCCLCHECK(ncclNetDevices(ndev));
+  if (*ndev == 0) {
     WARN("Error : Network returned 0 device");
     return ncclSystemError;
   }
-  if (info->ndev > NET_MAX_IFS) info->ndev = NET_MAX_IFS;
+  if (*ndev > NET_MAX_IFS) *ndev = NET_MAX_IFS;
 
-  // Find distance with current GPU
-  int cudaDev;
-  cudaGetDevice(&cudaDev);
-  char* cudaPath;
-  NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
+  *distances = (short*)malloc(*ndev*sizeof(short));
+  if (*distances == NULL) return ncclSystemError;
 
+  // Find distance with current GPU
+  int cudaDev, nvmlDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
   char line[1024];
-  sprintf(line, "CUDA Dev %d, %s NIC distance : ", cudaDev, ncclNetName());
-  for (int d=0; d<info->ndev; d++) {
-    char* nicPath;
-    ncclResult_t err = ncclNetPciPath(d, &nicPath);
-    info->distances[d] = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath);
-    sprintf(line+strlen(line), " %s", pathDists[info->distances[d]]);
-    if (err == ncclSuccess) free(nicPath);
+  sprintf(line, "CUDA Dev %d[%d], %s NIC distance : ", cudaDev, nvmlDev, ncclNetName());
+  for (int d=0; d<*ndev; d++) {
+    NCCLCHECK(netDistance(cudaDev, d, *distances+d));
+    sprintf(line+strlen(line), " %s", pathDists[(*distances)[d]]);
   }
   INFO(NCCL_INIT|NCCL_NET, "%s", line);
-  free(cudaPath);
   return ncclSuccess;
 }
 
 /* Determine if we can communicate with the peer */
-ncclResult_t netCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
-  struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
-  ret[0] = getTvalue(myInfo->distances, myInfo->ndev);
+ncclResult_t netCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  ret[0] = ncclNetTvalues[cudaDev];
+  if (ret[0] == NET_TVALUE_UNKNOWN) {
+    if (cudaDev >= NET_MAX_GPUS) {
+      WARN("CUDA device %d >= MAX %d\n", cudaDev, NET_MAX_GPUS);
+      return ncclInternalError;
+    }
+    int nDev;
+    short* distances;
+    NCCLCHECK(netDevices(&nDev, &distances));
+    ncclNetTvalues[cudaDev] = ret[0] = getTvalue(distances, nDev);
+    ncclNetNDev = nDev;
+    free(distances);
+  }
   return ncclSuccess;
 }
 
@@ -196,45 +220,51 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
   return ncclSuccess;
 }
 
-int getDev(int ringId, int nDev, short* distances) {
-  int minDistance = PATH_SOC;
-  for (int d=0; d<nDev; d++) if (distances[d] < minDistance) minDistance = distances[d];
+int getDev(int cudaDev, int ringId) {
+  ncclTvalue_t tvalues = ncclNetTvalues[cudaDev];
+
+  int dev = 0;
+  int maxScore = 0;
+  for (int d=0; d<ncclNetNDev; d++) if (getScore(tvalues,d) > maxScore) maxScore = getScore(tvalues,d);
   int skip = ringId+1;
   while (skip) {
-    for (int d=0; d<nDev; d++) {
-      if (distances[d] == minDistance) {
+    for (int d=0; d<ncclNetNDev; d++) {
+      if (getScore(tvalues, d) == maxScore) {
         skip--;
-        if (skip == 0) return d;
+        if (skip == 0) { dev = d; goto end; }
       }
     }
   }
-  return 0;
+end:
+  return dev;
 }
 
 NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
 NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
 
-static ncclResult_t netGetGdrSupport(int dev, int distance, int read, int* useGdr) {
+static ncclResult_t netGetGdrSupport(int dev, int read, int* useGdr) {
   *useGdr = 0;
 
-  int cudaDev;
+  int cudaDev, nvmlDev;
   CUDACHECK(cudaGetDevice(&cudaDev));
+  NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
 
   if (read) { // For reads (sends) only enable under certain conditions
     int gdrReadParam = ncclParamNetGdrRead();
     if (gdrReadParam == 0) return ncclSuccess;
-    else if (gdrReadParam < 0) { // default : enable only on DGX2
-      char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-      CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
-      int nvlinks = getNumNvlinks(busId);
-      if (nvlinks < CONNECT_NVSWITCH || ncclCudaCompCap() < 7) return ncclSuccess;
+    if (gdrReadParam < 0) {
+       int nvlink;
+       NCCLCHECK(ncclNvlinkGpu(&nvlink));
+       if (!nvlink) return ncclSuccess;
     }
   }
 
   // Check if we are close enough that it makes sense to enable GDR
   int netGdrLevel = ncclParamNetGdrLevel();
+  short distance;
+  NCCLCHECK(netDistance(cudaDev, dev, &distance));
   if (distance >= netGdrLevel) {
-    INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, dev, distance, netGdrLevel);
+    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel);
     return ncclSuccess;
   }
 
@@ -243,51 +273,59 @@ static ncclResult_t netGetGdrSupport(int dev, int distance, int read, int* useGd
   NCCLCHECK(ncclNetPtrSupport(dev, &flags));
   if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess;
   *useGdr = 1;
-  INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d / HCA %d (distance %d >= %d), read %d", ncclNetName(), cudaDev, dev, distance, netGdrLevel, read);
+  INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d[%d] / HCA %d (distance %d < %d), read %d", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel, read);
   return ncclSuccess;
 }
 
 /* Determine if we will use this transport for this peer and return connect
  * information for this peer */
-ncclResult_t netSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
+ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
   struct netSendResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
-  ring->send.transportResources = resources;
+  send->transportResources = resources;
+
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  resources->netDev = getDev(cudaDev, channelId);
+  NCCLCHECK(netGetGdrSupport(resources->netDev, 1, &resources->useGdr));
 
-  struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
-  resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances);
-  NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 1, &resources->useGdr));
+  int sendSize = sizeof(struct ncclSendMem);
+  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
 
-  int size = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
+  int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
   if (resources->useGdr) {
-    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devNetMem), size));
+    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
   }
+  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
+  resources->buffSize = buffSize;
 
-  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, size));
-  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, size));
-
+  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [send] via NET/%s/%d%s", channelId, myInfo->rank, peerInfo->rank, ncclNetName(), resources->netDev,
+      resources->useGdr ? "/GDRDMA" : "");
   return ncclSuccess;
 }
 
-ncclResult_t netRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
+ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
   struct netRecvResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
-  ring->recv.transportResources = resources;
+  recv->transportResources = resources;
 
-  struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
-  resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances);
-  NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 0, &resources->useGdr));
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  resources->netDev = getDev(cudaDev, channelId);
+  NCCLCHECK(netGetGdrSupport(resources->netDev, 0, &resources->useGdr));
 
   int sendSize = sizeof(struct ncclSendMem);
   NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
 
-  int recvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
+  int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+  if (resources->useGdr) {
+    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
+  }
   NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
+  resources->buffSize = buffSize;
 
-  struct netInfo* peerInfo = (struct netInfo*)peerOpaqueInfo;
-  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d via NET/%s/%d%s%s", ring->id, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
-      resources->useGdr ? "/GDRDMA" : "",
-      (resources->hostDevMem != NULL) ? "/GDCopy" : "");
+  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [receive] via NET/%s/%d%s", channelId, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
+      resources->useGdr ? "/GDRDMA" : "");
   struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
   NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
   return ncclSuccess;
@@ -297,27 +335,28 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
   // Setup device pointers
   struct netSendResources* resources = (struct netSendResources*)send->transportResources;
 
-  if (resources->useGdr) {
-    send->conn.buff = resources->devNetMem->buff;
-    // We don't use devMem for llMode because the CPU has to read the data
-    send->conn.llBuff = resources->devHostRecvMem->llBuff;
-  } else {
-    send->conn.buff = resources->devHostRecvMem->buff;
-    send->conn.llBuff = resources->devHostRecvMem->llBuff;
-  }
+  // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host
+  struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
+  send->conn.buff = recvMem->buff;
+  send->conn.llBuff = resources->devHostRecvMem->llBuff;
+
+  // Head/Tail/Opcount/Fifos are always on host
   send->conn.tail = &resources->devHostRecvMem->tail;
-  send->conn.opCount = &resources->devHostRecvMem->opCount;
+  send->conn.opCountRem = &resources->devHostRecvMem->opCount;
   send->conn.fifo = resources->devHostRecvMem->sizesFifo;
-  send->conn.llFifo = resources->devHostRecvMem->llSizesFifo;
-
-  if (resources->hostDevMem == NULL) {
-    send->conn.head = &resources->devHostSendMem->head;
-    send->conn.llHead = &resources->devHostSendMem->llHead;
-  }
+  send->conn.head = &resources->devHostSendMem->head;
+  send->conn.opCountLoc = &resources->devHostSendMem->opCount;
+  for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
 
   // Connect to remote peer
   struct netConnectInfo* info = (struct netConnectInfo*)connectInfo;
   NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm));
+
+  NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->buff, resources->buffSize,
+        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
+  NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->devHostRecvMem->llBuff,
+        NCCL_LL_BUFF_SIZE, NCCL_PTR_HOST, &resources->llMhandle));
+
   return ncclSuccess;
 }
 
@@ -326,32 +365,37 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
   // Setup device pointers
   struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources;
 
-  recv->conn.head = &resources->devHostSendMem->head;
-  recv->conn.llHead = &resources->devHostSendMem->llHead;
-
-  if (resources->useGdr == 0) {
-    recv->conn.buff = resources->devHostRecvMem->buff;
-    recv->conn.llBuff = resources->devHostRecvMem->llBuff;
-  }
+  // Intermediate buffering on GPU for GPU Direct RDMA
+  struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
+  recv->conn.buff = recvMem->buff;
+  recv->conn.llBuff = recvMem->llBuff;
 
-  if (resources->hostDevMem == NULL) {
-    recv->conn.tail = &resources->devHostRecvMem->tail;
-    recv->conn.opCount = &resources->devHostRecvMem->opCount;
-  }
+  // Head/Tail/Opcount are always on host
+  recv->conn.tail = &resources->devHostRecvMem->tail;
+  recv->conn.opCountLoc = &resources->devHostRecvMem->opCount;
+  recv->conn.head = &resources->devHostSendMem->head;
+  recv->conn.opCountRem = &resources->devHostSendMem->opCount;
 
-  // Finish connection establishment
+  // Finish connection establishment from remote peer
   NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
   NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
 
+  NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->buff, resources->buffSize,
+        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
+  NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->llBuff, NCCL_LL_BUFF_SIZE,
+        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->llMhandle));
+
   return ncclSuccess;
 }
 
 ncclResult_t netSendFree(void* transportResources) {
   struct netSendResources* resources = (struct netSendResources*)transportResources;
   NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
+  NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandle));
+  NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->llMhandle));
   NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
   if (resources->useGdr)
-    CUDACHECK(cudaFree(resources->devNetMem));
+    CUDACHECK(cudaFree(resources->devRecvMem));
   NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
   free(resources);
   return ncclSuccess;
@@ -360,196 +404,166 @@ ncclResult_t netSendFree(void* transportResources) {
 ncclResult_t netRecvFree(void* transportResources) {
   struct netRecvResources* resources = (struct netRecvResources*)transportResources;
   NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
+  NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandle));
+  NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->llMhandle));
   NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
+  if (resources->useGdr)
+    CUDACHECK(cudaFree(resources->devRecvMem));
   NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
   free(resources);
   return ncclSuccess;
 }
 
 ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
-  struct ncclRing* ring = args->ring;
-  struct netSendResources* resources = (struct netSendResources*) (ring->send.transportResources);
-  const int llMode = args->llMode;
-
-  volatile uint64_t* prevTail = &resources->hostRecvMem->tail;
-  struct ncclSendMem* prevMem = resources->hostDevMem ? resources->hostDevMem : resources->hostSendMem;
-  uint64_t* prevHead = llMode ? &prevMem->llHead : &prevMem->head;
-  struct ncclRecvMem* localMem = resources->useGdr ? resources->devNetMem : resources->hostRecvMem;
-  char* localBuff = llMode ? resources->hostRecvMem->llBuff : localMem->buff;
-  int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
-  volatile int* sizesFifo = llMode ? resources->hostRecvMem->llSizesFifo : resources->hostRecvMem->sizesFifo;
-  int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
-  int sliceSize = buffSize / args->substeps;
-
-  assert(args->substeps <= SIZES_FIFO_SIZE);
-
-  uint64_t head = llMode ? resources->llStep : 0ULL;
-  uint64_t tail = llMode ? resources->llStep : 0ULL;
-  uint64_t end = head + args->nsteps;
-
-  int idle = 0;
-  void* requests[args->substeps];
-
-  if (!args->needProxy) goto nextColl;
-
-  TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
-  TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
-
-  // Update in case we skipped some collectives
-  if (llMode == 0) resources->hostRecvMem->opCount = args->opCount;
-
-  while (head < end) {
-    idle++;
-    if (llMode) {
-      if (tail < end && tail < head + args->substeps) {
-        int slot = tail%args->substeps;
-        int size = sizesFifo[slot];
-        if (size != 0) {
-          if (size == -1) size = 0;
-          uint32_t flag = tail + 1;
-          int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
-          size = nFifoLines * sizeof(union ncclLLFifoLine);
-          union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+slot*sliceSize);
-          for (int i=0; i<nFifoLines; i++) {
-            volatile uint32_t *f1 = &lines[i].flag1;
-            volatile uint32_t *f2 = &lines[i].flag2;
-            while (f1[0] != flag || f2[0] != flag);
+  struct netSendResources* resources = (struct netSendResources*) (args->connector->transportResources);
+  if (args->state == ncclProxyOpReady) {
+    // Update opCount
+    resources->hostRecvMem->opCount = args->opCount;
+
+    // Round to next multiple of sliceSteps
+    resources->step = ROUNDUP(resources->step, args->chunkSteps);
+    args->head = resources->step;
+    args->tail = resources->step;
+    args->end = args->head + args->nsteps;
+    args->state = ncclProxyOpProgress;
+  }
+  if (args->state == ncclProxyOpProgress) {
+    args->idle = 1;
+    if (args->head < args->end) {
+      if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
+        volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
+        if (args->llMode) {
+          int buffSlot = args->tail%NCCL_STEPS;
+          int size = sizesFifo[buffSlot];
+          if (size != -1) {
+            uint32_t flag = args->tail + 1;
+            int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
+            size = nFifoLines * sizeof(union ncclLLFifoLine);
+            union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
+            int ready = 1;
+            for (int i=0; i<nFifoLines; i++) {
+              volatile uint32_t *f1 = &lines[i].flag1;
+              volatile uint32_t *f2 = &lines[i].flag2;
+              if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
+            }
+            if (ready) {
+              NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, resources->llMhandle, args->requests+buffSlot));
+              if (args->requests[buffSlot] != NULL) {
+                sizesFifo[buffSlot] = -1;
+                // Make sure size is reset to zero before we update the head.
+                __sync_synchronize();
+                args->tail += args->sliceSteps;
+                args->idle = 0;
+              }
+            }
           }
-          NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, ptrType, requests+slot));
-          if (requests[slot] != NULL) {
-            sizesFifo[slot] = size;
-            tail++;
-            idle = 0;
+        } else if (args->tail < resources->hostRecvMem->tail) {
+          struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
+          int stepSize = args->channel->buffSize/NCCL_STEPS;
+          // Send through network
+          int buffSlot = args->tail%NCCL_STEPS;
+          NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot));
+          if (args->requests[buffSlot] != NULL) {
+            sizesFifo[buffSlot] = -1;
+            // Make sure size is reset to zero before we update the head.
+            __sync_synchronize();
+            args->tail += args->sliceSteps;
+            args->idle = 0;
           }
         }
       }
-    } else while (tail < *prevTail) {
-        // Send through network
-        int slot = tail%args->substeps;
-        NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+slot*sliceSize, sizesFifo[slot], ptrType, requests+slot));
-        if (requests[slot] != NULL) {
-          tail++;
-          idle = 0;
-        }
-      }
-    if (head < tail) {
-      int done;
-      int slot = head%args->substeps;
-      NCCLCHECK(ncclNetTest(requests[slot], &done, NULL));
-      if (done) {
-        if (llMode) {
-          sizesFifo[slot] = 0;
-          // Make sure size is reset to zero before we update the head.
-          __sync_synchronize();
+      if (args->head < args->tail) {
+        int done;
+        int buffSlot = args->head%NCCL_STEPS;
+        NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
+        if (done) {
+          args->head += args->sliceSteps;
+          resources->hostSendMem->head = args->head;
+          args->idle = 0;
         }
-        head++;
-        *prevHead = head;
-        idle = 0;
       }
     }
-    if (idle) transportProxyIdle(idle);
+    if (args->head == args->end) {
+      resources->step = args->end;
+      args->idle = 0;
+      args->state = ncclProxyOpDone;
+    }
   }
-
-  // Reset
-  if (llMode == 0) *prevTail = 0;
-
-nextColl:
-  if (llMode) {
-    resources->llStep += args->nsteps;
-    // Don't forget to ack otherwise the GPU won't be able to push data.
-    *prevHead = resources->llStep;
-    if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
-      memset(localBuff, 0, NCCL_LL_BUFF_SIZE);
-      resources->llStep += NCCL_LL_CHUNKS;
-      *prevHead = resources->llStep;
-      resources->llLastCleaning = resources->llStep;
+  if (args->state == ncclProxyOpDone) {
+    union ncclLLFifoLine* llBuff = resources->hostRecvMem->llBuff;
+    if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
+      for (int i=0; i< NCCL_LL_BUFF_LINES; i++) llBuff[i].flag1 = llBuff[i].flag2 = resources->step;
+      resources->step += NCCL_STEPS;
+      resources->hostSendMem->head = resources->step;
+      resources->llLastCleaning = resources->step;
     }
+    args->state = ncclProxyOpNone;
   }
   return ncclSuccess;
 }
 
 ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
-  struct ncclRing* ring = args->ring;
-  struct netRecvResources* resources = (struct netRecvResources*) (ring->recv.transportResources);
-  int llMode = args->llMode;
-
-  volatile uint64_t* nextHead = llMode ? &resources->hostSendMem->llHead : &resources->hostSendMem->head;
-  struct ncclRecvMem* localMem = resources->useGdr ? ring->devMemRecv : resources->hostRecvMem;
-  char* localBuff = llMode ? localMem->llBuff : localMem->buff;
-  char* nextBuff = (resources->useGdr == 0 && resources->hostDevMem) ? resources->hostDevMem->buff : NULL;
-  int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
-  uint64_t* nextTail = resources->hostDevMem ? &resources->hostDevMem->tail : &resources->hostRecvMem->tail;
-
-  int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
-  int sliceSize = buffSize / args->substeps;
-
-  uint64_t head = llMode ? resources->llStep : 0ULL;
-  uint64_t tail = llMode ? resources->llStep : 0ULL;
-  uint64_t end = head + args->nsteps;
-
-  int idle = 0;
-  void* requests[args->substeps];
-
-  if (!args->needProxy) goto nextColl;
-
-  TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
-  TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
-
-  if (llMode == 0) {
-    // Waiting for next opCount is only needed before writing nextTail.
-    uint64_t* nextOpCount = resources->hostDevMem ? &resources->hostDevMem->opCount : &resources->hostRecvMem->opCount;
-    transportProxyWait([=] { return *nextOpCount >= args->opCount; });
+  struct netRecvResources* resources = (struct netRecvResources*) (args->connector->transportResources);
+  if (args->state == ncclProxyOpReady) {
+    // Update opCount
+    resources->hostSendMem->opCount = args->opCount;
+
+    // Round to next multiple of sliceSteps
+    resources->step = ROUNDUP(resources->step, args->chunkSteps);
+    args->head = resources->step;
+    args->tail = resources->step;
+    args->end = args->head + args->nsteps;
+    args->state = ncclProxyOpProgress;
   }
-
-  while (head < end) {
-    idle++;
-    if ((tail < head + args->substeps) && (tail < *nextHead + args->substeps) && (tail < end)) {
-      int slot = tail%args->substeps;
-      NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+slot*sliceSize, sliceSize, ptrType, requests+slot));
-      if (requests[slot] != NULL) {
-        tail++;
-        idle = 0;
+  if (args->state == ncclProxyOpProgress) {
+    args->idle = 1;
+    int stepSize = ( args->llMode ? NCCL_LL_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS;
+    if (args->head < args->end) {
+      struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
+      char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff;
+      void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle;
+      if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
+        int buffSlot = args->tail%NCCL_STEPS;
+        int sliceSize = stepSize * args->sliceSteps;
+        NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot));
+        if (args->requests[buffSlot] != NULL) {
+          args->tail += args->sliceSteps;
+          args->idle = 0;
+        }
       }
-    }
-    if (tail > head) {
-      int done;
-      int slot = head%args->substeps;
-      int size;
-      NCCLCHECK(ncclNetTest(requests[slot], &done, &size));
-      if (done) {
-        if (nextBuff) memcpy(nextBuff+slot*sliceSize, localBuff+slot*sliceSize, size);
-        head++;
-        if (llMode == 0) {
-          if (ptrType == NCCL_PTR_CUDA) ncclNetFlush(resources->netRecvComm, localBuff+slot*sliceSize, size);
-          *nextTail = head;
+      if (args->tail > args->head) {
+        int buffSlot = args->head%NCCL_STEPS;
+        int done, size;
+        NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size));
+        if (done) {
+          args->head += args->sliceSteps;
+          if (args->llMode == 0) {
+            if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle);
+            resources->hostRecvMem->tail = args->head;
+          }
+          args->idle = 0;
         }
-        idle = 0;
       }
     }
-    if (idle) transportProxyIdle(idle);
-  }
-
-  // Wait for last ack and reset
-  if (llMode == 0) {
-    transportProxyWait([=] { return *nextHead == head; });
-    *nextHead = 0;
+    if (args->head == args->end) {
+      resources->step = args->end;
+      args->idle = 0;
+      args->state = ncclProxyOpDone;
+    }
   }
-
-nextColl:
-  if (llMode) {
-    resources->llStep += args->nsteps;
-    if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
-      resources->llStep += NCCL_LL_CHUNKS;
-      while (*nextHead < resources->llStep);
-      resources->llLastCleaning = resources->llStep;
+  if (args->state == ncclProxyOpDone) {
+    if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
+      resources->step += NCCL_STEPS;
+      while (resources->hostSendMem->head < resources->step);
+      resources->llLastCleaning = resources->step;
     }
+    args->state = ncclProxyOpNone;
   }
   return ncclSuccess;
 }
 
 struct ncclTransport netTransport = {
   "NET",
-  netFillInfo,
   netCanConnect,
   netGetRings,
   { netSendSetup, netSendConnect, netSendFree, netSendProxy },
diff --git a/src/transport/net_ib.cu b/src/transport/net_ib.cu
index 18e158d..f7c574b 100644
--- a/src/transport/net_ib.cu
+++ b/src/transport/net_ib.cu
@@ -32,6 +32,7 @@ static int ncclNIbDevs = -1;
 struct ncclIbDev {
   int device;
   uint8_t port;
+  uint8_t link;
   ibv_context* context;
   char devName[MAXNAMESIZE];
 };
@@ -97,7 +98,6 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
         WARN("NET/IB : No IP interface found.");
         return ncclInternalError;
       }
-      INFO(NCCL_INIT|NCCL_NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName);
 
       // Detect IB cards
       int nIbDevs;
@@ -113,47 +113,59 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
 
       for (int d=0; d<nIbDevs; d++) {
         struct ibv_context * context;
-        if (ncclSuccess != wrap_ibv_open_device(&context, devices[d])) {
+        if (ncclSuccess != wrap_ibv_open_device(&context, devices[d]) || context == NULL) {
           WARN("NET/IB : Unable to open device %s", devices[d]->name);
           continue;
         }
         int found = 0;
-        if (context) {
-          struct ibv_device_attr devAttr;
-          if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
-            WARN("NET/IB : Unable to query device %s", devices[d]->name);
+        struct ibv_device_attr devAttr;
+        if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
+          WARN("NET/IB : Unable to query device %s", devices[d]->name);
+          if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
+          continue;
+        }
+        for (int port = 1; port <= devAttr.phys_port_cnt; port++) {
+          struct ibv_port_attr portAttr;
+          if (ncclSuccess != wrap_ibv_query_port(context, port, &portAttr)) {
+            WARN("NET/IB : Unable to query port %d", port);
             continue;
           }
-          for (int port = 1; port <= devAttr.phys_port_cnt; port++) {
-            struct ibv_port_attr portAttr;
-            if (ncclSuccess != wrap_ibv_query_port(context, port, &portAttr)) {
-              WARN("NET/IB : Unable to query port %d", port);
-              continue;
-            }
-            if (portAttr.state != IBV_PORT_ACTIVE) continue;
-            if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND
-                && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
-
-            // check against user specified HCAs/ports
-            if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) {
-              continue;
-            }
-            INFO(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
-                portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
-            ncclIbDevs[ncclNIbDevs].device = d;
-            ncclIbDevs[ncclNIbDevs].port = port;
-            ncclIbDevs[ncclNIbDevs].context = context;
-            strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
-            ncclNIbDevs++;
-            found++;
-            pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
-          }
+          if (portAttr.state != IBV_PORT_ACTIVE) continue;
+          if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND
+              && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
 
-          if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } }
+          // check against user specified HCAs/ports
+          if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) {
+            continue;
+          }
+          TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
+              portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
+          ncclIbDevs[ncclNIbDevs].device = d;
+          ncclIbDevs[ncclNIbDevs].port = port;
+          ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
+          ncclIbDevs[ncclNIbDevs].context = context;
+          strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
+          ncclNIbDevs++;
+          found++;
+          pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
         }
+        if (found == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
       }
       if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; };
     }
+    if (ncclNIbDevs == 0) {
+      INFO(NCCL_INIT|NCCL_NET, "NET/IB : No device found.");
+    } else {
+      char line[1024];
+      line[0] = '\0';
+      for (int d=0; d<ncclNIbDevs; d++) {
+        snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%d/%s", d, ncclIbDevs[d].devName,
+            ncclIbDevs[d].port, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
+      }
+      line[1023] = '\0';
+      char addrline[1024];
+      INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s ; OOB %s:%s", line, ncclIbIfName, socketToString(&ncclIbIfAddr.sa, addrline));
+    }
     pthread_mutex_unlock(&ncclIbLock);
   }
   return ncclSuccess;
@@ -205,11 +217,12 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
 ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
   *supportedTypes = NCCL_PTR_HOST;
 
-  int cudaDev;
+  int cudaDev, nvmlDev;
   CUDACHECK(cudaGetDevice(&cudaDev));
+  NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
 
   if (ncclIbGdrSupport(dev) != ncclSuccess) {
-    INFO(NCCL_INIT|NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (no module or not supported by GPU)", cudaDev, ncclIbDevs[dev].devName);
+    INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d '%s' (no module or not supported by GPU)", cudaDev, nvmlDev, dev, ncclIbDevs[dev].devName);
     return ncclSuccess;
   }
   *supportedTypes |= NCCL_PTR_CUDA;
@@ -242,23 +255,15 @@ struct ncclIbHandle {
   union socketAddress connectAddr;
 };
 
-struct ncclIbMr {
-  struct ibv_mr* mr;
-  int refcnt;
-};
-
 struct ncclIbVerbs {
   struct ibv_pd* pd;
   struct ibv_cq* cq;
-  struct ncclIbMr mrPool[MAX_REQUESTS];
-  int mrRotation;
 };
 
 struct ncclIbRequest {
   int used;
   int type;
   struct ncclIbVerbs* verbs;
-  struct ncclIbMr * ibMr;
   int done;
   int size;
   int free;
@@ -278,12 +283,12 @@ struct ncclIbSendFifo {
 };
 
 struct ncclIbSendComm {
+  struct ncclIbVerbs verbs;
   struct ncclIbSendFifo fifo[MAX_REQUESTS];
   struct ncclIbRequest reqs[MAX_REQUESTS];
   uint32_t fifoHead;
   int fd;
   int ready;
-  struct ncclIbVerbs verbs;
   struct ibv_qp* qp;
   struct ibv_mr* fifoMr;
 };
@@ -307,11 +312,11 @@ struct ncclIbRemFifo {
 };
 
 struct ncclIbRecvComm {
+  struct ncclIbVerbs verbs;
   struct ncclIbRemFifo remFifo;
   struct ncclIbRequest reqs[MAX_REQUESTS];
   int fd;
   int ready;
-  struct ncclIbVerbs verbs;
   struct ibv_qp* qp;
   struct ncclIbGpuFlush gpuFlush;
 };
@@ -434,13 +439,13 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
   // RoCE support
   qpInfo.lid = portAttr.lid;
   if (qpInfo.lid) { // IB
-    INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid);
+    INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid);
   } else { // RoCE
     union ibv_gid gid;
     NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
     qpInfo.spn = gid.global.subnet_prefix;
     qpInfo.iid = gid.global.interface_id;
-    INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
+    INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
   }
 
   NCCLCHECK(socketSend(comm->fd, &qpInfo, sizeof(qpInfo)));
@@ -537,7 +542,6 @@ ncclResult_t ncclIbGetRequest(struct ncclIbRequest* reqs, struct ncclIbRequest**
       r->used = 1;
       r->type = 0;
       r->verbs = NULL;
-      r->ibMr = NULL;
       r->done = 0;
       r->size = -1;
       r->free = 0;
@@ -583,57 +587,34 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size);
 
 #define REG_ALIGN (4096)
 
-// Cache previous MRs to avoid registering/unregistering for each Isend/Irecv
-ncclResult_t ncclIbGetMr(struct ncclIbVerbs* verbs, void* data, int size, struct ncclIbMr** mrRet) {
+ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+  struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
   uint64_t addr = (uint64_t)data;
-  int elem = -1;
   assert(size > 0);
 
-  // Look for an already existing MR
-  for (int i=0; i<MAX_REQUESTS; i++) {
-    if (verbs->mrPool[i].mr == NULL) continue;
-    uint64_t regAddr = (uint64_t)verbs->mrPool[i].mr->addr;
-    uint64_t regSize = (uint64_t)verbs->mrPool[i].mr->length;
-    if (regAddr <= addr && addr+size <= regAddr+regSize) {
-      *mrRet = verbs->mrPool+i;
-      verbs->mrPool[i].refcnt++;
-      return ncclSuccess;
-    }
-  }
-
-  // Find an unused element
-  if (elem == -1) {
-    elem = (verbs->mrRotation++);
-    for (int i=0; i<MAX_REQUESTS; i++) {
-      elem %= MAX_REQUESTS;
-      if (verbs->mrPool[elem].refcnt > 0) elem++; else break;
-    }
-    if (verbs->mrPool[elem].refcnt > 0) {
-      WARN("NET/IB : memory register : no MR available");
-      return ncclInternalError;
-    }
-  }
-
-  assert(elem < MAX_REQUESTS);
-  assert(verbs->mrPool[elem].refcnt == 0);
-
   // Deregister / register
   uint64_t regAddr = addr & (~(REG_ALIGN-1));
   uint64_t regSize = addr+size - regAddr;
   regSize = ((regSize + REG_ALIGN-1) / REG_ALIGN ) * REG_ALIGN;
-  if (verbs->mrPool[elem].mr) NCCLCHECK(wrap_ibv_dereg_mr(verbs->mrPool[elem].mr));
-  NCCLCHECK(wrap_ibv_reg_mr(&verbs->mrPool[elem].mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
-  *mrRet = verbs->mrPool+elem;
-  verbs->mrPool[elem].refcnt++;
-  TRACE(NCCL_INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey);
+  struct ibv_mr* mr;
+  NCCLCHECK(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
+  *mhandle = (void*)mr;
+  TRACE(NCCL_INIT,"regAddr %lx size %ld rkey %x", regAddr, regSize, mr->rkey);
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void** request) {
+ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
+  NCCLCHECK(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, void** request) {
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
   if (comm->ready == 0) NCCLCHECK(ncclSendCheck(comm));
   if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
 
+  struct ibv_mr* mr = (struct ibv_mr*)mhandle;
+
   // Wait for the receiver to have posted the corresponding receive
   volatile struct ncclIbSendFifo* slot = comm->fifo + (comm->fifoHead%MAX_REQUESTS);
   volatile uint32_t * readyPtr = &slot->ready;
@@ -641,7 +622,6 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void**
 
   struct ncclIbRequest* req;
   NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
-  req->type = type;
   req->verbs = &comm->verbs;
   req->size = size;
 
@@ -654,8 +634,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void**
     wr.sg_list = NULL;
     wr.num_sge = 0;
   } else {
-    NCCLCHECK(ncclIbGetMr(&comm->verbs, data, size, &req->ibMr));
-    sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=req->ibMr->mr->lkey;
+    sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=mr->lkey;
     wr.sg_list = &sge;
     wr.num_sge = 1;
   }
@@ -720,14 +699,15 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void** request) {
+ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
   if (comm->ready == 0) NCCLCHECK(ncclRecvCheck(comm));
   if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
 
+  struct ibv_mr* mr = (struct ibv_mr*)mhandle;
+
   struct ncclIbRequest* req;
   NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
-  req->type = type;
   req->verbs = &comm->verbs;
   req->size = size;
 
@@ -739,10 +719,8 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void**
   if (size == 0) {
     wr.sg_list = NULL;
     wr.num_sge = 0;
-    req->ibMr = NULL;
   } else {
-    NCCLCHECK(ncclIbGetMr(&comm->verbs, data, size, &req->ibMr));
-    sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=req->ibMr->mr->lkey;
+    sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=mr->lkey;
     wr.sg_list = &sge;
     wr.num_sge = 1;
   }
@@ -752,25 +730,25 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void**
   *request = req;
 
   // Post to FIFO to notify sender
-  NCCLCHECK(ncclIbPostFifo(comm, req->ibMr->mr->rkey, (uint64_t)data, size));
+  NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size));
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbFlush(void* recvComm, void* data, int size) {
+ncclResult_t ncclIbFlush(void* recvComm, void* data, int size, void* mhandle) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
   if (comm->gpuFlush.enabled == 0 || size == 0) return ncclSuccess;
 
   struct ncclIbRequest* req;
   NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
   req->verbs = &comm->verbs;
-  NCCLCHECK(ncclIbGetMr(&comm->verbs, data, 1, &req->ibMr));
+  struct ibv_mr* mr = (struct ibv_mr*)mhandle;
 
   struct ibv_send_wr wr;
   memset(&wr, 0, sizeof(wr));
   wr.wr_id = (uint64_t)req;
 
   wr.wr.rdma.remote_addr = (uint64_t)data;
-  wr.wr.rdma.rkey = req->ibMr->mr->rkey;
+  wr.wr.rdma.rkey = mr->rkey;
   wr.sg_list = &comm->gpuFlush.sge;
   wr.num_sge = 1;
   wr.opcode = IBV_WR_RDMA_READ;
@@ -800,32 +778,31 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) {
     }
 
     int wrDone = 0;
-    struct ibv_wc wc;
-    NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 1, &wc, &wrDone));
+    struct ibv_wc wcs[4];
+    NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 4, wcs, &wrDone));
     if (wrDone == 0) return ncclSuccess;
 
-    if (wc.status != IBV_WC_SUCCESS) {
-      WARN("NET/IB : Got completion with error %d, opcode %d, len %d, vendor err %d", wc.status, wc.opcode, wc.byte_len, wc.vendor_err);
-      return ncclSystemError;
-    }
+    for (int w=0; w<wrDone; w++) {
+      struct ibv_wc *wc = wcs+w;
+      if (wc->status != IBV_WC_SUCCESS) {
+        WARN("NET/IB : Got completion with error %d, opcode %d, len %d, vendor err %d", wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
+        return ncclSystemError;
+      }
 
-    struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc.wr_id;
-    if (doneReq) {
-      if (wc.opcode == IBV_WC_RECV) {
-        doneReq->size = wc.byte_len;
+      struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc->wr_id;
+      if (doneReq) {
+        if (wc->opcode == IBV_WC_RECV) {
+          doneReq->size = wc->byte_len;
 #if USE_RDMA_WRITE
-      } else if (wc.opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
-        doneReq->size = wc.imm_data;
+        } else if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
+          doneReq->size = wc->imm_data;
 #endif
-      }
-      if (doneReq->ibMr != NULL) {
-        doneReq->ibMr->refcnt--;
-        if (doneReq->ibMr->refcnt < 0) WARN("NET/IB : doneReq %p MR %p refcount now %d", doneReq, doneReq->ibMr, doneReq->ibMr->refcnt);
-      }
-      doneReq->done = 1;
-      if (doneReq->free == 1) {
-        // This is an internal (FIFO post) req. Free it immediately.
-        doneReq->used = 0;
+        }
+        doneReq->done = 1;
+        if (doneReq->free == 1) {
+          // This is an internal (FIFO post) req. Free it immediately.
+          doneReq->used = 0;
+        }
       }
     }
   }
@@ -837,12 +814,6 @@ ncclResult_t ncclIbCloseSend(void* sendComm) {
     close(comm->fd);
     if (comm->qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qp));
     if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr));
-    for (int i=0; i<MAX_REQUESTS; i++) {
-      if (comm->verbs.mrPool[i].mr != NULL) {
-        if (comm->verbs.mrPool[i].refcnt != 0) WARN("NET/IB : TX MR #%d has non-zero (%d) refcnt", i, comm->verbs.mrPool[i].refcnt);
-        NCCLCHECK(wrap_ibv_dereg_mr(comm->verbs.mrPool[i].mr));
-      }
-    }
     NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs));
     free(comm);
   }
@@ -859,12 +830,6 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) {
       if (comm->gpuFlush.hostMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->gpuFlush.hostMr));
     }
     if (comm->remFifo.mr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remFifo.mr));
-    for (int i=0; i<MAX_REQUESTS; i++) {
-      if (comm->verbs.mrPool[i].mr != NULL) {
-        if (comm->verbs.mrPool[i].refcnt != 0) WARN("NET/IB : RX MR #%d has non-zero (%d) refcnt", i, comm->verbs.mrPool[i].refcnt);
-        NCCLCHECK(wrap_ibv_dereg_mr(comm->verbs.mrPool[i].mr));
-      }
-    }
     NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs));
     free(comm);
   }
@@ -889,6 +854,8 @@ ncclNet_t ncclNetIb = {
   ncclIbListen,
   ncclIbConnect,
   ncclIbAccept,
+  ncclIbRegMr,
+  ncclIbDeregMr,
   ncclIbIsend,
   ncclIbIrecv,
   ncclIbFlush,
diff --git a/src/transport/net_socket.cu b/src/transport/net_socket.cu
index 1efee15..0464b43 100644
--- a/src/transport/net_socket.cu
+++ b/src/transport/net_socket.cu
@@ -27,10 +27,19 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
     pthread_mutex_lock(&ncclSocketLock);
     if (ncclNetIfs == -1) {
       ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
-      INFO(NCCL_INIT|NCCL_NET,"NET/Socket : %d interfaces found", ncclNetIfs);
       if (ncclNetIfs <= 0) {
         WARN("NET/Socket : no interface found");
         return ncclInternalError;
+      } else {
+        char line[1024];
+        char addrline[1024];
+        line[0] = '\0';
+        for (int i=0; i<ncclNetIfs; i++) {
+          snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, ncclNetIfNames+i*MAX_IF_NAME_SIZE,
+              socketToString(&ncclNetIfAddrs[i].sa, addrline));
+        }
+        line[1023] = '\0';
+        INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
       }
     }
     pthread_mutex_unlock(&ncclSocketLock);
@@ -113,7 +122,7 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
     union socketAddress localAddr;
     char ifName[MAX_IF_NAME_SIZE];
     if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
-      WARN("No usable listening interface found");
+      WARN("NET/Socket : No usable listening interface found");
       return ncclSystemError;
     }
     // pass the local address back
@@ -205,21 +214,24 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, int type, void** request) {
-  if (type != NCCL_PTR_HOST) return ncclInternalError;
+ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+  return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess;
+}
+ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
+
+ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, void* mhandle, void** request) {
   struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm;
   NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_SEND, data, size, comm->fd, (struct ncclSocketRequest**)request));
   return ncclSuccess;
 }
 
-ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, int type, void** request) {
-  if (type != NCCL_PTR_HOST) return ncclInternalError;
+ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) {
   struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm;
   NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_RECV, data, size, comm->fd, (struct ncclSocketRequest**)request));
   return ncclSuccess;
 }
 
-ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size) {
+ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size, void* mhandle) {
   // We don't support CUDA pointers, so we don't need a flush operation
   return ncclInternalError;
 }
@@ -243,6 +255,8 @@ ncclNet_t ncclNetSocket = {
   ncclSocketListen,
   ncclSocketConnect,
   ncclSocketAccept,
+  ncclSocketRegMr,
+  ncclSocketDeregMr,
   ncclSocketIsend,
   ncclSocketIrecv,
   ncclSocketFlush,
diff --git a/src/transport/p2p.cu b/src/transport/p2p.cu
index 6c4626a..9f3e0b6 100644
--- a/src/transport/p2p.cu
+++ b/src/transport/p2p.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,18 +11,9 @@
 #include "param.h"
 #include <unistd.h>
 #include <cuda_runtime.h>
-#include "nvmlwrap.h"
 #include <ctype.h>
 #include "nvlink.h"
 
-struct p2pInfo {
-  int rank;
-  int cudaDev;
-  uint64_t hostHash;
-  uint64_t pidHash;
-  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-};
-
 struct p2pConnectInfo {
   int direct;
   union {
@@ -31,36 +22,40 @@ struct p2pConnectInfo {
   };
 };
 
-#include <sys/types.h>
+struct p2pSendResources {
+  struct ncclSendMem* devMem;
+  void* ipcPtr;
+};
 
-/* Fill information necessary to exchange between ranks to choose whether or not
- * to use this transport */
-ncclResult_t p2pFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
-  struct p2pInfo* info = (struct p2pInfo*)opaqueInfo;
-  static_assert(sizeof(struct p2pInfo) <= sizeof(ncclTinfo_t), "p2p Info too large");
-  info->rank = rank;
-  CUDACHECK(cudaGetDevice(&info->cudaDev));
-  info->hostHash=getHostHash();
-  info->pidHash=getPidHash();
-
-  // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
-  // cudaDev is a CUDA runtime dev number which could be different from the
-  // NVML device number. Then we get the busID from NVML to be sure it is
-  // consistent with NVML remote PCI bus Ids.
-  CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
-  nvmlDevice_t nvmlDevice;
-  NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice));
-  nvmlPciInfo_t pciInfo;
-  NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo));
-  strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE);
-  return ncclSuccess;
-}
+struct p2pRecvResources {
+  struct ncclRecvMem* devMem;
+  void* ipcPtr;
+};
+
+#include <sys/types.h>
 
 NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2);
 NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2);
 
+/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
+static int busIdToCudaDev(const char* busId) {
+  int ndev;
+  if (cudaGetDeviceCount(&ndev) != cudaSuccess)
+    return -1;
+  for (int i = 0; i < ndev; i++) {
+    char devBusId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+    if (cudaDeviceGetPCIBusId(devBusId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess)
+      return -1;
+    if (strcmp(busId, devBusId) == 0) {
+      return i;
+    }
+  }
+  // BusId was not found in our locally visible CUDA devices
+  return -1;
+}
+
 /* Determine if we can communicate with the peer through p2p */
-ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
+ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
   // Do not use P2P across root complexes by default (provided CUDA permits it)
   int p2pLevel = PATH_SOC;
   if (ncclParamP2pDisable() == 1) p2pLevel = 0;
@@ -70,23 +65,26 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
 
   if (p2pLevel == 0) return ncclSuccess;
 
-  struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
-  struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
-
   // Rule out different nodes
   if (myInfo->hostHash != peerInfo->hostHash) return ncclSuccess;
 
+  // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
+  int peerCudaDev = busIdToCudaDev(peerInfo->busId);
+  if (peerCudaDev == -1) return ncclSuccess; // Peer's CUDA device is not visible in this process
+
+  TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
+
   // Do not detect topology if we're on the same GPU. Note this is not really supported.
-  if (myInfo->cudaDev == peerInfo->cudaDev) {
+  if (myInfo->cudaDev == peerCudaDev) {
     *ret = 1 + PATH_SOC;
     return ncclSuccess;
   }
 
   // See if CUDA can do P2P
   int p2p;
-  if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerInfo->cudaDev) != cudaSuccess) {
-    INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d and dev %d",
-        myInfo->cudaDev, peerInfo->cudaDev);
+  if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerCudaDev) != cudaSuccess) {
+    INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%d) and dev %d(=%d)",
+         myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
     return ncclSuccess;
   }
   if (p2p == 0) return ncclSuccess;
@@ -102,7 +100,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
   char* myPath;
   char* peerPath;
   ncclResult_t err1 = getCudaPath(myInfo->cudaDev, &myPath);
-  ncclResult_t err2 = getCudaPath(peerInfo->cudaDev, &peerPath);
+  ncclResult_t err2 = getCudaPath(peerCudaDev, &peerPath);
   if (err1 == ncclSuccess && err2 == ncclSuccess) {
     int distance = pciDistance(myPath, peerPath);
     if (distance < p2pLevel) {
@@ -174,8 +172,8 @@ static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentR
 static inline int copyRings(int nranks, int* rings, int nrings, int newNrings) {
   if (nrings == 0) return 0;
   // Copy rings by dup times
-  if (newNrings > MAXRINGS) {
-    newNrings = MAXRINGS;
+  if (newNrings > MAXCHANNELS) {
+    newNrings = MAXCHANNELS;
   }
   for (int r=nrings; r<newNrings; r++) {
     for (int i=0; i<nranks; i++) rings[r*nranks+i] = rings[(r%nrings)*nranks+i];
@@ -191,7 +189,6 @@ int p2pComputeRingsNvLink(ncclTvalue_t* matrix, int nranks, int *rings, int nrin
   if (connect) {
     inTheRing[rings[0]] = 1;
     nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, rings[1], nranks-2, connect);
-    nrings = copyRings(nranks, rings, nrings, nringsMax);
   } else {
     rings[0] = 0;
     nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, 0, nranks-1, connect);
@@ -209,9 +206,9 @@ static inline int findConnect(int nranks, int* ranks) {
 
 int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int oversubscribe, int* nthreads) {
   if (nrings == 0) return 0;
-  if (nrings > MAXRINGS) {
-    WARN("Max rings reached, limiting to %d", MAXRINGS);
-    nrings = MAXRINGS;
+  if (nrings > MAXCHANNELS) {
+    WARN("Max rings reached, limiting to %d", MAXCHANNELS);
+    nrings = MAXCHANNELS;
   }
   // Find existing constraints / connections
   int connect = 0;
@@ -239,9 +236,9 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin
 
   if (compNrings && compNrings < nrings && nranks <= 4) {
     // Try to oversubscribe to get a better result
-    int *rings2 = (int *)malloc(sizeof(int)*MAXRINGS*nranks);
-    if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXRINGS*nranks); return 0; }
-    for (int i=0; i<MAXRINGS*nranks; i++) rings2[i] = -1;
+    int *rings2 = (int *)malloc(sizeof(int)*MAXCHANNELS*nranks);
+    if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXCHANNELS*nranks); return 0; }
+    for (int i=0; i<MAXCHANNELS*nranks; i++) rings2[i] = -1;
     int nThreads = *nthreads;
     int compNrings2 = p2pComputeRingsNvLink(values, nranks, rings2, nrings, prev, next, 1, &nThreads);
     if (compNrings2 > compNrings*2) {
@@ -255,7 +252,6 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin
   // Duplicate the rings for direct NVLink
   compNrings = copyRings(nranks, rings, compNrings, compNrings*2);
 
-  if (ncclCudaCompCap() == 6) *nthreads /= 2;
   return compNrings;
 }
 
@@ -367,8 +363,8 @@ int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings,
 ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
   if (*nringsRet == 0) return ncclSuccess;
   int *rings;
-  NCCLCHECK(ncclCalloc(&rings, MAXRINGS*nranks));
-  for (int i=0; i<MAXRINGS*nranks; i++) rings[i] = -1;
+  NCCLCHECK(ncclCalloc(&rings, MAXCHANNELS*nranks));
+  for (int i=0; i<MAXCHANNELS*nranks; i++) rings[i] = -1;
   int nrings = *nringsRet;
 
   // NVswitch
@@ -446,39 +442,47 @@ end:
   } while (0)
 
 /* Send: Create and return connect structures for this peer to connect to me */
-ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
-  struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
-  struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
+ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+    struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+
+  struct p2pSendResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  send->transportResources = resources;
+  const int sendSize = sizeof(struct ncclSendMem);
+  NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize));
+
   struct p2pConnectInfo info;
   if (myInfo->pidHash == peerInfo->pidHash) {
     info.direct = 1;
-    info.directPtr = ring->devMemSend;
+    info.directPtr = resources->devMem;
     if (myInfo->cudaDev == peerInfo->cudaDev) {
-      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank);
+      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", channelId, myInfo->rank, peerInfo->rank);
     } else {
       // Enable P2P access
       cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
       if (err == cudaErrorPeerAccessAlreadyEnabled) {
         cudaGetLastError();
       } else if (err != cudaSuccess) {
-        WARN("failed to peer with device %d: %d %s",
-            peerInfo->cudaDev, err, cudaGetErrorString(err));
+        WARN("failed to peer with device %d(=%d): %d %s",
+             peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
         return ncclInternalError;
       }
       INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
-          ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+          channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
     }
   } else {
+    // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
+    int peerCudaDev = busIdToCudaDev(peerInfo->busId);
     info.direct = 0;
     // Map IPC and enable P2P access
-    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)ring->devMemSend);
+    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
     if (err != cudaSuccess) {
-      WARN("rank %d failed to get CUDA IPC handle to device %d : %d %s",
-          myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
+      WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
+           myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
       return ncclInternalError;
     }
     INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
-        ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+        channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
     //TRACE_DUMP_IPC(&info.devIpc);
   }
   static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
@@ -487,13 +491,19 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
 }
 
 /* Create and return connect structures for this peer to connect to me */
-ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
-  struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
-  struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
+ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+    struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) {
+
+  struct p2pRecvResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  recv->transportResources = resources;
+  const int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+  NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize));
+
   struct p2pConnectInfo info;
   if (myInfo->pidHash == peerInfo->pidHash) {
     info.direct = 1;
-    info.directPtr = ring->devMemRecv;
+    info.directPtr = resources->devMem;
     if (myInfo->cudaDev == peerInfo->cudaDev) {
       TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
     } else {
@@ -502,22 +512,24 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
       if (err == cudaErrorPeerAccessAlreadyEnabled) {
         cudaGetLastError();
       } else if (err != cudaSuccess) {
-        WARN("failed to peer with device %d: %d %s",
-            peerInfo->cudaDev, err, cudaGetErrorString(err));
+        WARN("failed to peer with device %d(=%d): %d %s",
+             peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
         return ncclInternalError;
       }
-      TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+      TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
     }
   } else {
+    // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
+    int peerCudaDev = busIdToCudaDev(peerInfo->busId);
     info.direct = 0;
     // Map IPC and enable P2P access
-    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)ring->devMemRecv);
+    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
     if (err != cudaSuccess) {
-      WARN("rank %d failed to get CUDA IPC handle to device %d : %d %s",
-          myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
+      WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
+           myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
       return ncclInternalError;
     }
-    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
     //TRACE_DUMP_IPC(&info.devIpc);
   }
   static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
@@ -527,22 +539,16 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
 
 /* Connect/Send to this peer */
 static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
-  void** resources = &send->transportResources;
+  struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
   struct ncclRecvMem* remDevMem;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
   if (info->direct) {
     remDevMem = (struct ncclRecvMem*)(info->directPtr);
     send->conn.direct = 1;
-    *resources = NULL;
   } else {
-    void* remPtr = NULL;
     //TRACE_DUMP_IPC(&info->devIpc);
-    cudaError_t err = cudaIpcOpenMemHandle(&remPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
-    void** ipcPtrSave;
-    NCCLCHECK(ncclCalloc(&ipcPtrSave, 1));
-    *resources = ipcPtrSave;
-    *ipcPtrSave = remPtr;
-    remDevMem = (struct ncclRecvMem*)remPtr;
+    cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
+    remDevMem = (struct ncclRecvMem*)resources->ipcPtr;
     if (err != cudaSuccess) {
       WARN("failed to open CUDA IPC handle : %d %s",
           err, cudaGetErrorString(err));
@@ -553,30 +559,26 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC
   send->conn.buff = remDevMem->buff;
   send->conn.llBuff = remDevMem->llBuff;
   send->conn.tail = &remDevMem->tail;
-  send->conn.opCount = &remDevMem->opCount;
-  // send->conn->head should have been set to devMemSend already
+  send->conn.opCountRem = &remDevMem->opCount;
+  send->conn.head = &resources->devMem->head;
+  send->conn.ptrExchange = &resources->devMem->ptrExchange;
+  send->conn.opCountLoc = &resources->devMem->opCount;
   return ncclSuccess;
 }
 
 /* Connect/Recv from this peer */
 ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
-  void** resources = &recv->transportResources;
+  struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
   struct ncclSendMem* remDevMem;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
   if (info->direct) {
     remDevMem = (struct ncclSendMem*)(info->directPtr);
     recv->conn.direct = 1;
     recv->conn.ptrExchange = &remDevMem->ptrExchange;
-    *resources = NULL;
   } else {
-    void* remPtr = NULL;
     //TRACE_DUMP_IPC(&info->devIpc);
-    cudaError_t err = cudaIpcOpenMemHandle(&remPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
-    void** ipcPtrSave;
-    NCCLCHECK(ncclCalloc(&ipcPtrSave, 1));
-    *resources = ipcPtrSave;
-    *ipcPtrSave = remPtr;
-    remDevMem = (struct ncclSendMem*)remPtr;
+    cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
+    remDevMem = (struct ncclSendMem*)resources->ipcPtr;
     if (err != cudaSuccess) {
       WARN("failed to open CUDA IPC handle : %d %s",
           err, cudaGetErrorString(err));
@@ -584,28 +586,35 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
     }
   }
 
-  // recv->conn->buff should have been set to devMemRecv already
-  // recv->conn->tail should have been set to devMemRecv already
-  // recv->conn->opCount should have been set to devMemRecv already
+  recv->conn.buff = resources->devMem->buff;
+  recv->conn.llBuff = resources->devMem->llBuff;
+  recv->conn.tail = &resources->devMem->tail;
+  recv->conn.opCountLoc = &resources->devMem->opCount;
   recv->conn.head = &remDevMem->head;
-  recv->conn.llHead = &remDevMem->llHead;
+  recv->conn.opCountRem = &remDevMem->opCount;
   return ncclSuccess;
 }
 
-ncclResult_t p2pFree(void* resources) {
-  if (resources != NULL) {
-    void** ipcPtrSave = (void**) resources;
-    CUDACHECK(cudaIpcCloseMemHandle(*ipcPtrSave));
-    free(resources);
-  }
+ncclResult_t p2pSendFree(void* resources) {
+  struct p2pSendResources* sendRes = (struct p2pSendResources*)resources;
+  if (sendRes->ipcPtr)
+    CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
+  CUDACHECK(cudaFree(sendRes->devMem));
+  return ncclSuccess;
+}
+
+ncclResult_t p2pRecvFree(void* resources) {
+  struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources;
+  if (recvRes->ipcPtr)
+    CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
+  CUDACHECK(cudaFree(recvRes->devMem));
   return ncclSuccess;
 }
 
 struct ncclTransport p2pTransport = {
   "P2P",
-  p2pFillInfo,
   p2pCanConnect,
   p2pGetRings,
-  { p2pSendSetup, p2pSendConnect, p2pFree, NULL },
-  { p2pRecvSetup, p2pRecvConnect, p2pFree, NULL }
+  { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL },
+  { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL }
 };
diff --git a/src/transport/shm.cu b/src/transport/shm.cu
index 317f652..56e0242 100644
--- a/src/transport/shm.cu
+++ b/src/transport/shm.cu
@@ -12,13 +12,6 @@
 #include <unistd.h>
 #include <cuda_runtime.h>
 
-struct shmInfo {
-  int rank;
-  int cudaDev;
-  uint64_t hostHash;
-  uint64_t pidHash;
-};
-
 struct shmSendConnectInfo {
   uint64_t pidHash;
   int id;
@@ -51,24 +44,10 @@ struct shmRecvResources {
   struct ncclRecvMem* devHostMem;
 };
 
-/* Fill information necessary to exchange between ranks to choose whether or not
- * to use this transport */
-ncclResult_t shmFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
-  struct shmInfo* info = (struct shmInfo*)opaqueInfo;
-  static_assert(sizeof(struct shmInfo) <= sizeof(ncclTinfo_t), "shm Info too large");
-  info->rank = rank;
-  CUDACHECK(cudaGetDevice(&info->cudaDev));
-  info->hostHash=getHostHash();
-  info->pidHash=getPidHash();
-  return ncclSuccess;
-}
-
 NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);
 
 /* Determine if we can communicate with the peer */
-ncclResult_t shmCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
-  struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo;
-  struct shmInfo* peerInfo = (struct shmInfo*)peerOpaqueInfo;
+ncclResult_t shmCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
   *ret = ((ncclParamShmDisable() == 1) || (myInfo->hostHash != peerInfo->hostHash)) ? 0 : 1;
   return ncclSuccess;
 }
@@ -88,7 +67,7 @@ static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid)
 }
 
 ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
-  if (*nringsRet == MAXRINGS) *nringsRet = 1;
+  if (*nringsRet == MAXCHANNELS) *nringsRet = 1;
   int nGroups = groups[nranks-1] + 1;
   int starts[nGroups];
   int ends[nGroups];
@@ -156,43 +135,40 @@ ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
 #define MAX_SHM_NAME_LEN 1024
 
 /* Create and return connect structures for this peer to connect to me */
-ncclResult_t shmSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
-  struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo;
-  struct shmInfo* peerInfo = (struct shmInfo*)peerOpaqueInfo;
+ncclResult_t shmSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
 
   struct shmSendResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
-  ring->send.transportResources = resources;
+  send->transportResources = resources;
 
   struct shmRecvConnectInfo info;
   char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
+  sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank);
   info.shmSize = resources->shmSize = sizeof(struct ncclSendMem);
   TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
   NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
 
-  INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
-  info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
+  INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+  info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
   static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
   memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo));
   return ncclSuccess;
 }
 
-ncclResult_t shmRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
-  struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo;
+ncclResult_t shmRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
   struct shmRecvResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
-  ring->recv.transportResources = resources;
+  recv->transportResources = resources;
 
   struct shmSendConnectInfo info;
 
   char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
-  info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
+  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank);
+  info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+buffSize;
   TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
   NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
 
-  info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
+  info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
   static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big");
   memcpy(connectInfo, &info, sizeof(struct shmSendConnectInfo));
   return ncclSuccess;
@@ -216,10 +192,10 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
   send->conn.buff = resources->devRemHostMem->buff;
   send->conn.llBuff = resources->devRemHostMem->llBuff;
   send->conn.tail = &resources->devRemHostMem->tail;
-  send->conn.opCount = &resources->devRemHostMem->opCount;
+  send->conn.opCountRem = &resources->devRemHostMem->opCount;
 
   send->conn.head = &resources->devHostMem->head;
-  send->conn.llHead = &resources->devHostMem->llHead;
+  send->conn.opCountLoc = &resources->devHostMem->opCount;
   return ncclSuccess;
 }
 
@@ -235,12 +211,12 @@ ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
   NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
   NCCLCHECK(shmUnlink(shmName));
   recv->conn.head = &resources->devRemHostMem->head;
-  recv->conn.llHead = &resources->devRemHostMem->llHead;
+  recv->conn.opCountRem = &resources->devRemHostMem->opCount;
 
   recv->conn.buff = resources->devHostMem->buff;
   recv->conn.llBuff = resources->devHostMem->llBuff;
   recv->conn.tail = &resources->devHostMem->tail;
-  recv->conn.opCount = &resources->devHostMem->opCount;
+  recv->conn.opCountLoc = &resources->devHostMem->opCount;
   return ncclSuccess;
 }
 
@@ -262,7 +238,6 @@ ncclResult_t shmRecvFree(void* transportResources) {
 
 struct ncclTransport shmTransport = {
   "SHM",
-  shmFillInfo,
   shmCanConnect,
   shmGetRings,
   { shmSendSetup, shmSendConnect, shmSendFree, NULL },