2.4.2-1

Add tree algorithms for allreduce to improve performance at scale. Add ncclCommAbort() and ncclCommGetAsyncError() to properly handle network errors and be permit recover. Detect initial CPU affinity and no longer escape it.
author: Sylvain Jeaugey <sjeaugey@nvidia.com> 2018-12-14 02:56:12 +0300
committer: Sylvain Jeaugey <sjeaugey@nvidia.com> 2019-01-30 02:19:27 +0300
commit: 1450d42675be325cd3b7a684d4b231eedceb22fb (patch)
tree: dc1f88ad03d598c3bb03f20dd81d8ef671fc2bff /src
parent: 4861e197fd83f0ac324ac0c21051820f8866e6ea (diff)
61 files changed, 3690 insertions, 3196 deletions
diff --git a/src/Makefile b/src/Makefile
index 481000a..fe60b11 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -9,8 +9,8 @@ include ../makefiles/version.mk
 
 ##### src files
 INCEXPORTS  := nccl.h nccl_net.h
-LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \
-		misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \
+LIBSRCFILES := init.cu channel.cu bootstrap.cu transport.cu enqueue.cu \
+                misc/group.cu misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/checks.cu misc/trees.cu \
 		transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
                 collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
 
@@ -29,11 +29,10 @@ LIBTARGET  := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
 STATICLIBTARGET := $(STATICLIBNAME)
 LIBOBJ     := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
 DEPFILES   := $(LIBOBJ:%.o=%.d)
-LDFLAGS    += -L${CUDA_LIB} -lcudart_static -lrt
+LDFLAGS    += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl
 
 DEVICELIB  := $(BUILDDIR)/obj/collectives/device/colldevice.a
 
-
 ##### rules
 build : lib staticlib
 
@@ -41,9 +40,12 @@ lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
 
 staticlib : $(LIBDIR)/$(STATICLIBTARGET)
 
-devicelib: $(INCDIR)/nccl.h
+$(DEVICELIB): ALWAYS_REBUILD
 	$(MAKE) -C collectives/device
 
+# Empty target to force rebuild
+ALWAYS_REBUILD:
+
 -include $(DEPFILES)
 $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
 
@@ -59,14 +61,14 @@ $(INCDIR)/nccl.h : nccl.h.in
 	    -e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \
 	    $< > $@
 
-$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) devicelib
+$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB)
 	@printf "Linking    %-35s > %s\n" $(LIBTARGET) $@
 	mkdir -p $(LIBDIR)
 	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS)
 	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
 	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
 
-$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) devicelib
+$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB)
 	@printf "Archiving  %-35s > %s\n" $(STATICLIBTARGET) $@
 	mkdir -p $(LIBDIR)
 	$(eval TMP := $(shell mktemp -d))
diff --git a/src/bootstrap.cu b/src/bootstrap.cu
index 13c6e92..6b1d573 100644
--- a/src/bootstrap.cu
+++ b/src/bootstrap.cu
@@ -15,27 +15,31 @@
 // Always use sockets for bootstrap
 ncclNet_t* ncclBootstrapNet = &ncclNetSocket;
 
-static ncclResult_t bootstrapListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; }
-static ncclResult_t bootstrapConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; }
-static ncclResult_t bootstrapAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; }
-static ncclResult_t bootstrapTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; }
-static ncclResult_t bootstrapCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; }
-static ncclResult_t bootstrapCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; }
-static ncclResult_t bootstrapCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; }
 
 // Additional sync functions based on async + test for bootstrap, using host ptrs.
-static ncclResult_t bootstrapSend(void* sendComm, void* data, int size) {
-  void* request;
-  NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, NCCL_PTR_HOST, &request));
+static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) {
+  void* request, *mhandle;
+  NCCLCHECK(ncclBootstrapNet->regMr(sendComm, data, size, NCCL_PTR_HOST, &mhandle));
+  NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, mhandle, &request));
+  NCCLCHECK(ncclBootstrapNet->deregMr(sendComm, mhandle));
   int done = 0;
-  while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
+  while (!done) NCCLCHECK(bootstrapNetTest(request, &done, NULL));
   return ncclSuccess;
 }
-static ncclResult_t bootstrapRecv(void* recvComm, void* data, int size) {
-  void* request;
-  NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, NCCL_PTR_HOST, &request));
+static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) {
+  void* request, *mhandle;
+  NCCLCHECK(ncclBootstrapNet->regMr(recvComm, data, size, NCCL_PTR_HOST, &mhandle));
+  NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, mhandle, &request));
+  NCCLCHECK(ncclBootstrapNet->deregMr(recvComm, mhandle));
   int done = 0;
-  while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
+  while (!done) NCCLCHECK(bootstrapNetTest(request, &done, NULL));
   return ncclSuccess;
 }
 
@@ -51,8 +55,8 @@ struct extId {
 struct extInfo {
   int rank;
   int nranks;
-  ncclNetHandle_t extHandleListenFromRoot;
-  ncclNetHandle_t extHandleRing;
+  ncclNetHandle_t extHandleListenRoot;
+  ncclNetHandle_t extHandleListen;
 };
 
 #include <sys/resource.h>
@@ -68,28 +72,25 @@ static ncclResult_t setFilesLimit() {
 static void *bootstrapRoot(void* commId) {
   struct extInfo info;
   struct extId* id = (struct extId*)commId;
-  ncclNetHandle_t *extHandleBstrap = NULL; // for initial rank <-> root information exchange
-  ncclNetHandle_t *extHandleRing = NULL; // for bootstrap ring creation
+  ncclNetHandle_t *rankHandles = NULL;
+  ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
   ncclNetHandle_t zero = { 0 }; // for sanity checking
   void* tmpComm;
   ncclResult_t res;
   setFilesLimit();
 
+  TRACE(NCCL_INIT, "BEGIN");
   /* Receive addresses from all ranks */
   int nranks = 0, c = 0;
   do {
-    NCCLCHECKGOTO(bootstrapAccept(id->extListenComm, &tmpComm), res, out);
-    NCCLCHECKGOTO(bootstrapRecv(tmpComm, &info, sizeof(info)), res, out);
-    NCCLCHECKGOTO(bootstrapCloseRecv(tmpComm), res, out);
+    NCCLCHECKGOTO(bootstrapNetAccept(id->extListenComm, &tmpComm), res, out);
+    NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out);
+    NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out);
 
     if (c == 0) {
-      extHandleBstrap = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t));
-      extHandleRing = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t));
-      if (extHandleBstrap == NULL || extHandleRing == NULL) {
-        WARN("Bootstrap thread : failed to allocate memory");
-        goto out;
-      }
       nranks = info.nranks;
+      NCCLCHECKGOTO(ncclCalloc(&rankHandles, nranks), res, out);
+      NCCLCHECKGOTO(ncclCalloc(&rankHandlesRoot, nranks), res, out);
     }
 
     if (nranks != info.nranks) {
@@ -97,40 +98,43 @@ static void *bootstrapRoot(void* commId) {
       goto out;
     }
 
-    if (memcmp(&zero, &extHandleBstrap[info.rank], sizeof(ncclNetHandle_t)) != 0) {
+    if (memcmp(&zero, &rankHandlesRoot[info.rank], sizeof(ncclNetHandle_t)) != 0) {
       WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
       goto out;
     }
 
-    // Save the connection handle for connecting back to the ranks
-    memcpy(&extHandleBstrap[info.rank], info.extHandleListenFromRoot, sizeof(ncclNetHandle_t));
-    // Save the connection handle for the AllGather ring
-    memcpy(&extHandleRing[info.rank], info.extHandleRing, sizeof(ncclNetHandle_t));
+    // Save the connection handle for that rank
+    memcpy(rankHandlesRoot+info.rank, info.extHandleListenRoot, sizeof(ncclNetHandle_t));
+    memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t));
 
     ++c;
   } while (c < nranks);
+  TRACE(NCCL_INIT, "COLLECTED HANDLES");
 
   // Send the connect handle for the next rank in the AllGather ring
   for (int r=0; r<nranks; ++r) {
     int next = (r+1) % nranks;
     void *tmpSendComm;
-    NCCLCHECKGOTO(bootstrapConnect(0, extHandleBstrap[r], &tmpSendComm), res, out);
-    NCCLCHECKGOTO(bootstrapSend(tmpSendComm, &extHandleRing[next], sizeof(ncclNetHandle_t)), res, out);
-    NCCLCHECKGOTO(bootstrapCloseSend(tmpSendComm), res, out);
+    NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot[r], &tmpSendComm), res, out);
+    NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
+    NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
   }
+  TRACE(NCCL_INIT, "SENT OUT HANDLES");
 
 out:
-  bootstrapCloseListen(id->extListenComm);
+  bootstrapNetCloseListen(id->extListenComm);
   free(commId);
-  free(extHandleBstrap);
-  free(extHandleRing);
+  if (rankHandles) free(rankHandles);
+  if (rankHandlesRoot) free(rankHandlesRoot);
+
+  TRACE(NCCL_INIT, "DONE");
   return NULL;
 }
 
 ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
   struct extId* id = (struct extId*)commId;
   id->hostHash = getHostHash();
-  NCCLCHECK(bootstrapListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
+  NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
   ncclUniqueId* threadIdCopy;
   NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
   memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
@@ -157,10 +161,18 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
   return ncclSuccess;
 }
 
+struct unexConn {
+  int peer;
+  void* comm;
+  struct unexConn* next;
+};
+
 struct extState {
+  void* extBstrapListenComm;
   void* extBstrapRingRecvComm;
   void* extBstrapRingSendComm;
-  ncclNetHandle_t extBstrapRootHandle;
+  ncclNetHandle_t* peerBstrapHandles;
+  struct unexConn* unexpectedConnections;
   int rank;
   int nranks;
   int dev;
@@ -174,39 +186,56 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co
   state->rank = rank;
   state->nranks = nranks;
   *commState = state;
-  void* extBstrapRootListenComm; // comm on which we accept root's connections
+
+  TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
 
   struct extInfo info = { 0 };
   info.rank = rank;
   info.nranks = nranks;
-  void *tmpSendComm, *extBstrapRingListenComm, *tmpRecvComm;
+  void *tmpSendComm, *tmpRecvComm;
   // Pass the remote address to listen via info
   if (idFromEnv) {
-    memcpy(&info.extHandleListenFromRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
-    memcpy(&info.extHandleRing, &id->extHandleRoot, sizeof(ncclNetHandle_t));
+    memcpy(&info.extHandleListen, &id->extHandleRoot, sizeof(ncclNetHandle_t));
+    memcpy(&info.extHandleListenRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
   }
   // listen will return the local address via info (specify interface type 'findSubnetIf')
   state->dev = idFromEnv ? findSubnetIf : 0;
-  NCCLCHECK(bootstrapListen(state->dev, &info.extHandleListenFromRoot, &extBstrapRootListenComm));
-  NCCLCHECK(bootstrapListen(state->dev, &info.extHandleRing, &extBstrapRingListenComm)); // AllGather Ring
+  void* extBstrapListenCommRoot;
+  NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListen, &state->extBstrapListenComm));
+  NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListenRoot, &extBstrapListenCommRoot));
+
+  // stagger connection times to avoid an overload of the root at very high rank counts
+  if (nranks > 128) {
+    long msec = rank;
+    struct timespec tv;
+    tv.tv_sec = msec / 1000;
+    tv.tv_nsec = 1000000 * (msec % 1000);
+    TRACE(NCCL_INIT, "rank %d delaying connection to root by %ld msec", rank, msec);
+    (void) nanosleep(&tv, NULL);
+  }
 
-  memcpy(&state->extBstrapRootHandle, &id->extHandleRoot, sizeof(ncclNetHandle_t));
-  // send info on my listening sockets to root
-  NCCLCHECK(bootstrapConnect(state->dev, id->extHandleRoot, &tmpSendComm));
-  NCCLCHECK(bootstrapSend(tmpSendComm, &info, sizeof(info)));
-  NCCLCHECK(bootstrapCloseSend(tmpSendComm));
+  // send info on my listening socket to root
+  NCCLCHECK(bootstrapNetConnect(state->dev, id->extHandleRoot, &tmpSendComm));
+  NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info)));
+  NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
 
   // get info on my "next" rank in the bootstrap ring from root
   ncclNetHandle_t extHandleNext;
-  NCCLCHECK(bootstrapAccept(extBstrapRootListenComm, &tmpRecvComm));
-  NCCLCHECK(bootstrapRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
-  NCCLCHECK(bootstrapCloseRecv(tmpRecvComm));
+  NCCLCHECK(bootstrapNetAccept(extBstrapListenCommRoot, &tmpRecvComm));
+  NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
+  NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
+  NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));
 
-  NCCLCHECK(bootstrapConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
+  NCCLCHECK(bootstrapNetConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
   // Accept the connect request from the previous rank in the AllGather ring
-  NCCLCHECK(bootstrapAccept(extBstrapRingListenComm, &state->extBstrapRingRecvComm));
-  NCCLCHECK(bootstrapCloseListen(extBstrapRingListenComm));
-  NCCLCHECK(bootstrapCloseListen(extBstrapRootListenComm));
+  NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm));
+
+  // AllGather all listen handlers
+  NCCLCHECK(ncclCalloc(&state->peerBstrapHandles, nranks));
+  memcpy(state->peerBstrapHandles+rank, info.extHandleListen, sizeof(ncclNetHandle_t));
+  NCCLCHECK(bootstrapAllGather(state, state->peerBstrapHandles, sizeof(ncclNetHandle_t)));
+
+  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
 
   return ncclSuccess;
 }
@@ -224,25 +253,106 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
    * and send previous step's data from (rank-i) to right
    */
   for (int i=0; i<nranks-1; i++) {
-    int rslice = (rank - i - 1 + nranks) % nranks;
-    int sslice = (rank - i + nranks) % nranks;
+    size_t rslice = (rank - i - 1 + nranks) % nranks;
+    size_t sslice = (rank - i + nranks) % nranks;
 
     // Send slice to the right
-    NCCLCHECK(bootstrapSend(state->extBstrapRingSendComm, data+sslice*size, size));
+    NCCLCHECK(bootstrapNetSend(state->extBstrapRingSendComm, data+sslice*size, size));
     // Recv slice from the left
-    NCCLCHECK(bootstrapRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
+    NCCLCHECK(bootstrapNetRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
   }
 
   TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
   return ncclSuccess;
 }
 
-ncclResult_t bootstrapClose(void* commState) {
+ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) {
   struct extState* state = (struct extState*)commState;
+  void* tmpSendComm;
+  NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles[peer], &tmpSendComm));
+  NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int)));
+  NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size));
+  NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
+  return ncclSuccess;
+}
+
+ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) {
+  // New unex
+  struct unexConn* unex;
+  NCCLCHECK(ncclCalloc(&unex, 1));
+  unex->peer = peer;
+  unex->comm = comm;
+
+  // Enqueue
+  struct unexConn* list = state->unexpectedConnections;
+  if (list == NULL) {
+    state->unexpectedConnections = unex;
+    return ncclSuccess;
+  }
+  while (list->next) list = list->next;
+  list->next = unex;
+  return ncclSuccess;
+}
 
-  NCCLCHECK(bootstrapCloseSend(state->extBstrapRingSendComm));
-  NCCLCHECK(bootstrapCloseRecv(state->extBstrapRingRecvComm));
+void* unexpectedDequeue(struct extState* state, int peer) {
+  struct unexConn* elem = state->unexpectedConnections;
+  struct unexConn* prev = NULL;
+  while (elem) {
+    if (elem->peer == peer) {
+      if (prev == NULL) {
+        state->unexpectedConnections = elem->next;
+      } else {
+        prev->next = elem->next;
+      }
+      void* comm = elem->comm;
+      free(elem);
+      return comm;
+    }
+    prev = elem;
+    elem = elem->next;
+  }
+  return NULL;
+}
+
+// We can't know who we'll receive from, so we need to receive everything at once
+ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
+  struct extState* state = (struct extState*)commState;
+
+  void* tmpRecvComm;
+
+  // Search unexpected connections first
+  if ((tmpRecvComm = unexpectedDequeue(state, peer)) != NULL) {
+    NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
+    NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
+    return ncclSuccess;
+  }
+
+  // Then look for new connections
+  while (1) {
+    NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &tmpRecvComm));
+    int newPeer;
+    NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &newPeer, sizeof(int)));
+    if (newPeer == peer) {
+      NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
+      NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
+      return ncclSuccess;
+    }
+    // Unexpected connection. Save for later.
+    NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvComm));
+  }
+}
+
+ncclResult_t bootstrapClose(void* commState) {
+  struct extState* state = (struct extState*)commState;
+  if (state->unexpectedConnections != NULL) {
+    WARN("Unexpected connections are not empty.\n");
+    return ncclInternalError;
+  }
+  NCCLCHECK(bootstrapNetCloseListen(state->extBstrapListenComm));
+  NCCLCHECK(bootstrapNetCloseSend(state->extBstrapRingSendComm));
+  NCCLCHECK(bootstrapNetCloseRecv(state->extBstrapRingRecvComm));
 
+  free(state->peerBstrapHandles);
   free(state);
 
   return ncclSuccess;
diff --git a/src/channel.cu b/src/channel.cu
new file mode 100644
index 0000000..937e84e
--- /dev/null
+++ b/src/channel.cu
@@ -0,0 +1,51 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "channel.h"
+#include "param.h"
+
+NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
+
+ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
+  struct ncclChannel* channel = comm->channels+channelid;
+  channel->id = channelid;
+
+  // Setup intermediate buffering
+  channel->buffSize = ncclParamBuffsize();
+
+  // Ring index to user rank table.
+  NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
+
+  // Communication structures with peers.
+  NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks));
+  for (size_t i=0; i<comm->nRanks; ++i) {
+    channel->peers[i].send.comm = comm;
+    channel->peers[i].recv.comm = comm;
+  }
+
+  // Per-channel operation list.
+  NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
+  return ncclSuccess;
+}
+
+ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
+  // Operation list
+  NCCLCHECK(ncclCudaHostFree(channel->collectives));
+
+  // Free Ring index to rank tables
+  free(channel->ring.userRanks);
+  CUDACHECK(cudaFree(channel->ring.devUserRanks));
+
+  // Free transport proxy resources
+  for (int r=0; r<nRanks; r++) {
+    struct ncclPeer* peer = channel->peers+r;
+    if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
+    if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
+  }
+  return ncclSuccess;
+}
diff --git a/src/collectives/all_gather.cu b/src/collectives/all_gather.cu
index 8dec28e..db21dee 100644
--- a/src/collectives/all_gather.cu
+++ b/src/collectives/all_gather.cu
@@ -4,29 +4,15 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
-#include "common_coll.h"
 #include "enqueue.h"
 #include "collectives.h"
 
-ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(ALLGATHER_SUBSTEPS, ALLGATHER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
-    NCCLCHECK(saveKernel(ncclCollAllGather, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes*comm->nRanks, 1));
-  }
-  return ncclSuccess;
-}
-
 NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
-  return ncclEnqueueCheck(ncclAllGatherFunc, "AllGather", sendbuff, recvbuff, sendcount, datatype,
-          ncclSum, 0, comm, stream);
+  struct ncclInfo info = { ncclCollAllGather, "AllGather",
+    sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
+    ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
 }
diff --git a/src/collectives/all_reduce.cu b/src/collectives/all_reduce.cu
index cc14083..1492c90 100644
--- a/src/collectives/all_reduce.cu
+++ b/src/collectives/all_reduce.cu
@@ -4,29 +4,15 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
-#include "common_coll.h"
 #include "enqueue.h"
 #include "collectives.h"
 
-ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(ALLREDUCE_SUBSTEPS, ALLREDUCE_BUFCHUNKS, (comm->nRanks)*2-2, comm->nRanks, nbytes, proxyPatternRing, comm));
-    NCCLCHECK(saveKernel(ncclCollAllReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, comm->nRanks));
-  }
-  return ncclSuccess;
-}
-
 NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
 ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
-  return ncclEnqueueCheck(ncclAllReduceFunc, "AllReduce", sendbuff, recvbuff, count, datatype,
-          op, 0, comm, stream);
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
+  struct ncclInfo info = { ncclCollAllReduce, "AllReduce",
+    sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
+    ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
 }
diff --git a/src/collectives/broadcast.cu b/src/collectives/broadcast.cu
index 91ce905..6a3d0a8 100644
--- a/src/collectives/broadcast.cu
+++ b/src/collectives/broadcast.cu
@@ -4,39 +4,23 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
-#include "common_coll.h"
 #include "enqueue.h"
 #include "collectives.h"
 
-ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(BROADCAST_SUBSTEPS, BROADCAST_BUFCHUNKS, 1, 1, nbytes, proxyPatternFrom(root), comm));
-    NCCLCHECK(saveKernel(ncclCollBroadcast, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes, 1));
-  }
-
-  return ncclSuccess;
+NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  struct ncclInfo info = { ncclCollBroadcast, "Broadcast",
+    sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
+    BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
 }
-
 /* Deprecated original "in place" function, similar to MPI */
 NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream) {
-  return ncclEnqueueCheck(ncclBroadcastFunc, "Bcast", buff, buff, count, datatype,
-          ncclSum, root, comm, stream);
+  return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
 }
 
-NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream) {
-  return ncclEnqueueCheck(ncclBroadcastFunc, "Broadcast", sendbuff, recvbuff, count, datatype,
-          ncclSum, root, comm, stream);
-}
diff --git a/src/collectives/collectives.h b/src/collectives/collectives.h
index 4a5cb7a..e6b19cb 100644
--- a/src/collectives/collectives.h
+++ b/src/collectives/collectives.h
@@ -7,9 +7,7 @@
 #ifndef NCCL_COLLECTIVES_H_
 #define NCCL_COLLECTIVES_H_
 
-typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
-
-#define FUNC_INDEX(coll, redop, dtype, ll) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll)
+#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*2+(al))*2+(ll))
 
 #define NCCL_COLL_NAME(coll, op, dtype) \
   coll##_##op##_##dtype
@@ -18,13 +16,17 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed
   coll##Kernel_##op##_##dtype
 
 /* Declare all collective operations */
-#define DECL_COLL4(coll, op, dtype) \
+#define DECL_COLL5(coll, op, dtype) \
   extern __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
-  extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl coll); \
+  extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl c); \
+
+#define DECL_COLL4(coll, op, dtype) \
+  DECL_COLL5(coll, op, dtype) \
+  DECL_COLL5(coll##LL, op, dtype)
 
 #define DECL_COLL3(coll, op, dtype) \
-  DECL_COLL4(coll##LL, op, dtype) \
-  DECL_COLL4(coll, op, dtype)
+  DECL_COLL4(coll##Ring, op, dtype) \
+  DECL_COLL4(coll##Tree, op, dtype)
 
 #define DECL_COLL2(coll, op) \
   DECL_COLL3(coll, op, i8) \
@@ -52,15 +54,16 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed
 
 DECL_ALL_COLLS
 
-#define ALLREDUCE_SUBSTEPS 2
-#define ALLREDUCE_BUFCHUNKS 2
-#define ALLGATHER_SUBSTEPS 2
-#define ALLGATHER_BUFCHUNKS 2
-#define REDUCESCATTER_SUBSTEPS 2
-#define REDUCESCATTER_BUFCHUNKS 2
-#define BROADCAST_SUBSTEPS 8
-#define BROADCAST_BUFCHUNKS 2
-#define REDUCE_SUBSTEPS 8
-#define REDUCE_BUFCHUNKS 2
+// CHUNKSIZE must be a multiple of SLICESIZE
+#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
+#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
+#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
+#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
+#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
+#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
+#define BROADCAST_SLICESTEPS 1
+#define BROADCAST_CHUNKSTEPS 1
+#define REDUCE_SLICESTEPS 1
+#define REDUCE_CHUNKSTEPS 1
 
 #endif
diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile
index e2bcd49..8e92596 100644
--- a/src/collectives/device/Makefile
+++ b/src/collectives/device/Makefile
@@ -12,18 +12,13 @@ OBJDIR := $(BUILDDIR)/obj/collectives/device
 
 LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu
 
-LIBOBJ     := $(patsubst %.cu,$(OBJDIR)/%_sum.o, $(LIBSRCFILES)) \
-              $(patsubst %.cu,$(OBJDIR)/%_prod.o, $(LIBSRCFILES)) \
-              $(patsubst %.cu,$(OBJDIR)/%_min.o, $(LIBSRCFILES)) \
-              $(patsubst %.cu,$(OBJDIR)/%_max.o, $(LIBSRCFILES)) \
-              $(OBJDIR)/functions.o
-
 LIBSRCFILES += functions.cu
 
 DEPFILES   := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES))
-DEPENDFILES := $(DEPFILES:%.d=%.dep)
+DEPENDFILES:= $(DEPFILES:%.d=%.dep)
 STATICLIB  := $(OBJDIR)/colldevice.a
 DEVOBJ     := $(OBJDIR)/devlink.o
+RULESFILE  := $(OBJDIR)/Makefile.rules
 
 NVCUFLAGS  += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden"
 
@@ -33,6 +28,16 @@ all: $(STATICLIB)
 # Dummy rule so that the extra dependency (%.dep) files are preserved by make
 all_deps: $(DEPENDFILES)
 
+# Auto-generating the rules per op/reduction/datatype/algorithm
+$(RULESFILE) :
+	@printf "Generating %-35s > %s\n" rules $@
+	@mkdir -p $(OBJDIR)
+	@./gen_rules.sh $(OBJDIR) > $@
+
+-include $(RULESFILE)
+
+LIBOBJ     := $(GENOBJS) $(OBJDIR)/functions.o
+
 -include $(DEPFILES)
 
 $(STATICLIB): $(LIBOBJ) $(DEVOBJ)
@@ -58,26 +63,6 @@ $(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep
 	mkdir -p `dirname $@`
 	$(NVCC) $(NVCUFLAGS) -dc $< -o $@
 
-$(OBJDIR)/%_sum.o : %.cu $(OBJDIR)/%.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) -DNCCL_OP=0 $(NVCUFLAGS) -dc $< -o $@
-
-$(OBJDIR)/%_prod.o : %.cu $(OBJDIR)/%.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) -DNCCL_OP=1 $(NVCUFLAGS) -dc $< -o $@
-
-$(OBJDIR)/%_min.o : %.cu $(OBJDIR)/%.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) -DNCCL_OP=2 $(NVCUFLAGS) -dc $< -o $@
-
-$(OBJDIR)/%_max.o : %.cu $(OBJDIR)/%.dep
-	@printf "Compiling  %-35s > %s\n" $< $@
-	mkdir -p `dirname $@`
-	$(NVCC) -DNCCL_OP=3 $(NVCUFLAGS) -dc $< -o $@
-
 # ... and create the device-side linked object with all those.
 $(DEVOBJ) : $(LIBOBJ)
 	$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu
index 0f572ce..530bf14 100644
--- a/src/collectives/device/all_gather.cu
+++ b/src/collectives/device/all_gather.cu
@@ -4,12 +4,8 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "common.h"
 #include "all_gather.h"
+#include "common.h"
 #include "collectives.h"
 
-#define UNROLL 4
-
-#if NCCL_OP == 0
-IMPL_COLL3(ncclAllGather, copy, FuncSum, i8, int8_t, ncclCollAllGather, ncclSum, ncclInt8);
-#endif
+IMPL_COLL_C(ncclAllGather, ncclCollAllGather);
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index a30e575..36809c9 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -8,72 +8,35 @@
 #include "primitives.h"
 #include "collectives.h"
 
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
-__device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x - 1;
   const int bid = args->bid;
-  __shared__ T* sharedNextOutput;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  int prevdirect = ring->recv.conn.direct;
-  int nextdirect = ring->send.conn.direct;
-
-  WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS);
-  PostFlag postDoneToPrev(ring->recv.conn.head, ALLGATHER_SUBSTEPS, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
-
-  typedef Primitives<UNROLL, ALLGATHER_SUBSTEPS, T> Prims;
-
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
   const int nranks = comm->nRanks;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / ALLGATHER_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
-
-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    *ring->recv.conn.opCount = args->opCount;
-    // Wait for next to be ready
-    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-    waitOpCountNext.wait(args->opCount);
-    if (prevdirect) {
-      *ring->recv.conn.ptrExchange = args->ThisOutput;
-    }
-    if (nextdirect) {
-      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
-      while (*ptr == nullptr);
-      sharedNextOutput = (T*)*ptr;
-      *ptr = nullptr;
-    }
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int poffset, noffset = 0;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*realChunkSize;
 
     /////////////// begin AllGather steps ///////////////
     ssize_t offset;
-    int maxOffset = min(chunkSize, size-chunkOffset);
+    int nelem = min(realChunkSize, size-chunkOffset);
     int rankDest;
 
     // step 0: push data to next GPU
@@ -81,129 +44,51 @@ __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
     offset = chunkOffset + rankDest * size;
 
     if (thisInput + chunkOffset == thisOutput + offset) { // In place
-      Prims::Copy(tid, nthreads,
-          thisInput  + chunkOffset,
-          nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
+      prims.directSend(thisInput+chunkOffset, offset, nelem);
     } else {
-      Prims::DoubleCopy(tid, nthreads,
-          thisInput  + chunkOffset,
-          thisOutput + offset,
-          nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
+      prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
     }
 
-    NEXT_STEP; // Increases step, poffset, noffset
-
     // k-2 steps: copy to next GPU
-    if (prevdirect) {
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ring->devUserRanks[nranks-j];
-        offset = chunkOffset + rankDest * size;
-
-        Prims::Copy(tid, nthreads,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-      Prims::Copy(tid, nthreads,
-          NULL,
-          NULL,
-          0, 0,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
-    } else {
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ring->devUserRanks[nranks-j];
-        offset = chunkOffset + rankDest * size;
-
-        Prims::DoubleCopy(tid, nthreads,
-            prevInput + poffset,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-
-      // Make final copy from buffer to dest.
-      rankDest = ring->devUserRanks[1];
+    for (int j=1; j<nranks-1; ++j) {
+      rankDest = ring->devUserRanks[nranks-j];
       offset = chunkOffset + rankDest * size;
 
-      // Here we need to copy from buffer to this output.
-      Prims::Copy(tid, nthreads,
-          prevInput + poffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
+      prims.directRecvCopySend(thisOutput+offset, offset, nelem);
     }
-  }
 
-  if (tid == 0) {
-    waitDoneFromNext.wait(ALLGATHER_SUBSTEPS*(step + ALLGATHER_BUFCHUNKS));
-    *ring->send.conn.head = 0ULL;
-    *ring->recv.conn.tail = 0ULL;
-    __threadfence_system();
-    *ring->recv.conn.opCount = args->opCount+1;
+    // Make final copy from buffer to dest.
+    rankDest = ring->devUserRanks[1];
+    offset = chunkOffset + rankDest * size;
+
+    // Final wait/copy.
+    prims.directRecv(thisOutput+offset, offset, nelem);
   }
 }
 
-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
-  poffset = noffset; \
-  pflag = nflag; \
-  noffset += NCCL_LL_SLICE_LINES; \
-  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
-  nflag++; \
-  step++;
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }
 
 template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
-  const int llNthreads = args->nThreads;
+  const int nthreads = args->nThreads;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
 
-  typedef LLPrimitives<T, FUNC> LL;
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
 
   const ssize_t size = args->N;
   //const int rank = comm->rank;
   const int nranks = comm->nRanks;
   ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*chunkSize;
-
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t pflag, nflag = step + 1;
-  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  const ssize_t loopSize = args->nChannels*chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     if (size-gridOffset < loopSize) {
@@ -213,57 +98,34 @@ __device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) {
 
     /////////////// begin AllGather steps ///////////////
     ssize_t offset;
-    int maxOffset = min(chunkSize, size-chunkOffset);
+    int nelem = min(chunkSize, size-chunkOffset);
     int rankDest;
 
     // step 0: push data to next GPU
     rankDest = ring->devUserRanks[0];
     offset = chunkOffset + rankDest * size;
 
-    WAIT_NEXT;
     if (thisInput + chunkOffset == thisOutput + offset) { // In place
-      LL::ReduceCopy(
-          thisInput  + chunkOffset,
-          nextOutput + noffset,
-          maxOffset, nflag, llNthreads);
+      LLprims.send(thisInput+chunkOffset, nelem);
     } else {
-      LL::ReduceCopy(
-          thisInput  + chunkOffset,
-          thisOutput + offset,
-          nextOutput + noffset,
-          maxOffset, nflag, llNthreads);
+      LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
     }
-    POST_SIZE;
-
-    NEXT_STEP_LL;
 
     // k-2 steps: copy to next GPU
     for (int j=1; j<nranks-1; ++j) {
       rankDest = ring->devUserRanks[nranks-j];
       offset = chunkOffset + rankDest * size;
 
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          prevInput  + poffset,
-          thisOutput + offset,
-          nextOutput + noffset,
-          maxOffset, pflag, nflag, llNthreads);
-      POST_SIZE;
-      ACK_PREV;
-
-      NEXT_STEP_LL;
+      LLprims.recvCopySend(thisOutput+offset, nelem);
     }
 
     // step k-1: final store
     rankDest = ring->devUserRanks[1];
     offset = chunkOffset + rankDest * size;
 
-    LL::ReduceCopy(
-        prevInput  + poffset,
-        thisOutput + offset,
-        maxOffset, pflag, llNthreads);
-    ACK_PREV;
+    LLprims.recv(thisOutput+offset, nelem);
   }
-
-  FIFO_CLEANING_AND_SAVE_STEP(nflag);
 }
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu
index caa1479..aaa96b4 100644
--- a/src/collectives/device/all_reduce.cu
+++ b/src/collectives/device/all_reduce.cu
@@ -4,18 +4,8 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "common.h"
 #include "all_reduce.h"
+#include "common.h"
 #include "collectives.h"
 
-#define UNROLL 4
-
-#if NCCL_OP == 0
-IMPL_COLL2(ncclAllReduce, sum,  FuncSum,  ncclCollAllReduce, ncclSum);
-#elif NCCL_OP == 1
-IMPL_COLL2(ncclAllReduce, prod, FuncProd, ncclCollAllReduce, ncclProd);
-#elif NCCL_OP == 2
-IMPL_COLL2(ncclAllReduce, min,  FuncMin,  ncclCollAllReduce, ncclMin);
-#elif NCCL_OP == 3
-IMPL_COLL2(ncclAllReduce, max,  FuncMax,  ncclCollAllReduce, ncclMax);
-#endif
+IMPL_COLL_R(ncclAllReduce, ncclCollAllReduce);
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index d7abc64..ea89a71 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -8,233 +8,152 @@
 #include "primitives.h"
 #include "collectives.h"
 
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
-__device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x - 1;
   const int bid = args->bid;
-  __shared__ T* sharedNextOutput;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  int prevdirect = ring->recv.conn.direct;
-  int nextdirect = ring->send.conn.direct;
-
-  WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS);
-  PostFlag postDoneToPrev(ring->recv.conn.head, ALLREDUCE_SUBSTEPS, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
-
-  typedef Primitives<UNROLL, ALLREDUCE_SUBSTEPS, T, FUNC> Prims;
-
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
-  //const int rank = comm->rank;
   const int nranks = comm->nRanks;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / ALLREDUCE_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
-
-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    *ring->recv.conn.opCount = args->opCount;
-    // Wait for next to be ready
-    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-    waitOpCountNext.wait(args->opCount);
-    if (prevdirect) {
-      *ring->recv.conn.ptrExchange = args->ThisOutput;
-    }
-    if (nextdirect) {
-      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
-      while (*ptr == nullptr);
-      sharedNextOutput = (T*)*ptr;
-      *ptr = nullptr;
-    }
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int poffset, noffset = 0;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,nranks*args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize;
 
     /////////////// begin AllReduce steps ///////////////
     ssize_t offset;
-    int maxOffset;
+    int nelem;
     int slice;
 
     // step 0: push data to next GPU
     slice = ring->devUserRanks[nranks-1];
-    offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    offset = chunkOffset + slice * realChunkSize;
+    nelem = min(realChunkSize, size-offset);
 
-    Prims::Copy(tid, nthreads,
-        thisInput  + offset,
-        nextOutput + noffset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext,
-        postReadyToNext);
-
-    NEXT_STEP; // Increases step, poffset, noffset
+    prims.send(thisInput+offset, nelem);
 
     // k-2 steps: reduce and copy to next GPU
     for (int j=2; j<nranks; ++j) {
       slice = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
-
-      Prims::Reduce(tid, nthreads,
-          prevInput  + poffset,
-          thisInput  + offset,
-          nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
+      offset = chunkOffset + slice * realChunkSize;
+      nelem = min(realChunkSize, size-offset);
+
+      prims.recvReduceSend(thisInput+offset, nelem);
     }
 
     // step k-1: reduce this buffer and data, which will produce the final
     // result that we store in this data and push to the next GPU
     slice = ring->devUserRanks[0];
-    offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    offset = chunkOffset + slice * realChunkSize;
+    nelem = min(realChunkSize, size-offset);
 
-    Prims::ReduceCopy(tid, nthreads,
-        prevInput  + poffset,
-        thisInput  + offset,
-        nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-        thisOutput + offset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP;
+    prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
 
     // k-2 steps: copy to next GPU
-    if (prevdirect) {
-      for (int j=1; j<nranks-1; ++j) {
-        slice = ring->devUserRanks[nranks - j];
-        offset = chunkOffset + slice * chunkSize;
-        maxOffset = min(chunkSize, size-offset);
-
-        Prims::Copy(tid, nthreads,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-      Prims::Copy(tid, nthreads,
-          NULL,
-          NULL,
-          0, 0,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
-    } else {
-      for (int j=1; j<nranks-1; ++j) {
-        slice = ring->devUserRanks[nranks - j];
-        offset = chunkOffset + slice * chunkSize;
-        maxOffset = min(chunkSize, size-offset);
-
-        Prims::DoubleCopy(tid, nthreads,
-            prevInput + poffset,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
+    for (int j=1; j<nranks-1; ++j) {
+      slice = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + slice * realChunkSize;
+      nelem = min(realChunkSize, size-offset);
 
-      // Make final copy from buffer to dest.
-      slice = ring->devUserRanks[1];
-      offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
-
-      // Here we need to copy from buffer to this output.
-      Prims::Copy(tid, nthreads,
-          prevInput + poffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
+      prims.directRecvCopySend(thisOutput+offset, offset, nelem);
     }
-  }
 
-  if (tid == 0) {
-    // Wait for next to have consumed all data before we reset the flag
-    waitDoneFromNext.wait(ALLREDUCE_SUBSTEPS*(step + ALLREDUCE_BUFCHUNKS));
-    *ring->send.conn.head = 0ULL;
-    *ring->recv.conn.tail = 0ULL;
-    __threadfence_system();
-    *ring->recv.conn.opCount = args->opCount+1;
+    // Make final copy from buffer to dest.
+    slice = ring->devUserRanks[1];
+    offset = chunkOffset + slice * realChunkSize;
+    nelem = min(realChunkSize, size-offset);
+
+    // Final wait/copy.
+    prims.directRecv(thisOutput+offset, offset, nelem);
   }
 }
 
-#include "ll_kernel.h"
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = blockDim.x - 1;
+  const int bid = args->bid;
+  struct ncclComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclTree* tree = &channel->tree;
+  const ssize_t size = args->N;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = args->lastChunkSize;
+  const ssize_t loopSize = args->nChannels*chunkSize;
 
-#define NEXT_STEP_LL \
-  poffset = noffset; \
-  pflag = nflag; \
-  noffset += NCCL_LL_SLICE_LINES; \
-  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
-  nflag++; \
-  step++;
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  do {
+    // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
+    ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Up
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        prims.send(thisInput+offset, nelem);
+      } else {
+        prims.recvReduceSend(thisInput+offset, nelem);
+      }
+    }
+  } while(0);
+
+  do {
+    // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
+    ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Down
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        prims.send(thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        prims.recv(thisOutput+offset, nelem);
+      } else {
+        prims.recvCopySend(thisOutput+offset, nelem);
+      }
+    }
+  } while(0);
+}
 
 template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
-  const int llNthreads = args->nThreads;
+  const int nthreads = args->nThreads;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
 
-  typedef LLPrimitives<T, FUNC> LL;
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
 
   const ssize_t size = args->N;
   //const int rank = comm->rank;
   const int nranks = comm->nRanks;
   ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*nranks*chunkSize;
-
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t pflag, nflag = step + 1;
-  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  const ssize_t loopSize = args->nChannels*nranks*chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     if (size-gridOffset < loopSize) {
@@ -244,89 +163,99 @@ __device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) {
 
     /////////////// begin AllReduce steps ///////////////
     ssize_t offset;
-    int maxOffset;
+    int nelem;
     int slice;
 
     // step 0: push data to next GPU
     slice = ring->devUserRanks[nranks-1];
     offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    nelem = min(chunkSize, size-offset);
 
-    WAIT_NEXT;
-    LL::ReduceCopy(
-        thisInput  + offset,
-        nextOutput + noffset,
-        maxOffset, nflag, llNthreads);
-    POST_SIZE;
-
-    NEXT_STEP_LL;
+    LLprims.send(thisInput+offset, nelem);
 
     // k-2 steps: reduce and copy to next GPU
     for (int j=2; j<nranks; ++j) {
       slice = ring->devUserRanks[nranks-j];
       offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
-
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          thisInput  + offset,
-          prevInput  + poffset,
-          nextOutput + noffset,
-          maxOffset, pflag, nflag, llNthreads);
-      POST_SIZE;
-      ACK_PREV;
-
-      NEXT_STEP_LL;
+      nelem = min(chunkSize, size-offset);
+
+      LLprims.recvReduceSend(thisInput+offset, nelem);
     }
 
     // step k-1: reduce this buffer and data, which will produce the final
     // result that we store in this data and push to the next GPU
     slice = ring->devUserRanks[0];
     offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    nelem = min(chunkSize, size-offset);
 
-    WAIT_NEXT;
-    LL::ReduceCopy(
-        thisInput  + offset,
-        prevInput  + poffset,
-        thisOutput + offset,
-        nextOutput + noffset,
-        maxOffset, pflag, nflag, llNthreads);
-    POST_SIZE;
-    ACK_PREV;
-
-    NEXT_STEP_LL;
+    LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
 
     // k-2 steps: copy to next GPU
     for (int j=1; j<nranks-1; ++j) {
-      slice = ring->devUserRanks[nranks - j];
+      slice = ring->devUserRanks[nranks-j];
       offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
-
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          prevInput + poffset,
-          thisOutput + offset,
-          nextOutput + noffset,
-          maxOffset, pflag, nflag, llNthreads);
-      POST_SIZE;
-      ACK_PREV;
-
-      NEXT_STEP_LL;
+      nelem = min(chunkSize, size-offset);
+
+      LLprims.recvCopySend(thisOutput+offset, nelem);
     }
 
     // Make final copy from buffer to dest.
     slice = ring->devUserRanks[1];
     offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
+    nelem = min(chunkSize, size-offset);
 
     // Here we need to copy from buffer to this output.
-    LL::ReduceCopy(
-        prevInput + poffset,
-        thisOutput + offset,
-        maxOffset, pflag, llNthreads);
-    ACK_PREV;
+    LLprims.recv(thisOutput+offset, nelem);
   }
+}
 
-  FIFO_CLEANING_AND_SAVE_STEP(nflag);
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = args->nThreads;
+  const int bid = args->bid;
+  struct ncclComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclTree* tree = &channel->tree;
+  const ssize_t size = args->N;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nChannels*chunkSize;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  do {
+    // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
+    ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Up
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        LLprims.send(thisInput+offset, nelem);
+      } else {
+        LLprims.recvReduceSend(thisInput+offset, nelem);
+      }
+    }
+  } while(0);
+
+  do {
+    // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
+    ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      // Down
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      if (tree->up == -1) {
+        LLprims.send(thisOutput+offset, nelem);
+      } else if (tree->down[0] == -1) {
+        LLprims.recv(thisOutput+offset, nelem);
+      } else {
+        LLprims.recvCopySend(thisOutput+offset, nelem);
+      }
+    }
+  } while(0);
 }
diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu
index 4125de4..b83ee70 100644
--- a/src/collectives/device/broadcast.cu
+++ b/src/collectives/device/broadcast.cu
@@ -4,12 +4,8 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "common.h"
 #include "broadcast.h"
+#include "common.h"
 #include "collectives.h"
 
-#define UNROLL 4
-
-#if NCCL_OP == 0
-IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8);
-#endif
+IMPL_COLL_C(ncclBroadcast, ncclCollBroadcast);
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index c2f6d00..fb18312 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -8,174 +8,74 @@
 #include "primitives.h"
 #include "collectives.h"
 
-// Increase Step and boffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  boffset += sliceSize; \
-  if (boffset == buffSize) boffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
-__device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
+__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x - 1;
   const int bid = args->bid;
-  __shared__ T* sharedNextOutput;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  int prevdirect = ring->recv.conn.direct;
-  int nextdirect = ring->send.conn.direct;
-
-  WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
-  PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS);
-
-  typedef Primitives<UNROLL, BROADCAST_SUBSTEPS, T> Prims;
-
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / BROADCAST_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
   const int rank = ring->devUserRanks[0];
   const int nextRank = ring->devUserRanks[1];
   const int root = args->root;
 
-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    *ring->recv.conn.opCount = args->opCount;
-    if (nextRank != root) {
-      // Wait for next to be ready
-      WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-      waitOpCountNext.wait(args->opCount);
-    }
-    if (rank != root && prevdirect) {
-      *ring->recv.conn.ptrExchange = args->ThisOutput;
-    }
-    if (nextRank != root && nextdirect) {
-      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
-      while (*ptr == nullptr);
-      sharedNextOutput = (T*)*ptr;
-      *ptr = nullptr;
-    }
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int boffset = 0;
-
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t offset = gridOffset + bid*chunkSize;
-    int maxOffset = min(chunkSize, size-offset);
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t offset = gridOffset + bid*realChunkSize;
+    int nelem = min(realChunkSize, size-offset);
 
     if (rank == root) {
       if (thisInput == thisOutput) {
-        Prims::Copy(tid, nthreads,
-            thisInput  + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext,
-            postReadyToNext);
+        prims.send(thisInput+offset, nelem);
       } else {
-        Prims::DoubleCopy(tid, nthreads,
-            thisInput  + offset,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext,
-            postReadyToNext);
+        prims.copySend(thisInput+offset, thisOutput+offset, nelem);
       }
     } else if (nextRank == root) {
-      if (prevdirect) maxOffset = 0; // Only wait for signals
-      Prims::Copy(tid, nthreads,
-          prevInput  + boffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
+      prims.recv(thisOutput+offset, nelem);
     } else {
-      if (prevdirect) {
-        Prims::Copy(tid, nthreads,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-      } else {
-        Prims::DoubleCopy(tid, nthreads,
-            prevInput + boffset,
-            thisOutput + offset,
-            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-      }
-    }
-    NEXT_STEP; // Increases step, boffset
-  }
-
-  if (tid == 0) {
-    if (nextRank != root) {
-      // Wait for next to have consumed data before resetting the flag
-      waitDoneFromNext.wait(BROADCAST_SUBSTEPS*(step + BROADCAST_BUFCHUNKS - 1));
-      *ring->send.conn.head = 0ULL;
+      prims.recvCopySend(thisOutput+offset, nelem);
     }
-    *ring->recv.conn.tail = 0ULL;
-    __threadfence_system();
-    *ring->recv.conn.opCount = args->opCount+1;
   }
 }
 
-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
-  boffset += NCCL_LL_SLICE_LINES; \
-  if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
-  flag++; \
-  step++;
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }
 
 template<int UNUSED, class FUNC, typename T>
-__device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
-  const int llNthreads = args->nThreads;
+  const int nthreads = args->nThreads;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
-  const int rank = comm->rank;
-  const int nextRank = ring->devUserRanks[1];
-  const int root = args->root;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
 
-  typedef LLPrimitives<T, FUNC> LL;
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
 
   const ssize_t size = args->N;
-  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*chunkSize;
+  const int rank = ring->devUserRanks[0];
+  const int nextRank = ring->devUserRanks[1];
+  const int root = args->root;
 
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t flag = step + 1;
-  int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nChannels*chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     if (size-gridOffset < loopSize) {
@@ -183,46 +83,20 @@ __device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) {
     }
     ssize_t offset = gridOffset + bid*chunkSize;
 
-    int maxOffset = min(chunkSize, size-offset);
+    int nelem = min(chunkSize, size-offset);
     if (rank == root) {
-      WAIT_NEXT;
       if (thisInput == thisOutput) {
-        LL::ReduceCopy(
-            thisInput + offset,
-            nextOutput + boffset,
-            maxOffset, flag, llNthreads);
+        LLprims.send(thisInput+offset, nelem);
       } else {
-        LL::ReduceCopy(
-            thisInput + offset,
-            thisOutput + offset,
-            nextOutput + boffset,
-            maxOffset, flag, llNthreads);
+        LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
       }
-      POST_SIZE;
-      NEXT_STEP_LL;
     } else if (nextRank == root) {
-      LL::ReduceCopy(
-          prevInput + boffset,
-          thisOutput + offset,
-          maxOffset, flag, llNthreads);
-      NEXT_STEP_LL;
-      ACK_PREV;
+      LLprims.recv(thisOutput + offset, nelem);
     } else {
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          prevInput + boffset,
-          thisOutput + offset,
-          nextOutput + boffset,
-          maxOffset, flag, flag, llNthreads);
-      POST_SIZE;
-      NEXT_STEP_LL;
-      ACK_PREV;
+      LLprims.recvCopySend(thisOutput + offset, nelem);
     }
   }
-
-  // We need everyone to acknowledge data even if they didn't receive anything
-  // so that the next collective can start right away.
-  ACK_PREV;
-
-  FIFO_CLEANING_AND_SAVE_STEP(flag);
 }
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index c988913..e4aecbd 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -11,13 +11,29 @@
 #include "core.h"
 #include "nccl.h"
 
+// Exit If Abort Barrier across CTA: make sure all threads exit consistently
+// Each thread sets a predicate to true if abort == 1
+// all CTA's threads enter the barrier and do a popc on their predicates being True
+// If any of the thread's predicate was True, all the threads call exit()
+static inline __device__ void exitIfAbortBarrier(int abort) {
+  uint32_t popc;
+  asm ("{");
+  asm volatile ("   .reg .pred barr_pred;");
+  asm volatile ("   setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
+  asm volatile ("   bar.red.popc.u32 %0, 13, barr_pred;" : "=r"(popc));
+  asm ("}");
+  if (popc) { asm volatile ("exit;"); }
+}
+
 typedef void(*ncclKern_t)(struct CollectiveArgs* args);
 extern __device__ ncclKern_t ncclFuncs[];
 
 static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
   int* d = (int*)dst;
   int* s = (int*)src;
-  __syncthreads();
+  // When aggregation is effective, if some threads have aborted inside the LL kernel,
+  // make sure the rest of the threads abort as well
+  exitIfAbortBarrier(0);
   for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
   __syncthreads();
 }
@@ -27,12 +43,14 @@ static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* ho
 }
 
 /* Functions for aggregation case */
-#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
+#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
 __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
-  coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(args); \
+  coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(args); \
 }
+
+#if NCCL_OP == 0
 /* Kernels with the first operation inlined */
-#define IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, fIndex) \
+#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \
 __launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \
 __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
   int tid = threadIdx.x; \
@@ -40,25 +58,25 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
   __shared__ struct ncclColl localColl; \
  \
   struct ncclComm* comm = firstColl.args.comm; \
-  struct ncclRing* ring = comm->rings+bid; \
+  struct ncclChannel* channel = comm->channels+bid; \
   struct ncclColl* c; \
   if (bid == 0) { \
     /* To optimize for latency, (only) the first operation is passed as argument.*/ \
     c = &firstColl; \
   } else { \
     c = &localColl; \
-    load_coll(c, ring->devCollectives+ring->collFifoHead, tid); \
+    load_coll(c, channel->devCollectives+channel->collFifoHead, tid); \
   } \
   while (1) { \
-    if (tid < c->nThreads) { \
+    if (tid < c->args.nThreads) { \
       if (c->funcIndex == fIndex) { \
-        coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
+        coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
       } else { \
         ncclFuncs[c->funcIndex](&c->args); \
       } \
     } \
     int nextIndex = c->nextIndex; \
-    if (tid == 0) ring->collFifoHead = nextIndex; \
+    if (tid == 0) channel->collFifoHead = nextIndex; \
  \
     if (c->active == 2) { \
       return; \
@@ -66,25 +84,75 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
  \
     /* Load next collective operation*/ \
     c = &localColl; /* for bid 0 */ \
-    load_coll(c, ring->devCollectives+nextIndex, tid); \
+    load_coll(c, channel->devCollectives+nextIndex, tid); \
   } \
 }
+#else
+#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex)
+#endif
+
+// Only generate inline kernels for LL
+#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \
+  IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
+  IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \
+  IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1, al)) \
 
 #define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
-  IMPL_COLL4(coll##LL, op, ncclFunc, dtype, ctype) \
-  IMPL_COLL4K(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1)) \
-  IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
-  IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 0)) \
+  IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0) \
+  IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 1)
 
+#if NCCL_TYPE == 0
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, i8,  int8_t,   ncclColl, ncclOp, ncclInt8)
+#elif NCCL_TYPE == 1
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, u8,  uint8_t,  ncclColl, ncclOp, ncclUint8)
+#elif NCCL_TYPE == 2
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, i32, int32_t,  ncclColl, ncclOp, ncclInt32)
+#elif NCCL_TYPE == 3
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32)
+#elif NCCL_TYPE == 4
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, i64, int64_t,  ncclColl, ncclOp, ncclInt64)
+#elif NCCL_TYPE == 5
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64)
+#elif NCCL_TYPE == 6
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, f16, half,     ncclColl, ncclOp, ncclFloat16)
+#elif NCCL_TYPE == 7
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, f32, float,    ncclColl, ncclOp, ncclFloat32)
+#elif NCCL_TYPE == 8
 #define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
-  IMPL_COLL3(coll, op, ncclFunc, i8,  int8_t,   ncclColl, ncclOp, ncclInt8) \
-  IMPL_COLL3(coll, op, ncclFunc, u8,  uint8_t,  ncclColl, ncclOp, ncclUint8) \
-  IMPL_COLL3(coll, op, ncclFunc, i32, int32_t,  ncclColl, ncclOp, ncclInt32) \
-  IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32) \
-  IMPL_COLL3(coll, op, ncclFunc, i64, int64_t,  ncclColl, ncclOp, ncclInt64) \
-  IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64) \
-  IMPL_COLL3(coll, op, ncclFunc, f16, half,     ncclColl, ncclOp, ncclFloat16) \
-  IMPL_COLL3(coll, op, ncclFunc, f32, float,    ncclColl, ncclOp, ncclFloat32) \
   IMPL_COLL3(coll, op, ncclFunc, f64, double,   ncclColl, ncclOp, ncclFloat64)
+#endif
+
+// Reduction define all functions
+#if NCCL_OP == 0
+#define IMPL_COLL_R(collf, colln) \
+  IMPL_COLL2(collf, sum,  FuncSum,  colln, ncclSum);
+#elif NCCL_OP == 1
+#define IMPL_COLL_R(collf, colln) \
+  IMPL_COLL2(collf, prod, FuncProd, colln, ncclProd);
+#elif NCCL_OP == 2
+#define IMPL_COLL_R(collf, colln) \
+  IMPL_COLL2(collf, min,  FuncMin,  colln, ncclMin);
+#elif NCCL_OP == 3
+#define IMPL_COLL_R(collf, colln) \
+  IMPL_COLL2(collf, max,  FuncMax,  colln, ncclMax);
+#endif
+
+// Copy primitives only define one
+#if NCCL_OP == 0 && NCCL_TYPE == 0
+#define IMPL_COLL_C(collf, colln) \
+  IMPL_COLL3(collf, copy, FuncSum, i8, int8_t, colln, ncclSum, ncclInt8);
+#else
+#define IMPL_COLL_C(collf, colln)
+#endif
+
+#define COLL_UNROLL 4
 
 #endif
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
index 0eaa061..e1fb096 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@@ -192,14 +192,6 @@ struct MULTI<FUNC, int64_t> {
   }
 };
 
-#define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))
-
-template<typename T>
-__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
-  size_t ptrval = reinterpret_cast<size_t>(ptr);
-  return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
-}
-
 template<typename T> inline __device__
 T vFetch(const volatile T* ptr) {
   return *ptr;
@@ -236,25 +228,6 @@ void vStore<half>(volatile half* ptr, const half val) {
 }
 #endif
 
-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
-__device__ inline void ReduceCopy(
-    const int tid, const int nthreads,
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1,
-    volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1, const int N) {
-  for (int idx = tid; idx < N; idx += nthreads) {
-    T val = vFetch(src0+idx);
-    if (TWO_INPUTS) {
-      val = FUNC()(val, vFetch(src1+idx));
-    }
-    vStore(dest0+idx, val);
-    if (TWO_OUTPUTS) {
-      vStore(dest1+idx, val);
-    }
-  }
-}
-
 typedef ulong2 Pack128;
 
 template<class FUNC, typename T>
@@ -265,72 +238,111 @@ struct MULTI128 {
   }
 };
 
-inline __device__ void Fetch128(Pack128& v, Pack128* p) {
+inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
   asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory");
 }
 inline __device__ void Store128(Pack128* p, Pack128& v) {
   asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" :: "l"(p), "l"(v.x), "l"(v.y) : "memory");
 }
 
+template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ __forceinline__ void ReduceCopyMulti(const int tid, const int nthreads,
+    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
+    const int offset, const int N) {
+  for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
+    T val = vFetch(srcs[0]+idx);
+    #pragma unroll
+    for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
+    #pragma unroll 1
+    for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
+
+    #pragma unroll
+    for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
+    #pragma unroll 1
+    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
+  }
+}
+
 #define WARP_SIZE 32
-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL>
-__device__ inline void ReduceCopy128b( const int w, const int nw, const int t,
-    Pack128 * src0, Pack128 * src1, Pack128 * dest0, Pack128 * dest1,
-    const int N) {
-  Pack128 t0[UNROLL];
-  Pack128 t1[UNROLL];
-  const Pack128* src0_end = src0 + N;
+
+template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ __forceinline__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
+    int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
+    const int elemOffset, const int Npack) {
   const int inc = nw * UNROLL * WARP_SIZE;
-  const int offset = w * UNROLL * WARP_SIZE + t;
-  src0 += offset;  if (TWO_INPUTS)  src1 += offset;
-  dest0 += offset; if (TWO_OUTPUTS) dest1 += offset;
-
-  while (src0 < src0_end) {
-#pragma unroll
-    for (int u = 0; u < UNROLL; ++u) {
-      Fetch128(t0[u], src0+u*WARP_SIZE);
-      if (TWO_INPUTS) Fetch128(t1[u], src1+u*WARP_SIZE);
+  int offset = w * UNROLL * WARP_SIZE + t;
+
+  const Pack128* srcs[MAXSRCS];
+  for (int i=0; i<MAXSRCS; i++) srcs[i] = ((const Pack128*)(s[i]+elemOffset))+offset;
+  Pack128* dsts[MAXDSTS];
+  for (int i=0; i<MAXDSTS; i++) dsts[i] = ((Pack128*)(d[i]+elemOffset))+offset;
+
+  while (offset < Npack) {
+    Pack128 vals[UNROLL];
+    // Load and reduce
+    for (int u = 0; u < UNROLL; ++u) Fetch128(vals[u], srcs[0]+u*WARP_SIZE);
+
+    for (int i=1; i<MINSRCS; i++) {
+      Pack128 vals2[UNROLL];
+      for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+      for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
     }
-#pragma unroll
-    for (int u = 0; u < UNROLL; ++u) {
-      if (TWO_INPUTS) MULTI128<FUNC, T>()(t0[u], t1[u]);
-      Store128(dest0+u*WARP_SIZE, t0[u]);
-      if (TWO_OUTPUTS) Store128(dest1+u*WARP_SIZE, t0[u]);
+    #pragma unroll 1
+    for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) {
+      Pack128 vals2[UNROLL];
+      for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+      for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
     }
-    src0 += inc;  if (TWO_INPUTS)  src1 += inc;
-    dest0 += inc; if (TWO_OUTPUTS) dest1 += inc;
+
+    // Store
+    for (int i = 0; i < MINDSTS; i++) {
+      for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+    }
+    #pragma unroll 1
+    for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
+      for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+    }
+    for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
+    for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
+    offset += inc;
   }
 }
 
-template<int UNROLL, class FUNC, typename T, bool HAS_DEST1, bool HAS_SRC1>
-__device__ inline void ReduceOrCopy(const int tid, const int nthreads,
-    volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
-    const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
+template <typename T>
+__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
+
+// Try to limit consecutive load/stores to 8.
+// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
+#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
+
+template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ __forceinline__ void ReduceOrCopyMulti(const int tid, const int nthreads,
+    int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
     int N) {
   int Nrem = N;
   if (Nrem <= 0) return;
 
-  int Npreamble = (Nrem<alignof(Pack128)) ? Nrem : AlignUp(dest0, alignof(Pack128)) - dest0;
+  int alignDiff = 0;
+  int align = ptrAlign128(srcs[0]);
+  #pragma unroll
+  for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
+  for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
+  #pragma unroll
+  for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
+  for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
 
-  // stage 0: check if we'll be able to use the fast, 128-bit aligned path.
-  // If not, we'll just use the slow preamble path for the whole operation
-  bool alignable = (((AlignUp(src0,  alignof(Pack128)) == src0  + Npreamble)) &&
-          (!HAS_DEST1 || (AlignUp(dest1, alignof(Pack128)) == dest1 + Npreamble)) &&
-          (!HAS_SRC1  || (AlignUp(src1,  alignof(Pack128)) == src1  + Npreamble)));
-
-  if (!alignable) {
-    Npreamble = Nrem;
-  }
+  int Npreamble = alignDiff ? Nrem :
+    N < alignof(Pack128) ? N :
+    (alignof(Pack128) - align) % alignof(Pack128);
 
   // stage 1: preamble: handle any elements up to the point of everything coming
   // into alignment
-  ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Npreamble);
-
-  Nrem -= Npreamble;
-  if (Nrem == 0) return;
-
-  dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
-  src0  += Npreamble; if (HAS_SRC1)  { src1  += Npreamble; }
+  if (Npreamble) {
+    ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
+    Nrem -= Npreamble;
+    if (Nrem == 0) return;
+  }
+  int offset = Npreamble;
 
   // stage 2: fast path: use 128b loads/stores to do the bulk of the work,
   // assuming the pointers we have are all 128-bit alignable.
@@ -338,35 +350,33 @@ __device__ inline void ReduceOrCopy(const int tid, const int nthreads,
   int nw = nthreads / WARP_SIZE; // Number of warps
   int t = tid % WARP_SIZE;       // Thread (inside the warp)
 
-  const int PackFactor = sizeof(Pack128) / sizeof(T);
+  const int packFactor = sizeof(Pack128) / sizeof(T);
 
   // stage 2a: main loop
-  int Nalign2a = (Nrem / (PackFactor * UNROLL * nthreads))
-      * (UNROLL * nthreads); // round down
+  int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
+      * (AUTOUNROLL * WARP_SIZE); // round down
+  int Nelem2a = Npack2a * packFactor;
 
-  ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2a);
+  ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);
 
-  int Ndone2a = Nalign2a * PackFactor;
-  Nrem -= Ndone2a;
+  Nrem -= Nelem2a;
   if (Nrem == 0) return;
-  dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; }
-  src0  += Ndone2a; if (HAS_SRC1)  { src1  += Ndone2a; }
+  offset += Nelem2a;
 
   // stage 2b: slightly less optimized for section when we don't have full
-  // UNROLLs
+  // unrolling
 
-  int Nalign2b = Nrem / PackFactor;
+  int Npack2b = Nrem / packFactor;
+  int Nelem2b = Npack2b * packFactor;
 
-  ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, 1>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2b);
+  ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);
 
-  int Ndone2b = Nalign2b * PackFactor;
-  Nrem -= Ndone2b;
+  Nrem -= Nelem2b;
   if (Nrem == 0) return;
-  dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; }
-  src0  += Ndone2b; if (HAS_SRC1)  { src1  += Ndone2b; }
+  offset += Nelem2b;
 
   // stage 2c: tail
-  ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Nrem);
+  ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
 }
 
 #endif // COMMON_KERNEL_H_
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu
index 1fb8108..ea06b68 100644
--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@@ -8,9 +8,13 @@
 #include "collectives.h"
 #include "common.h"
 
-#define NCCL_FUNC4(coll, op, dtype) \
+#define NCCL_FUNC5(coll, op, dtype) \
   NCCL_COLL_NAME(coll, op, dtype), \
-  NCCL_COLL_NAME(coll##LL, op, dtype)  \
+  NCCL_COLL_NAME(coll##LL, op, dtype)
+
+#define NCCL_FUNC4(coll, op, dtype) \
+  NCCL_FUNC5(coll##Ring, op, dtype), \
+  NCCL_FUNC5(coll##Tree, op, dtype)
 
 // Must be consistent with ncclDataType_t
 #define NCCL_FUNCS3A(coll, op) \
@@ -55,7 +59,7 @@
   NCCL_FUNCS2A(ncclAllReduce) }
 
 // Must be consistent with the ncclFuncSet enum
-__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
+__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
 // Don't try to initialize the host shadow copy of this device-side global
 // variable. There is no host pointer to a device-side function, which
 // confuses clang. This will be fixed in the next clang release.
diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh
new file mode 100755
index 0000000..3942c8c
--- /dev/null
+++ b/src/collectives/device/gen_rules.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+#
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+dir=$1
+
+targets="GENOBJS := \\\\\n"
+
+for base in all_reduce all_gather broadcast reduce reduce_scatter; do
+  opn=0
+  for op in sum prod min max; do
+    dtn=0
+    for dt in i8 u8 i32 u32 i64 u64 f16 f32 f64; do
+      echo "${dir}/${base}_${op}_${dt}.o : ${base}.cu ${dir}/${base}.dep"
+      echo "	@printf \"Compiling  %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o"
+      echo "	mkdir -p ${dir}"
+      echo "	\${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc ${base}.cu -o ${dir}/${base}_${op}_${dt}.o"
+      echo ""
+      targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n"
+      dtn=$(($dtn + 1))
+    done
+    opn=$(($opn + 1))
+  done
+done
+echo -e "$targets"
diff --git a/src/collectives/device/ll_kernel.h b/src/collectives/device/ll_kernel.h
deleted file mode 100644
index 5ec3c9a..0000000
--- a/src/collectives/device/ll_kernel.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_LL_KERNEL_H_
-#define NCCL_LL_KERNEL_H_
-
-static __device__ uint64_t readLL(union ncclLLFifoLine* src, uint32_t flag) {
-  uint32_t data1, flag1, data2, flag2;
-  do {
-    asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
-  } while ((flag1 != flag) || (flag2 != flag));
-  uint64_t val64 = data1 + (((uint64_t)data2) << 32);
-  return val64;
-}
-
-static __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
-  asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
-}
-
-// Using memcpy handles misaligned pointers.
-static __device__ uint64_t readAL(uint64_t* src) {
-  uint64_t val;
-  memcpy((char*)&val, (char*)src, sizeof(uint64_t));
-  return val;
-}
-static __device__ void storeAL(uint64_t* dst, uint64_t val) {
-  memcpy((char*)dst, (char*)&val, sizeof(uint64_t));
-}
-
-template <typename T, class FUNC>
-class LLPrimitives {
- private:
-  template <int HAS_SRC1, int HAS_SRC2, int HAS_DST1, int HAS_DST2>
-  static __device__ void ReduceCopyGeneric(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
-    if (size <= 0) return;
-    size_t size64 = size * sizeof(T) / sizeof(uint64_t);
-    uint64_t* src1A = (uint64_t*)src1;
-    uint64_t* dst1A = (uint64_t*)dst1;
-    int offset = threadIdx.x;
-    // Do multiples of 64 bits
-#pragma unroll 1
-    for (; offset < size64; offset += nthreads) {
-      uint64_t val;
-      if (HAS_SRC1) {
-        val = readAL(src1A+offset);
-        if (HAS_SRC2) val = MULTI<FUNC, T>()(readLL(src2+offset, iflag), val);
-      } else if (HAS_SRC2) {
-        val = readLL(src2+offset, iflag);
-      }
-      if (HAS_DST1) storeAL(dst1A+offset, val);
-      if (HAS_DST2) storeLL(dst2+offset, val, oflag);
-    }
-    // Finish last word
-    int sizeDone = size64*(sizeof(uint64_t)/sizeof(T));
-    int sizeRem = size - sizeDone;
-    if (threadIdx.x == 0 && sizeRem) {
-      const T* src1B = src1 + sizeDone;
-      T* dst1B = dst1 + sizeDone;
-
-      uint64_t lastVal;
-      T* vals = (T*)&lastVal;
-
-      if (HAS_SRC2) {
-        uint64_t lastVal2 = readLL(src2+size64, iflag);
-        T* src2B = (T*)&lastVal2;
-        for (int offset = 0; offset < sizeRem; offset++) {
-          vals[offset] = HAS_SRC1 ? FUNC()(src2B[offset], src1B[offset]) : src2B[offset];
-        }
-      } else if (HAS_SRC1) {
-        for (int offset = 0; offset < sizeRem; offset++) {
-          vals[offset] = src1B[offset];
-        }
-      }
-      if (HAS_DST2) storeLL(dst2+size64, lastVal, oflag);
-      if (HAS_DST1) {
-        for (int offset = 0; offset < sizeRem; offset++) {
-          dst1B[offset] = vals[offset];
-        }
-      }
-    }
-  }
- public:
-  static __device__ void ReduceCopy(const T* src, union ncclLLFifoLine* dst, int size, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<1, 0, 0, 1>(src, NULL, NULL, dst, size, 0, oflag, nthreads);
-  }
-
-  static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst, int size, uint32_t iflag, int nthreads) {
-    return ReduceCopyGeneric<0, 1, 1, 0>(NULL, src, dst, NULL, size, iflag, 0, nthreads);
-  }
-
-  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, union ncclLLFifoLine* dst, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<1, 1, 0, 1>(src1, src2, NULL, dst, size, iflag, oflag, nthreads);
-  }
-
-  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst, int size, uint32_t iflag, int nthreads) {
-    return ReduceCopyGeneric<1, 1, 1, 0>(src1, src2, dst, NULL, size, iflag, 0, nthreads);
-  }
-
-  static __device__ void ReduceCopy(const T* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<1, 0, 1, 1>(src, NULL, dst1, dst2, size, 0, oflag, nthreads);
-  }
-
-  static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<0, 1, 1, 1>(NULL, src, dst1, dst2, size, iflag, oflag, nthreads);
-  }
-
-  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
-    return ReduceCopyGeneric<1, 1, 1, 1>(src1, src2, dst1, dst2, size, iflag, oflag, nthreads);
-  }
-};
-
-// Common macros
-
-#define STEP_TO_SLOT(step) \
-  (step % NCCL_LL_CHUNKS)
-
-#define WAIT_NEXT \
-  if (tid == 0) { \
-    while (sendHead + NCCL_LL_CHUNKS <= step) { \
-      sendHead = sendHeadPtr[0]; \
-    } \
-  } \
-  asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads));
-
-#define POST_SIZE \
-  if (tid == 0 && sizesFifo) sizesFifo[step % NCCL_LL_CHUNKS] = (maxOffset <= 0) ? -1 : (maxOffset*2*(int)sizeof(T));
-
-#define ACK_PREV \
-  asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads)); \
-  if (tid == 0) recvHeadPtr[0] = step;
-
-#define FIFO_CLEANING_AND_SAVE_STEP(flag) do { \
-  if (step > ring->send.conn.llLastCleaning + NCCL_LL_CLEAN_FREQ) { \
-    /* Reset all flags */ \
-    static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); \
-    static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); \
-    const union ncclLLFifoLine resetLine = { 0, flag, 0, flag }; \
-    for (int i=0; i<NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*llNthreads); i++) { \
-      prevInput[tid+i*llNthreads].i4 = resetLine.i4; \
-    } \
-    __threadfence_system(); \
-    /* Restart from the same slot, only make sure sender waits for data to be reset */ \
-    step += NCCL_LL_CHUNKS; \
-    ACK_PREV; \
-    while (sendHeadPtr[0] < step); \
-    if (tid == 0) ring->send.conn.llLastCleaning = step; \
-  } \
-  ring->send.conn.llStep = step; \
-} while (0);
-
-#endif
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index e2baa4b..c5aaf54 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,218 +9,579 @@
 
 #include <type_traits>
 #include "reduce_kernel.h" // for reduction funcs
+#include "common.h"
+
+#define SPINS_BEFORE_CHECK_ABORT 1000000
+
+// Unroll unconditionally the first send/recv since nsend/nrecv should be at
+// least 1 if SEND/RECV is set.
+#define FOR_SEND(func, ...) do { \
+  if (SEND) { \
+    /* Send to far first, then close */ \
+    for (int i=1; i<NSEND && i<nsend; i++) func(i, ##__VA_ARGS__); \
+    func(0, ##__VA_ARGS__); \
+  } \
+} while (0)
+
+#define FOR_RECV(func, ...) do { \
+  if (RECV) { \
+    /* Recv from close first, then far */ \
+    func(0, ##__VA_ARGS__); \
+    for (int i=1; i<NRECV && i<nrecv; i++) func(i, ##__VA_ARGS__); \
+  } \
+} while (0)
 
+// Implementation of primitive types
+template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, class FUNC>
+class ncclPrimitives {
+ private:
+  const int tid;
+  const int nthreads;
+  int nrecv = 0;
+  int nsend = 0;
+  const int stepSize;
+  struct ncclConnInfo* recvConn[NRECV];
+  struct ncclConnInfo* sendConn[NSEND];
+  volatile uint64_t* waitPtr;
+  uint64_t recvStep[NRECV];
+  uint64_t sendStep[NSEND];
+  uint64_t sendConnHead[NSEND];
+  const T* recvDirectBuff[NRECV];
+  T* sendDirectBuff[NSEND];
+  const T* recvBuff[NRECV];
+  T* sendBuff[NSEND];
+  struct ncclComm* comm;
+
+  inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
+  inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
+  inline __device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
+  inline __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
+
+  inline __device__ void barrier() {
+    asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+  }
 
-/* Defines primitive operations: Copy, Reduce, DoubleCopy, and ReduceCopy.
- *
- * In order to reduce the reptetion of template arguments, the operations
- * are bundled as static methods of the Primitives class.
- *
- * Each primitive operation copies/reduces a contiguous buffer and syncs
- * an optional set of flags against a sub-step counter. The sync value is
- * based on the step parameter. Sync flags must be of type WaitFlag or
- * PostFlag. The primitive routines wait for all WaitFlag args to attain
- * at least a value of SUBSTEPS*(step-1)+substep+1 (i.e. completion of
- * corresponding substep by previous step) before executing the transfer.
- * After each substep is transfered, all PostFlag arguments get updated to
- * the value SUBSTEPS*step+substep+1.
- */
-
-
-class WaitFlag {
-  volatile uint64_t * const flag;
-  const int shift;
- public:
-  __device__ __forceinline__
-  WaitFlag(volatile uint64_t * const flag, const int shift) : flag(flag), shift(shift) { }
-  __device__ __forceinline__
-  void wait(uint64_t val) { while ((*flag + shift) < val) /*SPIN*/; }
-};
+  uint32_t mismatch = 0;
+  const uint64_t opCount;
 
+  inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
+    if (mismatch) {
+      // In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch
+      *(comm->fatalDevError) = ncclDevAssertedMismatch;
+    } else if (remoteOpCount && *remoteOpCount > opCount) {
+      mismatch += 1;
+    }
+  }
+
+  uint32_t spins = 0;
+  uint32_t abort = 0;
+
+  inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
+    spins++;
+    if (spins == SPINS_BEFORE_CHECK_ABORT) {
+      abort = *(comm->abortFlag);
+      checkMismatch(remoteOpCount);
+      spins = 0;
+    }
+    return abort;
+  }
+
+  inline __device__ void waitRecv(int i) {
+    spins = 0;
+    mismatch = 0;
+    recvStep[i] += SLICESTEPS;
+    if (tid == i) {
+      while (*(waitPtr) < recvStep[i]) {
+        if (checkAbort(recvConn[i]->opCountRem)) break;
+      }
+    }
+  }
+
+  inline __device__ void waitSend(int i) {
+    spins = 0;
+    mismatch = 0;
+    sendStep[i] += SLICESTEPS;
+    if (tid == WARP_SIZE+i) {
+      while (sendConnHead[i] + NCCL_STEPS < sendStep[i]) {
+        sendConnHead[i] = *waitPtr;
+        if (checkAbort(sendConn[i]->opCountRem)) break;
+      }
+    }
+  }
+
+  inline __device__ void postRecv(int i) {
+    *(recvConn[i]->head) = recvStep[i] += SLICESTEPS;
+  }
+
+  inline __device__ void postSend(int i) {
+    *(sendConn[i]->tail) = sendStep[i] += SLICESTEPS;
+  }
+
+  inline __device__ void postSendSize(int i, int size) {
+    if (sendConn[i]->fifo) sendConn[i]->fifo[sendStep[i]%NCCL_STEPS] = size;
+  }
+
+  template <int DIRECTRECV>
+  inline __device__ const T* directRecvPtr(int i, int directOffset) {
+    return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i);
+  }
+
+  template <int DIRECTSEND>
+  inline __device__ T* directSendPtr(int i, int directOffset) {
+    return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
+  }
+
+  template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
+  inline __device__ void
+  GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) {
+    int offset = 0;
+    int sliceSize = stepSize * SLICESTEPS;
+
+    const T* srcs[RECV*NRECV+SRC];
+    srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset);
+    if (RECV) {
+      if (SRC) srcs[1] = recvPtr(0);
+      for (int i=1; i<NRECV && i<nrecv; i++) srcs[SRC+i] = recvPtr(i);
+    }
+
+    T* dsts[SEND*NSEND+DST];
+    dsts[0] = DST ? dstPtr : directSendPtr<DIRECTSEND>(0, directOffset);
+    if (SEND) {
+      if (DST) dsts[1] = directSendPtr<DIRECTSEND>(0, directOffset);
+      for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
+    }
+
+    #pragma unroll 1
+    for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
+      int realSize = max(0, min(sliceSize, nelem-offset));
+      if (tid < nthreads) {
+        FOR_SEND(waitSend);
+        FOR_RECV(waitRecv);
+        if (realSize > 0) {
+          barrier();
+          if (DIRECTRECV && recvDirectBuff[0]) {
+            // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
+            if (SEND) {
+              ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
+            }
+          } else {
+            ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
+          }
+        }
+        exitIfAbortBarrier(abort);
+      } else {
+        exitIfAbortBarrier(abort);
+        FOR_SEND(postSendSize, realSize*sizeof(T));
+        if (SEND) __threadfence_system();
+        FOR_SEND(postSend);
+        FOR_RECV(postRecv);
+      }
+      for (int i=0; i<RECV*NRECV+SRC; i++) srcs[i] += sliceSize;
+      for (int i=0; i<SEND*NSEND+DST; i++) dsts[i] += sliceSize;
+      offset += sliceSize;
+    }
+  }
+
+  __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
+    recvConn[i] = conn;
+    recvBuff[i] = (const T*)recvConn[i]->buff;
+    recvStep[i] = recvConn[i]->step;
+    recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
+    // Return credits in case we rounded up.
+    if (tid == nthreads) *recvConn[i]->head = recvStep[i];
+    if (tid == i) {
+      waitPtr = recvConn[i]->tail;
+      *(recvConn[i]->opCountLoc) = opCount;
+    }
+    recvDirectBuff[i] = NULL;
+    if (directBuff && recvConn[i]->direct) {
+      recvDirectBuff[i] = directBuff;
+      if (tid == 0) *recvConn[i]->ptrExchange = directBuff;
+    }
+    nrecv++;
+  }
+
+  __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
+    sendConn[i] = conn;
+    sendBuff[i] = (T*)sendConn[i]->buff;
+    sendStep[i] = sendConn[i]->step;
+    sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
+    if (tid == WARP_SIZE+i) {
+      waitPtr = sendConn[i]->head;
+      sendConnHead[i] = *waitPtr;
+      *(sendConn[i]->opCountLoc) = opCount;
+    }
+    sendDirectBuff[i] = NULL;
+    if (directBuff && sendConn[i]->direct) {
+      void* volatile* ptr = sendConn[i]->ptrExchange;
+      while ((sendDirectBuff[i] = (T*)(*ptr)) == NULL);
+      __syncthreads();
+      if (tid == 0) *ptr = NULL;
+    }
+    nsend++;
+  }
+
+  __device__ __forceinline__ void saveRecvConn(int i) {
+    if (tid == i) {
+      recvConn[i]->step = recvStep[i];
+      __threadfence_system();
+      *(recvConn[i]->opCountLoc) += 1;
+    }
+  }
+
+  __device__ __forceinline__ void saveSendConn(int i) {
+    if (tid == WARP_SIZE+i) {
+      sendConn[i]->step = sendStep[i];
+      __threadfence_system();
+      *(sendConn[i]->opCountLoc) += 1;
+    }
+  }
 
-class PostFlag {
-  volatile uint64_t * const flag;
-  const int shift;
-  volatile int * const fifo;
-  const int fifo_size;
  public:
   __device__ __forceinline__
-  PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size) : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size) { }
-  __device__ __forceinline__
-  void post(uint64_t val) { *flag = (val - shift); }
-  __device__ __forceinline__
-  void postSize(uint64_t step, int size) { if (fifo != NULL) fifo[step%fifo_size] = size; };
-};
+  ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
+    : comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
+    // Make sure step is updated before we read it
+    __syncthreads();
 
+    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, directBuff);
+    for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, directBuff);
+  }
 
-// Helper to check if any argument is of type T.
-// e.g. AnyAre<WaitFlag>(Flag1, Flag2, ...)
-template<typename T> __device__ __forceinline__
-bool AnyAre() { return false; }
+  __device__ __forceinline__ void
+  send(const T* src, int nelem) {
+    GenericOp<0, 0, 0, 1, 1, 0>(src, NULL, nelem, 0);
+  }
+  __device__ __forceinline__ void
+  directSend(const T* src, int directOffset, int nelem) {
+    GenericOp<0, 1, 0, 1, 1, 0>(src, NULL, nelem, directOffset);
+  }
 
-template<typename T, typename FIRST_T, typename... TAIL_Ts>
-__device__ __forceinline__
-bool AnyAre(FIRST_T first, TAIL_Ts... tail) {
-  return std::is_same<T, FIRST_T>::value || AnyAre<T>(tail...);
-}
+  __device__ __forceinline__ void
+  recv(T* dst, int nelem) {
+    GenericOp<0, 0, 1, 0, 0, 1>(NULL, dst, nelem, 0);
+  }
+  __device__ __forceinline__ void
+  directRecv(T* dst, int directOffset, int nelem) {
+    GenericOp<1, 0, 1, 0, 0, 1>(NULL, dst, nelem, directOffset);
+  }
 
+  __device__ __forceinline__ void
+  copySend(const T* src, T* dst, int nelem) {
+    GenericOp<0, 0, 0, 1, 1, 1>(src, dst, nelem, 0);
+  }
+  __device__ __forceinline__ void
+  directCopySend(const T* src, T* dst, int directOffset, int nelem) {
+    GenericOp<0, 1, 0, 1, 1, 1>(src, dst, nelem, directOffset);
+  }
 
-// Wait on all WaitFlags, ignore PostFlags
-__device__ __forceinline__
-void WaitOnFlags(uint64_t val) { }
+  __device__ __forceinline__ void
+  recvCopySend(T* dst, int nelem) {
+    GenericOp<0, 0, 1, 1, 0, 1>(NULL, dst, nelem, 0);
+  }
+  __device__ __forceinline__ void
+  directRecvCopySend(T* dst, int directOffset, int nelem) {
+    GenericOp<1, 1, 1, 1, 0, 1>(NULL, dst, nelem, directOffset);
+  }
 
-template <typename... TAIL_Ts> __device__ __forceinline__
-void WaitOnFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
-  flag.wait(val);
-  WaitOnFlags(val, tail...);
-}
+  __device__ __forceinline__ void
+  recvReduceCopy(const T* src, T* dst, int nelem) {
+    GenericOp<0, 0, 1, 0, 1, 1>(src, dst, nelem, 0);
+  }
 
-template <typename... TAIL_Ts> __device__ __forceinline__
-void WaitOnFlags(uint64_t val, PostFlag, TAIL_Ts... tail) {
-  WaitOnFlags(val, tail...);
-}
+  __device__ __forceinline__ void
+  recvReduceSend(const T* src, int nelem) {
+    GenericOp<0, 0, 1, 1, 1, 0>(src, NULL, nelem, 0);
+  }
 
+  __device__ __forceinline__ void
+  recvReduceCopySend(const T* src, T* dst, int nelem) {
+    GenericOp<0, 0, 1, 1, 1, 1>(src, dst, nelem, 0);
+  }
+  __device__ __forceinline__ void
+  directRecvReduceCopySend(const T* src, T* dst, int directOffset, int nelem) {
+    // Direct is only for the send part
+    GenericOp<0, 1, 1, 1, 1, 1>(src, dst, nelem, directOffset);
+  }
 
-// Post all PostFlags, ignore WaitFlags
-__device__ __forceinline__
-void PostToFlags(uint64_t val) { }
+  __device__ __forceinline__ ~ncclPrimitives() {
+    // Save steps for next collective. Have thread 0 do it to be compatible
+    // with the way LL works.
+    for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
+    for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
+  }
+};
 
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostToFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
-  PostToFlags(val, tail...);
-}
+template <typename T, class FUNC, int NRECV, int NSEND>
+class ncclLLPrimitives {
+ private:
+  const int tid;
+  const int nthreads;
+  int nrecv = 0;
+  int nsend = 0;
+  struct ncclConnInfo* recvConn[NRECV];
+  struct ncclConnInfo* sendConn[NSEND];
+  volatile uint64_t* waitPtr;
+  volatile uint64_t* postPtr;
+  volatile int* fifoPtr;
+  uint64_t recvStep[NRECV];
+  uint64_t sendStep[NSEND];
+  uint64_t sendConnHead;
+  union ncclLLFifoLine* recvBuff[NRECV];
+  union ncclLLFifoLine* sendBuff[NSEND];
+  struct ncclComm* comm;
+
+  inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+  inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+  inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
+  inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
+  inline __device__ uint32_t recvFlag(int i) { return recvStep[i]+1; }
+  inline __device__ uint32_t sendFlag(int i) { return sendStep[i]+1; }
+
+  // Exit If Abort Barrier : make sure all threads exit consistently
+  // Each thread sets a predicate to true if val == 1
+  // all CTA's threads enter the barrier and do a popc on their predicates being True
+  // If any of the thread's predicate was True, all the threads call exit()
+  inline __device__ void exitIfAbortLocalBarrier() {
+    uint32_t popc;
+    asm ("{");
+    asm volatile ("   .reg .pred barr_pred;");
+    asm volatile ("   setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
+    asm volatile ("   bar.red.popc.u32 %0, 14, %1, barr_pred;" : "=r"(popc) : "r"(nthreads));
+    asm ("}");
+    if (popc) {
+      // Make sure threads not participating in the operation get the abort and all threads exit
+      exitIfAbortBarrier(1);
+    }
+  }
+
+  inline __device__ void barrier() {
+    asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+  }
+
+  uint32_t mismatch = 0;
+  const uint64_t opCount;
+
+  inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
+    if (mismatch > 20) {
+      // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
+      // Note that we are not using _threadfence_system in LL so the error cannot be asserted
+      *(comm->fatalDevError) = ncclDevSuspectedMismatch;
+    } else if (remoteOpCount && *remoteOpCount > opCount) {
+      mismatch += 1;
+    }
+  }
 
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostToFlags(uint64_t val, PostFlag flag, TAIL_Ts... tail) {
-  flag.post(val);
-  PostToFlags(val, tail...);
-}
+  uint32_t spins = 0;
+  uint32_t abort = 0;
 
+  inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
+    spins++;
+    if (spins == SPINS_BEFORE_CHECK_ABORT) {
+      abort = *(comm->abortFlag);
+      checkMismatch(remoteOpCount);
+      spins = 0;
+    }
+    return abort;
+  }
 
-// Post sizes for PostFlags, ignore WaitFlags
-__device__ __forceinline__
-void PostSizeToFlags(uint64_t step, int size) { }
+  inline __device__ void waitSend(int i, int nbytes) {
+    spins = 0;
+    mismatch = 0;
+    if (tid == WARP_SIZE+i) {
+      while (sendConnHead + NCCL_STEPS < sendStep[i] + 1) {
+        sendConnHead = *waitPtr;
+        if (checkAbort(sendConn[i]->opCountRem)) break;
+      }
+      if (fifoPtr) fifoPtr[sendStep[i]%NCCL_STEPS] = nbytes;
+    }
+  }
 
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostSizeToFlags(uint64_t step, int size, WaitFlag flag, TAIL_Ts... tail) {
-  PostSizeToFlags(step, size, tail...);
-}
+  inline __device__ void postRecv(int i) {
+    recvStep[i]++;
+    if (tid == i) *postPtr = recvStep[i];
+  }
 
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostSizeToFlags(uint64_t step, int size, PostFlag flag, TAIL_Ts... tail) {
-  flag.postSize(step, size);
-  PostSizeToFlags(step, size, tail...);
-}
+  inline __device__ void postSend(int i) {
+    sendStep[i]++;
+  }
 
+  __device__ uint64_t readLL(int i, int offset) {
+    union ncclLLFifoLine* src = recvPtr(i) + offset;
+    uint32_t flag = recvFlag(i);
+    uint32_t data1, flag1, data2, flag2;
+    spins = 0;
+    mismatch = 0;
+    do {
+      asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
+      if (checkAbort(recvConn[i]->opCountRem)) break;
+    } while ((flag1 != flag) || (flag2 != flag));
+    uint64_t val64 = data1 + (((uint64_t)data2) << 32);
+    return val64;
+  }
 
-// Create pointer arithmetic syntax that doesn't break for std::nullptr_t
-template <typename Tptr> __device__ __forceinline__
-Tptr ptradd(Tptr ptr, int i) {
-  return ptr + i;
-}
+  __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
+    asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
+  }
 
-__device__ __forceinline__
-std::nullptr_t ptradd(std::nullptr_t ptr, int i) {
-  return nullptr;
-}
+  // Using memcpy handles misaligned pointers.
+  __device__ uint64_t readAL(uint64_t* src) {
+    uint64_t val;
+    memcpy((char*)&val, (char*)src, sizeof(uint64_t));
+    return val;
+  }
 
+  __device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) {
+    memcpy((char*)dst, (char*)&val, nbytes);
+  }
 
-// Implementation of primitive types
-template <int UNROLL, int SUBSTEPS, typename T, typename REDOP=FuncSum<T> >
-class Primitives {
- private:
-  template <typename SRC2_T, // either T* or std::nullptr_t
-      typename DST2_T, // either T* or std::nullptr_t
-      typename... SYNC_Ts> // either WaitFunc or PostFunc
-  static __device__ __forceinline__ void
-  GenericOp(const int tid, const int nthreads,
-      const T*     src1,
-      const SRC2_T src2,
-      T*     dst1,
-      DST2_T dst2,
-      int len, int maxoffset, uint64_t step, SYNC_Ts... flags) {
-
-    enum { noSrc2 = std::is_same<SRC2_T, std::nullptr_t>::value };
-    enum { noDst2 = std::is_same<DST2_T, std::nullptr_t>::value };
-    static_assert(noSrc2 || std::is_same<SRC2_T, const T*>::value,
-        "src2 must be of type T* or std::nullptr_t");
-    static_assert(noDst2 || std::is_same<DST2_T, T*>::value,
-        "dst2 must be of type T* or std::nullptr_t");
-
-    using OpType = typename std::conditional<noSrc2, FuncSum<T>, REDOP>::type;
-
-    int sliceSize = len / SUBSTEPS;
-    int sliceOffset = 0;
-
-#pragma unroll 1
-    for (int sub=0; sub<SUBSTEPS; ++sub) {
-      int realSize = max(0, min(sliceSize, maxoffset-sliceOffset));
-      if (tid < nthreads) {
-        if (AnyAre<WaitFlag>(flags...)) {
-          if (tid == 0) {
-            WaitOnFlags(SUBSTEPS*step + sub + 1, flags...);
-          }
-          asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+  template <int RECV, int SEND, int SRC, int DST>
+  __device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) {
+    uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T);
+    FOR_SEND(waitSend, nbytes*2);
+    barrier();
+    uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
+    uint64_t* srcPack = (uint64_t*)srcPtr;
+    uint64_t* dstPack = (uint64_t*)dstPtr;
+    // Do multiples of 64 bits
+    #pragma unroll 2
+    for (int offset=tid; offset<npack; offset+=nthreads) {
+      // Recv : local, then intra-node, then inter-node
+      uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
+      if (RECV) {
+        if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val);
+        for (int i=1; i<NRECV && i<nrecv; i++) {
+          val = MULTI<FUNC, T>()(readLL(i, offset), val);
         }
-        ReduceOrCopy
-        <
-        UNROLL,
-        OpType,
-        T,
-        !std::is_same<DST2_T, std::nullptr_t>::value, // HAS_DEST1
-        !std::is_same<SRC2_T, std::nullptr_t>::value  // HAS_SRC1
-        >
-        (
-            tid, nthreads,
-            ptradd(dst1, sliceOffset),
-            ptradd(dst2, sliceOffset),
-            ptradd(src1, sliceOffset),
-            ptradd(src2, sliceOffset),
-            realSize
-        );
-        if (AnyAre<PostFlag>(flags...)) {
-          __syncthreads();
+      }
+
+      // Send : inter-node, then intra-node, then local
+      if (SEND) {
+        for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i));
+        storeLL(sendPtr(0)+offset, val, sendFlag(0));
+      }
+      if (DST) {
+        if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) {
+          // Last incomplete word
+          storeAL(dstPack+offset, val, nbytes & 0x7);
+        } else {
+          storeAL(dstPack+offset, val, sizeof(uint64_t));
         }
-      } else {
-        if (AnyAre<PostFlag>(flags...)) {
-          __syncthreads();
-          PostSizeToFlags(SUBSTEPS*step+sub, realSize*sizeof(T), flags...);
-          __threadfence_system();
-          PostToFlags(SUBSTEPS*step + sub + 1, flags...);
+      }
+    }
+    exitIfAbortLocalBarrier();
+    FOR_RECV(postRecv);
+    FOR_SEND(postSend);
+  }
+
+  __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
+    recvConn[i] = conn;
+    recvBuff[i] = recvConn[i]->llBuff;
+    recvStep[i] = recvConn[i]->step;
+    if (tid == i) {
+      postPtr = recvConn[i]->head;
+      *(recvConn[i]->opCountLoc) = opCount;
+    }
+    nrecv++;
+  }
+
+  __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
+    sendConn[i] = conn;
+    sendBuff[i] = sendConn[i]->llBuff;
+    sendStep[i] = sendConn[i]->step;
+    if (tid == WARP_SIZE+i) {
+      waitPtr = sendConn[i]->head;
+      fifoPtr = sendConn[i]->fifo;
+      sendConnHead = *waitPtr;
+      *(sendConn[i]->opCountLoc) = opCount;
+    }
+    nsend++;
+  }
+
+  __device__ __forceinline__ void saveRecvConn(int i) {
+    if (tid == i) {
+      recvConn[i]->step = recvStep[i];
+      *(recvConn[i]->opCountLoc) += 1;
+      __threadfence_block();
+    }
+  }
+
+  __device__ __forceinline__ void saveSendConn(int i) {
+    if (tid == WARP_SIZE+i) {
+      sendConn[i]->step = sendStep[i];
+      *(sendConn[i]->opCountLoc) += 1;
+      __threadfence_block();
+    }
+  }
+
+  __device__ __forceinline__ void llSendCleaning(int i) {
+    if (sendStep[i] > sendConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
+      /* Reset all flags */
+      static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS");
+      static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS");
+      for (int s=0; s<NCCL_STEPS; s++) {
+        waitSend(i, 0);
+        for (int o=tid; o<NCCL_LL_SLICE_LINES; o+=nthreads) {
+          const union ncclLLFifoLine resetLine = { 0, sendFlag(i), 0, sendFlag(i) };
+          sendPtr(i)[o].i4 = resetLine.i4;
         }
       }
-      sliceOffset += sliceSize;
+      if (tid == 0) sendConn[i]->llLastCleaning = sendStep[i];
+    }
+  }
+
+  __device__ __forceinline__ void llRecvCleaning(int i) {
+    if (recvStep[i] > recvConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
+      recvStep[i] += NCCL_STEPS;
+      if (tid == 0) recvConn[i]->llLastCleaning = recvStep[i];
     }
   }
 
  public:
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  Copy(const int tid, const int nthreads, const T* src, T* dst,
-      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
-    GenericOp(tid, nthreads, src, nullptr, dst, nullptr, len, maxOffset, step, flags...);
+  __device__ __forceinline__
+  ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
+    : comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
+    // Make sure step is updated before we read it.
+    barrier();
+
+    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
+    for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
   }
 
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  DoubleCopy(const int tid, const int nthreads, const T* src, T* dst1, T* dst2,
-      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
-    GenericOp(tid, nthreads, src, nullptr, dst1, dst2, len, maxOffset, step, flags...);
+  __device__ void send(const T* src, int nelem) {
+    return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem);
   }
 
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  Reduce(const int tid, const int nthreads, const T* src1, const T* src2, T* dst,
-      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
-    GenericOp(tid, nthreads, src1, src2, dst, nullptr, len, maxOffset, step, flags...);
+  __device__ void recv(T* dst, int nelem) {
+    return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem);
   }
 
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  ReduceCopy(const int tid, const int nthreads, const T* src1, const T* src2, T* dst1, T* dst2,
-      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
-    GenericOp(tid, nthreads, src1, src2, dst1, dst2, len, maxOffset, step, flags...);
+  __device__ void recvReduceSend(const T* src, int nelem) {
+    return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem);
+  }
+
+  __device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
+    return LLGenericOp<1, 0, 1, 1>(src, dst, nelem);
   }
-};
 
-#endif // end include guard
+  __device__ void copySend(const T* src, T* dst, int nelem) {
+    return LLGenericOp<0, 1, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ void recvCopySend(T* dst, int nelem) {
+    return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem);
+  }
+
+  __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
+    return LLGenericOp<1, 1, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ __forceinline__ ~ncclLLPrimitives() {
+    for (int i=0; i<NSEND && i<nsend; i++) llSendCleaning(i);
+    for (int i=0; i<NRECV && i<nrecv; i++) llRecvCleaning(i);
+    // Save steps for the next operation
+    for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
+    for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
+  }
+};
+#endif
diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu
index bd1d23c..1ef66d4 100644
--- a/src/collectives/device/reduce.cu
+++ b/src/collectives/device/reduce.cu
@@ -4,18 +4,8 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "common.h"
 #include "reduce.h"
+#include "common.h"
 #include "collectives.h"
 
-#define UNROLL 4
-
-#if NCCL_OP == 0
-IMPL_COLL2(ncclReduce, sum,  FuncSum,  ncclCollReduce, ncclSum);
-#elif NCCL_OP == 1
-IMPL_COLL2(ncclReduce, prod, FuncProd, ncclCollReduce, ncclProd);
-#elif NCCL_OP == 2
-IMPL_COLL2(ncclReduce, min,  FuncMin,  ncclCollReduce, ncclMin);
-#elif NCCL_OP == 3
-IMPL_COLL2(ncclReduce, max,  FuncMax,  ncclCollReduce, ncclMax);
-#endif
+IMPL_COLL_R(ncclReduce, ncclCollReduce);
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index f5694b1..302d053 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -8,143 +8,71 @@
 #include "primitives.h"
 #include "collectives.h"
 
-// Increase Step and boffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  boffset += sliceSize; \
-  if (boffset == buffSize) boffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
-__device__ void ncclReduceKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x - 1;
   const int bid = args->bid;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-
-  WaitFlag waitDoneFromNext(ring->send.conn.head, (REDUCE_BUFCHUNKS-1)*REDUCE_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
-  PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS);
-
-  typedef Primitives<UNROLL, REDUCE_SUBSTEPS, T, FUNC> Prims;
-
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
   const int nranks = comm->nRanks;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / REDUCE_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
   const int rank = ring->devUserRanks[0];
   const int prevRank = ring->devUserRanks[nranks-1];
   const int root = args->root;
 
-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    *ring->recv.conn.opCount = args->opCount;
-
-    if (rank != root) {
-      // Wait for next to be ready
-      WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-      waitOpCountNext.wait(args->opCount);
-    }
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int boffset = 0;
-
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t offset = gridOffset + bid*chunkSize;
-    int maxOffset = min(chunkSize, size-offset);
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t offset = gridOffset + bid*realChunkSize;
+    int nelem = min(realChunkSize, size-offset);
     if (prevRank == root) {
-      Prims::Copy(tid, nthreads,
-          thisInput + offset,
-          nextOutput + boffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
+      prims.send(thisInput+offset, nelem);
     } else if (rank == root) {
-      Prims::Reduce(tid, nthreads,
-          prevInput  + boffset,
-          thisInput + offset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
+      prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
     } else {
-      Prims::Reduce(tid, nthreads,
-          prevInput + boffset,
-          thisInput + offset,
-          nextOutput + boffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-    }
-    NEXT_STEP; // Increases step, boffset
-  }
-
-  if (tid == 0) {
-    if (rank != root) {
-      // Wait for next to have consumed data before resetting the flag
-      waitDoneFromNext.wait(REDUCE_SUBSTEPS*(step + REDUCE_BUFCHUNKS - 1));
-      *ring->send.conn.head = 0ULL;
+      prims.recvReduceSend(thisInput+offset, nelem);
     }
-    *ring->recv.conn.tail = 0ULL;
-    __threadfence_system();
-    *ring->recv.conn.opCount = args->opCount+1;
   }
 }
 
-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
-  boffset += NCCL_LL_SLICE_LINES; \
-  if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
-  flag++; \
-  step++;
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { }
 
 template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
-  const int llNthreads = args->nThreads;
+  const int nthreads = args->nThreads;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
-  const int nranks = comm->nRanks;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
+
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+  const ssize_t size = args->N;
   const int rank = comm->rank;
+  const int nranks = comm->nRanks;
   const int prevRank = ring->devUserRanks[nranks-1];
   const int root = args->root;
 
-  typedef LLPrimitives<T, FUNC> LL;
-
-  const ssize_t size = args->N;
   ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*chunkSize;
-
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t flag = step + 1;
-  int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  const ssize_t loopSize = args->nChannels*chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     if (size-gridOffset < loopSize) {
@@ -152,39 +80,16 @@ __device__ void ncclReduceLLKernel(struct CollectiveArgs* args) {
     }
     ssize_t offset = gridOffset + bid*chunkSize;
 
-    int maxOffset = min(chunkSize, size-offset);
+    int nelem = min(chunkSize, size-offset);
     if (prevRank == root) {
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          thisInput + offset,
-          nextOutput + boffset,
-          maxOffset, flag, llNthreads);
-      POST_SIZE;
-      NEXT_STEP_LL;
+      LLprims.send(thisInput+offset, nelem);
     } else if (rank == root) {
-      LL::ReduceCopy(
-          thisInput + offset,
-          prevInput  + boffset,
-          thisOutput + offset,
-          maxOffset, flag, llNthreads);
-      NEXT_STEP_LL;
-      ACK_PREV;
+      LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
     } else {
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          thisInput + offset,
-          prevInput + boffset,
-          nextOutput + boffset,
-          maxOffset, flag, flag, llNthreads);
-      POST_SIZE;
-      NEXT_STEP_LL;
-      ACK_PREV;
+      LLprims.recvReduceSend(thisInput+offset, nelem);
     }
   }
-
-  // We need everyone to acknowledge data even if they didn't receive anything
-  // so that the next collective can start right away.
-  ACK_PREV;
-
-  FIFO_CLEANING_AND_SAVE_STEP(flag);
 }
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/reduce_kernel.h b/src/collectives/device/reduce_kernel.h
index 0cb8f13..0e90793 100644
--- a/src/collectives/device/reduce_kernel.h
+++ b/src/collectives/device/reduce_kernel.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -46,30 +46,28 @@ struct FuncMin {
   }
 };
 
+#define MASK0 0x00ff00ff
+#define MASK1 0xff00ff00
+static __device__ uint32_t addChar4(const uint32_t x, const uint32_t y) {
+  /* This can be used both for signed and unsigned 8-bit addition */
+  const uint32_t x0 = x & MASK0;
+  const uint32_t x1 = x & MASK1;
+  const uint32_t y0 = y & MASK0;
+  const uint32_t y1 = y & MASK1;
+  const uint32_t r0 = (x0+y0);
+  const uint32_t r1 = (x1+y1);
+  return (r0 & MASK0) | (r1 & MASK1);
+}
+
 template<>
 struct FuncSum<int8_t> {
-  union converter { uint32_t storage; char4 a; };
   __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
     int32_t rv, z=0;
     asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
     return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vadd.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-        "vadd.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vadd.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vadd.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = cx.a.x + cy.a.x;
-    cr.a.y = cx.a.y + cy.a.y;
-    cr.a.z = cx.a.z + cy.a.z;
-    cr.a.w = cx.a.w + cy.a.w;
-    return cr.storage;
+    return addChar4(x, y);
 #endif
   }
   __device__ int8_t operator()(const int8_t x, const int8_t y) const {
@@ -78,28 +76,13 @@ struct FuncSum<int8_t> {
 };
 template<>
 struct FuncSum<uint8_t> {
-  union converter { uint32_t storage; uchar4 a; };
   __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
 #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
     int32_t rv, z=0;
     asm("vadd4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
     return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vadd.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
-        "vadd.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vadd.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vadd.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = cx.a.x + cy.a.x;
-    cr.a.y = cx.a.y + cy.a.y;
-    cr.a.z = cx.a.z + cy.a.z;
-    cr.a.w = cx.a.w + cy.a.w;
-    return cr.storage;
+    return addChar4(x, y);
 #endif
   }
   __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
@@ -109,22 +92,6 @@ struct FuncSum<uint8_t> {
 
 static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) {
   /* This can be used both for signed and unsigned 8-bit multiplication */
-#if (__CUDA_ARCH__ >= 300)
-  uint32_t rv;
-  asm("{ .reg .u32 t0, t1, t2, t3;\n\t"
-      " vmad.u32.u32.u32 t3, %1.b3, %2.b3, 0;\n\t"
-      " vmad.u32.u32.u32 t2, %1.b2, %2.b2, 0;\n\t"
-      " shl.b32          t3, t3, 16;\n\t"
-      " shl.b32          t2, t2, 16;\n\t"
-      " vmad.u32.u32.u32 t1, %1.b1, %2.b1, t3;\n\t"
-      " shl.b32          t1, t1, 8;\n\t"
-      " vmad.u32.u32.u32 t0, %1.b0, %2.b0, t2;\n\t"
-      " and.b32          t1, t1, 0xff00ff00;\n\t"
-      " and.b32          t0, t0, 0x00ff00ff;\n\t"
-      " or.b32           %0,  t0, t1;\n\t"
-      "}" : "=r"(rv) : "r"(x), "r"(y));
-  return rv;
-#else
   union converter { uint32_t storage; char4 a; };
   converter cx, cy, cr;
   cx.storage = x;
@@ -134,7 +101,6 @@ static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) {
   cr.a.z = cx.a.z * cy.a.z;
   cr.a.w = cx.a.w * cy.a.w;
   return cr.storage;
-#endif
 }
 
 template<>
@@ -164,13 +130,6 @@ struct FuncMax<int8_t> {
     int32_t rv, z=0;
     asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
     return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vmax.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-        "vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
     converter cx, cy, cr;
     cx.storage = x;
@@ -194,13 +153,6 @@ struct FuncMax<uint8_t> {
     int32_t rv, z=0;
     asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
     return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vmax.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
-        "vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
     converter cx, cy, cr;
     cx.storage = x;
@@ -225,13 +177,6 @@ struct FuncMin<int8_t> {
     int32_t rv, z=0;
     asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
     return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vmin.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-        "vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
     converter cx, cy, cr;
     cx.storage = x;
@@ -255,13 +200,6 @@ struct FuncMin<uint8_t> {
     int32_t rv, z=0;
     asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
     return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
-    int32_t rv;
-    asm("vmin.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
-        "vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
 #else
     converter cx, cy, cr;
     cx.storage = x;
diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu
index b16053c..10857ed 100644
--- a/src/collectives/device/reduce_scatter.cu
+++ b/src/collectives/device/reduce_scatter.cu
@@ -4,18 +4,8 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "common.h"
 #include "reduce_scatter.h"
+#include "common.h"
 #include "collectives.h"
 
-#define UNROLL 4
-
-#if NCCL_OP == 0
-IMPL_COLL2(ncclReduceScatter, sum,  FuncSum,  ncclCollReduceScatter, ncclSum);
-#elif NCCL_OP == 1
-IMPL_COLL2(ncclReduceScatter, prod, FuncProd, ncclCollReduceScatter, ncclProd);
-#elif NCCL_OP == 2
-IMPL_COLL2(ncclReduceScatter, min,  FuncMin,  ncclCollReduceScatter, ncclMin);
-#elif NCCL_OP == 3
-IMPL_COLL2(ncclReduceScatter, max,  FuncMax,  ncclCollReduceScatter, ncclMax);
-#endif
+IMPL_COLL_R(ncclReduceScatter, ncclCollReduceScatter);
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index cad011b..c70c845 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -8,156 +8,82 @@
 #include "primitives.h"
 #include "collectives.h"
 
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
 template<int UNROLL, class FUNC, typename T>
-__device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = blockDim.x - 1;
   const int bid = args->bid;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-
-  WaitFlag waitDoneFromNext(ring->send.conn.head, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, REDUCESCATTER_SUBSTEPS);
-  PostFlag postDoneToPrev(ring->recv.conn.head, REDUCESCATTER_SUBSTEPS, NULL, 0);
-  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
-
-  typedef Primitives<UNROLL, REDUCESCATTER_SUBSTEPS, T, FUNC> Prims;
-
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
   const int nranks = comm->nRanks;
-  const int buffSize = ring->buffSize / sizeof(T);
-  const int sliceSize = buffSize / REDUCESCATTER_BUFCHUNKS;
-  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
-
-  if (tid == 0) {
-    // Update in case we skipped some collectives
-    *ring->recv.conn.opCount = args->opCount;
-    // Wait for next to be ready
-    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
-    waitOpCountNext.wait(args->opCount);
-  }
-  __syncthreads();
-
-  uint64_t step = 0ULL;
-  int poffset, noffset = 0;
+  const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+  const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+  const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
-  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
-    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+    int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+    ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*realChunkSize;
 
     /////////////// begin ReduceScatter steps ///////////////
     ssize_t offset;
-    int maxOffset = min(chunkSize, size-chunkOffset);
+    int nelem = min(realChunkSize, size-chunkOffset);
     int rankDest;
 
     // step 0: push data to next GPU
     rankDest = ring->devUserRanks[nranks-1];
     offset = chunkOffset + rankDest * size;
 
-    Prims::Copy(tid, nthreads,
-        thisInput  + offset,
-        nextOutput + noffset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext,
-        postReadyToNext);
-
-    NEXT_STEP; // Increases step, poffset, noffset
+    prims.send(thisInput+offset, nelem);
 
     // k-2 steps: reduce and copy to next GPU
     for (int j=2; j<nranks; ++j) {
       rankDest = ring->devUserRanks[nranks-j];
       offset = chunkOffset + rankDest * size;
 
-      Prims::Reduce(tid, nthreads,
-          prevInput  + poffset,
-          thisInput  + offset,
-          nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
+      prims.recvReduceSend(thisInput+offset, nelem);
     }
 
-    // step k-1: reduce this buffer and data, which will produce the final
-    // result that we store in this data and push to the next GPU
+    // step k-1: reduce this buffer and data, which will produce the final result
     rankDest = ring->devUserRanks[0];
     offset = chunkOffset + rankDest * size;
 
-    Prims::Reduce(tid, nthreads,
-        prevInput  + poffset,
-        thisInput  + offset,
-        thisOutput + chunkOffset,
-        sliceSize, maxOffset,
-        step,
-        waitReadyFromPrev,
-        postDoneToPrev);
-  }
-
-  if (tid == 0) {
-    waitDoneFromNext.wait(REDUCESCATTER_SUBSTEPS*(step + REDUCESCATTER_BUFCHUNKS));
-    *ring->send.conn.head = 0ULL;
-    *ring->recv.conn.tail = 0ULL;
-    __threadfence_system();
-    *ring->recv.conn.opCount = args->opCount+1;
+    prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
   }
 }
 
-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
-  poffset = noffset; \
-  pflag = nflag; \
-  noffset += NCCL_LL_SLICE_LINES; \
-  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
-  nflag++; \
-  step++;
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { }
 
 template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
-  const int llNthreads = args->nThreads;
+  const int nthreads = args->nThreads;
   struct ncclComm* comm = args->comm;
-  struct ncclRing* ring = comm->rings+blockIdx.x;
-  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
-  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
-  volatile int * sizesFifo = ring->send.conn.llFifo;
-  uint64_t sendHead = sendHeadPtr[0];
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
 
-  typedef LLPrimitives<T, FUNC> LL;
+  ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
 
   const ssize_t size = args->N;
   //const int rank = comm->rank;
   const int nranks = comm->nRanks;
   ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
-  const ssize_t loopSize = args->nRings*chunkSize;
-
-  uint64_t step = ring->send.conn.llStep;
-  uint32_t pflag, nflag = step + 1;
-  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+  const ssize_t loopSize = args->nChannels*chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
-  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
-  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     if (size-gridOffset < loopSize) {
@@ -167,37 +93,21 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
 
     /////////////// begin ReduceScatter steps ///////////////
     ssize_t offset;
-    int maxOffset = min(chunkSize, size-chunkOffset);
+    int nelem = min(chunkSize, size-chunkOffset);
     int rankDest;
 
     // step 0: push data to next GPU
     rankDest = ring->devUserRanks[nranks-1];
     offset = chunkOffset + rankDest * size;
 
-    WAIT_NEXT;
-    LL::ReduceCopy(
-        thisInput  + offset,
-        nextOutput + noffset,
-        maxOffset, nflag, llNthreads);
-    POST_SIZE;
-
-    NEXT_STEP_LL;
+    LLprims.send(thisInput+offset, nelem);
 
     // k-2 steps: reduce and copy to next GPU
     for (int j=2; j<nranks; ++j) {
       rankDest = ring->devUserRanks[nranks-j];
       offset = chunkOffset + rankDest * size;
 
-      WAIT_NEXT;
-      LL::ReduceCopy(
-          thisInput  + offset,
-          prevInput  + poffset,
-          nextOutput + noffset,
-          maxOffset, pflag, nflag, llNthreads);
-      POST_SIZE;
-      ACK_PREV;
-
-      NEXT_STEP_LL;
+      LLprims.recvReduceSend(thisInput+offset, nelem);
     }
 
     // step k-1: reduce this buffer and data, which will produce the final
@@ -205,13 +115,9 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
     rankDest = ring->devUserRanks[0];
     offset = chunkOffset + rankDest * size;
 
-    LL::ReduceCopy(
-        thisInput  + offset,
-        prevInput  + poffset,
-        thisOutput + chunkOffset,
-        maxOffset, pflag, llNthreads);
-    ACK_PREV;
+    LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
   }
-
-  FIFO_CLEANING_AND_SAVE_STEP(nflag);
 }
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/reduce.cu b/src/collectives/reduce.cu
index d8fde80..302d4bc 100644
--- a/src/collectives/reduce.cu
+++ b/src/collectives/reduce.cu
@@ -4,30 +4,15 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
-#include "common_coll.h"
 #include "enqueue.h"
 #include "collectives.h"
 
-ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(REDUCE_SUBSTEPS, REDUCE_BUFCHUNKS, 1, 1, nbytes, proxyPatternTo(root), comm));
-    NCCLCHECK(saveKernel(ncclCollReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, 1));
-  }
-
-  return ncclSuccess;
-}
-
 NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  return ncclEnqueueCheck(ncclReduceFunc, "Reduce", sendbuff, recvbuff, count, datatype,
-          op, root, comm, stream);
+  struct ncclInfo info = { ncclCollReduce, "Reduce",
+    sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
+    REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
 }
diff --git a/src/collectives/reduce_scatter.cu b/src/collectives/reduce_scatter.cu
index 1447d4a..4ee77ef 100644
--- a/src/collectives/reduce_scatter.cu
+++ b/src/collectives/reduce_scatter.cu
@@ -4,29 +4,15 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
-#include "common_coll.h"
 #include "enqueue.h"
 #include "collectives.h"
 
-ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  size_t nbytes = count*ncclTypeSize(datatype);
-  INFO(NCCL_COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
-  } else {
-    NCCLCHECK(transportSaveProxies(REDUCESCATTER_SUBSTEPS, REDUCESCATTER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
-    NCCLCHECK(saveKernel(ncclCollReduceScatter, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes*comm->nRanks, 1));
-  }
-  return ncclSuccess;
-}
-
 NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
 ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
-  return ncclEnqueueCheck(ncclReduceScatterFunc, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype,
-          op, 0, comm, stream);
+  struct ncclInfo info = { ncclCollReduceScatter, "ReduceScatter",
+    sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
+    REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
 }
diff --git a/src/enqueue.cu b/src/enqueue.cu
new file mode 100644
index 0000000..d283223
--- /dev/null
+++ b/src/enqueue.cu
@@ -0,0 +1,442 @@
+/*************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "checks.h"
+#include "param.h"
+
+#include "collectives/collectives.h"
+
+// Only generate inline kernels for LL
+#define NCCL_FUNC5(coll, op, dtype) \
+  (void*)NCCL_KERN_NAME(coll##LL, op, dtype), \
+  (void*)NCCL_KERN_NAME(coll##LL, op, dtype)
+
+#define NCCL_FUNC4(coll, op, dtype) \
+  (void*)NCCL_FUNC5(coll##Ring, op, dtype), \
+  (void*)NCCL_FUNC5(coll##Tree, op, dtype)
+
+// Must be consistent with ncclDataType_t
+#define NCCL_FUNCS3A(coll, op) \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  u8), \
+  (void*)NCCL_FUNC4(coll, op, i32), \
+  (void*)NCCL_FUNC4(coll, op, u32), \
+  (void*)NCCL_FUNC4(coll, op, i64), \
+  (void*)NCCL_FUNC4(coll, op, u64), \
+  (void*)NCCL_FUNC4(coll, op, f16), \
+  (void*)NCCL_FUNC4(coll, op, f32), \
+  (void*)NCCL_FUNC4(coll, op, f64)
+#define NCCL_FUNCS3B(coll, op) \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8)
+
+// Must be consistent with ncclRedOp_t -- but we only generate kernel for sums.
+#define NCCL_FUNCS2A(coll) \
+  NCCL_FUNCS3A(coll, sum), \
+  NCCL_FUNCS3A(coll, sum), \
+  NCCL_FUNCS3A(coll, sum), \
+  NCCL_FUNCS3A(coll, sum)
+#define NCCL_FUNCS2B(coll) \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy)
+
+// Must be consistent with the ncclFuncSet enum
+static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
+  NCCL_FUNCS2B(ncclBroadcast),
+  NCCL_FUNCS2A(ncclReduce),
+  NCCL_FUNCS2B(ncclAllGather),
+  NCCL_FUNCS2A(ncclReduceScatter),
+  NCCL_FUNCS2A(ncclAllReduce)
+};
+
+/*****************************************************************************/
+/*       Launch system : synchronization and CUDA kernel launch              */
+/*****************************************************************************/
+
+ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
+#if CUDART_VERSION >= 9000
+  if (cgMode & 0x01) {
+    CUDACHECK(cudaLaunchCooperativeKernelMultiDevice(paramsList, numDevices,
+            // These flags are to reduce the latency of using this API
+            cudaCooperativeLaunchMultiDeviceNoPreSync|cudaCooperativeLaunchMultiDeviceNoPostSync));
+    return ncclSuccess;
+  }
+#endif
+  int savedDev;
+  CUDACHECK(cudaGetDevice(&savedDev));
+  for (int i = 0; i < numDevices; i++) {
+    struct cudaLaunchParams* params = paramsList+i;
+    CUDACHECK(cudaSetDevice(cudaDevs[i]));
+    CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
+  }
+  CUDACHECK(cudaSetDevice(savedDev));
+  return ncclSuccess;
+}
+
+ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
+  params->gridDim.x = std::min((int) params->gridDim.x, comm->nChannels);
+
+  // Set active = 2 for the last operation
+  for (int r=0; r<params->gridDim.x; r++) {
+    struct ncclChannel* channel = comm->channels+r;
+    channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active = 2;
+  }
+
+  // Find the first operation, choose the kernel accordingly and pass it
+  // as the first argument.
+  struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart;
+  memcpy(&comm->args, coll, sizeof(struct ncclColl));
+  // As we pass that coll directly, we can free it immediately.
+  coll->active = 0;
+
+  params->func = ncclKerns[coll->funcIndex];
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
+  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+  int val = *ptr;
+  bool done = false;
+  while (done == false) {
+    if (val >= comm->intraRanks) {
+      WARN("Trying to launch too many collectives");
+      return ncclInvalidUsage;
+    }
+    if (val+1 == comm->intraRanks) {
+      // Reset the barrier.
+      comm->intraBarrier[comm->intraPhase^1] = 0;
+      *isLast = 1;
+      return ncclSuccess;
+    }
+    done = __sync_bool_compare_and_swap(ptr, val, val+1);
+    val++;
+  }
+  *isLast = 0;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
+  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+  int val = *ptr;
+  if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
+    WARN("Trying to launch too many collectives");
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
+  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+  while (*ptr < comm->intraRanks) pthread_yield();
+  comm->intraPhase ^= 1;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
+  if (comm->nRanks == 1) return ncclSuccess;
+  struct cudaLaunchParams* params = comm->myParams;
+
+  NCCLCHECK(setupLaunch(comm, params));
+
+  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
+  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
+    // Enqueue event in user stream
+    CUDACHECK(cudaEventRecord(comm->doneEvent, comm->userStream));
+    // Create dependency between user stream and internal NCCL stream
+    CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
+    params->stream = comm->groupStream;
+  } else {
+    if (comm->userStream != params->stream) {
+      // Stream changed from last call, create dependency against last NCCL kernel launch
+      CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
+    }
+    params->stream = comm->userStream;
+  }
+
+  int isLast = 0;
+  NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
+
+  if (isLast) {
+    if (comm->launchMode == ncclComm::GROUP) {
+      // I'm the last. Launch all operations.
+      NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode));
+    }
+    NCCLCHECK(ncclCpuBarrierLast(comm));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
+  if (comm->nRanks == 1) return ncclSuccess;
+  // We can't print the CG mode before the first barrier happened.
+  if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
+    *comm->intraCGMode ^= 0x10;
+    INFO(NCCL_INIT,"Launch mode %s%s%s",
+        comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
+        *comm->intraCGMode ? "/CGMD" : "",
+        (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
+  }
+
+  NCCLCHECK(ncclCpuBarrierOut(comm));
+
+  struct cudaLaunchParams *params = comm->myParams;
+  if (comm->launchMode == ncclComm::PARALLEL) {
+    CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
+  }
+  // Start the network proxies as soon as the kernel has been launched. We can't
+  // perform any CUDA call between the two or having a cudaFree between the CUDA
+  // launch and the transportStartProxy call could cause a deadlock.
+  // Also, starting the proxies after the CUDA launch seems to be better for
+  // performance (latency).
+  for (int r=0; r<params->gridDim.x; r++) {
+    struct ncclChannel* channel = comm->channels+r;
+    channel->collStart = channel->collFifoTail;
+    channel->collCount = 0;
+  }
+  params->gridDim.x = params->blockDim.x = 0;
+  NCCLCHECK(transportStartProxy(comm));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
+  struct cudaLaunchParams *params = comm->myParams;
+  // Enqueue event after NCCL kernel
+  CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream));
+  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
+  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
+    // Create dependency between NCCL internal stream and user stream
+    CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
+  }
+  comm->userStreamSet = false;
+  return ncclSuccess;
+}
+
+/*****************************************************************************/
+/* Enqueueing system : computation of kernel and proxy operations parameters */
+/*****************************************************************************/
+
+static ncclResult_t getPatternInfo(struct ncclInfo* info) {
+  if (info->coll == ncclCollBroadcast) info->pattern = ncclPatternPipelineFrom;
+  else if (info->coll == ncclCollReduce) info->pattern = ncclPatternPipelineTo;
+  else if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->pattern = ncclPatternRing;
+  else if (info->coll == ncclCollAllReduce) {
+    if (info->nBytes <= info->comm->treeThreshold)
+      info->pattern = ncclPatternTreeUpDown;
+    else
+      info->pattern = ncclPatternRingTwice;
+  }
+  else {
+    WARN("Unknown collective %d", info->coll);
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t getLoopInfo(struct ncclInfo* info) {
+  switch (info->pattern) {
+    case ncclPatternTreeUp:
+    case ncclPatternTreeDown:
+    case ncclPatternTreeUpDown:
+    case ncclPatternPipelineFrom:
+    case ncclPatternPipelineTo:
+      info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
+    case ncclPatternRing:
+      info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
+    case ncclPatternRingTwice:
+      info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break;
+    default:
+      WARN("Unknown pattern %d\n", info->pattern);
+      return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
+static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) {
+  // Compute thresholds and limits that users can override
+  int perThreadLLThreshold = std::min(info->comm->threadThreshold, (ssize_t)NCCL_LL_CHANNEL_THRESHOLD);
+  int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads);
+
+  // First compute nThreads
+  int nt = NCCL_LL_MIN_NTHREADS;
+  while (DIVUP(info->nBytes, nt*info->nchunksPerLoop) > perThreadLLThreshold && nt*2 <= maxLLNthreads) nt *= 2;
+
+  // Then compute nChannels
+  int nc = DIVUP(info->nBytes, nt*info->nchunksPerLoop*perThreadLLThreshold);
+  if (nc == 0) nc = 1;
+  if (nc > info->comm->nChannels) nc = info->comm->nChannels;
+
+  // Check if we have a fixed LL threshold, otherwise compute it.
+  int perThreadThreshold = info->comm->threadThreshold;
+  if (info->pattern >= ncclPatternTreeUp) perThreadThreshold *= 4;
+  ssize_t llThreshold = info->comm->llThreshold >= 0 ?
+    info->comm->llThreshold :
+    nc*nt*info->nchunksPerLoop*perThreadThreshold;
+
+  if (info->nBytes <= llThreshold) {
+    *llMode = 1;
+    *nChannels = nc;
+    *nThreads = nt;
+  } else {
+    *llMode = 0;
+    *nChannels = info->comm->nChannels;
+    *nThreads = info->comm->nThreads+1;
+  }
+}
+
+static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
+  // Set nstepsPerLoop and nchunksPerLoop
+  NCCLCHECK(getPatternInfo(info));
+  NCCLCHECK(getLoopInfo(info));
+
+  coll->args.root = info->root;
+  coll->args.N = info->count;
+  coll->args.ThisInput = info->sendbuff;
+  coll->args.ThisOutput = info->recvbuff;
+  coll->args.comm = info->comm->devComm;
+  coll->args.opCount = info->comm->opCount;
+
+  // Compute llMode, nChannels, nThreads
+  int llMode;
+  getKernelInfo(info, &coll->args.nChannels, &coll->args.nThreads, &llMode);
+
+  int treeMode = info->pattern >= ncclPatternTreeUp ? 1 : 0;
+  coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, llMode, treeMode);
+
+  int stepSize   = ( llMode ? NCCL_LL_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS;
+  int chunkSteps = (llMode|treeMode) ? 1 : info->chunkSteps;
+  int sliceSteps = (llMode|treeMode) ? 1 : info->sliceSteps;
+  int chunkSize  = stepSize*chunkSteps;
+
+  // Compute lastChunkSize
+  if (treeMode == 1 && llMode == 0) {
+    if (info->pattern == ncclPatternTreeUpDown) {
+      // Optimize chunkSize / nSteps
+      while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
+      while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
+      while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
+    }
+    // Use lastChunkSize as chunkSize
+    coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+  } else if (llMode == 1) {
+    int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t);
+    const ssize_t loopSize = coll->args.nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
+    coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), coll->args.nChannels*info->nchunksPerLoop);
+    ALIGN_SIZE(coll->args.lastChunkSize, coll->args.nThreads*sizeof(uint64_t));
+    coll->args.lastChunkSize /= ncclTypeSize(info->datatype);
+  }
+
+  // Compute nSteps for proxies
+  size_t nBytes  = llMode ? info->nBytes*2 : info->nBytes;
+
+  int nLoops = (int)(DIVUP(nBytes, (((size_t)(coll->args.nChannels))*info->nchunksPerLoop*chunkSize)));
+  proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
+  proxyArgs->sliceSteps = sliceSteps;
+  proxyArgs->chunkSteps = chunkSteps;
+  proxyArgs->llMode = llMode;
+  proxyArgs->opCount = info->comm->opCount;
+  TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> llmode %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
+      coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, nBytes, llMode, coll->args.nChannels, coll->args.nThreads,
+      nLoops, proxyArgs->nsteps, info->comm);
+  return ncclSuccess;
+}
+
+static ncclResult_t saveKernel(struct ncclInfo* info) {
+  if (info->comm->nRanks == 1) {
+    if (info->sendbuff != info->recvbuff)
+      CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, cudaMemcpyDeviceToDevice, info->stream));
+    return ncclSuccess;
+  }
+
+  struct ncclColl coll;
+  struct ncclProxyArgs proxyArgs;
+  memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
+  NCCLCHECK(computeColl(info, &coll, &proxyArgs));
+
+  info->comm->myParams->blockDim.x = max(info->comm->myParams->blockDim.x, coll.args.nThreads);
+  if (info->comm->userStreamSet == false) {
+    info->comm->userStream = info->stream;
+    info->comm->userStreamSet = true;
+  } else if (info->stream != info->comm->userStream) {
+    WARN("Error : mixing different streams within a group call is not supported.");
+    return ncclInvalidUsage;
+  }
+  for (int bid=0; bid<coll.args.nChannels; bid++) {
+    struct ncclChannel* channel = info->comm->channels+(info->comm->myParams->gridDim.x % info->comm->nChannels);
+
+    if (channel->collCount == NCCL_MAX_OPS) {
+      WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
+      return ncclInvalidUsage;
+    }
+
+    // Proxy
+    proxyArgs.channel = channel;
+    NCCLCHECK(transportSaveProxies(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
+
+    info->comm->myParams->gridDim.x++;
+
+    int opIndex = channel->collFifoTail;
+    struct ncclColl* c = channel->collectives+opIndex;
+    volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
+    while (activePtr[0] != 0) sched_yield();
+
+    memcpy(c, &coll, sizeof(struct ncclColl));
+
+    c->args.bid = bid;
+    c->active = 1;
+    opIndex = (opIndex+1)%NCCL_MAX_OPS;
+    c->nextIndex = opIndex;
+    channel->collFifoTail = opIndex;
+    channel->collCount++;
+  }
+  /*if (llMode == 0)*/ info->comm->opCount++;
+  return ncclSuccess;
+}
+
+
+ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
+  if (info->comm == NULL) return ncclInvalidArgument;
+
+  INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
+       info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
+       info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
+
+  // Launch asynchronously if needed
+  if (ncclAsyncMode()) {
+    ncclResult_t ret = ncclSuccess;
+    int savedDev = -1;
+    if (info->comm->checkPointers) {
+      CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end);
+      CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, end);
+    }
+    // Check arguments
+    NCCLCHECKGOTO(ArgsCheck(info), ret, end);
+    // Always register comm even in case of error to make sure ncclGroupEnd
+    // cleans it up.
+    NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
+    NCCLCHECKGOTO(saveKernel(info), ret, end);
+end:
+    if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev));
+    ncclAsyncErrCheck(ret);
+    return ret;
+  } else {
+    NCCLCHECK(ArgsCheck(info));
+    NCCLCHECK(saveKernel(info));
+    NCCLCHECK(ncclBarrierEnqueue(info->comm));
+    NCCLCHECK(ncclBarrierEnqueueWait(info->comm));
+    NCCLCHECK(ncclEnqueueEvents(info->comm));
+    return ncclSuccess;
+  }
+}
diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h
index 278593c..a1aaf50 100644
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@@ -13,5 +13,7 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
 ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
 ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState);
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
+ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
+ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
 ncclResult_t bootstrapClose(void* commState);
 #endif
diff --git a/src/include/channel.h b/src/include/channel.h
new file mode 100644
index 0000000..76c5e8a
--- /dev/null
+++ b/src/include/channel.h
@@ -0,0 +1,14 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CHANNEL_H_
+#define NCCL_CHANNEL_H_
+#include "core.h"
+
+ncclResult_t initChannel(struct ncclComm* comm, int channelid);
+ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
+
+#endif
diff --git a/src/include/checks.h b/src/include/checks.h
new file mode 100644
index 0000000..bf7750e
--- /dev/null
+++ b/src/include/checks.h
@@ -0,0 +1,10 @@
+/*************************************************************************
+ * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+
+ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
+ncclResult_t ArgsCheck(struct ncclInfo* info);
diff --git a/src/include/common_coll.h b/src/include/common_coll.h
deleted file mode 100644
index 3ec7354..0000000
--- a/src/include/common_coll.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef COMMON_COLL_H_
-#define COMMON_COLL_H_
-
-#include "core.h"
-#include "enqueue.h"
-#include "collectives/collectives.h"
-
-static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
-  cudaPointerAttributes attr;
-  cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
-  if (err != cudaSuccess || attr.devicePointer == NULL) {
-    WARN("%s : %s is not a valid pointer", opname, ptrname);
-    return ncclInvalidArgument;
-  }
-#if CUDART_VERSION >= 10000
-  if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
-#else
-  if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
-#endif
-    WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
-    return ncclInvalidArgument;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
-  if (ptr == NULL) {
-    WARN("%s : %s argument is NULL", opname, ptrname);
-    return ncclInvalidArgument;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) {
-  NCCLCHECK(PtrCheck(comm, opname, "comm"));
-  // First, the easy ones
-  if (root < 0 || root >= comm->nRanks) {
-    WARN("%s : invalid root %d (root should be in the 0..%d range)", opname, root, comm->nRanks);
-    return ncclInvalidArgument;
-  }
-  if (type < 0 || type >= ncclNumTypes) {
-    WARN("%s : invalid type %d", opname, type);
-    return ncclInvalidArgument;
-  }
-  if (op < 0 || op >= ncclNumOps) {
-    WARN("%s : invalid reduction operation %d", opname, op);
-    return ncclInvalidArgument;
-  }
-
-  if (comm->checkPointers) {
-    // Check CUDA device pointers
-    if (strcmp(opname, "Broadcast") != 0 || comm->rank == root) {
-      NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname));
-    }
-    if (strcmp(opname, "Reduce") != 0 || comm->rank == root) {
-      NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname));
-    }
-  }
-  return ncclSuccess;
-}
-
-static __inline__ int ncclTypeSize(ncclDataType_t type) {
-  switch (type) {
-    case ncclInt8:
-    case ncclUint8:
-      return 1;
-    case ncclFloat16:
-      return 2;
-    case ncclInt32:
-    case ncclUint32:
-    case ncclFloat32:
-      return 4;
-    case ncclInt64:
-    case ncclUint64:
-    case ncclFloat64:
-      return 8;
-    default:
-      return -1;
-  }
-}
-
-// In : comm, nbytes ; Out : nrings, nthreads, ll
-// - We start with the minimum number of threads possible (64) and see if the size fits in LL;
-//   If not, we increase the number of threads by 2x, until we reach the max number of LL threads (256, or set by user via NCCL_NTHREADS, or platform non-LL default)
-// - We use "maxRings" to limit the max number of rings we can use before reaching the max number of LL threads
-//   This ensures we don't use a large number of rings with a small number of threads
-// - We use the NCCL_LL_RING_THRESHOLD as the per-thread threshold before we reach the max number of threads
-//   we use NCCL_THREAD_THRESHOLD when we reach the max
-// - If by the max number of LL threads, the size still cannot fit in LL, then we use non-LL setting
-// - We honor the NCCL_LL_THRESHOLD (total threshold) set by user too
-static inline void ncclGetCollResource(ncclComm_t comm, size_t nbytes, int* nrings, int* nthreads, int* ll) {
-  *ll = 0;
-  int llEnforced = 0; /* see if the size falls in the NCCL_LL_THRESHOLD range set by user */
-  if (comm->llThreshold >= 0) { /* user sets total LL threshold */
-    if (nbytes > comm->llThreshold) { /* non-LL */
-      *nthreads = comm->nThreads+1;
-      *nrings = comm->nRings;
-      return;
-    } else {
-      llEnforced = 1; /* user wants to use LL */
-    }
-  }
-  int nt = NCCL_LL_MIN_NTHREADS; /* start with min number of LL threads */
-  size_t nr;
-  int ll_max_nthreads = std::min(NCCL_LL_MAX_NTHREADS, comm->nThreads); /* respect user's setting or platform's default setting */
-  int maxRings = (comm->nRanks <= 4) ? 1 : ll_max_nthreads / NCCL_LL_MIN_NTHREADS;
-  ssize_t threshold = std::min(comm->threadThreshold, (ssize_t)NCCL_LL_RING_THRESHOLD);
-  while (nt < ll_max_nthreads && *ll == 0) {
-    nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*nt*comm->nRanks));
-    if (nr <= maxRings) { /* avoid using few threads but many rings */
-      nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
-      *ll = nbytes > comm->nRanks*nr*nt*threshold ? 0 : 1;
-    }
-    if (*ll == 0) {
-      nt = nt << 1;
-    }
-  }
-  if (*ll == 1) {
-    *nthreads = nt;
-    *nrings = (int)nr;
-    return; /* we can use smaller number of threads to make LL work, stop here */
-  }
-  nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*ll_max_nthreads*comm->nRanks)); /* else we try the max number of LL threads */
-  nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
-  *ll = nbytes > comm->nRanks*nr*ll_max_nthreads*comm->threadThreshold ? llEnforced : 1;
-  *nthreads = *ll ? ll_max_nthreads : comm->nThreads+1;
-  *nrings = *ll ? (int)nr : comm->nRings;
-}
-
-static ncclResult_t saveKernel(int coll, const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t dtype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, size_t nbytes, int loopFactor) {
-  int llMode, nBlocks, nThreads;
-  ncclGetCollResource(comm, nbytes, &nBlocks, &nThreads, &llMode);
-  comm->myParams->blockDim.x = std::max((int)comm->myParams->blockDim.x, nThreads);
-  if (comm->userStreamSet == false) {
-    comm->userStream = stream;
-    comm->userStreamSet = true;
-  } else if (stream != comm->userStream) {
-    WARN("Error : mixing different streams within a group call is not supported.");
-    return ncclInvalidUsage;
-  }
-  int lastChunkSize = 0;
-  if (llMode == 1) {
-    int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / ncclTypeSize(dtype);
-    const ssize_t loopSize = nBlocks*loopFactor*(ssize_t)sliceSize;
-    lastChunkSize = DIVUP((count-count/loopSize*loopSize), nBlocks*loopFactor);
-    ALIGN_SIZE(lastChunkSize, nThreads*sizeof(uint64_t)/ncclTypeSize(dtype));
-  }
-  for (int bid=0; bid<nBlocks; bid++) {
-    struct ncclRing* ring = comm->rings+(comm->myParams->gridDim.x % comm->nRings);
-    if (ring->collCount == NCCL_MAX_OPS) {
-      WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
-      return ncclInvalidUsage;
-    }
-
-    comm->myParams->gridDim.x++;
-
-    int opIndex = ring->collFifoTail;
-    struct ncclColl* c = ring->collectives+opIndex;
-    volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
-    while (activePtr[0] != 0) sched_yield();
-
-    struct CollectiveArgs* args = &c->args;
-    args->root = root;
-    args->N = count;
-    args->ThisInput = sendbuff;
-    args->ThisOutput = recvbuff;
-    args->comm = comm->devComm;
-    args->opCount = comm->opCount;
-    args->bid = bid;
-    args->nRings = nBlocks;
-    args->nThreads = nThreads;
-    args->lastChunkSize = lastChunkSize;
-
-    c->nThreads = nThreads;
-    c->funcIndex = FUNC_INDEX(coll, op, dtype, llMode);
-    c->active = 1;
-    opIndex = (opIndex+1)%NCCL_MAX_OPS;
-    c->nextIndex = opIndex;
-    ring->collFifoTail = opIndex;
-    ring->collCount++;
-  }
-  /*if (llMode == 0)*/ comm->opCount++;
-  return ncclSuccess;
-}
-
-extern __global__ void ncclMultiOpKernel (struct ncclColl firstColl);
-
-#endif
diff --git a/src/include/core.h b/src/include/core.h
index 8285df5..d57d271 100644
--- a/src/include/core.h
+++ b/src/include/core.h
@@ -8,6 +8,7 @@
 #define NCCL_CORE_H_
 
 #define NCCL_MAX_OPS 2048
+#define NCCL_STEPS 8
 
 #include "nccl.h"
 #include "transport.h"
@@ -29,15 +30,15 @@ struct cudaLaunchParams {
 };
 #endif
 
-#define MAXRINGS 16
+#define MAXCHANNELS 16
 #define MAXTHREADS 256
 #define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
 
-// Rings / LL tuning
-#define NCCL_LL_RING_THRESHOLD 8 // Per thread size before we start increasing nrings
-#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL for Volta and above
+// Channels / LL tuning
+#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
+#define NCCL_THREAD_THRESHOLD 64  // Per thread size before we switch to non-LL
 #define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
-#define NCCL_LL_MAX_NTHREADS 256
+#define NCCL_LL_MAX_NTHREADS MAXTHREADS
 #define NCCL_LL_MIN_NTHREADS 64
 
 #define DIVUP(x, y) \
@@ -63,43 +64,84 @@ union ncclLLFifoLine {
   int4 i4;
 };
 
+typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
+
+typedef enum {
+  ncclPatternRing,
+  ncclPatternRingTwice,
+  ncclPatternPipelineFrom,
+  ncclPatternPipelineTo,
+  ncclPatternTreeUp,
+  ncclPatternTreeDown,
+  ncclPatternTreeUpDown
+} ncclPattern_t;
+
+typedef enum {
+  ncclDevSuccess,
+  ncclDevAssertedMismatch,
+  ncclDevSuspectedMismatch
+} ncclDevError_t;
+
+// Used to pass NCCL call information between functions
+struct ncclInfo {
+  ncclColl_t coll;
+  const char* opName;
+  // NCCL Coll Args
+  const void* sendbuff;
+  void* recvbuff;
+  size_t count;
+  ncclDataType_t datatype;
+  ncclRedOp_t op;
+  int root;
+  ncclComm_t comm;
+  cudaStream_t stream;
+  // Algorithm details
+  int chunkSteps;
+  int sliceSteps;
+  // Computed later
+  ncclPattern_t pattern;
+  size_t nBytes;
+  int nstepsPerLoop;
+  int nchunksPerLoop;
+};
+
 struct ncclConnInfo {
   // Regular comm mechanism
   char *buff;         // Local for recv, remote for send
   uint64_t *tail;     // Local for recv, remote for send
   uint64_t *head;     // Local for send, remote for recv
-  uint64_t *opCount;  // Local for recv, remote for send
+  uint64_t *opCountLoc; // opCount of local rank
+  uint64_t *opCountRem; // opCount of remote rank
 
   int direct;         // Direct communication
   void **ptrExchange; // Pointer exchange for direct communication
 
   int *fifo;          // Size fifo for proxy
 
+  uint64_t step;      // Keep where we are
+
   // Low latency mechanism
-  char *llBuff;       // Local for recv, remote for send
-  uint64_t *llHead;   // Local for send, remote for recv
-  int *llFifo;        // LL Size fifo for proxy
-  uint64_t llStep;    // Keep where we are
+  union ncclLLFifoLine *llBuff; // Local for recv, remote for send
   uint64_t llLastCleaning;
 };
 
 struct ncclConnector {
-  struct transportProxyInfo* proxyInfo;
-  struct ncclTransport* transport;
+  int connected;
+  struct ncclProxyArgs *proxyAppend;
+  struct ncclTransportComm* transportComm;
   void* transportResources; // Host-side resources
   struct ncclConnInfo conn;
+  struct ncclComm *comm;
 };
 
 #define CACHE_LINE_SIZE 128
 #define MEM_ALIGN 4096
-#define SIZES_FIFO_SIZE 32
 #define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
 
-#define NCCL_LL_CHUNKS 8
 #define NUM_LINES_PER_THREAD 8
-#define NCCL_LL_BUFF_SIZE (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_LL_CHUNKS*sizeof(union ncclLLFifoLine)) // 256K
-#define NCCL_LL_BUFF_LINES (NCCL_LL_BUFF_SIZE / (2*sizeof(uint64_t)))
-#define NCCL_LL_SLICE_LINES (NCCL_LL_BUFF_LINES / NCCL_LL_CHUNKS)
+#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
+#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
+#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
 #define NCCL_LL_CLEAN_FREQ 0x10000000
 
 struct ncclSendMem {
@@ -109,7 +151,7 @@ struct ncclSendMem {
       char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
       void* ptrExchange;
       char pad2[CACHE_LINE_SIZE-sizeof(void*)];
-      uint64_t llHead;
+      uint64_t opCount;
     };
     char pad3[MEM_ALIGN];
   };
@@ -119,37 +161,54 @@ struct ncclRecvMem {
   union {
     struct {
       uint64_t tail;
-      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
       uint64_t opCount;
-      char pad4[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      int sizesFifo[SIZES_FIFO_SIZE];
-      int llSizesFifo[SIZES_FIFO_SIZE];
+      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      int sizesFifo[NCCL_STEPS];
     };
-    char pad5[MEM_ALIGN];
+    char pad4[MEM_ALIGN];
   };
-  char llBuff[NCCL_LL_BUFF_SIZE];
+  ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
   char buff[1]; // Actually larger than that
 };
 
 struct ncclRing {
+  // Shortcuts for userRanks[1] and userRanks[n-1]
+  int prev;
+  int next;
+
+  // Maps an internal nccl index to user-specified rank order. This is necessary
+  // since we need to know how the user expects data to be ordered across
+  // devices. Ordered from current device.
+  int* userRanks;
+  int* devUserRanks;
+};
+
+#define NCCL_MAX_TREE_ARITY 3
+struct ncclTree {
+  int depth;
+  int up;
+  int down[NCCL_MAX_TREE_ARITY];
+};
+
+struct ncclPeer {
+  struct ncclConnector send;
+  struct ncclConnector recv;
+};
+
+struct ncclChannel {
   union {
     struct {
+      struct ncclRing ring;
+      struct ncclTree tree;
+
       int id;
       int nthreads;
-      // Per ring resources
-      struct ncclSendMem* devMemSend;   // CUDA-size resources
-      struct ncclRecvMem* devMemRecv;   // CUDA-size resources
       int buffSize;
-      int devMemSendSize;    // Keep the size for IPCs
-      int devMemRecvSize;    // Keep the size for IPCs
-      struct ncclConnector send;
-      struct ncclConnector recv;
 
-      // Maps an internal nccl index to user-specified rank order. This is necessary
-      // since we need to know how the user expects data to be ordered across
-      // devices. Ordered from current device.
-      int* userRanks;
-      int* devUserRanks;
+      // Communication structures
+      struct ncclPeer* peers;
+      struct ncclPeer* devPeers;
 
       // Operation list for aggregation
       struct ncclColl* collectives;
@@ -162,7 +221,7 @@ struct ncclRing {
     int data[0x80];
   };
 };
-static_assert(sizeof(struct ncclRing) == 0x80*sizeof(int), "ncclRing must have a pow2 size");
+static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
 
 /* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
 /* to make sure reads to host from the CUDA kernel are aligned. */
@@ -179,7 +238,7 @@ struct CollectiveArgs {
   size_t N;
   uint32_t root;
   uint8_t bid;
-  uint8_t nRings;
+  uint8_t nChannels;
   uint16_t nThreads;
 
   int lastChunkSize;
@@ -188,7 +247,6 @@ struct ncclColl {
   union {
     struct {
       struct CollectiveArgs args;
-      uint16_t nThreads;
       uint16_t funcIndex;
       uint16_t nextIndex;
       uint8_t  active;
@@ -199,11 +257,16 @@ struct ncclColl {
 static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
 
 struct ncclComm {
-  struct ncclRing rings[MAXRINGS];
+  struct ncclChannel channels[MAXCHANNELS];
+
+  struct ncclPeerInfo* peerInfo;
+
+  void* bootstrap;
 
   int rank;    // my rank in the communicator
   int nRanks;  // number of GPUs in communicator
   int cudaDev; // my cuda device index
+  int nvmlDev; // my NVML device number
 
   enum { GROUP, PARALLEL } launchMode;
   cudaStream_t userStream;
@@ -215,18 +278,31 @@ struct ncclComm {
   // where syncs are not symmetric).
   uint64_t opCount;
 
-  // Rings for collectives
-  int nRings;
+  // Channels for collectives
+  int nChannels;
   int nThreads;
 
   // Low-latency algorithm threshold
   ssize_t llThreshold;
   ssize_t threadThreshold;
 
+  // Tree algorithm threshold
+  ssize_t treeThreshold;
+
   // An internal CUDA stream for NCCL kernel CGMD launches
   int groupCudaStream;
   cudaStream_t groupStream;
 
+  // Whether there has been a fatal error in this communicator.
+  ncclResult_t fatalError;
+
+  // Error reported by GPU
+  volatile ncclDevError_t* fatalDevError;
+
+  // On host: this pointer has been obtained from cudaHostAlloc(cudaHostAllocMapped)
+  // On device:  this pointer has been obtained from cudaHostGetDevicePointer()
+  volatile uint32_t *abortFlag;
+
   // Device copy of the communicator
   struct ncclComm *devComm;
 
@@ -244,6 +320,10 @@ struct ncclComm {
   int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
   struct ncclColl args;
   void* argsptr;
+
+  // Global proxy thread
+  pthread_t proxyThread;
+  struct ncclProxyState proxyState;
 };
 
 // Check CUDA calls
@@ -324,6 +404,28 @@ struct ncclComm {
 #endif // end PROFAPI
 
 int ncclCudaCompCap();
+ncclResult_t ncclNvlinkGpu(int* nvlink);
+int64_t ncclTreeThreshold();
+
+static __inline__ int ncclTypeSize(ncclDataType_t type) {
+  switch (type) {
+    case ncclInt8:
+    case ncclUint8:
+      return 1;
+    case ncclFloat16:
+      return 2;
+    case ncclInt32:
+    case ncclUint32:
+    case ncclFloat32:
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclFloat64:
+      return 8;
+    default:
+      return -1;
+  }
+}
 
 #include <sys/mman.h>
 static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
diff --git a/src/include/cpuset.h b/src/include/cpuset.h
new file mode 100644
index 0000000..f70d1d8
--- /dev/null
+++ b/src/include/cpuset.h
@@ -0,0 +1,61 @@
+/*************************************************************************
+ * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CPUSET_H_
+#define NCCL_CPUSET_H_
+
+// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
+
+static int hexToInt(char c) {
+  int v = c - '0';
+  if (v < 0) return -1;
+  if (v > 9) v = 10 + c - 'a';
+  if ((v < 0) || (v > 15)) return -1;
+  return v;
+}
+
+#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
+
+ncclResult_t ncclStrToCpuset(char* str, cpu_set_t* mask) {
+  uint32_t cpumasks[CPU_SET_N_U32];
+  int m = CPU_SET_N_U32-1;
+  cpumasks[m] = 0;
+  for (int o=0; o<strlen(str); o++) {
+    char c = str[o];
+    if (c == ',') {
+      m--;
+      cpumasks[m] = 0;
+    } else {
+      int v = hexToInt(c);
+      if (v == -1) break;
+      cpumasks[m] <<= 4;
+      cpumasks[m] += v;
+    }
+  }
+  // Copy cpumasks to mask
+  for (int a=0; m<CPU_SET_N_U32; a++,m++) {
+    memcpy(((uint32_t*)mask)+a, cpumasks+m, sizeof(uint32_t));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
+  int c = 0;
+  uint8_t* m8 = (uint8_t*)mask;
+  for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
+    if (c == 0 && m8[o] == 0) continue;
+    sprintf(str+c, "%02x", m8[o]);
+    c+=2;
+    if (o && o%4 == 0) {
+      sprintf(str+c, ",");
+      c++;
+    }
+  }
+  str[c] = '\0';
+  return ncclSuccess;
+}
+
+#endif
diff --git a/src/include/debug.h b/src/include/debug.h
index 55dee18..3acdf8c 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -25,6 +25,7 @@ extern uint64_t ncclDebugMask;
 extern pthread_mutex_t ncclDebugOutputLock;
 extern FILE *ncclDebugFile;
 extern ncclResult_t getHostName(char* hostname, int maxlen);
+extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev);
 
 extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
 
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
index 69d0463..4db7094 100644
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@@ -10,12 +10,7 @@
 #include "core.h"
 #include "group.h"
 
-typedef ncclResult_t(*ncclFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
-
-ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff,
-    void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root,
-    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
 ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
 ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
 ncclResult_t ncclCpuBarrierOut(ncclComm_t comm);
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
index ce3f6ca..89edbf5 100644
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@@ -58,8 +58,50 @@ typedef struct {
   ncclResult_t (*closeListen)(void* listenComm);
 } ncclNet_v1_t;
 
-typedef ncclNet_v1_t ncclNet_t;
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Return the device path in /sys. NCCL will call free on this path.
+  ncclResult_t (*pciPath)(int dev, char** path);
+  // Return whether this device supports host pointers and/or CUDA pointers
+  // as data from the current GPU. Supported types should be composed with
+  // NCCL_PTR_HOST and NCCL_PTR_CUDA.
+  ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connectHandle
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
+  // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v2_t;
+
+typedef ncclNet_v2_t ncclNet_t;
 
-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v1
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v2
 
 #endif // end include guard
diff --git a/src/include/net.h b/src/include/net.h
index ebc9677..e75e6bb 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -26,9 +26,11 @@ static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(
 static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
 static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
 static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
-static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, type, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, type, request)); return ncclSuccess; }
-static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size) { NCCLCHECK(ncclNet->flush(recvComm, data, size)); return ncclSuccess; }
+static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
+static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
+static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; }
+static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclNet->flush(recvComm, data, size, mhandle)); return ncclSuccess; }
 static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
diff --git a/src/include/nvlink.h b/src/include/nvlink.h
index 7eb74c9..1baf9e5 100644
--- a/src/include/nvlink.h
+++ b/src/include/nvlink.h
@@ -67,18 +67,15 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
   if (res != ncclSuccess) return 0;
 
   for(int l=0; l<maxNvLinks; ++l) {
-    // nvmlDeviceGetNvLinkCapability(NVML_NVLINK_CAP_P2P_SUPPORTED) would seem to
-    // report whether the NVLink connects to a peer GPU (versus a POWER CPU?). I
-    // don't know whether nvmlDeviceGetNvLinkRemotePciInfo() would succeed in
-    // the POWER CPU case, so it seems best to check this as well.
+    // Check whether we can use this NVLink for P2P
     unsigned canP2P;
     if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
 
-    // nvmlDeviceGetNvLinkRemotePciInfo() will return NVML_ERROR_NOT_SUPPORTED
-    // if the links don't exist, or are disabled. So checking for that return
-    // here would probably make the nvmlDeviceGetNvLinkCapability check above
-    // redundant. Presumably, we still need to check the P2P capability above,
-    // since even non-GPUs would possess PCI info.
+    // Make sure the Nvlink is up. The previous call should have trained the link.
+    nvmlEnableState_t isActive;
+    if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
+
+    // Try to figure out what's on the other side of the NVLink
     nvmlPciInfo_t remoteProc;
     if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
 
@@ -89,7 +86,7 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
       p[c] = toupper(p[c]);
     }
 
-    if (strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
+    if (busId2 != NULL && strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
       links++;
     } else {
       // Make a lower case copy of the bus ID for calling ncclDeviceType
@@ -101,11 +98,21 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
         lowerId[c] = tolower(p[c]);
       }
 
-      // Determine if the remote side is NVswitch
+      // Determine if the remote side is NVswitch or a GPU
       enum ncclNvLinkDeviceType type;
-      if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
-        //TODO: we are making an assumption that all GPUs are connected to this switch
-        //This assumption may change for future architectures
+      ncclResult_t ret = ncclDeviceType(lowerId, &type);
+      if (ret == ncclSuccess) {
+        if (type == ncclNvLinkDeviceSwitch) {
+          //TODO: we are making an assumption that all GPUs are connected to this switch
+          //This assumption may change for future architectures
+          nvswitch_links++;
+        } else if (type == ncclNvLinkDeviceGpu && busId2 == NULL) {
+          links++;
+        }
+      } else {
+        // The NVLink is up but we couldn't find the PCI device on the other
+        // side. Assume it's an NVswitch outside a VM.
+        if (l==0) INFO(NCCL_INIT, "Assuming NVLink is connected to NVswitch");
         nvswitch_links++;
       }
     }
@@ -113,43 +120,4 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
   return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*links;
 }
 
-static int getNumNvlinks(const char* busId) {
-  nvmlDevice_t nvmlDev;
-  ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev);
-  if (res != ncclSuccess) return 0;
-
-  int nvlinks = 0, nvswitch_links = 0;
-  int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
-  for(int l=0; l<maxNvLinks; ++l) {
-    unsigned canP2P;
-    nvmlEnableState_t isActive;
-    if (wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) == ncclSuccess && canP2P &&
-        wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) == ncclSuccess && isActive == NVML_FEATURE_ENABLED) {
-      nvlinks++;
-    } else {
-      continue;
-    }
-
-    nvmlPciInfo_t remoteProc;
-    if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
-
-    // Make a lower case copy of the bus ID for calling ncclDeviceType
-    // PCI system path is in lower case
-    char* p = remoteProc.busId;
-    char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-    for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
-      if (p[c] == 0) break;
-      lowerId[c] = tolower(p[c]);
-    }
-
-    // Determine if the remote side is NVswitch
-    enum ncclNvLinkDeviceType type;
-    if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
-      //TODO: we are making an assumption that all GPUs are connected to this switch
-      //This assumption may change for future architectures
-      nvswitch_links++;
-    }
-  }
-  return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*nvlinks;
-}
 #endif
diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h
index ddfd233..0b6198a 100644
--- a/src/include/nvmlwrap.h
+++ b/src/include/nvmlwrap.h
@@ -7,7 +7,7 @@
 #ifndef NCCL_NVMLWRAP_H_
 #define NCCL_NVMLWRAP_H_
 
-#include "core.h"
+#include "nccl.h"
 
 //#define NVML_DIRECT 1
 #ifdef NVML_DIRECT
@@ -32,14 +32,6 @@ static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index)
   NVMLCHECK(nvmlDeviceGetIndex(device, index));
   return ncclSuccess;
 }
-static ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
-  NVMLCHECK(nvmlDeviceSetCpuAffinity(device));
-  return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
-  NVMLCHECK(nvmlDeviceClearCpuAffinity(device));
-  return ncclSuccess;
-}
 static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
   NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device));
   return ncclSuccess;
@@ -61,6 +53,10 @@ static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsig
   NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
   return ncclSuccess;
 }
+static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
+  NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber));
+  return ncclSuccess;
+}
 #else
 // Dynamically handle dependencies on NVML
 
@@ -136,14 +132,14 @@ ncclResult_t wrapNvmlInit(void);
 ncclResult_t wrapNvmlShutdown(void);
 ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
 ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
-ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
-ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
 ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
 ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci);
 ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
 ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
 ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
                                                    nvmlNvLinkCapability_t capability, unsigned int *capResult);
+ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber);
+
 #endif // NVML_DIRECT
 
 #endif // End include guard
diff --git a/src/include/ring.h b/src/include/ring.h
deleted file mode 100644
index fa5e099..0000000
--- a/src/include/ring.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_RING_H_
-#define NCCL_RING_H_
-#include "core.h"
-
-ncclResult_t initRing(struct ncclComm* comm, int ringid);
-ncclResult_t freeRing(struct ncclRing* ring);
-
-#endif
diff --git a/src/include/rings.h b/src/include/rings.h
index 751846c..43fc595 100644
--- a/src/include/rings.h
+++ b/src/include/rings.h
@@ -12,6 +12,6 @@ static int getDefaultThreads() {
   return ncclCudaCompCap() == 3 ? 128 : 256;
 }
 
-ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next);
+ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut);
 
 #endif
diff --git a/src/include/socket.h b/src/include/socket.h
index 624af40..fb5cfc0 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -60,7 +60,9 @@ static inline int envSocketFamily(void) {
 }
 
 static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
+#ifdef ENABLE_TRACE
   char line[1024];
+#endif
   struct netIf userIfs[MAX_IFS];
   bool searchNot = prefixList && prefixList[0] == '^';
   int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
@@ -106,7 +108,6 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
       // Store the IP address
       int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
       memcpy(addrs+found, interface->ifa_addr, salen);
-      INFO(NCCL_INIT|NCCL_NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
       found++;
     }
   }
@@ -336,8 +337,10 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
   TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
 #endif
 
-  /* Put the socket in listen mode */
-  SYSCHECK(listen(sockfd, 128), "listen");
+  /* Put the socket in listen mode
+   * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
+   */
+  SYSCHECK(listen(sockfd, 16384), "listen");
   *fd = sockfd;
   return ncclSuccess;
 }
diff --git a/src/include/transport.h b/src/include/transport.h
index 59f83c9..6231a71 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,6 +9,7 @@
 
 #include "nccl.h"
 #include <stdint.h>
+#include "nvmlwrap.h"
 
 #define NTRANSPORTS 3
 
@@ -19,11 +20,13 @@ struct ncclRing;
 struct ncclConnector;
 struct ncclComm;
 
-#define RANK_INFO_SIZE 64
-typedef char ncclTinfo_t[RANK_INFO_SIZE];
-
-struct ncclInfo {
-  ncclTinfo_t tinfo[NTRANSPORTS];
+struct ncclPeerInfo {
+  int rank;
+  int cudaDev;
+  int nvmlDev;
+  uint64_t hostHash;
+  uint64_t pidHash;
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
 };
 
 // Used to hold the transport connection values
@@ -34,18 +37,47 @@ struct ncclConnect {
   char data[CONNECT_SIZE];
 };
 
+enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress, ncclProxyOpDone };
+
+struct ncclProxyArgs;
+typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
+
 struct ncclProxyArgs {
-  struct ncclRing* ring;
-  int substeps;
+  proxyProgressFunc_t progress;
+  struct ncclChannel* channel;
+  struct ncclConnector* connector;
+  int sliceSteps;
+  int chunkSteps;
   int nsteps;
   uint64_t opCount;
   int llMode;
-  bool needProxy;
-  int active;   // add component before this line -- it is left out during initialization
+  int state;   // add component before this line -- it is left out during initialization
+
+  // Internal state
+  uint64_t head;
+  uint64_t tail;
+  uint64_t end;
+  void* requests[NCCL_STEPS];
+  int idle;
+
+  // Element linking
+  pthread_mutex_t mutex;
+  struct ncclProxyArgs* next;
+  struct ncclProxyArgs* nextPeer;
+};
+
+struct ncclProxyPool;
+struct ncclProxyState {
+  pthread_cond_t cond;
+  pthread_mutex_t mutex;
+  bool stop;
+  struct ncclProxyArgs* ops;
+  struct ncclProxyArgs* pool;
+  struct ncclProxyPool* pools;
 };
 
 struct ncclTransportComm {
-  ncclResult_t (*setup)(ncclTinfo_t*, ncclTinfo_t*, struct ncclConnect*, struct ncclRing*);
+  ncclResult_t (*setup)(struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
   ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*);
   ncclResult_t (*free)(void*);
   ncclResult_t (*proxy)(struct ncclProxyArgs*);
@@ -53,8 +85,7 @@ struct ncclTransportComm {
 
 struct ncclTransport {
   const char name[4];
-  ncclResult_t (*fillInfo)(ncclTinfo_t*, int);
-  ncclResult_t (*canConnect)(ncclTvalue_t*, ncclTinfo_t*, ncclTinfo_t*);
+  ncclResult_t (*canConnect)(ncclTvalue_t*, struct ncclPeerInfo*, struct ncclPeerInfo*);
   ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*);
   struct ncclTransportComm send;
   struct ncclTransportComm recv;
@@ -64,37 +95,17 @@ struct ncclTransport {
 
 typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
 
-#define TRANSPORT_PROXY_FIFO_SIZE NCCL_MAX_OPS
-
-struct transportProxyInfo {
-  struct ncclComm* comm;
-  pthread_t thread;
-  threadFunc_t func;
-  volatile int proxyReady;
-  struct ncclProxyArgs argsFifo[TRANSPORT_PROXY_FIFO_SIZE];
-  volatile uint64_t argsFifoHead;
-  volatile uint64_t argsFifoTail;
-  pthread_cond_t cond;
-  pthread_mutex_t mutex;
-};
-
-ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm);
-ncclResult_t transportDestroyProxy(struct ncclConnector* connector);
-
 enum proxyMode {
   proxyRing = 0,
   proxyFrom = 1,
   proxyTo = 2
 };
 
-static int proxyPatternRing = proxyRing;
-static inline int proxyPatternFrom(int root) { return 1+root; }
-static inline int proxyPatternTo(int root) { return -1-root; }
-static inline enum proxyMode proxyPatternMode(int pattern) { return (pattern == 0) ? proxyRing : ((pattern > 0) ? proxyFrom : proxyTo); }
-static inline int proxyPatternRoot(int pattern) { return (pattern > 0) ? pattern-1 : -pattern-1; }
-
-ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t size, int pattern, struct ncclComm* comm);
-ncclResult_t transportStartProxies(struct ncclComm* comm);
+ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr);
+ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks);
+ncclResult_t transportStartProxy(struct ncclComm* comm);
+ncclResult_t transportCreateProxy(struct ncclComm* comm);
+ncclResult_t transportDestroyProxy(struct ncclComm* comm);
 
 #include <unistd.h>
 
diff --git a/src/include/trees.h b/src/include/trees.h
new file mode 100644
index 0000000..1a151d1
--- /dev/null
+++ b/src/include/trees.h
@@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TREES_H_
+#define NCCL_TREES_H_
+
+ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0);
+ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* u1, int* d1_0, int* d1_1);
+
+#endif
diff --git a/src/init.cu b/src/init.cu
index 9d0188e..75822e6 100644
--- a/src/init.cu
+++ b/src/init.cu
@@ -1,21 +1,26 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
 #include "nccl.h"
 #include "core.h"
-#include "ring.h"
+#include "channel.h"
 #include "param.h"
 #include "nvmlwrap.h"
 #include "rings.h"
+#include "trees.h"
 #include "bootstrap.h"
 #include "transport.h"
-#include "common_coll.h"
 #include "group.h"
 #include "utils.h"
 #include "net.h"
+#include "checks.h"
+#include "enqueue.h"
+#include "topo.h"
+#include "nvlink.h"
+#include "cpuset.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/mman.h>
@@ -55,6 +60,16 @@ NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
 ncclNet_t* ncclNet = NULL;
 
 // We define this as weak to let tests redefine their own
+#pragma weak ncclNvlinkGpu
+ncclResult_t ncclNvlinkGpu(int* nvlink) {
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+  CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
+  *nvlink = getNvlinkGpu(busId, NULL);
+  return ncclSuccess;
+}
+// We define this as weak to let tests redefine their own
 #pragma weak ncclCudaCompCap
 int ncclCudaCompCap() {
   int cudaDev;
@@ -77,10 +92,7 @@ ncclResult_t initNet(ncclNet_t* net) {
   int ndev;
   if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
   if (net->devices(&ndev) != ncclSuccess) return ncclInternalError;
-  if (ndev <= 0) {
-    INFO(NCCL_INIT|NCCL_NET, "Net/%s: call to devices() returned 0 devices.", net->name);
-    return ncclSystemError;
-  }
+  if (ndev <= 0) return ncclSystemError;
   return ncclSuccess;
 }
 
@@ -91,15 +103,15 @@ ncclResult_t initNetPlugin(ncclNet_t** net) {
     // string, so checking errno doesn't hurt to try to provide a better
     // error message
     if (errno == ENOENT) {
-      INFO(NCCL_INIT|NCCL_NET, "No network plugin found.");
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (libnccl-net.so).");
     } else {
-      INFO(NCCL_INIT|NCCL_NET, "Unable to load libnccl-net.so : %s", dlerror());
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
     }
     return ncclSuccess;
   }
   ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
   if (extNet == NULL) {
-    INFO(NCCL_INIT|NCCL_NET, "NetPlugin: could not find " STR(NCCL_PLUGIN_SYMBOL) " symbol");
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol.");
     goto cleanup;
   }
   if (initNet(extNet) == ncclSuccess) {
@@ -116,21 +128,18 @@ ncclResult_t initNet() {
   NCCLCHECK(initNet(&ncclNetSocket));
 
   NCCLCHECK(initNetPlugin(&ncclNet));
-  if (ncclNet != NULL) {
-    INFO(NCCL_INIT|NCCL_NET, "Using network plugin %s", ncclNetName());
-    return ncclSuccess;
-  }
+  if (ncclNet != NULL) return ncclSuccess;
   if (initNet(&ncclNetIb) == ncclSuccess) {
     ncclNet = &ncclNetIb;
   } else {
     ncclNet = &ncclNetSocket;
   }
-  INFO(NCCL_INIT|NCCL_NET,"Using network %s", ncclNetName());
   return ncclSuccess;
 }
 
 NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
 NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2);
+NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", -2);
 
 int ncclThreadThreshold(int minCompCap, int multiNode) {
   int threshold = ncclParamThreadThreshold();
@@ -177,10 +186,15 @@ static ncclResult_t commFree(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
 
+  free(comm->peerInfo);
+
+  if (comm->bootstrap)
+    NCCLCHECK(bootstrapClose(comm->bootstrap));
+
   CUDACHECK(cudaFree(comm->devComm));
 
-  for (int ring=0; ring<comm->nRings; ring++)
-    NCCLCHECK(freeRing(comm->rings+ring));
+  for (int channel=0; channel<comm->nChannels; channel++)
+    NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks));
 
   if (comm->doneEvent != NULL)
     CUDACHECK(cudaEventDestroy(comm->doneEvent));
@@ -199,6 +213,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
     free(comm->intraCGMode);
     free(comm->intraCC);
   }
+  CUDACHECK(cudaFreeHost((void *)comm->abortFlag));
+  CUDACHECK(cudaFreeHost((void *)comm->fatalDevError));
 
   free(comm);
   return ncclSuccess;
@@ -222,12 +238,15 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   struct ncclComm* comm;
   NCCLCHECK(ncclCalloc(&comm, 1));
 
-  INFO(NCCL_INIT,"comm %p rank %d nranks %d", comm, rank, ndev);
   comm->rank = rank;
   comm->nRanks = ndev;
   cudaGetDevice(&comm->cudaDev);
+  getNvmlDevice(comm->cudaDev, &comm->nvmlDev);
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);
+
   comm->doneEvent = doneEvent;
   comm->llThreshold = ncclParamLlThreshold();
+  comm->treeThreshold = ncclParamTreeThreshold();
   comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
 #if CUDART_VERSION >= 9200
   comm->groupCudaStream = ncclParamGroupCudaStream();
@@ -235,6 +254,13 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   // Don't allow the user to overload the default setting in older CUDA builds
   comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM;
 #endif
+  comm->fatalError = ncclSuccess;
+
+  CUDACHECK(cudaHostAlloc((void**) &comm->fatalDevError, sizeof(ncclDevError_t), cudaHostAllocMapped));
+  *comm->fatalDevError = ncclDevSuccess;
+
+  CUDACHECK(cudaHostAlloc((void**) &comm->abortFlag, sizeof(uint32_t), cudaHostAllocMapped));
+  *comm->abortFlag = 0;
 
   comm->argsptr = &comm->args;
 
@@ -248,9 +274,18 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   // Copy the comm on the device
   NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1));
   // Copy userRanks
-  for (int r=0; r<comm->nRings; r++) {
-    NCCLCHECK(ncclCudaMemcpy(comm->rings[r].devUserRanks, comm->rings[r].userRanks, comm->nRanks));
+  for (int r=0; r<comm->nChannels; r++) {
+    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
+    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks));
   }
+  // Copy the device-accessible pointer to comm->abortFlag
+  void *devAbortFlag;
+  CUDACHECK(cudaHostGetDevicePointer(&devAbortFlag, (uint32_t *)comm->abortFlag, 0));
+  CUDACHECK(cudaMemcpy(&comm->devComm->abortFlag, &devAbortFlag, sizeof(int *), cudaMemcpyHostToDevice));
+  // Copy the device-accessible pointer to comm->fatalDevError
+  void *devFatalError;
+  CUDACHECK(cudaHostGetDevicePointer(&devFatalError, (ncclDevError_t *)comm->fatalDevError, 0));
+  CUDACHECK(cudaMemcpy(&comm->devComm->fatalDevError, &devFatalError, sizeof(ncclDevError_t *), cudaMemcpyHostToDevice));
   return ncclSuccess;
 }
 
@@ -267,35 +302,81 @@ static void showVersion() {
   }
 }
 
-static ncclResult_t fillInfo(struct ncclInfo* info, int rank) {
-  for (int t=0; t<NTRANSPORTS; t++) {
-    NCCLCHECK(ncclTransports[t].fillInfo(info->tinfo+t, rank));
-  }
+static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank) {
+  info->rank = rank;
+  CUDACHECK(cudaGetDevice(&info->cudaDev));
+  NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev))
+  info->hostHash=getHostHash();
+  info->pidHash=getPidHash();
+
+  // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
+  // cudaDev is a CUDA runtime dev number which could be different from the
+  // NVML device number. Then we get the busID from NVML to be sure it is
+  // consistent with NVML remote PCI bus Ids.
+  CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
+  nvmlDevice_t nvmlDevice;
+  NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice));
+  nvmlPciInfo_t pciInfo;
+  NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo));
+  strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE);
   return ncclSuccess;
 }
 
 template <int type>
-static ncclResult_t selectTransport(struct ncclInfo* myInfo, struct ncclInfo* peerInfo, struct ncclConnect* connect, struct ncclTransport** transportRet, struct ncclRing* ring) {
+static ncclResult_t selectTransport(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) {
   for (int t=0; t<NTRANSPORTS; t++) {
     struct ncclTransport *transport = ncclTransports+t;
     struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
     ncclTvalue_t ret = 0;
-    NCCLCHECK(transport->canConnect(&ret, myInfo->tinfo+t, peerInfo->tinfo+t));
+    NCCLCHECK(transport->canConnect(&ret, myInfo, peerInfo));
     if (ret > 0) {
-      NCCLCHECK(transportComm->setup(myInfo->tinfo+t, peerInfo->tinfo+t, connect, ring));
-      *transportRet = transport;
+      connector->transportComm = transportComm;
+      NCCLCHECK(transportComm->setup(myInfo, peerInfo, connect, connector, buffSize, channelId));
       return ncclSuccess;
     }
   }
   WARN("No transport found !");
-  *transportRet = NULL;
   return ncclInternalError;
 }
 
-static ncclResult_t setupRing(struct ncclComm* comm, int ringid, int rank, int nranks, int* ringRanks, struct ncclInfo* allInfo, struct ncclConnect* connect) {
-  NCCLCHECK(initRing(comm, ringid));
+static int log2(int n) {
+ int l = 0;
+ while (n>>=1) l++;
+ return l;
+}
+
+static ncclResult_t ncclTreeThreshold(int nnodes, int nranks, int nChannels, ssize_t *treeThreshold) {
+  int nvlink;
+  NCCLCHECK(ncclNvlinkGpu(&nvlink));
+  float ringbw = nvlink ? 5000*nChannels : 5000; // approx, in MB/s or B/us
+  float ringlatinter = 6;
+  float treelatintra = 4;
+  float treelatinter = 15;
+  float treebw;
+  if (!nvlink) {
+    treebw = ringbw * 2 / 3;
+  } else {
+    treebw = ringbw * 3 / 4;
+    if (nnodes == 2) treebw *= 2;
+  }
+  float ringlat = ringlatinter*(nranks-1);
+  float treelat = treelatinter*log2(nnodes)+treelatintra*(nranks/nnodes-1);
+  if (nnodes < 2 || ringlat <= treelat)
+    *treeThreshold = 0;
+  else if (treebw > ringbw)
+    *treeThreshold = 0x7fffffffffffffff;
+  else
+    *treeThreshold = (ssize_t)(((ringbw*treebw/(ringbw-treebw)))*(ringlat-treelat));
+  return ncclSuccess;
+}
+
+static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks, int* treeMasters) {
+  TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
+  NCCLCHECK(initChannel(comm, channelId));
+
+  struct ncclChannel* channel = comm->channels+channelId;
+  struct ncclRing* ring = &channel->ring;
 
-  struct ncclRing* ring = comm->rings+ringid;
   // Reorganize ranks to start with rank.
   int shift;
   for (shift = 0; shift<nranks; shift++) {
@@ -306,21 +387,85 @@ static ncclResult_t setupRing(struct ncclComm* comm, int ringid, int rank, int n
   for (int i=0; i<nranks; i++) {
     ring->userRanks[i] = ringRanks[(i+shift)%nranks];
   }
-  int prev = ring->userRanks[nranks-1];
-  int next = ring->userRanks[1];
+  int prev = ring->prev = ring->userRanks[nranks-1];
+  int next = ring->next = ring->userRanks[1];
+
+  struct ncclTree* tree = &channel->tree;
+  tree->up = -1;
+  tree->down[0] = tree->down[1] = tree->down[2] = -1;
+
+  //
+  // Find per-node masters and connect them via a binary tree
+  //
+
+  int nMasters = 0;
+  for (int r=0; r<nranks; r++) nMasters += treeMasters[r];
+  if (nMasters == 0) {
+    nMasters = 1;
+    treeMasters[0] = 1;
+  }
+
+  if (comm->treeThreshold == -2)
+    NCCLCHECK(ncclTreeThreshold(nMasters, comm->nRanks, comm->nChannels, &comm->treeThreshold));
+
+  if (comm->treeThreshold > 0) {
+    // Compute tree depth. Not an exact value but a good approximation in most
+    // cases and consistent across nodes
+    tree->depth = nranks/nMasters + log2(nMasters);
+
+    // Find my master : go backwards in the ring to find my root
+    int master = 0;
+    for (int i = 0; i<nranks; i++) {
+      int r = ring->userRanks[(nranks-i)%nranks];
+      if (treeMasters[r]) {
+        master = r;
+        break;
+      }
+    }
 
-  NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+0, &ring->recv.transport, ring));
-  NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+1, &ring->send.transport, ring));
-  NCCLCHECK(transportCreateProxy(0, ring, comm));
-  NCCLCHECK(transportCreateProxy(1, ring, comm));
+    int ranks[nMasters];
+    int i = 0, masterIndex = -1;
+    // Build binary tree
+    for (int r=0; r<nranks; r++) {
+      // Create index table
+      if (r == master) masterIndex = i;
+      if (treeMasters[r]) ranks[i++] = r;
+    }
+    int btreeUp, btreeDown0, btreeDown1;
+    int u0, d0_0, d0_1, u1, d1_0, d1_1;
+    NCCLCHECK(ncclGetDtree(nMasters, masterIndex, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1));
+    if (channelId < DIVUP(comm->nChannels, 2)) {
+      btreeUp = u0; btreeDown0 = d0_0; btreeDown1 = d0_1;
+    } else {
+      btreeUp = u1; btreeDown0 = d1_0; btreeDown1 = d1_1;
+    }
+
+    //
+    // Now build the full tree, combining the intra-node ring and the
+    // inter-node binary tree.
+    //
+
+    if (rank == master) {
+      int nDown = 0;
+      if (btreeUp != -1) tree->up = ranks[btreeUp];
+      if (treeMasters[next] == 0) tree->down[nDown++] = next;
+      if (btreeDown0 != -1) tree->down[nDown++] = ranks[btreeDown0];
+      if (btreeDown1 != -1) tree->down[nDown++] = ranks[btreeDown1];
+    } else {
+      tree->up = prev;
+      if (treeMasters[next] == 0) tree->down[0] = next;
+    }
+  }
+
+  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
   return ncclSuccess;
 }
 
-static ncclResult_t fillConnect(struct ncclInfo* allInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) {
+static ncclResult_t fillConnect(struct ncclPeerInfo* peerInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) {
   for (int r=0; r<nranks; r++) {
     connectTransport[r] = -1;
     for (int t=0; t<NTRANSPORTS; t++) {
-      NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, allInfo[rank].tinfo+t, allInfo[r].tinfo+t));
+      NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, peerInfo+rank, peerInfo+r));
       if (connectValue[r] > 0) {
         connectTransport[r] = t;
         break;
@@ -330,11 +475,6 @@ static ncclResult_t fillConnect(struct ncclInfo* allInfo, int nranks, int rank,
   return ncclSuccess;
 }
 
-static void swap(void* mem1, void* mem2, int size) {
-  char tmp[size];
-  memcpy(tmp, mem1, size); memcpy(mem1, mem2, size); memcpy(mem2, tmp, size);
-}
-
 #define MAXWIDTH 20
 #define PREFIXLEN 15
 #define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
@@ -380,9 +520,9 @@ void dumpLine(int* values, int nranks, const char* prefix) {
 static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
   for (int r=0; r<nrings; r++) {
     char prefix[30];
-    /*sprintf(prefix, "[%d] Ring %d Prev : ", rank, r);
+    /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
     dumpLine(prev+r*nranks, nranks, prefix);
-    sprintf(prefix, "[%d] Ring %d Next : ", rank, r);
+    sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
     dumpLine(next+r*nranks, nranks, prefix);*/
 
     int current = rank;
@@ -390,7 +530,7 @@ static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int
       rings[r*nranks+i] = current;
       current = next[r*nranks+current];
     }
-    sprintf(prefix, "Ring %02d : ", r);
+    sprintf(prefix, "Channel %02d : ", r);
     if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
     if (current != rank) {
       WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
@@ -488,140 +628,274 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct
   return ncclSuccess;
 }
 
+static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
+  TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
+  uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
+  struct ncclConnect connect;
+  struct ncclConnector* conn;
+  for (int i=0; i<nrecv; i++) {
+    int peer = peerRecv[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].recv;
+    if (conn->connected) { ++nSkippedRecv; continue; }
+    NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
+    NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+  }
+  for (int i=0; i<nsend; i++) {
+    int peer = peerSend[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].send;
+    if (conn->connected) { ++nSkippedSend; continue; }
+    NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
+    NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+  }
+  for (int i=0; i<nsend; i++) {
+    int peer = peerSend[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].send;
+    if (conn->connected) {++nSkippedSend; continue; }
+    NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+    NCCLCHECK(conn->transportComm->connect(&connect, conn));
+    conn->connected = 1;
+  }
+  for (int i=0; i<nrecv; i++) {
+    int peer = peerRecv[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].recv;
+    if (conn->connected) {++nSkippedRecv; continue; }
+    NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+    NCCLCHECK(conn->transportComm->connect(&connect, conn));
+    conn->connected = 1;
+  }
+  TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
+  return ncclSuccess;
+}
+
 static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
+  // We use 3 AllGathers
+  // 1. { peerInfo, comm }
+  // 2. ConnectTransport[nranks], ConnectValue[nranks]
+  // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
+
   int rank = comm->rank;
   int nranks = comm->nRanks;
-  void* commState;
-  NCCLCHECK(bootstrapInit(commId, rank, nranks, &commState));
+  TRACE(NCCL_INIT, "rank %d nranks %d - BEGIN", rank, nranks);
+  NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
 
-  struct ncclInfo* allInfo;
-  NCCLCHECK(ncclCalloc(&allInfo, nranks));
-  NCCLCHECK(fillInfo(allInfo+rank, rank));
-  NCCLCHECK(bootstrapAllGather(commState, allInfo, sizeof(struct ncclInfo)));
+  // AllGather1 - begin
+  struct {
+    struct ncclPeerInfo peerInfo;
+    struct ncclComm* comm;
+  } *allGather1Data;
+
+  NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
+  allGather1Data[rank].comm = comm;
+  NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank));
+  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
+
+  NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
+  for (int i = 0; i < nranks; i++) {
+    memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
+  }
+  // AllGather1 data is used again below
+  // AllGather1 - end
+
+  // AllGather2 - begin
+  size_t allGather2DataRowSize = sizeof(int)*nranks + sizeof(ncclTvalue_t)*nranks;
+  void *allGather2Data;
+  NCCLCHECK(ncclCalloc((char **)&allGather2Data, allGather2DataRowSize*nranks));
+  int *myTransportRow = (int *)((char *)allGather2Data + allGather2DataRowSize*rank);
+  ncclTvalue_t *myValueRow = (ncclTvalue_t *)(myTransportRow + nranks);
+
+  NCCLCHECK(fillConnect(comm->peerInfo, nranks, rank, myTransportRow, myValueRow));
+  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather2Data, allGather2DataRowSize));
 
   int* connectTransport;
   ncclTvalue_t* connectValue;
   NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
   NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
+  for (int i = 0; i < nranks; i++) {
+    memcpy(connectTransport + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize, sizeof(int)*nranks);
+    memcpy(connectValue + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize + nranks*sizeof(int), sizeof(ncclTvalue_t)*nranks);
+  }
+  free(allGather2Data);
+  // AllGather2 - end
 
-  NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
-  NCCLCHECK(bootstrapAllGather(commState, connectTransport, nranks*(sizeof(int))));
-  NCCLCHECK(bootstrapAllGather(commState, connectValue, nranks*(sizeof(ncclTvalue_t))));
   //if (rank == 0) dumpMatrix(connectTransport, nranks);
   //if (rank == 0) dumpMatrixTvalue(connectValue, nranks);
 
   // Get my rings
   int nrings;
-  int* prev, *next;
-  NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
-  NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
+  int* prev, *next, *treeIn, *treeOut;
+  NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS));
   comm->nThreads = getDefaultThreads();
-  NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next));
+  NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut));
+  TRACE(NCCL_INIT, "rank %d nranks %d - BUILD %d RINGS", rank, nranks, nrings);
+  assert(nrings <= MAXCHANNELS);
   free(connectTransport);
   free(connectValue);
 
+  // AllGather3 - begin
+  struct {
+    int nThreads;
+    int nrings;
+    int cudaCompCap;
+    int prev[MAXCHANNELS];
+    int next[MAXCHANNELS];
+  } *allGather3Data;
+
+  NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
+  allGather3Data[rank].nThreads = comm->nThreads;
+  allGather3Data[rank].nrings = nrings;
+  allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
+  for (int r=0; r<nrings; r++) {
+    allGather3Data[rank].prev[r] = *(prev+r*nranks+rank);
+    allGather3Data[rank].next[r] = *(next+r*nranks+rank);
+  }
+  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
+
   // Find max nThreads
-  int allData[nranks];
-  allData[rank] = comm->nThreads;
-  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
   for (int i=0; i<nranks; i++)
-    comm->nThreads = std::max(allData[i], comm->nThreads);
-  if (rank == 0) INFO(NCCL_INIT,"Using %d threads", comm->nThreads);
+    comm->nThreads = std::max(allGather3Data[i].nThreads, comm->nThreads);
 
   // Determine the minimum CUDA Compute capability of all GPUs
-  int myCompCap = ncclCudaCompCap();
+  int myCompCap = allGather3Data[rank].cudaCompCap;
   int minCompCap = myCompCap;
-  allData[rank] = myCompCap;
-  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
-  for (int i=0; i<nranks; i++)
-    minCompCap = std::min(allData[i], minCompCap);
-  if (rank == 0) INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
+  for (int i = 0; i < nranks; i++)
+    minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap);
+
+  // Determine thread threshold across all GPUs
+  int nnodes = 0;
+  for (int r=0; r<nranks; r++) nnodes += treeIn[r];
+  comm->threadThreshold = ncclThreadThreshold(minCompCap, nnodes);
 
   // Find min nrings across ranks
-  allData[rank] = nrings;
-  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
   for (int i=0; i<nranks; i++)
-    nrings = std::min(allData[i], nrings);
-
-  // Exchange data with others to build complete rings
-  comm->nRings = nrings;
-  for (int r=0; r<nrings; r++) {
-    NCCLCHECK(bootstrapAllGather(commState, prev+r*nranks, sizeof(int)));
-    NCCLCHECK(bootstrapAllGather(commState, next+r*nranks, sizeof(int)));
+    nrings = std::min(allGather3Data[i].nrings, nrings);
+  comm->nChannels = nrings;
+
+  // Unpack the per ring prev/next arrays
+  for (int i = 0; i < nranks; i++) {
+    for (int r = 0; r < nrings; r++) {
+      prev[r*nranks+i] = allGather3Data[i].prev[r];
+      next[r*nranks+i] = allGather3Data[i].next[r];
+    }
   }
+  free(allGather3Data);
+  // AllGather3 - end
+
   int *rings;
-  NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
   NCCLCHECK(buildRings(nrings, rings, rank, nranks, prev, next));
   free(prev);
   free(next);
+  TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d RINGS", rank, nranks, nrings);
 
   // Connect with prev/next for each ring
-  struct ncclConnect *connectData;
-  NCCLCHECK(ncclCalloc(&connectData, 2*nranks));
+  struct ncclConnect *connect;
+  NCCLCHECK(ncclCalloc(&connect, 2));
   for (int r=0; r<nrings; r++) {
-    int* ringRanks = rings+r*nranks;
-    struct ncclRing *ring = comm->rings+r;
-    NCCLCHECK(setupRing(comm, r, rank, nranks, ringRanks, allInfo, connectData+2*rank));
-    int prev_offset = ring->userRanks[nranks-1]*2+1;
-    int next_offset = ring->userRanks[1]*2;
-    NCCLCHECK(bootstrapAllGather(commState, connectData, sizeof(struct ncclConnect)*2));
-    NCCLCHECK(ring->send.transport->send.connect(connectData+next_offset, &ring->send));
-    NCCLCHECK(ring->recv.transport->recv.connect(connectData+prev_offset, &ring->recv));
-  }
-  free(connectData);
-  free(rings);
-  free(allInfo);
+    struct ncclChannel* channel = comm->channels+r;
+    NCCLCHECK(setupChannel(comm, r, rank, nranks, rings+r*nranks, treeIn+r*nranks));
+    NCCLCHECK(p2pSetup(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next));
+    NCCLCHECK(p2pSetup(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up));
+    NCCLCHECK(p2pSetup(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down));
+  }
+  if (comm->treeThreshold > 0) {
+    char line[1024];
+    line[0]='\0';
+    for (int c=0; c<nrings; c++) {
+      struct ncclTree* tree = &comm->channels[c].tree;
+      snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d->%d->%d/%d/%d",
+          c, tree->up, rank, tree->down[0], tree->down[1], tree->down[2]);
+    }
+    line[1023] = '\0';
+    INFO(NCCL_INIT, "Trees%s", line);
+  }
+  if (rank == 0) {
+    char treeline[64];
+    snprintf(treeline, 64, "enabled up to size %ld", comm->treeThreshold);
+    INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees %s", comm->nThreads, minCompCap,
+       comm->treeThreshold == 0 ? "disabled" :
+       comm->treeThreshold == 0x7fffffffffffffff ? "enabled for all sizes" :
+       treeline);
+  }
 
-  // Intra-process barrier setup
-  struct rankInfo {
-    uint64_t hostHash;
-    uint64_t pidHash;
-    struct ncclComm* comm;
-  } rankInfos[nranks];
-  rankInfos[rank].hostHash = getHostHash();
-  rankInfos[rank].pidHash = getPidHash();
-  rankInfos[rank].comm = comm;
-  NCCLCHECK(bootstrapAllGather(commState, rankInfos, sizeof(struct rankInfo)));
+  TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, nrings);
+  free(connect);
+  free(rings);
+  free(treeIn);
+  free(treeOut);
 
-  // Compute intra ranks
+  // Compute intra ranks (using AllGather1 data)
   int intraRank0 = -1, intraRank = -1, intraRanks = 0;
-  int multiNode = 0;
-  for (int r=0; r<nranks; r++) {
-    if ((rankInfos[r].hostHash == rankInfos[rank].hostHash) &&
-        (rankInfos[r].pidHash == rankInfos[rank].pidHash)) {
-      if (intraRanks == 0) intraRank0 = r;
-      if (r == rank) intraRank = intraRanks;
+  for (int i = 0; i < nranks; i++) {
+    if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) &&
+        (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) {
+      if (intraRanks == 0) intraRank0 = i;
+      if (i == rank) intraRank = intraRanks;
       intraRanks++;
-    } else if (rankInfos[r].hostHash != rankInfos[rank].hostHash) {
-      multiNode = 1;
     }
   }
   TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
-      rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
-  if (intraRank == -1 || intraRank0 == -1 || rankInfos[intraRank0].comm == NULL) {
+        rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
+  if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
     WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
-        rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
+         rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
     return ncclInternalError;
   }
-  NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, rankInfos[intraRank0].comm));
+  NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm));
 
-  // Determine thread threshold across all GPUs
-  comm->threadThreshold = ncclThreadThreshold(minCompCap, multiNode);
+  // Done with AllGather1 data
+  free(allGather1Data);
 
-  // Barrier
-  bootstrapClose(commState);
+  if (nnodes) NCCLCHECK(transportCreateProxy(comm));
+
+  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
   return ncclSuccess;
 }
 
-bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice) {
-  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-  if (cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev) != cudaSuccess) return false;
-  if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevice) != ncclSuccess) return false;
-  if (wrapNvmlDeviceSetCpuAffinity(*nvmlDevice) != ncclSuccess) {
-    WARN("Failed to set CPU affinity");
-    return false;
+static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) {
+  CPU_ZERO_S(sizeof(cpu_set_t), mask);
+  char* cudaPath;
+  NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
+  char path[PATH_MAX];
+  strncpy(path, cudaPath, PATH_MAX-1);
+  snprintf(path+strlen(path), PATH_MAX-1-strlen(path), "/local_cpus");
+  path[PATH_MAX-1] = '\0';
+  int fd;
+  SYSCHECKVAL(open(path, O_RDONLY), "open", fd);
+  char affinityStr[sizeof(cpu_set_t)*2];
+  int r = read(fd, affinityStr, sizeof(cpu_set_t)*2);
+  if (r > 0)
+    NCCLCHECK(ncclStrToCpuset(affinityStr, mask));
+  close(fd);
+  free(cudaPath);
+  return ncclSuccess;
+}
+
+static ncclResult_t setCpuAffinity(int cudaDev) {
+  // Work within the enveloppe we were provided
+  cpu_set_t mask;
+  SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
+
+  // Find the subpart that is local to our GPU
+  cpu_set_t gpuMask;
+  NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask));
+  cpu_set_t finalMask;
+  CPU_AND(&finalMask, &mask, &gpuMask);
+
+  // If those are not disjoint, try to stay local
+  if (CPU_COUNT(&finalMask)) {
+    char affinityStr[sizeof(cpu_set_t)*2];
+    NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
+    INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", cudaDev, affinityStr);
+    SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity");
   }
-  return true;
+  return ncclSuccess;
 }
 
 ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
@@ -633,9 +907,8 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
 
   // Make sure all host memory allocation are close to the GPU
   int cudaDev;
-  nvmlDevice_t nvmlDevice;
   CUDACHECK(cudaGetDevice(&cudaDev));
-  SetCpuAffinity(cudaDev, &nvmlDevice);
+  NCCLCHECK(setCpuAffinity(cudaDev));
   ncclResult_t res;
 
   NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
@@ -645,7 +918,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
   sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
   NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
 
-  INFO(NCCL_INIT,"comm %p rank %d nranks %d - COMPLETE", *newcomm, myrank, nranks);
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->nvmlDev);
 
   return ncclSuccess;
 cleanup:
@@ -664,8 +937,6 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
   NCCLCHECK(ncclInit());
   if (myrank == 0) showVersion();
 
-  INFO(NCCL_INIT,"rank %d nranks %d", myrank, nranks);
-
   // Make sure the CUDA runtime is initialized.
   CUDACHECK(cudaFree(NULL));
 
@@ -685,7 +956,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
 }
 
 static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) {
-  struct ncclInfo* allInfo;
+  struct ncclPeerInfo* allInfo;
   NCCLCHECK(ncclCalloc(&allInfo, nranks));
   for (int rank=0; rank<nranks; rank++) {
     CUDACHECK(cudaSetDevice(devs[rank]));
@@ -699,12 +970,14 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
   for (int rank=0; rank<nranks; rank++)
     NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
 
-  int* prev, *prevFinal, *next, *nextFinal;
-  NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
-  NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXRINGS));
-  NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
-  NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXRINGS));
-  int nrings = MAXRINGS;
+  int* prev, *prevFinal, *next, *nextFinal, *treeIn, *treeOut;
+  NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS));
+  int nrings = MAXCHANNELS;
   int nthreads=0;
   int myCompCap = ncclCudaCompCap();
   int minCompCap = myCompCap;
@@ -713,7 +986,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
     int nringsRank;
     int nthreadsRank = getDefaultThreads();
     myCompCap = ncclCudaCompCap();
-    NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next));
+    NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut));
     nrings = std::min(nrings, nringsRank);
     nthreads = std::max(nthreads, nthreadsRank);
     minCompCap = std::min(minCompCap, myCompCap);
@@ -728,11 +1001,10 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
   free(prev);
   free(next);
 
-  INFO(NCCL_INIT,"Using %d threads", nthreads);
-  INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
+  INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees disabled", nthreads, minCompCap);
 
   int* rings;
-  NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
   NCCLCHECK(buildRings(nrings, rings, 0, nranks, prevFinal, nextFinal));
   free(prevFinal);
   free(nextFinal);
@@ -741,7 +1013,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
   int threadThreshold = ncclThreadThreshold(minCompCap, 0);
 
   for (int rank=0; rank<nranks; rank++) {
-    comms[rank]->nRings = nrings;
+    comms[rank]->nChannels = nrings;
     comms[rank]->nThreads = nthreads;
     comms[rank]->threadThreshold = threadThreshold;
   }
@@ -751,26 +1023,32 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
     int* ringRanks = rings+r*nranks;
     for (int rank=0; rank<nranks; rank++) {
       CUDACHECK(cudaSetDevice(devs[rank]));
-      NCCLCHECK(setupRing(comms[rank], r, rank, nranks, ringRanks, allInfo, connect+2*rank));
-    }
-    // RingExchange connect information
-    for (int rank=0; rank<nranks; rank++) {
-      // Swap rank->prev and prevRank->next
-      struct ncclRing *ring = comms[rank]->rings+r;
-      int prevRank = ring->userRanks[nranks-1];
-      struct ncclConnect* prevRankNextConnect = connect+2*prevRank+1;
-      struct ncclConnect* rankPrevConnect = connect+2*rank;
-      swap(prevRankNextConnect, rankPrevConnect, sizeof(struct ncclConnect));
+      struct ncclChannel* channel = comms[rank]->channels+r;
+      struct ncclRing *ring = &channel->ring;
+      NCCLCHECK(setupChannel(comms[rank], r, rank, nranks, ringRanks, treeIn));
+      // Make sure we don't use trees, we cannot use them with initAll
+      comms[rank]->treeThreshold = 0;
+      int prev = channel->ring.prev = ring->userRanks[nranks-1];
+      int next = channel->ring.next = ring->userRanks[1];
+      struct ncclConnector* recv = &channel->peers[prev].recv;
+      struct ncclConnector* send = &channel->peers[next].send;
+      NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+rank*2+0, recv, channel->buffSize, channel->id));
+      NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+rank*2+1, send, channel->buffSize, channel->id));
     }
     for (int rank=0; rank<nranks; rank++) {
       CUDACHECK(cudaSetDevice(devs[rank]));
-      struct ncclRing *ring = comms[rank]->rings+r;
-      NCCLCHECK(ring->send.transport->send.connect(connect+2*rank+1, &ring->send));
-      NCCLCHECK(ring->recv.transport->recv.connect(connect+2*rank+0, &ring->recv));
+      struct ncclChannel* channel = comms[rank]->channels+r;
+      struct ncclRing *ring = &channel->ring;
+      struct ncclConnector* recv = &channel->peers[ring->prev].recv;
+      struct ncclConnector* send = &channel->peers[ring->next].send;
+      NCCLCHECK(recv->transportComm->connect(connect+ring->prev*2+1, recv));
+      NCCLCHECK(send->transportComm->connect(connect+ring->next*2+0, send));
     }
   }
-  free(rings);
   free(allInfo);
+  free(rings);
+  free(treeIn);
+  free(treeOut);
   return ncclSuccess;
 }
 
@@ -794,7 +1072,6 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
   int savedDevice;
   int rank, cudaDev;
   ncclComm_t comm = NULL;
-  nvmlDevice_t nvmlDevice;
   int ncclDevList[ndev];
   for (int i=0; i<ndev; i++) {
     ncclDevList[i] = devlist ? devlist[i] : i;
@@ -812,7 +1089,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
     cudaDev = ncclDevList[rank];
     CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
 
-    SetCpuAffinity(cudaDev, &nvmlDevice);
+    NCCLCHECK(setCpuAffinity(cudaDev));
 
     NCCLCHECKGOTO(commAlloc(&comm, ndev, rank), res, cleanup);
     comms[rank] = comm;
@@ -848,27 +1125,50 @@ final:
   return res;
 }
 
-NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
-ncclResult_t ncclCommDestroy(ncclComm_t comm) {
 
-  if (comm == NULL)
-    return ncclSuccess;
+static ncclResult_t commDestroy(ncclComm_t comm) {
   int savedDevice;
   CUDACHECK(cudaGetDevice(&savedDevice));
   int commDevice = comm->cudaDev;
+  int rank = comm->rank;
 
   if (savedDevice != commDevice) {
     CUDACHECK(cudaSetDevice(commDevice));
   }
 
+  TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, *comm->abortFlag, comm->fatalError);
+
+  CUDACHECK(cudaStreamSynchronize(comm->groupStream));
+  NCCLCHECK(transportDestroyProxy(comm));
   NCCLCHECK(commFree(comm));
 
   if (savedDevice != commDevice)
     CUDACHECK(cudaSetDevice(savedDevice));
 
+  INFO(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
+
   return ncclSuccess;
 }
 
+NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
+ncclResult_t ncclCommDestroy(ncclComm_t comm) {
+  if (comm == NULL)
+    return ncclSuccess;
+
+  return commDestroy(comm);
+}
+
+NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
+ncclResult_t ncclCommAbort(ncclComm_t comm) {
+  if (comm == NULL)
+    return ncclSuccess;
+
+  // Ask anything that might still be running on the device to quit
+  *comm->abortFlag = 1;
+
+  return commDestroy(comm);
+}
+
 NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
 const char* ncclGetErrorString(ncclResult_t code) {
   switch (code) {
@@ -882,6 +1182,39 @@ const char* ncclGetErrorString(ncclResult_t code) {
   }
 }
 
+NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError);
+ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
+  NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm"));
+  NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError"));
+
+  // Check device reported error
+  static ncclDevError_t printedDevErr = ncclDevSuccess;
+  switch(*comm->fatalDevError) {
+    case ncclDevSuccess :
+      break;
+    case ncclDevAssertedMismatch :
+      if (printedDevErr != ncclDevAssertedMismatch) {
+        WARN("Mismatched collective detected, please check your collective calls at and around rank %d. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank);
+        printedDevErr = ncclDevAssertedMismatch;
+      }
+      if (comm->fatalError == ncclSuccess) {
+        comm->fatalError = ncclInvalidUsage;
+      }
+      break;
+    case ncclDevSuspectedMismatch :
+      if (printedDevErr != ncclDevSuspectedMismatch) {
+        WARN("Your program may be hanging, this may be caused by a collective mismatch around rank %d. Please check your collective calls at and around this rank. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank);
+        printedDevErr = ncclDevSuspectedMismatch;
+      }
+      break;
+    default:
+      WARN("Unknown device error %d", *comm->fatalDevError);
+      return ncclInternalError;
+  }
+  *asyncError = comm->fatalError;
+  return ncclSuccess;
+}
+
 NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
 ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
   NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
diff --git a/src/misc/checks.cu b/src/misc/checks.cu
new file mode 100644
index 0000000..a07e577
--- /dev/null
+++ b/src/misc/checks.cu
@@ -0,0 +1,69 @@
+/*************************************************************************
+ * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "checks.h"
+
+static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
+  cudaPointerAttributes attr;
+  cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
+  if (err != cudaSuccess || attr.devicePointer == NULL) {
+    WARN("%s : %s is not a valid pointer", opname, ptrname);
+    return ncclInvalidArgument;
+  }
+#if CUDART_VERSION >= 10000
+  if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
+#else
+  if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
+#endif
+    WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
+    return ncclInvalidArgument;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
+  if (ptr == NULL) {
+    WARN("%s : %s argument is NULL", opname, ptrname);
+    return ncclInvalidArgument;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ArgsCheck(struct ncclInfo* info) {
+  NCCLCHECK(PtrCheck(info->comm, info->opName, "comm"));
+  // First, the easy ones
+  if (info->root < 0 || info->root >= info->comm->nRanks) {
+    WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks);
+    return ncclInvalidArgument;
+  }
+  if (info->datatype < 0 || info->datatype >= ncclNumTypes) {
+    WARN("%s : invalid type %d", info->opName, info->datatype);
+    return ncclInvalidArgument;
+  }
+  // Type is OK, compute nbytes. Convert Allgather/Broadcast calls to chars.
+  info->nBytes = info->count * ncclTypeSize(info->datatype);
+  if (info->coll == ncclCollAllGather || info->coll == ncclCollBroadcast) {
+    info->count = info->nBytes;
+    info->datatype = ncclInt8;
+  }
+  if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
+
+  if (info->op < 0 || info->op >= ncclNumOps) {
+    WARN("%s : invalid reduction operation %d", info->opName, info->op);
+    return ncclInvalidArgument;
+  }
+
+  if (info->comm->checkPointers) {
+    // Check CUDA device pointers
+    if (info->coll != ncclCollBroadcast || info->comm->rank == info->root) {
+      NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
+    }
+    if (info->coll != ncclCollReduce || info->comm->rank == info->root) {
+      NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
+    }
+  }
+  return ncclSuccess;
+}
diff --git a/src/misc/enqueue.cu b/src/misc/enqueue.cu
deleted file mode 100644
index 80846dd..0000000
--- a/src/misc/enqueue.cu
+++ /dev/null
@@ -1,248 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "enqueue.h"
-#include "common_coll.h"
-#include "param.h"
-
-#include "collectives/collectives.h"
-
-#define NCCL_FUNC4(coll, op, dtype) \
-  (void*)NCCL_KERN_NAME(coll, op, dtype), \
-  (void*)NCCL_KERN_NAME(coll##LL, op, dtype)
-
-// Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(coll, op) \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  u8), \
-  (void*)NCCL_FUNC4(coll, op, i32), \
-  (void*)NCCL_FUNC4(coll, op, u32), \
-  (void*)NCCL_FUNC4(coll, op, i64), \
-  (void*)NCCL_FUNC4(coll, op, u64), \
-  (void*)NCCL_FUNC4(coll, op, f16), \
-  (void*)NCCL_FUNC4(coll, op, f32), \
-  (void*)NCCL_FUNC4(coll, op, f64)
-#define NCCL_FUNCS3B(coll, op) \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8), \
-  (void*)NCCL_FUNC4(coll, op,  i8)
-
-// Must be consistent with ncclRedOp_t
-#define NCCL_FUNCS2A(coll) \
-  NCCL_FUNCS3A(coll, sum ), \
-  NCCL_FUNCS3A(coll, prod), \
-  NCCL_FUNCS3A(coll, max ), \
-  NCCL_FUNCS3A(coll, min )
-#define NCCL_FUNCS2B(coll) \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy), \
-  NCCL_FUNCS3B(coll, copy)
-
-// Must be consistent with the ncclFuncSet enum
-static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
-  NCCL_FUNCS2B(ncclBroadcast),
-  NCCL_FUNCS2A(ncclReduce),
-  NCCL_FUNCS2B(ncclAllGather),
-  NCCL_FUNCS2A(ncclReduceScatter),
-  NCCL_FUNCS2A(ncclAllReduce)
-};
-
-ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
-#if CUDART_VERSION >= 9000
-  if (cgMode & 0x01) {
-    CUDACHECK(cudaLaunchCooperativeKernelMultiDevice(paramsList, numDevices,
-            // These flags are to reduce the latency of using this API
-            cudaCooperativeLaunchMultiDeviceNoPreSync|cudaCooperativeLaunchMultiDeviceNoPostSync));
-    return ncclSuccess;
-  }
-#endif
-  int savedDev;
-  CUDACHECK(cudaGetDevice(&savedDev));
-  for (int i = 0; i < numDevices; i++) {
-    struct cudaLaunchParams* params = paramsList+i;
-    CUDACHECK(cudaSetDevice(cudaDevs[i]));
-    CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
-  }
-  CUDACHECK(cudaSetDevice(savedDev));
-  return ncclSuccess;
-}
-
-ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
-  params->gridDim.x = std::min((int) params->gridDim.x, comm->nRings);
-
-  // Set active = 2 for the last operation
-  for (int r=0; r<params->gridDim.x; r++) {
-    struct ncclRing* ring = comm->rings+r;
-    ring->collectives[(ring->collStart+ring->collCount-1)%NCCL_MAX_OPS].active = 2;
-  }
-
-  // Find the first operation, choose the kernel accordingly and pass it
-  // as the first argument.
-  struct ncclColl* coll = comm->rings[0].collectives+comm->rings[0].collStart;
-  memcpy(&comm->args, coll, sizeof(struct ncclColl));
-  // As we pass that coll directly, we can free it immediately.
-  coll->active = 0;
-
-  params->func = ncclKerns[coll->funcIndex];
-  return ncclSuccess;
-}
-
-ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
-  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
-  int val = *ptr;
-  bool done = false;
-  while (done == false) {
-    if (val >= comm->intraRanks) {
-      WARN("Trying to launch too many collectives");
-      return ncclInvalidUsage;
-    }
-    if (val+1 == comm->intraRanks) {
-      // Reset the barrier.
-      comm->intraBarrier[comm->intraPhase^1] = 0;
-      *isLast = 1;
-      return ncclSuccess;
-    }
-    done = __sync_bool_compare_and_swap(ptr, val, val+1);
-    val++;
-  }
-  *isLast = 0;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
-  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
-  int val = *ptr;
-  if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
-    WARN("Trying to launch too many collectives");
-    return ncclInternalError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
-  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
-  while (*ptr < comm->intraRanks) pthread_yield();
-  comm->intraPhase ^= 1;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
-  if (comm->nRanks == 1) return ncclSuccess;
-  struct cudaLaunchParams* params = comm->myParams;
-
-  NCCLCHECK(setupLaunch(comm, params));
-
-  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
-  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
-    // Enqueue event in user stream
-    CUDACHECK(cudaEventRecord(comm->doneEvent, comm->userStream));
-    // Create dependency between user stream and internal NCCL stream
-    CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
-    params->stream = comm->groupStream;
-  } else {
-    if (comm->userStream != params->stream) {
-      // Stream changed from last call, create dependency against last NCCL kernel launch
-      CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
-    }
-    params->stream = comm->userStream;
-  }
-
-  int isLast = 0;
-  NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
-
-  if (isLast) {
-    if (comm->launchMode == ncclComm::GROUP) {
-      // I'm the last. Launch all operations.
-      NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode));
-    }
-    NCCLCHECK(ncclCpuBarrierLast(comm));
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
-  if (comm->nRanks == 1) return ncclSuccess;
-  // We can't print the CG mode before the first barrier happened.
-  if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
-    *comm->intraCGMode ^= 0x10;
-    INFO(NCCL_INIT,"Launch mode %s%s%s",
-        comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
-        *comm->intraCGMode ? "/CGMD" : "",
-        (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
-  }
-
-  NCCLCHECK(ncclCpuBarrierOut(comm));
-
-  struct cudaLaunchParams *params = comm->myParams;
-  if (comm->launchMode == ncclComm::PARALLEL) {
-    CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
-  }
-  // Start the network proxies as soon as the kernel has been launched. We can't
-  // perform any CUDA call between the two or having a cudaFree between the CUDA
-  // launch and the transportStartProxies call could cause a deadlock.
-  // Also, starting the proxies after the CUDA launch seems to be better for
-  // performance (latency).
-  for (int r=0; r<params->gridDim.x; r++) {
-    struct ncclRing* ring = comm->rings+r;
-    ring->collStart = ring->collFifoTail;
-    ring->collCount = 0;
-  }
-  params->gridDim.x = params->blockDim.x = 0;
-  NCCLCHECK(transportStartProxies(comm));
-  return ncclSuccess;
-}
-
-ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
-  struct cudaLaunchParams *params = comm->myParams;
-  // Enqueue event after NCCL kernel
-  CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream));
-  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
-  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
-    // Create dependency between NCCL internal stream and user stream
-    CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
-  }
-  comm->userStreamSet = false;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff,
-    void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root,
-    ncclComm_t comm, cudaStream_t stream) {
-  if (comm == NULL) return ncclInvalidArgument;
-  // Launch asynchronously if needed
-  if (ncclAsyncMode()) {
-    ncclResult_t ret = ncclSuccess;
-    int savedDev = -1;
-    if (comm->checkPointers) {
-      CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end);
-      CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, end);
-    }
-    // Check arguments
-    NCCLCHECKGOTO(ArgsCheck(sendbuff, recvbuff, count, type, op, root, comm, primName), ret, end);
-    // Always register comm even in case of error to make sure ncclGroupEnd
-    // cleans it up.
-    NCCLCHECK(ncclAsyncColl(comm));
-    NCCLCHECKGOTO(func(sendbuff, recvbuff, count, type, op, root, comm, stream), ret, end);
-end:
-    if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev));
-    ncclAsyncErrCheck(ret);
-    return ret;
-  } else {
-    NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, type, op, root, comm, primName));
-    NCCLCHECK(func(sendbuff, recvbuff, count, type, op, root, comm, stream));
-    NCCLCHECK(ncclBarrierEnqueue(comm));
-    NCCLCHECK(ncclBarrierEnqueueWait(comm));
-    NCCLCHECK(ncclEnqueueEvents(comm));
-    return ncclSuccess;
-  }
-}
diff --git a/src/misc/group.cu b/src/misc/group.cu
index 1716a75..c428a22 100644
--- a/src/misc/group.cu
+++ b/src/misc/group.cu
@@ -179,13 +179,13 @@ group_cleanup:
   // an atomic operation, we need to cancel all operations.
   for (int i=0; i<ncclGroupIndex; i++) {
     struct ncclComm* comm = ncclGroupArgs[i].coll.comm;
-    for (int r=0; r<comm->nRings; r++) {
-      struct ncclRing* ring = comm->rings+r;
-      for (int i=0; i<ring->collCount; i++) {
-        ring->collectives[(ring->collStart + i)%NCCL_MAX_OPS].active = 0;
+    for (int c=0; c<comm->nChannels; c++) {
+      struct ncclChannel* channel = comm->channels+c;
+      for (int i=0; i<channel->collCount; i++) {
+        channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
       }
-      ring->collFifoTail = ring->collStart;
-      ring->collCount = 0;
+      channel->collFifoTail = channel->collStart;
+      channel->collCount = 0;
     }
     comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
     comm->userStreamSet = false;
diff --git a/src/misc/nvmlwrap.cu b/src/misc/nvmlwrap.cu
index d9407f4..635f332 100644
--- a/src/misc/nvmlwrap.cu
+++ b/src/misc/nvmlwrap.cu
@@ -16,14 +16,14 @@ static nvmlReturn_t (*nvmlInternalInit)(void);
 static nvmlReturn_t (*nvmlInternalShutdown)(void);
 static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
 static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
-static nvmlReturn_t (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device);
-static nvmlReturn_t (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device);
 static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
 static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
 static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci);
 static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
 static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link,
     nvmlNvLinkCapability_t capability, unsigned int *capResult);
+static nvmlReturn_t (*nvmlInternalDeviceGetMinorNumber)(nvmlDevice_t device, unsigned int* minorNumber);
+
 
 ncclResult_t wrapNvmlSymbols(void) {
   if (nvmlState == nvmlInitialized)
@@ -70,10 +70,9 @@ ncclResult_t wrapNvmlSymbols(void) {
   LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
   LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
   LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceSetCpuAffinity", nvmlInternalDeviceSetCpuAffinity);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity);
   LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
   LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceGetMinorNumber", nvmlInternalDeviceGetMinorNumber);
   LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState);
   LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo);
   LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability);
@@ -86,9 +85,8 @@ teardown:
   nvmlInternalShutdown = NULL;
   nvmlInternalDeviceGetHandleByPciBusId = NULL;
   nvmlInternalDeviceGetIndex = NULL;
-  nvmlInternalDeviceSetCpuAffinity = NULL;
-  nvmlInternalDeviceClearCpuAffinity = NULL;
   nvmlInternalDeviceGetPciInfo = NULL;
+  nvmlInternalDeviceGetMinorNumber = NULL;
   nvmlInternalDeviceGetNvLinkState = NULL;
   nvmlInternalDeviceGetNvLinkRemotePciInfo = NULL;
   nvmlInternalDeviceGetNvLinkCapability = NULL;
@@ -155,46 +153,28 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
   return ncclSuccess;
 }
 
-ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
-  if (nvmlInternalDeviceSetCpuAffinity == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclInternalError;
-  }
-  // Workaround : it seems SetCpuAffinity is not thread safe.
-  static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
-  pthread_mutex_lock(&lock);
-  nvmlReturn_t ret = nvmlInternalDeviceSetCpuAffinity(device);
-  pthread_mutex_unlock(&lock);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceSetCpuAffinity() failed: %s ",
-        nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
-  if (nvmlInternalInit == NULL) {
+ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
+  if (nvmlInternalDeviceGetPciInfo == NULL) {
     WARN("lib wrapper not initialized.");
     return ncclInternalError;
   }
-  nvmlReturn_t ret = nvmlInternalDeviceClearCpuAffinity(device);
+  nvmlReturn_t ret = nvmlInternalDeviceGetPciInfo(device, pci);
   if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceClearCpuAffinity() failed: %s ",
+    WARN("nvmlDeviceGetPciInfo() failed: %s ",
         nvmlInternalErrorString(ret));
     return ncclSystemError;
   }
   return ncclSuccess;
 }
 
-ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
-  if (nvmlInternalDeviceGetPciInfo == NULL) {
+ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
+  if (nvmlInternalDeviceGetMinorNumber == NULL) {
     WARN("lib wrapper not initialized.");
     return ncclInternalError;
   }
-  nvmlReturn_t ret = nvmlInternalDeviceGetPciInfo(device, pci);
+  nvmlReturn_t ret = nvmlInternalDeviceGetMinorNumber(device, minorNumber);
   if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetPciInfo() failed: %s ",
+    WARN("nvmlDeviceGetMinorNumber() failed: %s ",
         nvmlInternalErrorString(ret));
     return ncclSystemError;
   }
@@ -208,8 +188,9 @@ ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link
   }
   nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive);
   if (ret != NVML_SUCCESS) {
-    INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
-        nvmlInternalErrorString(ret));
+    if (ret != NVML_ERROR_NOT_SUPPORTED)
+      INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
+          nvmlInternalErrorString(ret));
     return ncclSystemError;
   }
   return ncclSuccess;
diff --git a/src/misc/rings.cu b/src/misc/rings.cu
index a5d4616..a7b122c 100644
--- a/src/misc/rings.cu
+++ b/src/misc/rings.cu
@@ -160,7 +160,10 @@ static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankTo
     while ((rank = findConnected(curRank, matrix, nranks, transport, coords)) == -1) {
       current[transport] = 0;
       transport++;
-      if (transport == NTRANSPORTS) { free(p2pConnected); return ncclInternalError; }
+      if (transport == NTRANSPORTS) {
+        WARN("Error : Could not find transport to connect next group\n");
+        free(p2pConnected);
+        return ncclInternalError; }
     }
     curRank = rank;
     current[transport]++;
@@ -179,8 +182,20 @@ ncclResult_t getEnvThreads(int* nthreads) {
   return ncclSuccess;
 }
 
+static inline int copyRings(int nrings, int newNrings, int nranks, int* a, int* b, int* c, int* d) {
+  if (newNrings > MAXCHANNELS) newNrings = MAXCHANNELS;
+  for (int r=nrings; r<newNrings; r++) {
+    for (int i=0; i<nranks; i++) {
+      a[r*nranks+i] = a[(r-nrings)*nranks+i];
+      b[r*nranks+i] = b[(r-nrings)*nranks+i];
+      c[r*nranks+i] = c[(r-nrings)*nranks+i];
+      d[r*nranks+i] = d[(r-nrings)*nranks+i];
+    }
+  }
+  return newNrings;
+}
 /* Main ring creation function */
-ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next) {
+ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut) {
   *nrings = 0;
 
   if (nranks == 1) return ncclSuccess;
@@ -191,6 +206,12 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
     if (ret == ncclSuccess && *nrings > 0) {
       if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings);
       NCCLCHECK(getEnvThreads(nthreads));
+      for (int r = 0; r<*nrings; r++) {
+        for (int i = 0; i<nranks; i++) {
+          if (transports[i*nranks+prev[i]] == 2) treeIn[i] = 1;
+          if (transports[i*nranks+next[i]] == 2) treeOut[i] = 1;
+        }
+      }
       return ncclSuccess;
     }
     if (rank == 0) INFO(NCCL_INIT,"No valid ring found in environment, ignoring");
@@ -210,8 +231,8 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
   int minScore = NCCL_MAX_SCORE;
   int nringsTmp;
   int *prevTmp, *nextTmp, *idxToRank, *rankToIdx, *groups, *subgroups;
-  NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXRINGS));
-  NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXCHANNELS));
   NCCLCHECK(ncclCalloc(&idxToRank, nranks));
   NCCLCHECK(ncclCalloc(&rankToIdx, nranks));
   NCCLCHECK(ncclCalloc(&groups, nranks));
@@ -220,8 +241,8 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
   int nThreads;
   do {
     nThreads = *nthreads;
-    for (int i=0; i<nranks*MAXRINGS; i++) prevTmp[i] = nextTmp[i] = -1;
-    nringsTmp = MAXRINGS;
+    for (int i=0; i<nranks*MAXCHANNELS; i++) prevTmp[i] = nextTmp[i] = -1;
+    nringsTmp = MAXCHANNELS;
     // Loop over transports to connect groups
     for (int t=NTRANSPORTS-1; t>=0; t--) {
       for (int i=0; i<nranks; i++) idxToRank[i] = rankToIdx[i] = -1;
@@ -282,6 +303,11 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
           for (int i=0; i<nidx; i++) {
             if ((prevTmp[r*nranks+idxToRank[i]] == -1) && (subprev[r*nidx+i] != -1)) prevTmp[r*nranks+idxToRank[i]] = idxToRank[subprev[r*nidx+i]];
             if ((nextTmp[r*nranks+idxToRank[i]] == -1) && (subnext[r*nidx+i] != -1)) nextTmp[r*nranks+idxToRank[i]] = idxToRank[subnext[r*nidx+i]];
+            if (t == NTRANSPORTS-1) {
+              // Save node-level masters for trees
+              treeIn[r*nranks+idxToRank[i]] = prevTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
+              treeOut[r*nranks+idxToRank[i]] = nextTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
+            }
           }
         }
         //for (int r=0; r<nringsTmp; r++) {
@@ -316,6 +342,15 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
 
   *nthreads = nThreads;
 
+  /* Duplicate the rings in case of multinode+NVLink */
+  int nnodes = 0;
+  for (int r=0; r<nranks; r++) nnodes += treeIn[r];
+  int nvlink;
+  NCCLCHECK(ncclNvlinkGpu(&nvlink));
+  if (nnodes > 1 && nvlink) {
+    *nrings = copyRings(*nrings, *nrings*2, nranks, prev, next, treeIn, treeOut);
+  }
+
   if (*nrings == 0) {
     WARN("Could not create rings, falling back on simple ring");
     *nrings = 1;
@@ -329,9 +364,9 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
     if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than NCCL_MAX_NRINGS, ignoring NCCL_MIN_NRINGS");
     minNrings = 0;
   }
-  if (minNrings > MAXRINGS) {
-    if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXRINGS, MAXRINGS);
-    minNrings = MAXRINGS;
+  if (minNrings > MAXCHANNELS) {
+    if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXCHANNELS, MAXCHANNELS);
+    minNrings = MAXCHANNELS;
   }
   if (maxNrings > 0 && maxNrings <= *nrings) {
     if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
@@ -341,13 +376,7 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
     if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
     if (minNrings > 0 && minNrings > *nrings) {
       if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings);
-      for (int r=*nrings; r<MAXRINGS && r <minNrings; r++) {
-        for (int i=0; i<nranks; i++) {
-          prev[r*nranks+i] = prev[(r-*nrings)*nranks+i];
-          next[r*nranks+i] = next[(r-*nrings)*nranks+i];
-        }
-      }
-      *nrings = minNrings;
+      *nrings = copyRings(*nrings, minNrings, nranks, prev, next, treeIn, treeOut);
     }
   }
 
diff --git a/src/misc/trees.cu b/src/misc/trees.cu
new file mode 100644
index 0000000..e53ea0b
--- /dev/null
+++ b/src/misc/trees.cu
@@ -0,0 +1,108 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "net.h"
+#include "param.h"
+
+#define RANK_TO_INDEX(r) (rank > root ? rank-1 : rank)
+
+/* Btree which alternates leaves and nodes.
+ * Assumes root is 0, which conveniently builds a tree on powers of two,
+ * (because we have pow2-1 ranks) which lets us manipulate bits.
+ * Find first non-zero bit, then :
+ * Find the parent :
+ *   xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below)
+ *   xx11[0] -> xx10[0] (3,7,11 below)
+ * Find the children :
+ *   xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13)
+ *   xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13)
+ *
+ * Illustration :
+ * 0---------------8
+ *          ______/ \______
+ *         4               12
+ *       /   \            /  \
+ *     2       6       10     \
+ *    / \     / \     /  \     \
+ *   1   3   5   7   9   11    13
+ */
+ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
+  int up, down0, down1;
+  int bit;
+  for (bit=1; bit<nranks; bit<<=1) {
+    if (bit & rank) break;
+  }
+
+  if (rank == 0) {
+    *u = -1;
+    *d0 = nranks > 1 ? bit >> 1 : -1;
+    *d1 = -1;
+    return ncclSuccess;
+  }
+
+  up = (rank ^ bit) | (bit << 1);
+  if (up >= nranks) up = (rank ^ bit);
+  *u = up;
+
+  int lowbit = bit >> 1;
+  // down0 is always within bounds
+  down0 = lowbit == 0 ? -1 : rank-lowbit;
+
+  down1 = lowbit == 0 ? -1 : rank+lowbit;
+  // Make sure down1 is within bounds
+  while (down1 >= nranks) {
+    down1 = lowbit == 0 ? -1 : rank+lowbit;
+    lowbit >>= 1;
+  }
+  *d0 = down0; *d1 = down1;
+
+  return ncclSuccess;
+}
+
+/* Build a double binary tree. Take the previous tree for the first tree.
+ * For the second tree, we use a mirror tree (if nranks is odd)
+ *
+ *                 8---------0---------5
+ *          ______/ \______      _____/ \______
+ *         4               12   1              9
+ *       /   \            /      \           /   \
+ *     2       6       10          3       7      10
+ *    / \     / \     /  \        / \     / \    /  \
+ *   1   3   5   7   9   11      2   4   6   8  11  12
+ *
+ * or shift it by one rank (if nranks is even)
+ *
+ *                 8---------0--------------9
+ *          ______/ \                ______/ \
+ *         4         \              5         \
+ *       /   \        \           /   \        \
+ *     2       6       10       3       7       11
+ *    / \     / \     /  \     / \     / \     /  \
+ *   1   3   5   7   9   11   2   4   6   8   10   1
+ */
+ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* s1, int* d1_0, int* d1_1) {
+  // First tree ... use a btree
+  ncclGetBtree(nranks, rank, s0, d0_0, d0_1);
+  // Second tree ... mirror or shift
+  if (nranks % 2 == 0) {
+    // shift
+    int shiftrank = (rank-1+nranks) % nranks;
+    int u, d0, d1;
+    ncclGetBtree(nranks, shiftrank, &u, &d0, &d1);
+    *s1 = u == -1 ? -1 : (u+1) % nranks;
+    *d1_0 = d0 == -1 ? -1 : (d0+1) % nranks;
+    *d1_1 = d1 == -1 ? -1 : (d1+1) % nranks;
+  } else {
+    // mirror
+    int u, d0, d1;
+    ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1);
+    *s1 = u == -1 ? -1 : nranks-1-u;
+    *d1_0 = d0 == -1 ? -1 : nranks-1-d0;
+    *d1_1 = d1 == -1 ? -1 : nranks-1-d1;
+  }
+  return ncclSuccess;
+}
diff --git a/src/misc/utils.cu b/src/misc/utils.cu
index d8e3aec..c618e71 100644
--- a/src/misc/utils.cu
+++ b/src/misc/utils.cu
@@ -11,6 +11,24 @@
 #include <string.h>
 #include <stdarg.h>
 
+#include "nvmlwrap.h"
+#include "core.h"
+
+// Convert a logical cudaDev index to the NVML device minor number
+ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) {
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+  nvmlDevice_t nvmlDevice;
+  unsigned int dev;
+  *nvmlDev = -1;
+  CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
+  NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDevice));
+  NCCLCHECK(wrapNvmlDeviceGetMinorNumber(nvmlDevice, &dev));
+
+  *nvmlDev = dev;
+
+  return ncclSuccess;
+}
+
 ncclResult_t getHostName(char* hostname, int maxlen) {
   if (gethostname(hostname, maxlen) != 0) {
     strncpy(hostname, "unknown", maxlen);
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 7227625..985274e 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -68,14 +68,24 @@ ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId
 ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
 ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
 
-/* Frees resources associated with communicator object. */
+/* Frees resources associated with communicator object, but waits for any operations
+ * that might still be running on the device. */
 ncclResult_t  ncclCommDestroy(ncclComm_t comm);
 ncclResult_t pncclCommDestroy(ncclComm_t comm);
 
+/* Frees resources associated with communicator object and aborts any operations
+ * that might still be running on the device. */
+ncclResult_t  ncclCommAbort(ncclComm_t comm);
+ncclResult_t pncclCommAbort(ncclComm_t comm);
+
 /* Returns a human-readable error message. */
 const char*  ncclGetErrorString(ncclResult_t result);
 const char* pncclGetErrorString(ncclResult_t result);
 
+/* Checks whether the comm has encountered any asynchronous errors */
+ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+
 /* Gets the number of ranks in the communicator clique. */
 ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
 ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
diff --git a/src/ring.cu b/src/ring.cu
deleted file mode 100644
index fede793..0000000
--- a/src/ring.cu
+++ /dev/null
@@ -1,70 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "ring.h"
-#include "param.h"
-
-NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
-
-ncclResult_t initRing(struct ncclComm* comm, int ringid) {
-  struct ncclRing* ring = comm->rings+ringid;
-  ring->id = ringid;
-
-  // Setup intermediate buffering
-  ring->buffSize = ncclParamBuffsize();
-
-  const int sendSize = ring->devMemSendSize = sizeof(struct ncclSendMem);
-  struct ncclSendMem* sendMem;
-  NCCLCHECK(ncclCudaCalloc((char**)&sendMem, sendSize));
-  ring->devMemSend = sendMem;
-
-  const int recvSize = ring->devMemRecvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
-  struct ncclRecvMem* recvMem;
-  NCCLCHECK(ncclCudaCalloc((char**)&recvMem, recvSize));
-  ring->devMemRecv = recvMem;
-
-  TRACE(NCCL_INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize);
-
-  // Pre-configure send/recv pointers. Those are the default, they may change later.
-  ring->recv.conn.buff = recvMem->buff;
-  ring->recv.conn.llBuff = recvMem->llBuff;
-  ring->recv.conn.tail = &recvMem->tail;
-  ring->recv.conn.opCount = &recvMem->opCount;
-  ring->recv.conn.direct = 0;
-  ring->send.conn.head = &sendMem->head;
-  ring->send.conn.llHead = &sendMem->llHead;
-  ring->send.conn.direct = 0;
-  ring->send.conn.llStep = 0;
-  ring->send.conn.llLastCleaning = 0;
-
-  // Ring index to user rank table.
-  NCCLCHECK(ncclCudaCalloc(&ring->devUserRanks, comm->nRanks));
-  NCCLCHECK(ncclCalloc(&ring->userRanks, comm->nRanks));
-
-  // Per-ring operation list.
-  NCCLCHECK(ncclCudaHostAlloc((void**)&ring->collectives, (void**)&ring->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
-  return ncclSuccess;
-}
-
-ncclResult_t freeRing(struct ncclRing* ring) {
-  // Intermediate buffering
-  CUDACHECK(cudaFree(ring->devMemSend));
-  CUDACHECK(cudaFree(ring->devMemRecv));
-
-  // Index to rank table
-  free(ring->userRanks);
-  CUDACHECK(cudaFree(ring->devUserRanks));
-
-  // Operation list
-  NCCLCHECK(ncclCudaHostFree(ring->collectives));
-
-  // Free transport proxy resources
-  if (ring->send.transportResources) NCCLCHECK(ring->send.transport->send.free(ring->send.transportResources));
-  NCCLCHECK(transportDestroyProxy(&ring->send));
-  if (ring->recv.transportResources) NCCLCHECK(ring->recv.transport->recv.free(ring->recv.transportResources));
-  NCCLCHECK(transportDestroyProxy(&ring->recv));
-  return ncclSuccess;
-}
diff --git a/src/transport.cu b/src/transport.cu
index 7c13d5c..1436a5b 100644
--- a/src/transport.cu
+++ b/src/transport.cu
@@ -1,11 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
 #include "core.h"
-#include "common_coll.h"
 
 extern struct ncclTransport p2pTransport;
 extern struct ncclTransport shmTransport;
@@ -17,74 +16,16 @@ struct ncclTransport ncclTransports[NTRANSPORTS] = {
   netTransport,
 };
 
-static void FifoPullArgs(struct transportProxyInfo* info, struct ncclProxyArgs *args) {
-  struct ncclProxyArgs *fifoArgs = info->argsFifo + (info->argsFifoHead % TRANSPORT_PROXY_FIFO_SIZE);
-  pthread_mutex_lock(&info->mutex);
-  while (fifoArgs->active == 0)
-    pthread_cond_wait(&info->cond, &info->mutex);
-  __sync_synchronize();
-  memcpy(args, fifoArgs, sizeof(struct ncclProxyArgs));
-  __sync_synchronize();
-  fifoArgs->active = 0;
-  pthread_cond_signal(&info->cond);
-  pthread_mutex_unlock(&info->mutex);
-  info->argsFifoHead++;
-}
-
-static struct ncclProxyArgs* FifoGetNextArgs(struct transportProxyInfo* info) {
-  if (info == NULL) return NULL;
-  struct ncclProxyArgs* fifoArgs = info->argsFifo + (info->argsFifoTail % TRANSPORT_PROXY_FIFO_SIZE);
-  pthread_mutex_lock(&info->mutex);
-  while (fifoArgs->active == 1)
-    pthread_cond_wait(&info->cond, &info->mutex);
-  pthread_mutex_unlock(&info->mutex);
-  info->argsFifoTail++;
-  return fifoArgs;
-}
-
-static void FifoPushArgs(struct transportProxyInfo* info) {
-  if (info == NULL) return;
-
-  struct ncclProxyArgs* fifoArgs = info->argsFifo + ((info->argsFifoTail-1) % TRANSPORT_PROXY_FIFO_SIZE);
-  if (fifoArgs->active == 0) return;
-
-  pthread_mutex_lock(&info->mutex);
-  pthread_cond_signal(&info->cond);
-  pthread_mutex_unlock(&info->mutex);
-}
-
-static void WaitProxyReady(struct transportProxyInfo* info) {
-  pthread_mutex_lock(&info->mutex);
-  while (info->proxyReady == 0)
-    pthread_cond_wait(&info->cond, &info->mutex);
-  pthread_mutex_unlock(&info->mutex);
-}
-
-static void SetProxyReady(struct transportProxyInfo* info) {
-  pthread_mutex_lock(&info->mutex);
-  info->proxyReady = 1;
-  pthread_cond_signal(&info->cond);
-  pthread_mutex_unlock(&info->mutex);
-}
-
-static void StopProxy(struct transportProxyInfo* info) {
-  struct ncclProxyArgs* fifoArgs = FifoGetNextArgs(info);
-  fifoArgs->active = -1;
-  FifoPushArgs(info);
-}
-
 #define RECV 0
 #define SEND 1
 
-static bool NeedProxy(int type, int pattern, struct ncclRing* ring, int nranks) {
-  enum proxyMode mode = proxyPatternMode(pattern);
-  if (mode == proxyRing) return true;
+static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
+  if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
 
   /* In chains, one rank does not need a proxy. Let's figure out which one it is */
-  int root = proxyPatternRoot(pattern);
   // Which index in the reorganized rings should we compare root against */
   const int myrank = 0, nextrank = 1, prevrank = nranks-1;
-  int index = mode == proxyFrom ?
+  int index = pattern == ncclPatternPipelineFrom ?
       /*                            no recv /  no send    if root = */
       /* bcast  */ (type == RECV ?   myrank : nextrank ):
       /* reduce */ (type == RECV ? prevrank :   myrank );
@@ -92,96 +33,216 @@ static bool NeedProxy(int type, int pattern, struct ncclRing* ring, int nranks)
   return (root != rank);
 }
 
-static void SaveProxy(struct ncclConnector* connector, struct ncclProxyArgs* args, int needProxy) {
-  struct transportProxyInfo* info = connector->proxyInfo;
-  if (info == NULL) return;
-  struct ncclProxyArgs* fifoArgs = FifoGetNextArgs(info);
-  args->needProxy = needProxy;
-  __sync_synchronize();
-  memcpy(fifoArgs, args, sizeof(struct ncclProxyArgs));
-  __sync_synchronize();
-  fifoArgs->active = 1;
+enum { proxyRecv=0, proxySend=1 };
+
+#define PROXYARGS_ALLOCATE_SIZE 32
+struct ncclProxyPool {
+  struct ncclProxyPool *next;
+  struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
+};
+
+ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) {
+  struct ncclProxyState* state = &comm->proxyState;
+  struct ncclProxyArgs* elem;
+  pthread_mutex_lock(&state->mutex);
+  if (state->pool == NULL) {
+    // Allocate a new pool of elements
+    struct ncclProxyPool* newPool;
+    NCCLCHECK(ncclCalloc(&newPool, 1));
+    struct ncclProxyArgs* newElems = newPool->elems;
+    // Chain newly allocated elements
+    for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) {
+      if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1;
+    }
+    // Add them all to the pool list
+    state->pool = newElems;
+    // Save the pool memory block for later resource release
+    newPool->next = state->pools;
+    state->pools = newPool;
+  }
+  elem = state->pool;
+  state->pool = state->pool->next;
+  pthread_mutex_unlock(&state->mutex);
+  elem->next = elem->nextPeer = NULL;
+  *argsptr = elem;
+  return ncclSuccess;
 }
 
-ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t nbytes, int pattern, struct ncclComm* comm) {
-  int llMode, nrings, nthreads;
-  ncclGetCollResource(comm, nbytes, &nrings, &nthreads, &llMode);
-  nbytes       = llMode ? nbytes * 2    : nbytes;
-  substeps     = llMode ? 1             : substeps;
-  subchunks    = llMode ? NCCL_LL_CHUNKS : subchunks;
-  int buffSize = llMode ? NCCL_LL_BUFF_SIZE : comm->rings[0].buffSize;
-
-  int nrounds = (int)(DIVUP(nbytes, ((size_t)nrings * nblocksPerRound * (buffSize/subchunks)))); // Fixed 32-bit overflow
-  int nsteps = nstepsPerRound * nrounds * substeps;
-  TRACE(NCCL_NET,"opCount %lx substeps %d subchunks %d nrounds %d nsteps %d comm %p", comm->opCount, subchunks, subchunks, nrounds, nsteps, comm);
-  TRACE(NCCL_NET,"opCount %lx nbytes %zi nrings %d buffSize %d pattern %d comm %p", comm->opCount, nbytes, nrings, buffSize, pattern, comm);
-  for (int r=0; r<nrings; r++) {
-    struct ncclRing* ring = comm->rings+((comm->myParams->gridDim.x+r)%comm->nRings);
-    struct ncclProxyArgs args = { ring, substeps*subchunks, nsteps, comm->opCount, llMode, 0 };
-    SaveProxy(&ring->recv, &args, NeedProxy(RECV, pattern, ring, comm->nRanks));
-    SaveProxy(&ring->send, &args, NeedProxy(SEND, pattern, ring, comm->nRanks));
+static void ProxyAppend(struct ncclConnector* connector, struct ncclProxyArgs* args) {
+  struct ncclComm* comm = connector->comm;
+  struct ncclProxyState* state = &comm->proxyState;
+  pthread_mutex_lock(&state->mutex);
+  if (connector->proxyAppend == NULL) {
+    // Nothing running for that peer. Add to the circular list
+    if (state->ops == NULL) {
+      // Create the list
+      args->next = args;
+      state->ops = args;
+    } else {
+      // Insert element in the list
+      args->next = state->ops->next;
+      state->ops->next = args;
+    }
+    connector->proxyAppend = args;
+  } else {
+    // There is an active operation already for that peer.
+    // Add it to the per-peer list
+    connector->proxyAppend->nextPeer = args;
+    connector->proxyAppend = args;
   }
+  pthread_mutex_unlock(&state->mutex);
+}
+
+template <int type>
+static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) {
+  if (peer < 0) return ncclSuccess;
+
+  struct ncclPeer* peerComm = args->channel->peers+peer;
+  struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send;
+  if (connector->transportComm->proxy == NULL) return ncclSuccess;
+
+  struct ncclProxyArgs* op;
+  NCCLCHECK(transportAllocateProxyArgs(connector->comm, &op));
+  memcpy(op, args, sizeof(struct ncclProxyArgs));
+  op->connector = connector;
+  op->progress = connector->transportComm->proxy;
+  op->state = ncclProxyOpReady;
+  ProxyAppend(connector, op);
   return ncclSuccess;
 }
 
-ncclResult_t transportStartProxies(ncclComm* comm) {
-  for (int r=0; r<comm->nRings; r++) {
-    FifoPushArgs(comm->rings[r].send.proxyInfo);
-    FifoPushArgs(comm->rings[r].recv.proxyInfo);
+ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks) {
+  if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
+    struct ncclRing* ring = &args->channel->ring;
+    if (NeedProxy(RECV, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxyRecv>(ring->prev, args));
+    if (NeedProxy(SEND, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxySend>(ring->next, args));
+  }
+  if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
+    // Tree up
+    struct ncclTree* tree = &args->channel->tree;
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args));
+    NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
+  }
+  if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
+    // Tree down
+    struct ncclTree* tree = &args->channel->tree;
+    for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
+    NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
   }
-  pthread_yield(); // Let other threads run
   return ncclSuccess;
 }
 
-void* persistentThread(void *opaqueInfo) {
-  struct transportProxyInfo* info = (struct transportProxyInfo*)opaqueInfo;
-  // We need to initialize the context before launching any NCCL cuda kernel,
-  // otherwise we would create it during the first cudaMemcpyAsync inside the
-  // proxy function and that would cause a deadlock
-  cudaSetDevice(info->comm->cudaDev);
-  // Signal the main thread the context is created and it can proceed.
-  SetProxyReady(info);
+void* persistentThread(void *comm_) {
+  struct ncclComm* comm = (struct ncclComm*)comm_;
+  struct ncclProxyState* state = &comm->proxyState;
+  struct ncclProxyArgs* op = NULL;
+  ncclResult_t ret = ncclSuccess;
+  int idle = 1;
+  int idleSpin = 0;
   while (1) {
-    struct ncclProxyArgs args;
-    FifoPullArgs(info, &args);
-    if (args.active == -1) {
-      // Main thread asked to stop
+    do {
+      if (*comm->abortFlag) return NULL;
+      if (op == NULL) {
+        pthread_mutex_lock(&state->mutex);
+        op = state->ops;
+        if (op == NULL) {
+          if (state->stop) {
+            // No more commands to process and proxy has been requested to stop
+            pthread_mutex_unlock(&state->mutex);
+            return NULL;
+          }
+          pthread_cond_wait(&state->cond, &state->mutex);
+        }
+        pthread_mutex_unlock(&state->mutex);
+      }
+    } while (op == NULL);
+    op->idle = 0;
+    if (op->state != ncclProxyOpNone) ret = op->progress(op);
+    if (ret != ncclSuccess) {
+      comm->fatalError = ret;
+      INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
       return NULL;
     }
-    ncclResult_t res = info->func(&args);
-    if (res != ncclSuccess) {
-      WARN("%s:%d -> %d [Proxy thread error]", __FILE__, __LINE__, res);
+    idle &= op->idle;
+    pthread_mutex_lock(&state->mutex);
+    if (!idle) idleSpin = 0;
+    struct ncclProxyArgs *next = op->next;
+    if (next->state == ncclProxyOpNone) {
+      struct ncclProxyArgs *freeOp = next;
+      if (next->nextPeer) {
+        // Replace next by its next per-peer element.
+        next = next->nextPeer;
+        if (op != freeOp) {
+          next->next = freeOp->next;
+          op->next = next;
+        } else {
+          next->next = next;
+        }
+      } else {
+        // Remove next from circular list
+        next->connector->proxyAppend = NULL;
+        if (op != freeOp) {
+          next = next->next;
+          op->next = next;
+        } else {
+          next = NULL;
+        }
+      }
+      if (freeOp == state->ops) state->ops = next;
+      freeOp->next = state->pool;
+      state->pool = freeOp;
     }
+    op = next;
+    if (op == state->ops) {
+      if (idle == 1) {
+        if (++idleSpin == 10) {
+          sched_yield();
+          idleSpin = 0;
+        }
+      }
+      idle = 1;
+    }
+    pthread_mutex_unlock(&state->mutex);
   }
 }
 
-ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm) {
-  struct ncclConnector* connector = (type == RECV) ? &ring->recv : &ring->send;
-  threadFunc_t proxyfunc = (threadFunc_t) ((type == RECV) ? connector->transport->recv.proxy : connector->transport->send.proxy);
-  if (proxyfunc) {
-    TRACE(NCCL_NET,"type %d ring %p proxyfunc %p comm %p", type, ring, proxyfunc, comm);
-    struct transportProxyInfo* info;
-    NCCLCHECK(ncclCalloc(&info, 1));
-    connector->proxyInfo = info;
-    info->comm = comm;
-    info->cond = PTHREAD_COND_INITIALIZER;
-    info->mutex = PTHREAD_MUTEX_INITIALIZER;
-    info->func = proxyfunc;
-    info->argsFifoHead = info->argsFifoTail = 0;
-    info->proxyReady = 0;
-    pthread_create(&connector->proxyInfo->thread, NULL, persistentThread, info);
-    // Wait for thread to initialize its CUDA context.
-    WaitProxyReady(info);
+ncclResult_t transportStartProxy(struct ncclComm* comm) {
+  pthread_mutex_lock(&comm->proxyState.mutex);
+  if (comm->proxyState.ops != NULL)
+    pthread_cond_signal(&comm->proxyState.cond);
+  pthread_mutex_unlock(&comm->proxyState.mutex);
+  return ncclSuccess;
+}
+
+ncclResult_t transportCreateProxy(struct ncclComm* comm) {
+  if (!comm->proxyThread) {
+    comm->proxyState.cond = PTHREAD_COND_INITIALIZER;
+    comm->proxyState.mutex = PTHREAD_MUTEX_INITIALIZER;
+    comm->proxyState.ops = NULL;
+    pthread_create(&comm->proxyThread, NULL, persistentThread, comm);
   }
   return ncclSuccess;
 }
 
-ncclResult_t transportDestroyProxy(struct ncclConnector* connector) {
-  if (connector->proxyInfo) {
-    StopProxy(connector->proxyInfo);
-    pthread_join(connector->proxyInfo->thread, NULL);
-    free(connector->proxyInfo);
-    connector->proxyInfo = NULL;
+ncclResult_t transportDestroyProxy(struct ncclComm* comm) {
+  struct ncclProxyState* state = &comm->proxyState;
+
+  // Request the proxy to stop and then wake it
+  pthread_mutex_lock(&state->mutex);
+  state->stop = true;
+  pthread_cond_signal(&state->cond);
+  pthread_mutex_unlock(&state->mutex);
+  if (comm->proxyThread) pthread_join(comm->proxyThread, NULL);
+
+  // Free off any memory allocated for the proxy arg pools
+  pthread_mutex_lock(&state->mutex);
+  struct ncclProxyState* proxyState = &comm->proxyState;
+  while (proxyState->pools != NULL) {
+    struct ncclProxyPool *next = proxyState->pools->next;
+    free(proxyState->pools);
+    proxyState->pools = next;
   }
+  pthread_mutex_unlock(&state->mutex);
+
   return ncclSuccess;
 }
diff --git a/src/transport/net.cu b/src/transport/net.cu
index 9c366b3..06a6e23 100644
--- a/src/transport/net.cu
+++ b/src/transport/net.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,11 +9,17 @@
 #include "nvmlwrap.h"
 #include "net.h"
 #include "param.h"
-#include "nvlink.h"
+#include "topo.h"
 #include <cuda_runtime.h>
 #include <assert.h>
 
 #define NET_MAX_IFS 16
+#define NET_MAX_GPUS 32
+
+// Cache GPU-NIC distances to avoid re-computing them
+#define NET_TVALUE_UNKNOWN 0ULL
+static ncclTvalue_t ncclNetTvalues[NET_MAX_GPUS] = { NET_TVALUE_UNKNOWN };
+static int ncclNetNDev;
 
 // We encode 3 bits of distance per interface into a ncclTvalue_t (64-bit)
 #define NET_BITS_PER_IF 3
@@ -28,13 +34,9 @@ static ncclTvalue_t getTvalue(short* distances, int ndev) {
   }
   return tvalue;
 }
-
-struct netInfo {
-  int rank;
-  int ndev;
-  ncclTvalue_t tValue;
-  short distances[NET_MAX_IFS];
-};
+static int getScore(ncclTvalue_t tvalue, int dev) {
+  return (tvalue >> (dev*NET_BITS_PER_IF)) & NET_BITS_PER_IF_MASK;
+}
 
 struct netConnectInfo {
   ncclNetHandle_t netHandle;
@@ -46,11 +48,13 @@ struct netSendResources {
   struct ncclRecvMem* hostRecvMem;
   struct ncclSendMem* devHostSendMem;
   struct ncclRecvMem* devHostRecvMem;
-  struct ncclSendMem* hostDevMem;
   int netDev;
   int useGdr;
-  struct ncclRecvMem* devNetMem;
-  uint64_t llStep;
+  int buffSize;
+  void* mhandle;
+  void* llMhandle;
+  struct ncclRecvMem* devRecvMem;
+  uint64_t step;
   uint64_t llLastCleaning;
 };
 
@@ -61,50 +65,70 @@ struct netRecvResources {
   struct ncclRecvMem* hostRecvMem;
   struct ncclSendMem* devHostSendMem;
   struct ncclRecvMem* devHostRecvMem;
-  struct ncclRecvMem* hostDevMem;
   int netDev;
   int useGdr;
-  uint64_t llStep;
+  int buffSize;
+  void* mhandle;
+  void* llMhandle;
+  struct ncclRecvMem* devRecvMem;
+  uint64_t step;
   uint64_t llLastCleaning;
 };
 
-/* Fill information necessary to exchange between ranks to choose whether or not
- * to use this transport */
-ncclResult_t netFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
-  struct netInfo* info = (struct netInfo*)opaqueInfo;
-  static_assert(sizeof(struct netInfo) <= sizeof(ncclTinfo_t), "NET Info too large");
-  info->rank = rank;
-  NCCLCHECK(ncclNetDevices(&info->ndev));
-  if (info->ndev == 0) {
+static ncclResult_t netDistance(int cudaDev, int dev, short* distance) {
+  char* cudaPath = NULL;
+  char* nicPath = NULL;
+  ncclResult_t err;
+  NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
+  err = ncclNetPciPath(dev, &nicPath);
+  *distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath);
+  if (nicPath) free(nicPath);
+  if (cudaPath) free(cudaPath);
+  return ncclSuccess;
+}
+
+static ncclResult_t netDevices(int* ndev, short** distances) {
+  NCCLCHECK(ncclNetDevices(ndev));
+  if (*ndev == 0) {
     WARN("Error : Network returned 0 device");
     return ncclSystemError;
   }
-  if (info->ndev > NET_MAX_IFS) info->ndev = NET_MAX_IFS;
+  if (*ndev > NET_MAX_IFS) *ndev = NET_MAX_IFS;
 
-  // Find distance with current GPU
-  int cudaDev;
-  cudaGetDevice(&cudaDev);
-  char* cudaPath;
-  NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
+  *distances = (short*)malloc(*ndev*sizeof(short));
+  if (*distances == NULL) return ncclSystemError;
 
+  // Find distance with current GPU
+  int cudaDev, nvmlDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
   char line[1024];
-  sprintf(line, "CUDA Dev %d, %s NIC distance : ", cudaDev, ncclNetName());
-  for (int d=0; d<info->ndev; d++) {
-    char* nicPath;
-    ncclResult_t err = ncclNetPciPath(d, &nicPath);
-    info->distances[d] = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath);
-    sprintf(line+strlen(line), " %s", pathDists[info->distances[d]]);
-    if (err == ncclSuccess) free(nicPath);
+  sprintf(line, "CUDA Dev %d[%d], %s NIC distance : ", cudaDev, nvmlDev, ncclNetName());
+  for (int d=0; d<*ndev; d++) {
+    NCCLCHECK(netDistance(cudaDev, d, *distances+d));
+    sprintf(line+strlen(line), " %s", pathDists[(*distances)[d]]);
   }
   INFO(NCCL_INIT|NCCL_NET, "%s", line);
-  free(cudaPath);
   return ncclSuccess;
 }
 
 /* Determine if we can communicate with the peer */
-ncclResult_t netCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
-  struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
-  ret[0] = getTvalue(myInfo->distances, myInfo->ndev);
+ncclResult_t netCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  ret[0] = ncclNetTvalues[cudaDev];
+  if (ret[0] == NET_TVALUE_UNKNOWN) {
+    if (cudaDev >= NET_MAX_GPUS) {
+      WARN("CUDA device %d >= MAX %d\n", cudaDev, NET_MAX_GPUS);
+      return ncclInternalError;
+    }
+    int nDev;
+    short* distances;
+    NCCLCHECK(netDevices(&nDev, &distances));
+    ncclNetTvalues[cudaDev] = ret[0] = getTvalue(distances, nDev);
+    ncclNetNDev = nDev;
+    free(distances);
+  }
   return ncclSuccess;
 }
 
@@ -196,45 +220,51 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
   return ncclSuccess;
 }
 
-int getDev(int ringId, int nDev, short* distances) {
-  int minDistance = PATH_SOC;
-  for (int d=0; d<nDev; d++) if (distances[d] < minDistance) minDistance = distances[d];
+int getDev(int cudaDev, int ringId) {
+  ncclTvalue_t tvalues = ncclNetTvalues[cudaDev];
+
+  int dev = 0;
+  int maxScore = 0;
+  for (int d=0; d<ncclNetNDev; d++) if (getScore(tvalues,d) > maxScore) maxScore = getScore(tvalues,d);
   int skip = ringId+1;
   while (skip) {
-    for (int d=0; d<nDev; d++) {
-      if (distances[d] == minDistance) {
+    for (int d=0; d<ncclNetNDev; d++) {
+      if (getScore(tvalues, d) == maxScore) {
         skip--;
-        if (skip == 0) return d;
+        if (skip == 0) { dev = d; goto end; }
       }
     }
   }
-  return 0;
+end:
+  return dev;
 }
 
 NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
 NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
 
-static ncclResult_t netGetGdrSupport(int dev, int distance, int read, int* useGdr) {
+static ncclResult_t netGetGdrSupport(int dev, int read, int* useGdr) {
   *useGdr = 0;
 
-  int cudaDev;
+  int cudaDev, nvmlDev;
   CUDACHECK(cudaGetDevice(&cudaDev));
+  NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
 
   if (read) { // For reads (sends) only enable under certain conditions
     int gdrReadParam = ncclParamNetGdrRead();
     if (gdrReadParam == 0) return ncclSuccess;
-    else if (gdrReadParam < 0) { // default : enable only on DGX2
-      char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-      CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
-      int nvlinks = getNumNvlinks(busId);
-      if (nvlinks < CONNECT_NVSWITCH || ncclCudaCompCap() < 7) return ncclSuccess;
+    if (gdrReadParam < 0) {
+       int nvlink;
+       NCCLCHECK(ncclNvlinkGpu(&nvlink));
+       if (!nvlink) return ncclSuccess;
     }
   }
 
   // Check if we are close enough that it makes sense to enable GDR
   int netGdrLevel = ncclParamNetGdrLevel();
+  short distance;
+  NCCLCHECK(netDistance(cudaDev, dev, &distance));
   if (distance >= netGdrLevel) {
-    INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, dev, distance, netGdrLevel);
+    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel);
     return ncclSuccess;
   }
 
@@ -243,51 +273,59 @@ static ncclResult_t netGetGdrSupport(int dev, int distance, int read, int* useGd
   NCCLCHECK(ncclNetPtrSupport(dev, &flags));
   if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess;
   *useGdr = 1;
-  INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d / HCA %d (distance %d >= %d), read %d", ncclNetName(), cudaDev, dev, distance, netGdrLevel, read);
+  INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d[%d] / HCA %d (distance %d < %d), read %d", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel, read);
   return ncclSuccess;
 }
 
 /* Determine if we will use this transport for this peer and return connect
  * information for this peer */
-ncclResult_t netSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
+ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
   struct netSendResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
-  ring->send.transportResources = resources;
+  send->transportResources = resources;
+
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  resources->netDev = getDev(cudaDev, channelId);
+  NCCLCHECK(netGetGdrSupport(resources->netDev, 1, &resources->useGdr));
 
-  struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
-  resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances);
-  NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 1, &resources->useGdr));
+  int sendSize = sizeof(struct ncclSendMem);
+  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
 
-  int size = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
+  int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
   if (resources->useGdr) {
-    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devNetMem), size));
+    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
   }
+  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
+  resources->buffSize = buffSize;
 
-  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, size));
-  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, size));
-
+  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [send] via NET/%s/%d%s", channelId, myInfo->rank, peerInfo->rank, ncclNetName(), resources->netDev,
+      resources->useGdr ? "/GDRDMA" : "");
   return ncclSuccess;
 }
 
-ncclResult_t netRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
+ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
   struct netRecvResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
-  ring->recv.transportResources = resources;
+  recv->transportResources = resources;
 
-  struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
-  resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances);
-  NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 0, &resources->useGdr));
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  resources->netDev = getDev(cudaDev, channelId);
+  NCCLCHECK(netGetGdrSupport(resources->netDev, 0, &resources->useGdr));
 
   int sendSize = sizeof(struct ncclSendMem);
   NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
 
-  int recvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
+  int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+  if (resources->useGdr) {
+    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
+  }
   NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
+  resources->buffSize = buffSize;
 
-  struct netInfo* peerInfo = (struct netInfo*)peerOpaqueInfo;
-  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d via NET/%s/%d%s%s", ring->id, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
-      resources->useGdr ? "/GDRDMA" : "",
-      (resources->hostDevMem != NULL) ? "/GDCopy" : "");
+  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [receive] via NET/%s/%d%s", channelId, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
+      resources->useGdr ? "/GDRDMA" : "");
   struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
   NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
   return ncclSuccess;
@@ -297,27 +335,28 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
   // Setup device pointers
   struct netSendResources* resources = (struct netSendResources*)send->transportResources;
 
-  if (resources->useGdr) {
-    send->conn.buff = resources->devNetMem->buff;
-    // We don't use devMem for llMode because the CPU has to read the data
-    send->conn.llBuff = resources->devHostRecvMem->llBuff;
-  } else {
-    send->conn.buff = resources->devHostRecvMem->buff;
-    send->conn.llBuff = resources->devHostRecvMem->llBuff;
-  }
+  // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host
+  struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
+  send->conn.buff = recvMem->buff;
+  send->conn.llBuff = resources->devHostRecvMem->llBuff;
+
+  // Head/Tail/Opcount/Fifos are always on host
   send->conn.tail = &resources->devHostRecvMem->tail;
-  send->conn.opCount = &resources->devHostRecvMem->opCount;
+  send->conn.opCountRem = &resources->devHostRecvMem->opCount;
   send->conn.fifo = resources->devHostRecvMem->sizesFifo;
-  send->conn.llFifo = resources->devHostRecvMem->llSizesFifo;
-
-  if (resources->hostDevMem == NULL) {
-    send->conn.head = &resources->devHostSendMem->head;
-    send->conn.llHead = &resources->devHostSendMem->llHead;
-  }
+  send->conn.head = &resources->devHostSendMem->head;
+  send->conn.opCountLoc = &resources->devHostSendMem->opCount;
+  for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
 
   // Connect to remote peer
   struct netConnectInfo* info = (struct netConnectInfo*)connectInfo;
   NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm));
+
+  NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->buff, resources->buffSize,
+        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
+  NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->devHostRecvMem->llBuff,
+        NCCL_LL_BUFF_SIZE, NCCL_PTR_HOST, &resources->llMhandle));
+
   return ncclSuccess;
 }
 
@@ -326,32 +365,37 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
   // Setup device pointers
   struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources;
 
-  recv->conn.head = &resources->devHostSendMem->head;
-  recv->conn.llHead = &resources->devHostSendMem->llHead;
-
-  if (resources->useGdr == 0) {
-    recv->conn.buff = resources->devHostRecvMem->buff;
-    recv->conn.llBuff = resources->devHostRecvMem->llBuff;
-  }
+  // Intermediate buffering on GPU for GPU Direct RDMA
+  struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
+  recv->conn.buff = recvMem->buff;
+  recv->conn.llBuff = recvMem->llBuff;
 
-  if (resources->hostDevMem == NULL) {
-    recv->conn.tail = &resources->devHostRecvMem->tail;
-    recv->conn.opCount = &resources->devHostRecvMem->opCount;
-  }
+  // Head/Tail/Opcount are always on host
+  recv->conn.tail = &resources->devHostRecvMem->tail;
+  recv->conn.opCountLoc = &resources->devHostRecvMem->opCount;
+  recv->conn.head = &resources->devHostSendMem->head;
+  recv->conn.opCountRem = &resources->devHostSendMem->opCount;
 
-  // Finish connection establishment
+  // Finish connection establishment from remote peer
   NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
   NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
 
+  NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->buff, resources->buffSize,
+        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
+  NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->llBuff, NCCL_LL_BUFF_SIZE,
+        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->llMhandle));
+
   return ncclSuccess;
 }
 
 ncclResult_t netSendFree(void* transportResources) {
   struct netSendResources* resources = (struct netSendResources*)transportResources;
   NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
+  NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandle));
+  NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->llMhandle));
   NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
   if (resources->useGdr)
-    CUDACHECK(cudaFree(resources->devNetMem));
+    CUDACHECK(cudaFree(resources->devRecvMem));
   NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
   free(resources);
   return ncclSuccess;
@@ -360,196 +404,166 @@ ncclResult_t netSendFree(void* transportResources) {
 ncclResult_t netRecvFree(void* transportResources) {
   struct netRecvResources* resources = (struct netRecvResources*)transportResources;
   NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
+  NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandle));
+  NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->llMhandle));
   NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
+  if (resources->useGdr)
+    CUDACHECK(cudaFree(resources->devRecvMem));
   NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
   free(resources);
   return ncclSuccess;
 }
 
 ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
-  struct ncclRing* ring = args->ring;
-  struct netSendResources* resources = (struct netSendResources*) (ring->send.transportResources);
-  const int llMode = args->llMode;
-
-  volatile uint64_t* prevTail = &resources->hostRecvMem->tail;
-  struct ncclSendMem* prevMem = resources->hostDevMem ? resources->hostDevMem : resources->hostSendMem;
-  uint64_t* prevHead = llMode ? &prevMem->llHead : &prevMem->head;
-  struct ncclRecvMem* localMem = resources->useGdr ? resources->devNetMem : resources->hostRecvMem;
-  char* localBuff = llMode ? resources->hostRecvMem->llBuff : localMem->buff;
-  int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
-  volatile int* sizesFifo = llMode ? resources->hostRecvMem->llSizesFifo : resources->hostRecvMem->sizesFifo;
-  int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
-  int sliceSize = buffSize / args->substeps;
-
-  assert(args->substeps <= SIZES_FIFO_SIZE);
-
-  uint64_t head = llMode ? resources->llStep : 0ULL;
-  uint64_t tail = llMode ? resources->llStep : 0ULL;
-  uint64_t end = head + args->nsteps;
-
-  int idle = 0;
-  void* requests[args->substeps];
-
-  if (!args->needProxy) goto nextColl;
-
-  TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
-  TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
-
-  // Update in case we skipped some collectives
-  if (llMode == 0) resources->hostRecvMem->opCount = args->opCount;
-
-  while (head < end) {
-    idle++;
-    if (llMode) {
-      if (tail < end && tail < head + args->substeps) {
-        int slot = tail%args->substeps;
-        int size = sizesFifo[slot];
-        if (size != 0) {
-          if (size == -1) size = 0;
-          uint32_t flag = tail + 1;
-          int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
-          size = nFifoLines * sizeof(union ncclLLFifoLine);
-          union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+slot*sliceSize);
-          for (int i=0; i<nFifoLines; i++) {
-            volatile uint32_t *f1 = &lines[i].flag1;
-            volatile uint32_t *f2 = &lines[i].flag2;
-            while (f1[0] != flag || f2[0] != flag);
+  struct netSendResources* resources = (struct netSendResources*) (args->connector->transportResources);
+  if (args->state == ncclProxyOpReady) {
+    // Update opCount
+    resources->hostRecvMem->opCount = args->opCount;
+
+    // Round to next multiple of sliceSteps
+    resources->step = ROUNDUP(resources->step, args->chunkSteps);
+    args->head = resources->step;
+    args->tail = resources->step;
+    args->end = args->head + args->nsteps;
+    args->state = ncclProxyOpProgress;
+  }
+  if (args->state == ncclProxyOpProgress) {
+    args->idle = 1;
+    if (args->head < args->end) {
+      if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
+        volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
+        if (args->llMode) {
+          int buffSlot = args->tail%NCCL_STEPS;
+          int size = sizesFifo[buffSlot];
+          if (size != -1) {
+            uint32_t flag = args->tail + 1;
+            int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
+            size = nFifoLines * sizeof(union ncclLLFifoLine);
+            union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
+            int ready = 1;
+            for (int i=0; i<nFifoLines; i++) {
+              volatile uint32_t *f1 = &lines[i].flag1;
+              volatile uint32_t *f2 = &lines[i].flag2;
+              if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
+            }
+            if (ready) {
+              NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, resources->llMhandle, args->requests+buffSlot));
+              if (args->requests[buffSlot] != NULL) {
+                sizesFifo[buffSlot] = -1;
+                // Make sure size is reset to zero before we update the head.
+                __sync_synchronize();
+                args->tail += args->sliceSteps;
+                args->idle = 0;
+              }
+            }
           }
-          NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, ptrType, requests+slot));
-          if (requests[slot] != NULL) {
-            sizesFifo[slot] = size;
-            tail++;
-            idle = 0;
+        } else if (args->tail < resources->hostRecvMem->tail) {
+          struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
+          int stepSize = args->channel->buffSize/NCCL_STEPS;
+          // Send through network
+          int buffSlot = args->tail%NCCL_STEPS;
+          NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot));
+          if (args->requests[buffSlot] != NULL) {
+            sizesFifo[buffSlot] = -1;
+            // Make sure size is reset to zero before we update the head.
+            __sync_synchronize();
+            args->tail += args->sliceSteps;
+            args->idle = 0;
           }
         }
       }
-    } else while (tail < *prevTail) {
-        // Send through network
-        int slot = tail%args->substeps;
-        NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+slot*sliceSize, sizesFifo[slot], ptrType, requests+slot));
-        if (requests[slot] != NULL) {
-          tail++;
-          idle = 0;
-        }
-      }
-    if (head < tail) {
-      int done;
-      int slot = head%args->substeps;
-      NCCLCHECK(ncclNetTest(requests[slot], &done, NULL));
-      if (done) {
-        if (llMode) {
-          sizesFifo[slot] = 0;
-          // Make sure size is reset to zero before we update the head.
-          __sync_synchronize();
+      if (args->head < args->tail) {
+        int done;
+        int buffSlot = args->head%NCCL_STEPS;
+        NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
+        if (done) {
+          args->head += args->sliceSteps;
+          resources->hostSendMem->head = args->head;
+          args->idle = 0;
         }
-        head++;
-        *prevHead = head;
-        idle = 0;
       }
     }
-    if (idle) transportProxyIdle(idle);
+    if (args->head == args->end) {
+      resources->step = args->end;
+      args->idle = 0;
+      args->state = ncclProxyOpDone;
+    }
   }
-
-  // Reset
-  if (llMode == 0) *prevTail = 0;
-
-nextColl:
-  if (llMode) {
-    resources->llStep += args->nsteps;
-    // Don't forget to ack otherwise the GPU won't be able to push data.
-    *prevHead = resources->llStep;
-    if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
-      memset(localBuff, 0, NCCL_LL_BUFF_SIZE);
-      resources->llStep += NCCL_LL_CHUNKS;
-      *prevHead = resources->llStep;
-      resources->llLastCleaning = resources->llStep;
+  if (args->state == ncclProxyOpDone) {
+    union ncclLLFifoLine* llBuff = resources->hostRecvMem->llBuff;
+    if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
+      for (int i=0; i< NCCL_LL_BUFF_LINES; i++) llBuff[i].flag1 = llBuff[i].flag2 = resources->step;
+      resources->step += NCCL_STEPS;
+      resources->hostSendMem->head = resources->step;
+      resources->llLastCleaning = resources->step;
     }
+    args->state = ncclProxyOpNone;
   }
   return ncclSuccess;
 }
 
 ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
-  struct ncclRing* ring = args->ring;
-  struct netRecvResources* resources = (struct netRecvResources*) (ring->recv.transportResources);
-  int llMode = args->llMode;
-
-  volatile uint64_t* nextHead = llMode ? &resources->hostSendMem->llHead : &resources->hostSendMem->head;
-  struct ncclRecvMem* localMem = resources->useGdr ? ring->devMemRecv : resources->hostRecvMem;
-  char* localBuff = llMode ? localMem->llBuff : localMem->buff;
-  char* nextBuff = (resources->useGdr == 0 && resources->hostDevMem) ? resources->hostDevMem->buff : NULL;
-  int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
-  uint64_t* nextTail = resources->hostDevMem ? &resources->hostDevMem->tail : &resources->hostRecvMem->tail;
-
-  int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
-  int sliceSize = buffSize / args->substeps;
-
-  uint64_t head = llMode ? resources->llStep : 0ULL;
-  uint64_t tail = llMode ? resources->llStep : 0ULL;
-  uint64_t end = head + args->nsteps;
-
-  int idle = 0;
-  void* requests[args->substeps];
-
-  if (!args->needProxy) goto nextColl;
-
-  TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
-  TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
-
-  if (llMode == 0) {
-    // Waiting for next opCount is only needed before writing nextTail.
-    uint64_t* nextOpCount = resources->hostDevMem ? &resources->hostDevMem->opCount : &resources->hostRecvMem->opCount;
-    transportProxyWait([=] { return *nextOpCount >= args->opCount; });
+  struct netRecvResources* resources = (struct netRecvResources*) (args->connector->transportResources);
+  if (args->state == ncclProxyOpReady) {
+    // Update opCount
+    resources->hostSendMem->opCount = args->opCount;
+
+    // Round to next multiple of sliceSteps
+    resources->step = ROUNDUP(resources->step, args->chunkSteps);
+    args->head = resources->step;
+    args->tail = resources->step;
+    args->end = args->head + args->nsteps;
+    args->state = ncclProxyOpProgress;
   }
-
-  while (head < end) {
-    idle++;
-    if ((tail < head + args->substeps) && (tail < *nextHead + args->substeps) && (tail < end)) {
-      int slot = tail%args->substeps;
-      NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+slot*sliceSize, sliceSize, ptrType, requests+slot));
-      if (requests[slot] != NULL) {
-        tail++;
-        idle = 0;
+  if (args->state == ncclProxyOpProgress) {
+    args->idle = 1;
+    int stepSize = ( args->llMode ? NCCL_LL_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS;
+    if (args->head < args->end) {
+      struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
+      char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff;
+      void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle;
+      if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
+        int buffSlot = args->tail%NCCL_STEPS;
+        int sliceSize = stepSize * args->sliceSteps;
+        NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot));
+        if (args->requests[buffSlot] != NULL) {
+          args->tail += args->sliceSteps;
+          args->idle = 0;
+        }
       }
-    }
-    if (tail > head) {
-      int done;
-      int slot = head%args->substeps;
-      int size;
-      NCCLCHECK(ncclNetTest(requests[slot], &done, &size));
-      if (done) {
-        if (nextBuff) memcpy(nextBuff+slot*sliceSize, localBuff+slot*sliceSize, size);
-        head++;
-        if (llMode == 0) {
-          if (ptrType == NCCL_PTR_CUDA) ncclNetFlush(resources->netRecvComm, localBuff+slot*sliceSize, size);
-          *nextTail = head;
+      if (args->tail > args->head) {
+        int buffSlot = args->head%NCCL_STEPS;
+        int done, size;
+        NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size));
+        if (done) {
+          args->head += args->sliceSteps;
+          if (args->llMode == 0) {
+            if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle);
+            resources->hostRecvMem->tail = args->head;
+          }
+          args->idle = 0;
         }
-        idle = 0;
       }
     }
-    if (idle) transportProxyIdle(idle);
-  }
-
-  // Wait for last ack and reset
-  if (llMode == 0) {
-    transportProxyWait([=] { return *nextHead == head; });
-    *nextHead = 0;
+    if (args->head == args->end) {
+      resources->step = args->end;
+      args->idle = 0;
+      args->state = ncclProxyOpDone;
+    }
   }
-
-nextColl:
-  if (llMode) {
-    resources->llStep += args->nsteps;
-    if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
-      resources->llStep += NCCL_LL_CHUNKS;
-      while (*nextHead < resources->llStep);
-      resources->llLastCleaning = resources->llStep;
+  if (args->state == ncclProxyOpDone) {
+    if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
+      resources->step += NCCL_STEPS;
+      while (resources->hostSendMem->head < resources->step);
+      resources->llLastCleaning = resources->step;
     }
+    args->state = ncclProxyOpNone;
   }
   return ncclSuccess;
 }
 
 struct ncclTransport netTransport = {
   "NET",
-  netFillInfo,
   netCanConnect,
   netGetRings,
   { netSendSetup, netSendConnect, netSendFree, netSendProxy },
diff --git a/src/transport/net_ib.cu b/src/transport/net_ib.cu
index 18e158d..f7c574b 100644
--- a/src/transport/net_ib.cu
+++ b/src/transport/net_ib.cu
@@ -32,6 +32,7 @@ static int ncclNIbDevs = -1;
 struct ncclIbDev {
   int device;
   uint8_t port;
+  uint8_t link;
   ibv_context* context;
   char devName[MAXNAMESIZE];
 };
@@ -97,7 +98,6 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
         WARN("NET/IB : No IP interface found.");
         return ncclInternalError;
       }
-      INFO(NCCL_INIT|NCCL_NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName);
 
       // Detect IB cards
       int nIbDevs;
@@ -113,47 +113,59 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
 
       for (int d=0; d<nIbDevs; d++) {
         struct ibv_context * context;
-        if (ncclSuccess != wrap_ibv_open_device(&context, devices[d])) {
+        if (ncclSuccess != wrap_ibv_open_device(&context, devices[d]) || context == NULL) {
           WARN("NET/IB : Unable to open device %s", devices[d]->name);
           continue;
         }
         int found = 0;
-        if (context) {
-          struct ibv_device_attr devAttr;
-          if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
-            WARN("NET/IB : Unable to query device %s", devices[d]->name);
+        struct ibv_device_attr devAttr;
+        if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
+          WARN("NET/IB : Unable to query device %s", devices[d]->name);
+          if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
+          continue;
+        }
+        for (int port = 1; port <= devAttr.phys_port_cnt; port++) {
+          struct ibv_port_attr portAttr;
+          if (ncclSuccess != wrap_ibv_query_port(context, port, &portAttr)) {
+            WARN("NET/IB : Unable to query port %d", port);
             continue;
           }
-          for (int port = 1; port <= devAttr.phys_port_cnt; port++) {
-            struct ibv_port_attr portAttr;
-            if (ncclSuccess != wrap_ibv_query_port(context, port, &portAttr)) {
-              WARN("NET/IB : Unable to query port %d", port);
-              continue;
-            }
-            if (portAttr.state != IBV_PORT_ACTIVE) continue;
-            if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND
-                && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
-
-            // check against user specified HCAs/ports
-            if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) {
-              continue;
-            }
-            INFO(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
-                portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
-            ncclIbDevs[ncclNIbDevs].device = d;
-            ncclIbDevs[ncclNIbDevs].port = port;
-            ncclIbDevs[ncclNIbDevs].context = context;
-            strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
-            ncclNIbDevs++;
-            found++;
-            pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
-          }
+          if (portAttr.state != IBV_PORT_ACTIVE) continue;
+          if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND
+              && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
 
-          if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } }
+          // check against user specified HCAs/ports
+          if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) {
+            continue;
+          }
+          TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
+              portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
+          ncclIbDevs[ncclNIbDevs].device = d;
+          ncclIbDevs[ncclNIbDevs].port = port;
+          ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
+          ncclIbDevs[ncclNIbDevs].context = context;
+          strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
+          ncclNIbDevs++;
+          found++;
+          pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
         }
+        if (found == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
       }
       if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; };
     }
+    if (ncclNIbDevs == 0) {
+      INFO(NCCL_INIT|NCCL_NET, "NET/IB : No device found.");
+    } else {
+      char line[1024];
+      line[0] = '\0';
+      for (int d=0; d<ncclNIbDevs; d++) {
+        snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%d/%s", d, ncclIbDevs[d].devName,
+            ncclIbDevs[d].port, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
+      }
+      line[1023] = '\0';
+      char addrline[1024];
+      INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s ; OOB %s:%s", line, ncclIbIfName, socketToString(&ncclIbIfAddr.sa, addrline));
+    }
     pthread_mutex_unlock(&ncclIbLock);
   }
   return ncclSuccess;
@@ -205,11 +217,12 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
 ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
   *supportedTypes = NCCL_PTR_HOST;
 
-  int cudaDev;
+  int cudaDev, nvmlDev;
   CUDACHECK(cudaGetDevice(&cudaDev));
+  NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
 
   if (ncclIbGdrSupport(dev) != ncclSuccess) {
-    INFO(NCCL_INIT|NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (no module or not supported by GPU)", cudaDev, ncclIbDevs[dev].devName);
+    INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d '%s' (no module or not supported by GPU)", cudaDev, nvmlDev, dev, ncclIbDevs[dev].devName);
     return ncclSuccess;
   }
   *supportedTypes |= NCCL_PTR_CUDA;
@@ -242,23 +255,15 @@ struct ncclIbHandle {
   union socketAddress connectAddr;
 };
 
-struct ncclIbMr {
-  struct ibv_mr* mr;
-  int refcnt;
-};
-
 struct ncclIbVerbs {
   struct ibv_pd* pd;
   struct ibv_cq* cq;
-  struct ncclIbMr mrPool[MAX_REQUESTS];
-  int mrRotation;
 };
 
 struct ncclIbRequest {
   int used;
   int type;
   struct ncclIbVerbs* verbs;
-  struct ncclIbMr * ibMr;
   int done;
   int size;
   int free;
@@ -278,12 +283,12 @@ struct ncclIbSendFifo {
 };
 
 struct ncclIbSendComm {
+  struct ncclIbVerbs verbs;
   struct ncclIbSendFifo fifo[MAX_REQUESTS];
   struct ncclIbRequest reqs[MAX_REQUESTS];
   uint32_t fifoHead;
   int fd;
   int ready;
-  struct ncclIbVerbs verbs;
   struct ibv_qp* qp;
   struct ibv_mr* fifoMr;
 };
@@ -307,11 +312,11 @@ struct ncclIbRemFifo {
 };
 
 struct ncclIbRecvComm {
+  struct ncclIbVerbs verbs;
   struct ncclIbRemFifo remFifo;
   struct ncclIbRequest reqs[MAX_REQUESTS];
   int fd;
   int ready;
-  struct ncclIbVerbs verbs;
   struct ibv_qp* qp;
   struct ncclIbGpuFlush gpuFlush;
 };
@@ -434,13 +439,13 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
   // RoCE support
   qpInfo.lid = portAttr.lid;
   if (qpInfo.lid) { // IB
-    INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid);
+    INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid);
   } else { // RoCE
     union ibv_gid gid;
     NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
     qpInfo.spn = gid.global.subnet_prefix;
     qpInfo.iid = gid.global.interface_id;
-    INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
+    INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
   }
 
   NCCLCHECK(socketSend(comm->fd, &qpInfo, sizeof(qpInfo)));
@@ -537,7 +542,6 @@ ncclResult_t ncclIbGetRequest(struct ncclIbRequest* reqs, struct ncclIbRequest**
       r->used = 1;
       r->type = 0;
       r->verbs = NULL;
-      r->ibMr = NULL;
       r->done = 0;
       r->size = -1;
       r->free = 0;
@@ -583,57 +587,34 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size);
 
 #define REG_ALIGN (4096)
 
-// Cache previous MRs to avoid registering/unregistering for each Isend/Irecv
-ncclResult_t ncclIbGetMr(struct ncclIbVerbs* verbs, void* data, int size, struct ncclIbMr** mrRet) {
+ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+  struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
   uint64_t addr = (uint64_t)data;
-  int elem = -1;
   assert(size > 0);
 
-  // Look for an already existing MR
-  for (int i=0; i<MAX_REQUESTS; i++) {
-    if (verbs->mrPool[i].mr == NULL) continue;
-    uint64_t regAddr = (uint64_t)verbs->mrPool[i].mr->addr;
-    uint64_t regSize = (uint64_t)verbs->mrPool[i].mr->length;
-    if (regAddr <= addr && addr+size <= regAddr+regSize) {
-      *mrRet = verbs->mrPool+i;
-      verbs->mrPool[i].refcnt++;
-      return ncclSuccess;
-    }
-  }
-
-  // Find an unused element
-  if (elem == -1) {
-    elem = (verbs->mrRotation++);
-    for (int i=0; i<MAX_REQUESTS; i++) {
-      elem %= MAX_REQUESTS;
-      if (verbs->mrPool[elem].refcnt > 0) elem++; else break;
-    }
-    if (verbs->mrPool[elem].refcnt > 0) {
-      WARN("NET/IB : memory register : no MR available");
-      return ncclInternalError;
-    }
-  }
-
-  assert(elem < MAX_REQUESTS);
-  assert(verbs->mrPool[elem].refcnt == 0);
-
   // Deregister / register
   uint64_t regAddr = addr & (~(REG_ALIGN-1));
   uint64_t regSize = addr+size - regAddr;
   regSize = ((regSize + REG_ALIGN-1) / REG_ALIGN ) * REG_ALIGN;
-  if (verbs->mrPool[elem].mr) NCCLCHECK(wrap_ibv_dereg_mr(verbs->mrPool[elem].mr));
-  NCCLCHECK(wrap_ibv_reg_mr(&verbs->mrPool[elem].mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
-  *mrRet = verbs->mrPool+elem;
-  verbs->mrPool[elem].refcnt++;
-  TRACE(NCCL_INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey);
+  struct ibv_mr* mr;
+  NCCLCHECK(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
+  *mhandle = (void*)mr;
+  TRACE(NCCL_INIT,"regAddr %lx size %ld rkey %x", regAddr, regSize, mr->rkey);
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void** request) {
+ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
+  NCCLCHECK(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, void** request) {
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
   if (comm->ready == 0) NCCLCHECK(ncclSendCheck(comm));
   if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
 
+  struct ibv_mr* mr = (struct ibv_mr*)mhandle;
+
   // Wait for the receiver to have posted the corresponding receive
   volatile struct ncclIbSendFifo* slot = comm->fifo + (comm->fifoHead%MAX_REQUESTS);
   volatile uint32_t * readyPtr = &slot->ready;
@@ -641,7 +622,6 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void**
 
   struct ncclIbRequest* req;
   NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
-  req->type = type;
   req->verbs = &comm->verbs;
   req->size = size;
 
@@ -654,8 +634,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void**
     wr.sg_list = NULL;
     wr.num_sge = 0;
   } else {
-    NCCLCHECK(ncclIbGetMr(&comm->verbs, data, size, &req->ibMr));
-    sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=req->ibMr->mr->lkey;
+    sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=mr->lkey;
     wr.sg_list = &sge;
     wr.num_sge = 1;
   }
@@ -720,14 +699,15 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void** request) {
+ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
   if (comm->ready == 0) NCCLCHECK(ncclRecvCheck(comm));
   if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
 
+  struct ibv_mr* mr = (struct ibv_mr*)mhandle;
+
   struct ncclIbRequest* req;
   NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
-  req->type = type;
   req->verbs = &comm->verbs;
   req->size = size;
 
@@ -739,10 +719,8 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void**
   if (size == 0) {
     wr.sg_list = NULL;
     wr.num_sge = 0;
-    req->ibMr = NULL;
   } else {
-    NCCLCHECK(ncclIbGetMr(&comm->verbs, data, size, &req->ibMr));
-    sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=req->ibMr->mr->lkey;
+    sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=mr->lkey;
     wr.sg_list = &sge;
     wr.num_sge = 1;
   }
@@ -752,25 +730,25 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void**
   *request = req;
 
   // Post to FIFO to notify sender
-  NCCLCHECK(ncclIbPostFifo(comm, req->ibMr->mr->rkey, (uint64_t)data, size));
+  NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size));
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbFlush(void* recvComm, void* data, int size) {
+ncclResult_t ncclIbFlush(void* recvComm, void* data, int size, void* mhandle) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
   if (comm->gpuFlush.enabled == 0 || size == 0) return ncclSuccess;
 
   struct ncclIbRequest* req;
   NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
   req->verbs = &comm->verbs;
-  NCCLCHECK(ncclIbGetMr(&comm->verbs, data, 1, &req->ibMr));
+  struct ibv_mr* mr = (struct ibv_mr*)mhandle;
 
   struct ibv_send_wr wr;
   memset(&wr, 0, sizeof(wr));
   wr.wr_id = (uint64_t)req;
 
   wr.wr.rdma.remote_addr = (uint64_t)data;
-  wr.wr.rdma.rkey = req->ibMr->mr->rkey;
+  wr.wr.rdma.rkey = mr->rkey;
   wr.sg_list = &comm->gpuFlush.sge;
   wr.num_sge = 1;
   wr.opcode = IBV_WR_RDMA_READ;
@@ -800,32 +778,31 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) {
     }
 
     int wrDone = 0;
-    struct ibv_wc wc;
-    NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 1, &wc, &wrDone));
+    struct ibv_wc wcs[4];
+    NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 4, wcs, &wrDone));
     if (wrDone == 0) return ncclSuccess;
 
-    if (wc.status != IBV_WC_SUCCESS) {
-      WARN("NET/IB : Got completion with error %d, opcode %d, len %d, vendor err %d", wc.status, wc.opcode, wc.byte_len, wc.vendor_err);
-      return ncclSystemError;
-    }
+    for (int w=0; w<wrDone; w++) {
+      struct ibv_wc *wc = wcs+w;
+      if (wc->status != IBV_WC_SUCCESS) {
+        WARN("NET/IB : Got completion with error %d, opcode %d, len %d, vendor err %d", wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
+        return ncclSystemError;
+      }
 
-    struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc.wr_id;
-    if (doneReq) {
-      if (wc.opcode == IBV_WC_RECV) {
-        doneReq->size = wc.byte_len;
+      struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc->wr_id;
+      if (doneReq) {
+        if (wc->opcode == IBV_WC_RECV) {
+          doneReq->size = wc->byte_len;
 #if USE_RDMA_WRITE
-      } else if (wc.opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
-        doneReq->size = wc.imm_data;
+        } else if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
+          doneReq->size = wc->imm_data;
 #endif
-      }
-      if (doneReq->ibMr != NULL) {
-        doneReq->ibMr->refcnt--;
-        if (doneReq->ibMr->refcnt < 0) WARN("NET/IB : doneReq %p MR %p refcount now %d", doneReq, doneReq->ibMr, doneReq->ibMr->refcnt);
-      }
-      doneReq->done = 1;
-      if (doneReq->free == 1) {
-        // This is an internal (FIFO post) req. Free it immediately.
-        doneReq->used = 0;
+        }
+        doneReq->done = 1;
+        if (doneReq->free == 1) {
+          // This is an internal (FIFO post) req. Free it immediately.
+          doneReq->used = 0;
+        }
       }
     }
   }
@@ -837,12 +814,6 @@ ncclResult_t ncclIbCloseSend(void* sendComm) {
     close(comm->fd);
     if (comm->qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qp));
     if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr));
-    for (int i=0; i<MAX_REQUESTS; i++) {
-      if (comm->verbs.mrPool[i].mr != NULL) {
-        if (comm->verbs.mrPool[i].refcnt != 0) WARN("NET/IB : TX MR #%d has non-zero (%d) refcnt", i, comm->verbs.mrPool[i].refcnt);
-        NCCLCHECK(wrap_ibv_dereg_mr(comm->verbs.mrPool[i].mr));
-      }
-    }
     NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs));
     free(comm);
   }
@@ -859,12 +830,6 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) {
       if (comm->gpuFlush.hostMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->gpuFlush.hostMr));
     }
     if (comm->remFifo.mr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remFifo.mr));
-    for (int i=0; i<MAX_REQUESTS; i++) {
-      if (comm->verbs.mrPool[i].mr != NULL) {
-        if (comm->verbs.mrPool[i].refcnt != 0) WARN("NET/IB : RX MR #%d has non-zero (%d) refcnt", i, comm->verbs.mrPool[i].refcnt);
-        NCCLCHECK(wrap_ibv_dereg_mr(comm->verbs.mrPool[i].mr));
-      }
-    }
     NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs));
     free(comm);
   }
@@ -889,6 +854,8 @@ ncclNet_t ncclNetIb = {
   ncclIbListen,
   ncclIbConnect,
   ncclIbAccept,
+  ncclIbRegMr,
+  ncclIbDeregMr,
   ncclIbIsend,
   ncclIbIrecv,
   ncclIbFlush,
diff --git a/src/transport/net_socket.cu b/src/transport/net_socket.cu
index 1efee15..0464b43 100644
--- a/src/transport/net_socket.cu
+++ b/src/transport/net_socket.cu
@@ -27,10 +27,19 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
     pthread_mutex_lock(&ncclSocketLock);
     if (ncclNetIfs == -1) {
       ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
-      INFO(NCCL_INIT|NCCL_NET,"NET/Socket : %d interfaces found", ncclNetIfs);
       if (ncclNetIfs <= 0) {
         WARN("NET/Socket : no interface found");
         return ncclInternalError;
+      } else {
+        char line[1024];
+        char addrline[1024];
+        line[0] = '\0';
+        for (int i=0; i<ncclNetIfs; i++) {
+          snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, ncclNetIfNames+i*MAX_IF_NAME_SIZE,
+              socketToString(&ncclNetIfAddrs[i].sa, addrline));
+        }
+        line[1023] = '\0';
+        INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
       }
     }
     pthread_mutex_unlock(&ncclSocketLock);
@@ -113,7 +122,7 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
     union socketAddress localAddr;
     char ifName[MAX_IF_NAME_SIZE];
     if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
-      WARN("No usable listening interface found");
+      WARN("NET/Socket : No usable listening interface found");
       return ncclSystemError;
     }
     // pass the local address back
@@ -205,21 +214,24 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, int type, void** request) {
-  if (type != NCCL_PTR_HOST) return ncclInternalError;
+ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+  return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess;
+}
+ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
+
+ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, void* mhandle, void** request) {
   struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm;
   NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_SEND, data, size, comm->fd, (struct ncclSocketRequest**)request));
   return ncclSuccess;
 }
 
-ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, int type, void** request) {
-  if (type != NCCL_PTR_HOST) return ncclInternalError;
+ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) {
   struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm;
   NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_RECV, data, size, comm->fd, (struct ncclSocketRequest**)request));
   return ncclSuccess;
 }
 
-ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size) {
+ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size, void* mhandle) {
   // We don't support CUDA pointers, so we don't need a flush operation
   return ncclInternalError;
 }
@@ -243,6 +255,8 @@ ncclNet_t ncclNetSocket = {
   ncclSocketListen,
   ncclSocketConnect,
   ncclSocketAccept,
+  ncclSocketRegMr,
+  ncclSocketDeregMr,
   ncclSocketIsend,
   ncclSocketIrecv,
   ncclSocketFlush,
diff --git a/src/transport/p2p.cu b/src/transport/p2p.cu
index 6c4626a..9f3e0b6 100644
--- a/src/transport/p2p.cu
+++ b/src/transport/p2p.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -11,18 +11,9 @@
 #include "param.h"
 #include <unistd.h>
 #include <cuda_runtime.h>
-#include "nvmlwrap.h"
 #include <ctype.h>
 #include "nvlink.h"
 
-struct p2pInfo {
-  int rank;
-  int cudaDev;
-  uint64_t hostHash;
-  uint64_t pidHash;
-  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-};
-
 struct p2pConnectInfo {
   int direct;
   union {
@@ -31,36 +22,40 @@ struct p2pConnectInfo {
   };
 };
 
-#include <sys/types.h>
+struct p2pSendResources {
+  struct ncclSendMem* devMem;
+  void* ipcPtr;
+};
 
-/* Fill information necessary to exchange between ranks to choose whether or not
- * to use this transport */
-ncclResult_t p2pFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
-  struct p2pInfo* info = (struct p2pInfo*)opaqueInfo;
-  static_assert(sizeof(struct p2pInfo) <= sizeof(ncclTinfo_t), "p2p Info too large");
-  info->rank = rank;
-  CUDACHECK(cudaGetDevice(&info->cudaDev));
-  info->hostHash=getHostHash();
-  info->pidHash=getPidHash();
-
-  // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
-  // cudaDev is a CUDA runtime dev number which could be different from the
-  // NVML device number. Then we get the busID from NVML to be sure it is
-  // consistent with NVML remote PCI bus Ids.
-  CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
-  nvmlDevice_t nvmlDevice;
-  NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice));
-  nvmlPciInfo_t pciInfo;
-  NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo));
-  strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE);
-  return ncclSuccess;
-}
+struct p2pRecvResources {
+  struct ncclRecvMem* devMem;
+  void* ipcPtr;
+};
+
+#include <sys/types.h>
 
 NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2);
 NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2);
 
+/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
+static int busIdToCudaDev(const char* busId) {
+  int ndev;
+  if (cudaGetDeviceCount(&ndev) != cudaSuccess)
+    return -1;
+  for (int i = 0; i < ndev; i++) {
+    char devBusId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+    if (cudaDeviceGetPCIBusId(devBusId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess)
+      return -1;
+    if (strcmp(busId, devBusId) == 0) {
+      return i;
+    }
+  }
+  // BusId was not found in our locally visible CUDA devices
+  return -1;
+}
+
 /* Determine if we can communicate with the peer through p2p */
-ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
+ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
   // Do not use P2P across root complexes by default (provided CUDA permits it)
   int p2pLevel = PATH_SOC;
   if (ncclParamP2pDisable() == 1) p2pLevel = 0;
@@ -70,23 +65,26 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
 
   if (p2pLevel == 0) return ncclSuccess;
 
-  struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
-  struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
-
   // Rule out different nodes
   if (myInfo->hostHash != peerInfo->hostHash) return ncclSuccess;
 
+  // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
+  int peerCudaDev = busIdToCudaDev(peerInfo->busId);
+  if (peerCudaDev == -1) return ncclSuccess; // Peer's CUDA device is not visible in this process
+
+  TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
+
   // Do not detect topology if we're on the same GPU. Note this is not really supported.
-  if (myInfo->cudaDev == peerInfo->cudaDev) {
+  if (myInfo->cudaDev == peerCudaDev) {
     *ret = 1 + PATH_SOC;
     return ncclSuccess;
   }
 
   // See if CUDA can do P2P
   int p2p;
-  if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerInfo->cudaDev) != cudaSuccess) {
-    INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d and dev %d",
-        myInfo->cudaDev, peerInfo->cudaDev);
+  if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerCudaDev) != cudaSuccess) {
+    INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%d) and dev %d(=%d)",
+         myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
     return ncclSuccess;
   }
   if (p2p == 0) return ncclSuccess;
@@ -102,7 +100,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
   char* myPath;
   char* peerPath;
   ncclResult_t err1 = getCudaPath(myInfo->cudaDev, &myPath);
-  ncclResult_t err2 = getCudaPath(peerInfo->cudaDev, &peerPath);
+  ncclResult_t err2 = getCudaPath(peerCudaDev, &peerPath);
   if (err1 == ncclSuccess && err2 == ncclSuccess) {
     int distance = pciDistance(myPath, peerPath);
     if (distance < p2pLevel) {
@@ -174,8 +172,8 @@ static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentR
 static inline int copyRings(int nranks, int* rings, int nrings, int newNrings) {
   if (nrings == 0) return 0;
   // Copy rings by dup times
-  if (newNrings > MAXRINGS) {
-    newNrings = MAXRINGS;
+  if (newNrings > MAXCHANNELS) {
+    newNrings = MAXCHANNELS;
   }
   for (int r=nrings; r<newNrings; r++) {
     for (int i=0; i<nranks; i++) rings[r*nranks+i] = rings[(r%nrings)*nranks+i];
@@ -191,7 +189,6 @@ int p2pComputeRingsNvLink(ncclTvalue_t* matrix, int nranks, int *rings, int nrin
   if (connect) {
     inTheRing[rings[0]] = 1;
     nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, rings[1], nranks-2, connect);
-    nrings = copyRings(nranks, rings, nrings, nringsMax);
   } else {
     rings[0] = 0;
     nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, 0, nranks-1, connect);
@@ -209,9 +206,9 @@ static inline int findConnect(int nranks, int* ranks) {
 
 int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int oversubscribe, int* nthreads) {
   if (nrings == 0) return 0;
-  if (nrings > MAXRINGS) {
-    WARN("Max rings reached, limiting to %d", MAXRINGS);
-    nrings = MAXRINGS;
+  if (nrings > MAXCHANNELS) {
+    WARN("Max rings reached, limiting to %d", MAXCHANNELS);
+    nrings = MAXCHANNELS;
   }
   // Find existing constraints / connections
   int connect = 0;
@@ -239,9 +236,9 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin
 
   if (compNrings && compNrings < nrings && nranks <= 4) {
     // Try to oversubscribe to get a better result
-    int *rings2 = (int *)malloc(sizeof(int)*MAXRINGS*nranks);
-    if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXRINGS*nranks); return 0; }
-    for (int i=0; i<MAXRINGS*nranks; i++) rings2[i] = -1;
+    int *rings2 = (int *)malloc(sizeof(int)*MAXCHANNELS*nranks);
+    if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXCHANNELS*nranks); return 0; }
+    for (int i=0; i<MAXCHANNELS*nranks; i++) rings2[i] = -1;
     int nThreads = *nthreads;
     int compNrings2 = p2pComputeRingsNvLink(values, nranks, rings2, nrings, prev, next, 1, &nThreads);
     if (compNrings2 > compNrings*2) {
@@ -255,7 +252,6 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin
   // Duplicate the rings for direct NVLink
   compNrings = copyRings(nranks, rings, compNrings, compNrings*2);
 
-  if (ncclCudaCompCap() == 6) *nthreads /= 2;
   return compNrings;
 }
 
@@ -367,8 +363,8 @@ int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings,
 ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
   if (*nringsRet == 0) return ncclSuccess;
   int *rings;
-  NCCLCHECK(ncclCalloc(&rings, MAXRINGS*nranks));
-  for (int i=0; i<MAXRINGS*nranks; i++) rings[i] = -1;
+  NCCLCHECK(ncclCalloc(&rings, MAXCHANNELS*nranks));
+  for (int i=0; i<MAXCHANNELS*nranks; i++) rings[i] = -1;
   int nrings = *nringsRet;
 
   // NVswitch
@@ -446,39 +442,47 @@ end:
   } while (0)
 
 /* Send: Create and return connect structures for this peer to connect to me */
-ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
-  struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
-  struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
+ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+    struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+
+  struct p2pSendResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  send->transportResources = resources;
+  const int sendSize = sizeof(struct ncclSendMem);
+  NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize));
+
   struct p2pConnectInfo info;
   if (myInfo->pidHash == peerInfo->pidHash) {
     info.direct = 1;
-    info.directPtr = ring->devMemSend;
+    info.directPtr = resources->devMem;
     if (myInfo->cudaDev == peerInfo->cudaDev) {
-      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank);
+      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", channelId, myInfo->rank, peerInfo->rank);
     } else {
       // Enable P2P access
       cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
       if (err == cudaErrorPeerAccessAlreadyEnabled) {
         cudaGetLastError();
       } else if (err != cudaSuccess) {
-        WARN("failed to peer with device %d: %d %s",
-            peerInfo->cudaDev, err, cudaGetErrorString(err));
+        WARN("failed to peer with device %d(=%d): %d %s",
+             peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
         return ncclInternalError;
       }
       INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
-          ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+          channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
     }
   } else {
+    // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
+    int peerCudaDev = busIdToCudaDev(peerInfo->busId);
     info.direct = 0;
     // Map IPC and enable P2P access
-    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)ring->devMemSend);
+    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
     if (err != cudaSuccess) {
-      WARN("rank %d failed to get CUDA IPC handle to device %d : %d %s",
-          myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
+      WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
+           myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
       return ncclInternalError;
     }
     INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
-        ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+        channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
     //TRACE_DUMP_IPC(&info.devIpc);
   }
   static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
@@ -487,13 +491,19 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
 }
 
 /* Create and return connect structures for this peer to connect to me */
-ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
-  struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
-  struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
+ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+    struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) {
+
+  struct p2pRecvResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  recv->transportResources = resources;
+  const int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+  NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize));
+
   struct p2pConnectInfo info;
   if (myInfo->pidHash == peerInfo->pidHash) {
     info.direct = 1;
-    info.directPtr = ring->devMemRecv;
+    info.directPtr = resources->devMem;
     if (myInfo->cudaDev == peerInfo->cudaDev) {
       TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
     } else {
@@ -502,22 +512,24 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
       if (err == cudaErrorPeerAccessAlreadyEnabled) {
         cudaGetLastError();
       } else if (err != cudaSuccess) {
-        WARN("failed to peer with device %d: %d %s",
-            peerInfo->cudaDev, err, cudaGetErrorString(err));
+        WARN("failed to peer with device %d(=%d): %d %s",
+             peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
         return ncclInternalError;
       }
-      TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+      TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
     }
   } else {
+    // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
+    int peerCudaDev = busIdToCudaDev(peerInfo->busId);
     info.direct = 0;
     // Map IPC and enable P2P access
-    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)ring->devMemRecv);
+    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
     if (err != cudaSuccess) {
-      WARN("rank %d failed to get CUDA IPC handle to device %d : %d %s",
-          myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
+      WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
+           myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
       return ncclInternalError;
     }
-    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
     //TRACE_DUMP_IPC(&info.devIpc);
   }
   static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
@@ -527,22 +539,16 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
 
 /* Connect/Send to this peer */
 static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
-  void** resources = &send->transportResources;
+  struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
   struct ncclRecvMem* remDevMem;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
   if (info->direct) {
     remDevMem = (struct ncclRecvMem*)(info->directPtr);
     send->conn.direct = 1;
-    *resources = NULL;
   } else {
-    void* remPtr = NULL;
     //TRACE_DUMP_IPC(&info->devIpc);
-    cudaError_t err = cudaIpcOpenMemHandle(&remPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
-    void** ipcPtrSave;
-    NCCLCHECK(ncclCalloc(&ipcPtrSave, 1));
-    *resources = ipcPtrSave;
-    *ipcPtrSave = remPtr;
-    remDevMem = (struct ncclRecvMem*)remPtr;
+    cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
+    remDevMem = (struct ncclRecvMem*)resources->ipcPtr;
     if (err != cudaSuccess) {
       WARN("failed to open CUDA IPC handle : %d %s",
           err, cudaGetErrorString(err));
@@ -553,30 +559,26 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC
   send->conn.buff = remDevMem->buff;
   send->conn.llBuff = remDevMem->llBuff;
   send->conn.tail = &remDevMem->tail;
-  send->conn.opCount = &remDevMem->opCount;
-  // send->conn->head should have been set to devMemSend already
+  send->conn.opCountRem = &remDevMem->opCount;
+  send->conn.head = &resources->devMem->head;
+  send->conn.ptrExchange = &resources->devMem->ptrExchange;
+  send->conn.opCountLoc = &resources->devMem->opCount;
   return ncclSuccess;
 }
 
 /* Connect/Recv from this peer */
 ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
-  void** resources = &recv->transportResources;
+  struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
   struct ncclSendMem* remDevMem;
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
   if (info->direct) {
     remDevMem = (struct ncclSendMem*)(info->directPtr);
     recv->conn.direct = 1;
     recv->conn.ptrExchange = &remDevMem->ptrExchange;
-    *resources = NULL;
   } else {
-    void* remPtr = NULL;
     //TRACE_DUMP_IPC(&info->devIpc);
-    cudaError_t err = cudaIpcOpenMemHandle(&remPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
-    void** ipcPtrSave;
-    NCCLCHECK(ncclCalloc(&ipcPtrSave, 1));
-    *resources = ipcPtrSave;
-    *ipcPtrSave = remPtr;
-    remDevMem = (struct ncclSendMem*)remPtr;
+    cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
+    remDevMem = (struct ncclSendMem*)resources->ipcPtr;
     if (err != cudaSuccess) {
       WARN("failed to open CUDA IPC handle : %d %s",
           err, cudaGetErrorString(err));
@@ -584,28 +586,35 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
     }
   }
 
-  // recv->conn->buff should have been set to devMemRecv already
-  // recv->conn->tail should have been set to devMemRecv already
-  // recv->conn->opCount should have been set to devMemRecv already
+  recv->conn.buff = resources->devMem->buff;
+  recv->conn.llBuff = resources->devMem->llBuff;
+  recv->conn.tail = &resources->devMem->tail;
+  recv->conn.opCountLoc = &resources->devMem->opCount;
   recv->conn.head = &remDevMem->head;
-  recv->conn.llHead = &remDevMem->llHead;
+  recv->conn.opCountRem = &remDevMem->opCount;
   return ncclSuccess;
 }
 
-ncclResult_t p2pFree(void* resources) {
-  if (resources != NULL) {
-    void** ipcPtrSave = (void**) resources;
-    CUDACHECK(cudaIpcCloseMemHandle(*ipcPtrSave));
-    free(resources);
-  }
+ncclResult_t p2pSendFree(void* resources) {
+  struct p2pSendResources* sendRes = (struct p2pSendResources*)resources;
+  if (sendRes->ipcPtr)
+    CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
+  CUDACHECK(cudaFree(sendRes->devMem));
+  return ncclSuccess;
+}
+
+ncclResult_t p2pRecvFree(void* resources) {
+  struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources;
+  if (recvRes->ipcPtr)
+    CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
+  CUDACHECK(cudaFree(recvRes->devMem));
   return ncclSuccess;
 }
 
 struct ncclTransport p2pTransport = {
   "P2P",
-  p2pFillInfo,
   p2pCanConnect,
   p2pGetRings,
-  { p2pSendSetup, p2pSendConnect, p2pFree, NULL },
-  { p2pRecvSetup, p2pRecvConnect, p2pFree, NULL }
+  { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL },
+  { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL }
 };
diff --git a/src/transport/shm.cu b/src/transport/shm.cu
index 317f652..56e0242 100644
--- a/src/transport/shm.cu
+++ b/src/transport/shm.cu
@@ -12,13 +12,6 @@
 #include <unistd.h>
 #include <cuda_runtime.h>
 
-struct shmInfo {
-  int rank;
-  int cudaDev;
-  uint64_t hostHash;
-  uint64_t pidHash;
-};
-
 struct shmSendConnectInfo {
   uint64_t pidHash;
   int id;
@@ -51,24 +44,10 @@ struct shmRecvResources {
   struct ncclRecvMem* devHostMem;
 };
 
-/* Fill information necessary to exchange between ranks to choose whether or not
- * to use this transport */
-ncclResult_t shmFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
-  struct shmInfo* info = (struct shmInfo*)opaqueInfo;
-  static_assert(sizeof(struct shmInfo) <= sizeof(ncclTinfo_t), "shm Info too large");
-  info->rank = rank;
-  CUDACHECK(cudaGetDevice(&info->cudaDev));
-  info->hostHash=getHostHash();
-  info->pidHash=getPidHash();
-  return ncclSuccess;
-}
-
 NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);
 
 /* Determine if we can communicate with the peer */
-ncclResult_t shmCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
-  struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo;
-  struct shmInfo* peerInfo = (struct shmInfo*)peerOpaqueInfo;
+ncclResult_t shmCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
   *ret = ((ncclParamShmDisable() == 1) || (myInfo->hostHash != peerInfo->hostHash)) ? 0 : 1;
   return ncclSuccess;
 }
@@ -88,7 +67,7 @@ static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid)
 }
 
 ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
-  if (*nringsRet == MAXRINGS) *nringsRet = 1;
+  if (*nringsRet == MAXCHANNELS) *nringsRet = 1;
   int nGroups = groups[nranks-1] + 1;
   int starts[nGroups];
   int ends[nGroups];
@@ -156,43 +135,40 @@ ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
 #define MAX_SHM_NAME_LEN 1024
 
 /* Create and return connect structures for this peer to connect to me */
-ncclResult_t shmSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
-  struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo;
-  struct shmInfo* peerInfo = (struct shmInfo*)peerOpaqueInfo;
+ncclResult_t shmSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
 
   struct shmSendResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
-  ring->send.transportResources = resources;
+  send->transportResources = resources;
 
   struct shmRecvConnectInfo info;
   char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
+  sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank);
   info.shmSize = resources->shmSize = sizeof(struct ncclSendMem);
   TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
   NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
 
-  INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
-  info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
+  INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+  info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
   static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
   memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo));
   return ncclSuccess;
 }
 
-ncclResult_t shmRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
-  struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo;
+ncclResult_t shmRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
   struct shmRecvResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
-  ring->recv.transportResources = resources;
+  recv->transportResources = resources;
 
   struct shmSendConnectInfo info;
 
   char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
-  info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
+  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank);
+  info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+buffSize;
   TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
   NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
 
-  info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
+  info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
   static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big");
   memcpy(connectInfo, &info, sizeof(struct shmSendConnectInfo));
   return ncclSuccess;
@@ -216,10 +192,10 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
   send->conn.buff = resources->devRemHostMem->buff;
   send->conn.llBuff = resources->devRemHostMem->llBuff;
   send->conn.tail = &resources->devRemHostMem->tail;
-  send->conn.opCount = &resources->devRemHostMem->opCount;
+  send->conn.opCountRem = &resources->devRemHostMem->opCount;
 
   send->conn.head = &resources->devHostMem->head;
-  send->conn.llHead = &resources->devHostMem->llHead;
+  send->conn.opCountLoc = &resources->devHostMem->opCount;
   return ncclSuccess;
 }
 
@@ -235,12 +211,12 @@ ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
   NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
   NCCLCHECK(shmUnlink(shmName));
   recv->conn.head = &resources->devRemHostMem->head;
-  recv->conn.llHead = &resources->devRemHostMem->llHead;
+  recv->conn.opCountRem = &resources->devRemHostMem->opCount;
 
   recv->conn.buff = resources->devHostMem->buff;
   recv->conn.llBuff = resources->devHostMem->llBuff;
   recv->conn.tail = &resources->devHostMem->tail;
-  recv->conn.opCount = &resources->devHostMem->opCount;
+  recv->conn.opCountLoc = &resources->devHostMem->opCount;
   return ncclSuccess;
 }
 
@@ -262,7 +238,6 @@ ncclResult_t shmRecvFree(void* transportResources) {
 
 struct ncclTransport shmTransport = {
   "SHM",
-  shmFillInfo,
   shmCanConnect,
   shmGetRings,
   { shmSendSetup, shmSendConnect, shmSendFree, NULL },
author	Sylvain Jeaugey <sjeaugey@nvidia.com>	2018-12-14 02:56:12 +0300
committer	Sylvain Jeaugey <sjeaugey@nvidia.com>	2019-01-30 02:19:27 +0300
commit	1450d42675be325cd3b7a684d4b231eedceb22fb (patch)
tree	dc1f88ad03d598c3bb03f20dd81d8ef671fc2bff /src
parent	4861e197fd83f0ac324ac0c21051820f8866e6ea (diff)