merge with newest master

author: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com> 2020-01-07 02:48:51 +0300
committer: Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com> 2020-01-07 02:49:13 +0300
commit: 7bed5ac058c3ca3c4617d133f3240b6981f95428 (patch)
tree: 49e7c6b2e08790b5d7b9eda124ce6cd130c87f1f
parent: 8e3a3f7c5b520babff49cec54a866fa3eda3a3b6 (diff)
parent: 3899f6e0f219b0cbf58537f791b0dd104b377750 (diff)
102 files changed, 7275 insertions, 4727 deletions
diff --git a/LICENSE.txt b/LICENSE.txt
index 3593a7a..e318c66 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,5 +1,5 @@
 
- Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
diff --git a/Makefile b/Makefile
index 605e3bf..caed3d4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/README.md b/README.md
index fa51453..7f0a72f 100644
--- a/README.md
+++ b/README.md
@@ -55,7 +55,7 @@ To install NCCL on the system, create a package then install it as root.
 Debian/Ubuntu :
 ```shell
 $ # Install tools to create debian packages
-$ sudo apt install build-essential devscripts debhelper
+$ sudo apt install build-essential devscripts debhelper fakeroot
 $ # Build NCCL deb package
 $ make pkg.debian.build
 $ ls build/pkg/deb/
@@ -89,4 +89,4 @@ $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g <ngpus>
 
 ## Copyright
 
-All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+All source code and accompanying documentation is copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
diff --git a/ext-net/dummy/Makefile b/ext-net/dummy/Makefile
index d1eb4c5..efa841c 100644
--- a/ext-net/dummy/Makefile
+++ b/ext-net/dummy/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/ext-net/dummy/plugin.c b/ext-net/dummy/plugin.c
index f11b365..67d7d88 100644
--- a/ext-net/dummy/plugin.c
+++ b/ext-net/dummy/plugin.c
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/makefiles/common.mk b/makefiles/common.mk
index a81a1b5..37e81be 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -15,6 +15,7 @@ PROFAPI ?= 0
 NVCC = $(CUDA_HOME)/bin/nvcc
 
 CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
 CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
 #CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
 CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
@@ -24,8 +25,7 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
 
 # Better define NVCC_GENCODE in your environment to the minimal set
 # of archs to reduce compile time.
-CUDA8_GENCODE = -gencode=arch=compute_30,code=sm_30 \
-                -gencode=arch=compute_35,code=sm_35 \
+CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \
                 -gencode=arch=compute_50,code=sm_50 \
                 -gencode=arch=compute_60,code=sm_60 \
                 -gencode=arch=compute_61,code=sm_61
@@ -42,9 +42,13 @@ else
 endif
 #$(info NVCC_GENCODE is ${NVCC_GENCODE})
 
-CXXFLAGS   := -I$(CUDA_INC) -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
-CXXFLAGS   += -Wall -Wno-sign-compare -Wno-unused-function
-NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
+CXXFLAGS   := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
+CXXFLAGS   += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla
+CXXFLAGS   += -I $(CUDA_INC)
+# Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors)
+# 512 : 120, 640 : 96, 768 : 80, 1024 : 60
+# We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions.
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
 # Use addprefix so that we can specify more than one path
 NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
 
@@ -67,7 +71,7 @@ CXXFLAGS  += -O0 -g -ggdb3
 endif
 
 ifneq ($(VERBOSE), 0)
-NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
+NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
 CXXFLAGS  += -Wall -Wextra
 else
 .SILENT:
diff --git a/makefiles/formatting.mk b/makefiles/formatting.mk
index 4a4ab88..a543131 100644
--- a/makefiles/formatting.mk
+++ b/makefiles/formatting.mk
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/makefiles/version.mk b/makefiles/version.mk
index a8c6e3a..80f6221 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 4
-NCCL_PATCH   := 2
+NCCL_MINOR   := 5
+NCCL_PATCH   := 6
 NCCL_SUFFIX  :=
-PKG_REVISION := 1
+PKG_REVISION := 2
diff --git a/pkg/Makefile b/pkg/Makefile
index 04b23da..ab6487b 100644
--- a/pkg/Makefile
+++ b/pkg/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/pkg/debian/Makefile b/pkg/debian/Makefile
index 439635f..0494f3e 100644
--- a/pkg/debian/Makefile
+++ b/pkg/debian/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -16,13 +16,8 @@ DEBFILES   := compat copyright libnccl-dev.install rules $(DEBGEN)
 DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES))
 
 PKG_TIMESTAMP  := $(shell date -R)
-ARCH           := $(shell uname -m)
-PKG_ARCH       ?= $(shell uname -m | sed -e "s/x86_64/amd64/g" | sed -e "s/ppc64le/ppc64el/g")
-PKG_MULTIARCH  ?= $(shell $(CXX) -print-multiarch)
-ifeq ($(PKG_MULTIARCH),)
-# Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
-PKG_MULTIARCH  := $(ARCH)-linux-gnu
-endif
+PKG_ARCH       ?= $(shell dpkg-architecture -qDEB_HOST_ARCH)
+PKG_MULTIARCH  ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH)
 
 prep : $(DEBTARGETS)
 	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
diff --git a/pkg/redhat/Makefile b/pkg/redhat/Makefile
index ffcc973..0808478 100644
--- a/pkg/redhat/Makefile
+++ b/pkg/redhat/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/pkg/redhat/nccl.spec.in b/pkg/redhat/nccl.spec.in
index f9d83a3..5fad346 100644
--- a/pkg/redhat/nccl.spec.in
+++ b/pkg/redhat/nccl.spec.in
@@ -7,6 +7,7 @@ Group:          Development/Libraries
 License:        BSD
 URL:            http://developer.nvidia.com/nccl
 Source0:        nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz
+Prereq:         /sbin/ldconfig
 
 %description
 NCCL (pronounced "Nickel") is a stand-alone library of standard collective
@@ -50,6 +51,12 @@ ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
 # static
 install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir}
 
+%post -p /sbin/ldconfig
+%postun -p /sbin/ldconfig
+
+%post devel -p /sbin/ldconfig
+%postun devel -p /sbin/ldconfig
+
 %clean
 rm -rf $RPM_BUILD_ROOT
 
diff --git a/pkg/srctxz/Makefile b/pkg/srctxz/Makefile
index ed677fe..01cab95 100644
--- a/pkg/srctxz/Makefile
+++ b/pkg/srctxz/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/pkg/srctxz/create_srctxz.sh.in b/pkg/srctxz/create_srctxz.sh.in
index ae7d01f..11bdd52 100644
--- a/pkg/srctxz/create_srctxz.sh.in
+++ b/pkg/srctxz/create_srctxz.sh.in
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/pkg/txz/Makefile b/pkg/txz/Makefile
index fa587ef..b7d9aa5 100644
--- a/pkg/txz/Makefile
+++ b/pkg/txz/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/pkg/txz/create_txz.sh.in b/pkg/txz/create_txz.sh.in
index 73922e0..deae854 100644
--- a/pkg/txz/create_txz.sh.in
+++ b/pkg/txz/create_txz.sh.in
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/src/Makefile b/src/Makefile
index fe60b11..b11de5e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -9,34 +9,40 @@ include ../makefiles/version.mk
 
 ##### src files
 INCEXPORTS  := nccl.h nccl_net.h
-LIBSRCFILES := init.cu channel.cu bootstrap.cu transport.cu enqueue.cu \
-                misc/group.cu misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/checks.cu misc/trees.cu \
-		transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
-                collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
+LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc \
+                misc/nvmlwrap.cc misc/ibvwrap.cc misc/utils.cc misc/argcheck.cc \
+		transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
+                collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
+                graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc
 
 ##### lib files
 LIBNAME     := libnccl.so
 STATICLIBNAME := libnccl_static.a
+##### pkgconfig files
+PKGCONFIGFILE := nccl.pc
 ##### dirs
 BUILDDIR ?= $(abspath ../build)
 INCDIR := $(BUILDDIR)/include
 LIBDIR := $(BUILDDIR)/lib
 OBJDIR := $(BUILDDIR)/obj
+PKGDIR := $(BUILDDIR)/lib/pkgconfig
 ##### target files
+CUDARTLIB  ?= cudart_static
 INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
 LIBSONAME  := $(LIBNAME:%=%.$(NCCL_MAJOR))
 LIBTARGET  := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
 STATICLIBTARGET := $(STATICLIBNAME)
-LIBOBJ     := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
+PKGTARGET  := $(PKGCONFIGFILE)
+LIBOBJ     := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
 DEPFILES   := $(LIBOBJ:%.o=%.d)
-LDFLAGS    += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl
+LDFLAGS    += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl
 
 DEVICELIB  := $(BUILDDIR)/obj/collectives/device/colldevice.a
 
 ##### rules
 build : lib staticlib
 
-lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
+lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET)
 
 staticlib : $(LIBDIR)/$(STATICLIBTARGET)
 
@@ -53,7 +59,7 @@ $(INCDIR)/nccl.h : nccl.h.in
 # NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
 	@$(eval NCCL_VERSION := $(shell printf "%d%d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH)))
 	mkdir -p $(INCDIR)
-	printf "Generating %-35s > %s\n" $< $@
+	@printf "Generating %-35s > %s\n" $< $@
 	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
 	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
 	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
@@ -77,37 +83,53 @@ $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB)
 	ar cr $@ $(LIBOBJ) $(TMP)/*.o
 	rm -Rf $(TMP)
 
+$(PKGDIR)/nccl.pc : nccl.pc.in
+	mkdir -p $(PKGDIR)
+	@printf "Generating %-35s > %s\n" $< $@
+	sed -e 's|$${nccl:Prefix}|\$(PREFIX)|g' \
+	    -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    $< > $@
+
 $(INCDIR)/%.h : %.h
 	@printf "Grabbing   %-35s > %s\n" $< $@
 	mkdir -p $(INCDIR)
-	cp -f $< $@
+	install -m 644 $< $@
 
 $(INCDIR)/nccl_%.h : include/nccl_%.h
 	@printf "Grabbing   %-35s > %s\n" $< $@
 	mkdir -p $(INCDIR)
-	cp -f $< $@
+	install -m 644 $< $@
+
+$(PKGDIR)/%.pc : %.pc
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(PKGDIR)
+	install -m 644 $< $@
 
-$(OBJDIR)/%.o : %.cu
+$(OBJDIR)/%.o : %.cc
 	@printf "Compiling  %-35s > %s\n" $< $@
 	mkdir -p `dirname $@`
-	$(NVCC) -I. -I$(INCDIR) -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
-	@$(NVCC) -I. -I$(INCDIR) -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
+	$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@
+	@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp)
 	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
 	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
                 sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
 	@rm -f $(@:%.o=%.d.tmp)
 
 clean :
-	rm -rf ${INCDIR} ${LIBDIR} ${OBJDIR}
 	$(MAKE) -C collectives/device clean
+	rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
 
 install : lib
 	mkdir -p $(PREFIX)/lib
+	mkdir -p $(PREFIX)/lib/pkgconfig
 	mkdir -p $(PREFIX)/include
-	cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
+	cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/
+	cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/
 	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
 
-FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
+FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
 # Note that formatting.mk defines a new target so in order to not overwrite the default target,
 # it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
 # as the BUILDDIR variable.
diff --git a/src/bootstrap.cu b/src/bootstrap.cc
index 6b1d573..11ffc35 100644
--- a/src/bootstrap.cu
+++ b/src/bootstrap.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,48 +9,142 @@
 #include "utils.h"
 #include "bootstrap.h"
 #include "net.h"
+#include "socket.h"
 #include <unistd.h>
 #include <sys/types.h>
 
-// Always use sockets for bootstrap
-ncclNet_t* ncclBootstrapNet = &ncclNetSocket;
+struct bootstrapNetComm {
+  int fd;
+};
 
-static ncclResult_t bootstrapNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; }
-static ncclResult_t bootstrapNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; }
-static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; }
-static ncclResult_t bootstrapNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; }
-static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; }
-static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; }
-static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; }
+/* Init functions */
+static char bootstrapNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
+static union socketAddress bootstrapNetIfAddrs[MAX_IFS];
+static int bootstrapNetIfs = -1;
+pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
+
+ncclResult_t bootstrapNetInit() {
+  if (bootstrapNetIfs == -1) {
+    pthread_mutex_lock(&bootstrapNetLock);
+    if (bootstrapNetIfs == -1) {
+      bootstrapNetIfs = findInterfaces(bootstrapNetIfNames, bootstrapNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
+      if (bootstrapNetIfs <= 0) {
+        WARN("Bootstrap : no socket interface found");
+        return ncclInternalError;
+      } else {
+        char line[1024];
+        char addrline[1024];
+        line[0] = '\0';
+        for (int i=0; i<bootstrapNetIfs; i++) {
+          snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, bootstrapNetIfNames+i*MAX_IF_NAME_SIZE,
+              socketToString(&bootstrapNetIfAddrs[i].sa, addrline));
+        }
+        line[1023] = '\0';
+        INFO(NCCL_INIT, "Bootstrap : Using%s", line);
+      }
+    }
+    pthread_mutex_unlock(&bootstrapNetLock);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetNewComm(struct bootstrapNetComm** comm) {
+  NCCLCHECK(ncclCalloc(comm, 1));
+  (*comm)->fd = -1;
+  return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetGetSocketAddr(int dev, union socketAddress* addr) {
+  if (dev >= bootstrapNetIfs) return ncclInternalError;
+  memcpy(addr, bootstrapNetIfAddrs+dev, sizeof(*addr));
+  return ncclSuccess;
+}
+
+/* Socket Interface Selection type */
+enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
+
+static ncclResult_t bootstrapNetListen(int dev, ncclNetHandle_t* netHandle, void** listenComm) {
+  union socketAddress* connectAddr = (union socketAddress*) netHandle;
+  static_assert(sizeof(union socketAddress) < NCCL_NET_HANDLE_MAXSIZE, "union socketAddress size is too large");
+  // if dev >= 0, listen based on dev
+  if (dev >= 0) {
+    NCCLCHECK(bootstrapNetGetSocketAddr(dev, connectAddr));
+  } else if (dev == findSubnetIf) {
+    // handle stores a remote address
+    // need to find a local addr that is in the same network as the remote addr
+    union socketAddress localAddr;
+    char ifName[MAX_IF_NAME_SIZE];
+    if (findInterfaceMatchSubnet(ifName, &localAddr, connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
+      WARN("NET/Socket : No usable listening interface found");
+      return ncclSystemError;
+    }
+    // pass the local address back
+    memcpy(connectAddr, &localAddr, sizeof(localAddr));
+  } // Otherwise, handle stores a local address
+  struct bootstrapNetComm* comm;
+  NCCLCHECK(bootstrapNetNewComm(&comm));
+  NCCLCHECK(createListenSocket(&comm->fd, connectAddr));
+  *listenComm = comm;
+  return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetConnect(int dev, ncclNetHandle_t* netHandle, void** sendComm) {
+  union socketAddress* connectAddr = (union socketAddress*) netHandle;
+  struct bootstrapNetComm* comm;
+  NCCLCHECK(bootstrapNetNewComm(&comm));
+  NCCLCHECK(connectAddress(&comm->fd, connectAddr));
+  *sendComm = comm;
+  return ncclSuccess;
+}
 
-// Additional sync functions based on async + test for bootstrap, using host ptrs.
+static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) {
+  struct bootstrapNetComm* lComm = (struct bootstrapNetComm*)listenComm;
+  struct bootstrapNetComm* rComm;
+  NCCLCHECK(bootstrapNetNewComm(&rComm));
+  struct sockaddr_in sockaddr;
+  socklen_t socklen = sizeof(struct sockaddr_in);
+  SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
+  *recvComm = rComm;
+  return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetClose(void* opaqueComm) {
+  struct bootstrapNetComm* comm = (struct bootstrapNetComm*)opaqueComm;
+  if (comm) {
+    close(comm->fd);
+    free(comm);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(bootstrapNetClose(sendComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(bootstrapNetClose(recvComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(bootstrapNetClose(listenComm)); return ncclSuccess; }
+
+// Additional sync functions
 static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) {
-  void* request, *mhandle;
-  NCCLCHECK(ncclBootstrapNet->regMr(sendComm, data, size, NCCL_PTR_HOST, &mhandle));
-  NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, mhandle, &request));
-  NCCLCHECK(ncclBootstrapNet->deregMr(sendComm, mhandle));
-  int done = 0;
-  while (!done) NCCLCHECK(bootstrapNetTest(request, &done, NULL));
+  struct bootstrapNetComm* comm = (struct bootstrapNetComm*)sendComm;
+  NCCLCHECK(socketSend(comm->fd, &size, sizeof(int)));
+  NCCLCHECK(socketSend(comm->fd, data, size));
   return ncclSuccess;
 }
 static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) {
-  void* request, *mhandle;
-  NCCLCHECK(ncclBootstrapNet->regMr(recvComm, data, size, NCCL_PTR_HOST, &mhandle));
-  NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, mhandle, &request));
-  NCCLCHECK(ncclBootstrapNet->deregMr(recvComm, mhandle));
-  int done = 0;
-  while (!done) NCCLCHECK(bootstrapNetTest(request, &done, NULL));
+  struct bootstrapNetComm* comm = (struct bootstrapNetComm*)recvComm;
+  int recvSize;
+  NCCLCHECK(socketReceive(comm->fd, &recvSize, sizeof(int)));
+  if (recvSize > size) {
+    WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size);
+    return ncclInternalError;
+  }
+  NCCLCHECK(socketReceive(comm->fd, data, std::min(recvSize, size)));
   return ncclSuccess;
 }
 
-struct extId {
-  ncclNetHandle_t extHandleRoot;
-  void* extListenComm;
-  uint64_t hostHash;
-  pid_t pid;
-  int fd;
-  pthread_t boostrapThread;
-};
+ncclResult_t bootstrapNetCreateHandle(ncclNetHandle_t* netHandle, const char* str) {
+  union socketAddress* connectAddr = (union socketAddress*) netHandle;
+  NCCLCHECK(GetSocketAddrFromString(connectAddr, str));
+  return ncclSuccess;
+}
 
 struct extInfo {
   int rank;
@@ -69,9 +163,8 @@ static ncclResult_t setFilesLimit() {
   return ncclSuccess;
 }
 
-static void *bootstrapRoot(void* commId) {
+static void *bootstrapRoot(void* listenComm) {
   struct extInfo info;
-  struct extId* id = (struct extId*)commId;
   ncclNetHandle_t *rankHandles = NULL;
   ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
   ncclNetHandle_t zero = { 0 }; // for sanity checking
@@ -83,7 +176,7 @@ static void *bootstrapRoot(void* commId) {
   /* Receive addresses from all ranks */
   int nranks = 0, c = 0;
   do {
-    NCCLCHECKGOTO(bootstrapNetAccept(id->extListenComm, &tmpComm), res, out);
+    NCCLCHECKGOTO(bootstrapNetAccept(listenComm, &tmpComm), res, out);
     NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out);
     NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out);
 
@@ -108,22 +201,22 @@ static void *bootstrapRoot(void* commId) {
     memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t));
 
     ++c;
+    TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d",  info.rank, c, nranks);
   } while (c < nranks);
-  TRACE(NCCL_INIT, "COLLECTED HANDLES");
+  TRACE(NCCL_INIT, "COLLECTED ALL %d HANDLES", nranks);
 
   // Send the connect handle for the next rank in the AllGather ring
   for (int r=0; r<nranks; ++r) {
     int next = (r+1) % nranks;
     void *tmpSendComm;
-    NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot[r], &tmpSendComm), res, out);
+    NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot+r, &tmpSendComm), res, out);
     NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
     NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
   }
-  TRACE(NCCL_INIT, "SENT OUT HANDLES");
+  TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);
 
 out:
-  bootstrapNetCloseListen(id->extListenComm);
-  free(commId);
+  bootstrapNetCloseListen(listenComm);
   if (rankHandles) free(rankHandles);
   if (rankHandlesRoot) free(rankHandlesRoot);
 
@@ -131,31 +224,28 @@ out:
   return NULL;
 }
 
-ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
-  struct extId* id = (struct extId*)commId;
-  id->hostHash = getHostHash();
-  NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
-  ncclUniqueId* threadIdCopy;
-  NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
-  memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
-  pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy);
+ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
+  ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
+  void* listenComm;
+  NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, netHandle, &listenComm));
+  pthread_t thread;
+  pthread_create(&thread, NULL, bootstrapRoot, listenComm);
   return ncclSuccess;
 }
 
-ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
-  static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
-  extId* id = (extId*)out;
+ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
+  static_assert(sizeof(ncclNetHandle_t) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
+  memset(id, 0, sizeof(ncclUniqueId));
+  ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
 
   char* env = getenv("NCCL_COMM_ID");
   if (env) {
-    if (ncclSocketCreateHandle(&id->extHandleRoot, env) != 0) {
+    if (bootstrapNetCreateHandle(netHandle, env) != 0) {
       WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
       return ncclInvalidArgument;
     }
-    id->pid = -1;
   } else {
-    id->pid = getpid();
-    NCCLCHECK(bootstrapCreateRoot(out, false));
+    NCCLCHECK(bootstrapCreateRoot(id, false));
   }
 
   return ncclSuccess;
@@ -178,9 +268,9 @@ struct extState {
   int dev;
 };
 
-ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) {
-  struct extId* id = (struct extId*)commId;
-  bool idFromEnv = id->pid < 0;
+ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState) {
+  ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
+  bool idFromEnv = getenv("NCCL_COMM_ID") != NULL;
   struct extState* state;
   NCCLCHECK(ncclCalloc(&state, 1));
   state->rank = rank;
@@ -195,8 +285,8 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co
   void *tmpSendComm, *tmpRecvComm;
   // Pass the remote address to listen via info
   if (idFromEnv) {
-    memcpy(&info.extHandleListen, &id->extHandleRoot, sizeof(ncclNetHandle_t));
-    memcpy(&info.extHandleListenRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
+    memcpy(&info.extHandleListen, netHandle, sizeof(ncclNetHandle_t));
+    memcpy(&info.extHandleListenRoot, netHandle, sizeof(ncclNetHandle_t));
   }
   // listen will return the local address via info (specify interface type 'findSubnetIf')
   state->dev = idFromEnv ? findSubnetIf : 0;
@@ -215,7 +305,7 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co
   }
 
   // send info on my listening socket to root
-  NCCLCHECK(bootstrapNetConnect(state->dev, id->extHandleRoot, &tmpSendComm));
+  NCCLCHECK(bootstrapNetConnect(state->dev, netHandle, &tmpSendComm));
   NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info)));
   NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
 
@@ -226,7 +316,7 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co
   NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
   NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));
 
-  NCCLCHECK(bootstrapNetConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
+  NCCLCHECK(bootstrapNetConnect(state->dev, &extHandleNext, &state->extBstrapRingSendComm));
   // Accept the connect request from the previous rank in the AllGather ring
   NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm));
 
@@ -269,7 +359,7 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
 ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) {
   struct extState* state = (struct extState*)commState;
   void* tmpSendComm;
-  NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles[peer], &tmpSendComm));
+  NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles+peer, &tmpSendComm));
   NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int)));
   NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size));
   NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
@@ -357,3 +447,13 @@ ncclResult_t bootstrapClose(void* commState) {
 
   return ncclSuccess;
 }
+
+ncclResult_t bootstrapAbort(void* commState) {
+  struct extState* state = (struct extState*)commState;
+  bootstrapNetCloseListen(state->extBstrapListenComm);
+  bootstrapNetCloseSend(state->extBstrapRingSendComm);
+  bootstrapNetCloseRecv(state->extBstrapRingRecvComm);
+  free(state->peerBstrapHandles);
+  free(state);
+  return ncclSuccess;
+}
diff --git a/src/channel.cu b/src/channel.cc
index 937e84e..b053e5b 100644
--- a/src/channel.cu
+++ b/src/channel.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -47,5 +47,10 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
     if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
     if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
   }
+
+  // Free the peer structures.
+  CUDACHECK(cudaFree(channel->devPeers));
+  free(channel->peers);
+
   return ncclSuccess;
 }
diff --git a/src/collectives/all_gather.cu b/src/collectives/all_gather.cc
index db21dee..348c176 100644
--- a/src/collectives/all_gather.cu
+++ b/src/collectives/all_gather.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/collectives/all_reduce.cu b/src/collectives/all_reduce.cc
index 1492c90..7796d5b 100644
--- a/src/collectives/all_reduce.cu
+++ b/src/collectives/all_reduce.cc
@@ -1,11 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
 #include "enqueue.h"
-#include "collectives.h"
 
 NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
diff --git a/src/collectives/broadcast.cu b/src/collectives/broadcast.cc
index 6a3d0a8..042301b 100644
--- a/src/collectives/broadcast.cu
+++ b/src/collectives/broadcast.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile
index 8e92596..001059c 100644
--- a/src/collectives/device/Makefile
+++ b/src/collectives/device/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -68,4 +68,4 @@ $(DEVOBJ) : $(LIBOBJ)
 	$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
 
 clean:
-	rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(STATICLIB) test
+	rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(RULESFILE) $(STATICLIB)
diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu
index 530bf14..109c341 100644
--- a/src/collectives/device/all_gather.cu
+++ b/src/collectives/device/all_gather.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index 36809c9..0ad5ba9 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -1,33 +1,33 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"
 
 template<int UNROLL, class FUNC, typename T>
 __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = args->nThreads-WARP_SIZE;
   const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
   const int nranks = comm->nRanks;
   const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+  const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
   const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
 
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
 
-  ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
-    prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
+  ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, FUNC>
+    prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
@@ -74,7 +74,7 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
   const int nthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
 
@@ -129,3 +129,67 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
 
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
+
+#include "prims_ll128.h"
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
+
+  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+  const ssize_t size = args->N;
+  //const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+  // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
+  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+
+  const ssize_t loopSize = args->nChannels*chunkSize;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
+
+    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+    /////////////// begin AllGather steps ///////////////
+    ssize_t offset;
+    int nelem = min(chunkSize, size-chunkOffset);
+    int rankDest;
+
+    // step 0: push data to next GPU
+    rankDest = ring->devUserRanks[0];
+    offset = chunkOffset + rankDest * size;
+
+    if (thisInput + chunkOffset == thisOutput + offset) { // In place
+      LLprims.send(thisInput+chunkOffset, nelem);
+    } else {
+      LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
+    }
+
+    // k-2 steps: copy to next GPU
+    for (int j=1; j<nranks-1; ++j) {
+      rankDest = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + rankDest * size;
+
+      LLprims.recvCopySend(thisOutput+offset, nelem);
+    }
+
+    // step k-1: final store
+    rankDest = ring->devUserRanks[1];
+    offset = chunkOffset + rankDest * size;
+
+    LLprims.recv(thisOutput+offset, nelem);
+  }
+}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu
index aaa96b4..85d007e 100644
--- a/src/collectives/device/all_reduce.cu
+++ b/src/collectives/device/all_reduce.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index ea89a71..2449c2b 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -1,19 +1,19 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"
 
 template<int UNROLL, class FUNC, typename T>
 __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = args->nThreads-WARP_SIZE;
   const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
@@ -27,7 +27,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
 
   ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
-    prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
+    prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
     int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels));
@@ -85,23 +85,28 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
 template<int UNROLL, class FUNC, typename T>
 __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = args->nThreads-WARP_SIZE;
   const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclTree* tree = &channel->tree;
   const ssize_t size = args->N;
   const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = args->lastChunkSize;
+  int chunkSize = args->lastChunkSize;
+  const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
   const ssize_t loopSize = args->nChannels*chunkSize;
 
+  if (loopSize > size) {
+    chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+  }
+
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
 
   do {
+    struct ncclTree* tree = &channel->treeUp;
     // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
-    ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
+    ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
       // Up
       ssize_t offset = gridOffset + bid*chunkSize;
@@ -117,8 +122,9 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
   } while(0);
 
   do {
+    struct ncclTree* tree = &channel->treeDn;
     // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
-    ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
+    ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
       // Down
       ssize_t offset = gridOffset + bid*chunkSize;
@@ -139,7 +145,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
   const int nthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
 
@@ -149,6 +155,8 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
   //const int rank = comm->rank;
   const int nranks = comm->nRanks;
   ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t minChunkSize = nthreads * (sizeof(uint64_t)) / sizeof(T);
+
   const ssize_t loopSize = args->nChannels*nranks*chunkSize;
 
   // Compute pointers
@@ -156,10 +164,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    if (size-gridOffset < loopSize) {
-      chunkSize = args->lastChunkSize;
-    }
-    ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
+    chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
 
     /////////////// begin AllReduce steps ///////////////
     ssize_t offset;
@@ -168,7 +173,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
 
     // step 0: push data to next GPU
     slice = ring->devUserRanks[nranks-1];
-    offset = chunkOffset + slice * chunkSize;
+    offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
     nelem = min(chunkSize, size-offset);
 
     LLprims.send(thisInput+offset, nelem);
@@ -176,7 +181,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
     // k-2 steps: reduce and copy to next GPU
     for (int j=2; j<nranks; ++j) {
       slice = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + slice * chunkSize;
+      offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
       nelem = min(chunkSize, size-offset);
 
       LLprims.recvReduceSend(thisInput+offset, nelem);
@@ -185,7 +190,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
     // step k-1: reduce this buffer and data, which will produce the final
     // result that we store in this data and push to the next GPU
     slice = ring->devUserRanks[0];
-    offset = chunkOffset + slice * chunkSize;
+    offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
     nelem = min(chunkSize, size-offset);
 
     LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
@@ -193,7 +198,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
     // k-2 steps: copy to next GPU
     for (int j=1; j<nranks-1; ++j) {
       slice = ring->devUserRanks[nranks-j];
-      offset = chunkOffset + slice * chunkSize;
+      offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
       nelem = min(chunkSize, size-offset);
 
       LLprims.recvCopySend(thisOutput+offset, nelem);
@@ -201,7 +206,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
 
     // Make final copy from buffer to dest.
     slice = ring->devUserRanks[1];
-    offset = chunkOffset + slice * chunkSize;
+    offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
     nelem = min(chunkSize, size-offset);
 
     // Here we need to copy from buffer to this output.
@@ -214,18 +219,23 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int nthreads = args->nThreads;
   const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclTree* tree = &channel->tree;
   const ssize_t size = args->N;
   ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
   const ssize_t loopSize = args->nChannels*chunkSize;
 
+  if (loopSize > size) {
+    chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+  }
+
   // Compute pointers
   const T * __restrict__ thisInput = (const T*)args->ThisInput;
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
 
   do {
+    struct ncclTree* tree = &channel->treeUp;
     // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
     ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
@@ -243,6 +253,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
   } while(0);
 
   do {
+    struct ncclTree* tree = &channel->treeDn;
     // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
     ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
     for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
@@ -259,3 +270,141 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
     }
   } while(0);
 }
+
+#include "prims_ll128.h"
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
+
+  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+  const ssize_t size = args->N;
+  //const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+  // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
+  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+
+  const ssize_t loopSize = args->nChannels*nranks*chunkSize;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
+
+    /////////////// begin AllReduce steps ///////////////
+    ssize_t offset;
+    int nelem;
+    int slice;
+
+    // step 0: push data to next GPU
+    slice = ring->devUserRanks[nranks-1];
+    offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+    nelem = min(chunkSize, size-offset);
+
+    LLprims.send(thisInput+offset, nelem);
+
+    // k-2 steps: reduce and copy to next GPU
+    for (int j=2; j<nranks; ++j) {
+      slice = ring->devUserRanks[nranks-j];
+      offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+      nelem = min(chunkSize, size-offset);
+
+      LLprims.recvReduceSend(thisInput+offset, nelem);
+    }
+
+    // step k-1: reduce this buffer and data, which will produce the final
+    // result that we store in this data and push to the next GPU
+    slice = ring->devUserRanks[0];
+    offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+    nelem = min(chunkSize, size-offset);
+
+    LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
+
+    // k-2 steps: copy to next GPU
+    for (int j=1; j<nranks-1; ++j) {
+      slice = ring->devUserRanks[nranks-j];
+      offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+      nelem = min(chunkSize, size-offset);
+
+      LLprims.recvCopySend(thisOutput+offset, nelem);
+    }
+
+    // Make final copy from buffer to dest.
+    slice = ring->devUserRanks[1];
+    offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+    nelem = min(chunkSize, size-offset);
+
+    // Here we need to copy from buffer to this output.
+    LLprims.recv(thisOutput+offset, nelem);
+  }
+}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = args->nThreads;
+  const int bid = args->bid;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclTree* treeUp = &channel->treeUp;
+  struct ncclTree* treeDn = &channel->treeDn;
+  const ssize_t size = args->N;
+  ssize_t chunkSize = args->lastChunkSize;
+  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/8;
+  const ssize_t loopSize = args->nChannels*chunkSize;
+  int nthreadsSplit = NCCL_LL128_SPLIT(nthreads);
+
+  if (loopSize > size) {
+    chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+  }
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  if (treeUp->up == -1) {
+    // ReduceAndBroadcast : max number of recv is 3, max number of send is 3
+    ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, treeUp->down, treeDn->down, channel, comm, args->opCount);
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      ssize_t offset = gridOffset + bid*chunkSize;
+      int nelem = min(chunkSize, size-offset);
+      LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
+    }
+  } else {
+    if (tid < nthreadsSplit) {
+      // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
+      ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreadsSplit, treeUp->down, &treeUp->up, channel, comm, args->opCount);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        // Up
+        ssize_t offset = gridOffset + bid*chunkSize;
+        int nelem = min(chunkSize, size-offset);
+        if (treeUp->down[0] == -1) {
+          LLprims.send(thisInput+offset, nelem);
+        } else {
+          LLprims.recvReduceSend(thisInput+offset, nelem);
+        }
+      }
+    } else {
+      // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
+      ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &treeDn->up, treeDn->down, channel, comm, args->opCount);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        // Down
+        ssize_t offset = gridOffset + bid*chunkSize;
+        int nelem = min(chunkSize, size-offset);
+        if (treeDn->down[0] == -1) {
+          LLprims.recv(thisOutput+offset, nelem);
+        } else {
+          LLprims.recvCopySend(thisOutput+offset, nelem);
+        }
+      }
+    }
+  }
+}
diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu
index b83ee70..8c8dbb6 100644
--- a/src/collectives/device/broadcast.cu
+++ b/src/collectives/device/broadcast.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index fb18312..de8b989 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -1,19 +1,19 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"
 
 template<int UNROLL, class FUNC, typename T>
 __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = args->nThreads-WARP_SIZE;
   const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
@@ -29,7 +29,7 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
 
   ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, FUNC>
-    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
+    prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
@@ -59,7 +59,7 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
   const int nthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
 
@@ -100,3 +100,51 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
 
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
+
+#include "prims_ll128.h"
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
+
+  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+  const ssize_t size = args->N;
+  const int rank = ring->devUserRanks[0];
+  const int nextRank = ring->devUserRanks[1];
+  const int root = args->root;
+
+  ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+
+  const ssize_t loopSize = args->nChannels*chunkSize;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
+    ssize_t offset = gridOffset + bid*chunkSize;
+
+    int nelem = min(chunkSize, size-offset);
+    if (rank == root) {
+      if (thisInput == thisOutput) {
+        LLprims.send(thisInput+offset, nelem);
+      } else {
+        LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
+      }
+    } else if (nextRank == root) {
+      LLprims.recv(thisOutput + offset, nelem);
+    } else {
+      LLprims.recvCopySend(thisOutput + offset, nelem);
+    }
+  }
+}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index e4aecbd..46eb9f5 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,9 +7,8 @@
 #ifndef NCCL_DEVICE_COMMON_H_
 #define NCCL_DEVICE_COMMON_H_
 
-#include "../collectives.h"
-#include "core.h"
-#include "nccl.h"
+#include "collectives.h"
+#include "devcomm.h"
 
 // Exit If Abort Barrier across CTA: make sure all threads exit consistently
 // Each thread sets a predicate to true if abort == 1
@@ -31,17 +30,19 @@ extern __device__ ncclKern_t ncclFuncs[];
 static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
   int* d = (int*)dst;
   int* s = (int*)src;
-  // When aggregation is effective, if some threads have aborted inside the LL kernel,
-  // make sure the rest of the threads abort as well
-  exitIfAbortBarrier(0);
   for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
-  __syncthreads();
 }
-static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid) {
+static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, struct ncclDevComm* comm) {
+  // Check whether the last operation was aborted and make sure all threads exit
+  int abort = tid == 0 ? *(comm->abortFlag) : 0;
+  exitIfAbortBarrier(abort);
   load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid);
+  __syncthreads();
   if (tid == 0) hostColl->active = 0;
 }
 
+extern __device__ volatile uint64_t* ncclShmem;
+
 /* Functions for aggregation case */
 #define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
 __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
@@ -51,13 +52,14 @@ __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
 #if NCCL_OP == 0
 /* Kernels with the first operation inlined */
 #define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \
-__launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \
 __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
   int tid = threadIdx.x; \
   int bid = blockIdx.x; \
+  __shared__ volatile uint64_t shmem[NCCL_LL128_SHMEM_SIZE]; \
+  ncclShmem = shmem; \
   __shared__ struct ncclColl localColl; \
  \
-  struct ncclComm* comm = firstColl.args.comm; \
+  struct ncclDevComm* comm = firstColl.args.comm; \
   struct ncclChannel* channel = comm->channels+bid; \
   struct ncclColl* c; \
   if (bid == 0) { \
@@ -65,7 +67,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
     c = &firstColl; \
   } else { \
     c = &localColl; \
-    load_coll(c, channel->devCollectives+channel->collFifoHead, tid); \
+    load_coll(c, channel->devCollectives+channel->collFifoHead, tid, comm); \
   } \
   while (1) { \
     if (tid < c->args.nThreads) { \
@@ -84,7 +86,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
  \
     /* Load next collective operation*/ \
     c = &localColl; /* for bid 0 */ \
-    load_coll(c, channel->devCollectives+nextIndex, tid); \
+    load_coll(c, channel->devCollectives+nextIndex, tid, comm); \
   } \
 }
 #else
@@ -93,13 +95,14 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
 
 // Only generate inline kernels for LL
 #define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \
-  IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
   IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \
-  IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1, al)) \
+  IMPL_COLL_FUNC(coll##LL128, op, ncclFunc, dtype, ctype) \
+  IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
+  IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, al, NCCL_PROTO_LL)) \
 
 #define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
-  IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0) \
-  IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 1)
+  IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \
+  IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING)
 
 #if NCCL_TYPE == 0
 #define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
index e1fb096..aa1e936 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,7 +7,7 @@
 #ifndef NCCL_COMMON_KERNEL_H_
 #define NCCL_COMMON_KERNEL_H_
 
-#include "core.h"
+#include "devcomm.h"
 #include <cstdio>
 #include <cstdint>
 
@@ -263,8 +263,6 @@ __device__ __forceinline__ void ReduceCopyMulti(const int tid, const int nthread
   }
 }
 
-#define WARP_SIZE 32
-
 template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
 __device__ __forceinline__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
     int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu
index ea06b68..034fe96 100644
--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@@ -1,20 +1,23 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#include "devcomm.h"
 #include "collectives.h"
 #include "common.h"
 
+__device__ volatile uint64_t* ncclShmem;
+
 #define NCCL_FUNC5(coll, op, dtype) \
-  NCCL_COLL_NAME(coll, op, dtype), \
-  NCCL_COLL_NAME(coll##LL, op, dtype)
+  NCCL_COLL_NAME(coll##LL, op, dtype), \
+  NCCL_COLL_NAME(coll##LL128, op, dtype), \
+  NCCL_COLL_NAME(coll, op, dtype)
 
 #define NCCL_FUNC4(coll, op, dtype) \
-  NCCL_FUNC5(coll##Ring, op, dtype), \
-  NCCL_FUNC5(coll##Tree, op, dtype)
+  NCCL_FUNC5(coll##Tree, op, dtype), \
+  NCCL_FUNC5(coll##Ring, op, dtype)
 
 // Must be consistent with ncclDataType_t
 #define NCCL_FUNCS3A(coll, op) \
@@ -50,7 +53,7 @@
   NCCL_FUNCS3B(coll, copy), \
   NCCL_FUNCS3B(coll, copy)
 
-// Must be consistent with ncclColl_t
+// Must be consistent with ncclFunc_t
 #define NCCL_FUNCS() { \
   NCCL_FUNCS2B(ncclBroadcast), \
   NCCL_FUNCS2A(ncclReduce), \
@@ -59,7 +62,7 @@
   NCCL_FUNCS2A(ncclAllReduce) }
 
 // Must be consistent with the ncclFuncSet enum
-__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
+__device__ ncclKern_t ncclFuncs[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
 // Don't try to initialize the host shadow copy of this device-side global
 // variable. There is no host pointer to a device-side function, which
 // confuses clang. This will be fixed in the next clang release.
diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh
index 3942c8c..4413213 100755
--- a/src/collectives/device/gen_rules.sh
+++ b/src/collectives/device/gen_rules.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
diff --git a/src/collectives/device/op128.h b/src/collectives/device/op128.h
new file mode 100644
index 0000000..9405dc2
--- /dev/null
+++ b/src/collectives/device/op128.h
@@ -0,0 +1,36 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef OP128_H_
+#define OP128_H_
+
+inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
+  asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
+      : "=l"(v0), "=l"(v1) : "l"(ptr));
+}
+
+inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
+  asm volatile("st.volatile.global.v2.u64 [%2], {%0,%1};"
+      :: "l"(v0), "l"(v1), "l"(ptr));
+}
+
+inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) {
+  uint64_t* shmemAsmPtr;
+  asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr));
+  return shmemAsmPtr;
+}
+
+inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) {
+  asm volatile("ld.volatile.shared.v2.u64 {%0,%1}, [%2];"
+      : "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr));
+}
+
+inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) {
+  asm volatile("st.volatile.shared.v2.u64 [%2], {%0,%1};"
+      :: "l"(v0), "l"(v1), "l"(shmemAsmPtr));
+}
+
+#endif
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index c5aaf54..aa3d20d 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -37,20 +37,32 @@ class ncclPrimitives {
  private:
   const int tid;
   const int nthreads;
+  const int wid;
+  const int stepSize;
   int nrecv = 0;
   int nsend = 0;
-  const int stepSize;
-  struct ncclConnInfo* recvConn[NRECV];
-  struct ncclConnInfo* sendConn[NSEND];
-  volatile uint64_t* waitPtr;
+  struct ncclConnInfo* recvConn = NULL;
+  volatile uint64_t* recvConnHeadPtr = NULL;
+  uint64_t recvConnHead;
+  volatile uint64_t* recvConnTailPtr = NULL;
+  uint64_t recvConnTail;
+  uint64_t recvConnTailCache; // Cache last seen value
+
+  struct ncclConnInfo* sendConn = NULL;
+  volatile int* sendConnFifoPtr = NULL;
+  volatile uint64_t* sendConnTailPtr = NULL;
+  uint64_t sendConnTail;
+  volatile uint64_t* sendConnHeadPtr = NULL;
+  uint64_t sendConnHead;
+  uint64_t sendConnHeadCache; // Cache last seen value
+
   uint64_t recvStep[NRECV];
   uint64_t sendStep[NSEND];
-  uint64_t sendConnHead[NSEND];
   const T* recvDirectBuff[NRECV];
   T* sendDirectBuff[NSEND];
   const T* recvBuff[NRECV];
   T* sendBuff[NSEND];
-  struct ncclComm* comm;
+  struct ncclDevComm* comm;
 
   inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
   inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
@@ -60,15 +72,18 @@ class ncclPrimitives {
   inline __device__ void barrier() {
     asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
   }
+  inline __device__ void subBarrier() {
+    asm volatile ("bar.sync 2, %0;" :: "r"(nthreads-WARP_SIZE));
+  }
 
   uint32_t mismatch = 0;
   const uint64_t opCount;
 
-  inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
+  inline __device__ void checkMismatch(struct ncclConnInfo* conn) {
     if (mismatch) {
       // In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch
       *(comm->fatalDevError) = ncclDevAssertedMismatch;
-    } else if (remoteOpCount && *remoteOpCount > opCount) {
+    } else if (conn && *conn->opCountRem > opCount) {
       mismatch += 1;
     }
   }
@@ -76,49 +91,55 @@ class ncclPrimitives {
   uint32_t spins = 0;
   uint32_t abort = 0;
 
-  inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
+  inline __device__ int checkAbort(int i, int send) {
     spins++;
-    if (spins == SPINS_BEFORE_CHECK_ABORT) {
+    if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
       abort = *(comm->abortFlag);
-      checkMismatch(remoteOpCount);
+      if (wid == i) checkMismatch(send ? sendConn : recvConn);
       spins = 0;
     }
     return abort;
   }
 
-  inline __device__ void waitRecv(int i) {
+  inline __device__ void waitSend(int nbytes) {
     spins = 0;
     mismatch = 0;
-    recvStep[i] += SLICESTEPS;
-    if (tid == i) {
-      while (*(waitPtr) < recvStep[i]) {
-        if (checkAbort(recvConn[i]->opCountRem)) break;
+    if (sendConnHeadPtr) {
+      while (sendConnHeadCache + NCCL_STEPS < sendConnHead + SLICESTEPS) {
+        sendConnHeadCache = *sendConnHeadPtr;
+        if (checkAbort(wid, 1)) break;
+      }
+      if (sendConnFifoPtr) {
+        sendConnFifoPtr[sendConnHead%NCCL_STEPS] = nbytes;
       }
+      sendConnHead += SLICESTEPS;
     }
   }
 
-  inline __device__ void waitSend(int i) {
+  inline __device__ void waitRecv() {
     spins = 0;
     mismatch = 0;
-    sendStep[i] += SLICESTEPS;
-    if (tid == WARP_SIZE+i) {
-      while (sendConnHead[i] + NCCL_STEPS < sendStep[i]) {
-        sendConnHead[i] = *waitPtr;
-        if (checkAbort(sendConn[i]->opCountRem)) break;
+    if (recvConnTailPtr) {
+      while (recvConnTailCache < recvConnTail + SLICESTEPS) {
+        recvConnTailCache = *recvConnTailPtr;
+        if (checkAbort(wid, 0)) break;
       }
+      recvConnTail += SLICESTEPS;
     }
   }
 
-  inline __device__ void postRecv(int i) {
-    *(recvConn[i]->head) = recvStep[i] += SLICESTEPS;
+  inline __device__ void incRecv(int i) {
+    recvStep[i] += SLICESTEPS;
   }
-
-  inline __device__ void postSend(int i) {
-    *(sendConn[i]->tail) = sendStep[i] += SLICESTEPS;
+  inline __device__ void postRecv() {
+    if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += SLICESTEPS;
   }
 
-  inline __device__ void postSendSize(int i, int size) {
-    if (sendConn[i]->fifo) sendConn[i]->fifo[sendStep[i]%NCCL_STEPS] = size;
+  inline __device__ void incSend(int i) {
+    sendStep[i] += SLICESTEPS;
+  }
+  inline __device__ void postSend() {
+    if (sendConnTailPtr) *sendConnTailPtr = sendConnTail += SLICESTEPS;
   }
 
   template <int DIRECTRECV>
@@ -131,11 +152,22 @@ class ncclPrimitives {
     return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
   }
 
+  template <int DIRECTRECV>
+  inline __device__ int directRecvInc(int i, int directInc, int sliceInc) {
+    return DIRECTRECV && recvDirectBuff[i] ? directInc : sliceInc;
+  }
+
+  template <int DIRECTSEND>
+  inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
+    return DIRECTSEND && sendDirectBuff[i] ? directInc : sliceInc;
+  }
+
   template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
   inline __device__ void
   GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) {
     int offset = 0;
-    int sliceSize = stepSize * SLICESTEPS;
+    int sliceSize = stepSize*SLICESTEPS;
+    int dataSize = max(DIVUP(nelem, 16*SLICESPERCHUNK)*16, sliceSize/32);
 
     const T* srcs[RECV*NRECV+SRC];
     srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset);
@@ -151,101 +183,126 @@ class ncclPrimitives {
       for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
     }
 
-    #pragma unroll 1
+    bool syncThread = tid >= nthreads-WARP_SIZE;
+
+    #pragma unroll
     for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
-      int realSize = max(0, min(sliceSize, nelem-offset));
-      if (tid < nthreads) {
-        FOR_SEND(waitSend);
-        FOR_RECV(waitRecv);
+      int realSize = max(0, min(dataSize, nelem-offset));
+      if (!syncThread) {
+        if (SEND) waitSend(realSize*sizeof(T));
+        if (RECV) waitRecv();
         if (realSize > 0) {
-          barrier();
+          subBarrier();
           if (DIRECTRECV && recvDirectBuff[0]) {
             // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
             if (SEND) {
-              ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
+              ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads-WARP_SIZE, 1, srcs, nsend, dsts+1, realSize);
             }
           } else {
-            ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
+            ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads-WARP_SIZE, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
           }
         }
-        exitIfAbortBarrier(abort);
-      } else {
-        exitIfAbortBarrier(abort);
-        FOR_SEND(postSendSize, realSize*sizeof(T));
-        if (SEND) __threadfence_system();
-        FOR_SEND(postSend);
-        FOR_RECV(postRecv);
       }
-      for (int i=0; i<RECV*NRECV+SRC; i++) srcs[i] += sliceSize;
-      for (int i=0; i<SEND*NSEND+DST; i++) dsts[i] += sliceSize;
-      offset += sliceSize;
+      barrier();
+      FOR_SEND(incSend);
+      FOR_RECV(incRecv);
+      if (syncThread) {
+        if (SEND) {
+          if (realSize > 0 && wid == 0) __threadfence_system();
+          __syncwarp();
+          postSend();
+        }
+        if (RECV) postRecv();
+      }
+      srcs[0] += SRC ? realSize : directRecvInc<DIRECTRECV>(0, realSize, sliceSize);
+      for (int i=1-SRC; i<RECV*NRECV; i++) srcs[SRC+i] += sliceSize;
+      dsts[0] += DST ? realSize : directSendInc<DIRECTSEND>(0, realSize, sliceSize);
+      for (int i=1-DST; i<SEND*NSEND; i++) dsts[DST+i] += directSendInc<DIRECTSEND>(i, realSize, sliceSize);
+      offset += realSize;
     }
   }
 
   __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
-    recvConn[i] = conn;
-    recvBuff[i] = (const T*)recvConn[i]->buff;
-    recvStep[i] = recvConn[i]->step;
+    recvBuff[i] = (const T*)conn->buff;
+    recvStep[i] = conn->step;
     recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
-    // Return credits in case we rounded up.
-    if (tid == nthreads) *recvConn[i]->head = recvStep[i];
-    if (tid == i) {
-      waitPtr = recvConn[i]->tail;
-      *(recvConn[i]->opCountLoc) = opCount;
-    }
     recvDirectBuff[i] = NULL;
-    if (directBuff && recvConn[i]->direct) {
+    if (directBuff && conn->direct) {
       recvDirectBuff[i] = directBuff;
-      if (tid == 0) *recvConn[i]->ptrExchange = directBuff;
+      if (tid == 0) *conn->ptrExchange = directBuff;
     }
+    if (wid == i) recvConn = conn;
+    if (wid == i) recvConnTail = recvConnHead = recvStep[i]; // Make sure we set this after rounding up
     nrecv++;
   }
+  __device__ __forceinline__ void loadRecvSync() {
+    if (tid >= WARP_SIZE && tid < 2*WARP_SIZE && wid<nrecv) {
+      recvConnTailPtr = recvConn->tail;
+      recvConnTailCache = *recvConnTailPtr;
+    }
+    if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+      recvConnHeadPtr = recvConn->head;
+      // Return credits in case we rounded up.
+      *recvConnHeadPtr = recvConnHead;
+      // Update opCount in case we skipped some operations
+      *(recvConn->opCountLoc) = opCount;
+    }
+  }
 
   __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
-    sendConn[i] = conn;
-    sendBuff[i] = (T*)sendConn[i]->buff;
-    sendStep[i] = sendConn[i]->step;
+    sendBuff[i] = (T*)conn->buff;
+    sendStep[i] = conn->step;
     sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
-    if (tid == WARP_SIZE+i) {
-      waitPtr = sendConn[i]->head;
-      sendConnHead[i] = *waitPtr;
-      *(sendConn[i]->opCountLoc) = opCount;
-    }
     sendDirectBuff[i] = NULL;
-    if (directBuff && sendConn[i]->direct) {
-      void* volatile* ptr = sendConn[i]->ptrExchange;
+    if (directBuff && conn->direct) {
+      void* volatile* ptr = conn->ptrExchange;
       while ((sendDirectBuff[i] = (T*)(*ptr)) == NULL);
-      __syncthreads();
+      barrier();
       if (tid == 0) *ptr = NULL;
     }
+    if (wid == i) sendConn = conn;
+    if (wid == i) sendConnTail = sendConnHead = sendStep[i]; // Make sure we set this after rounding up
     nsend++;
   }
+  __device__ __forceinline__ void loadSendSync() {
+    if (tid < nsend) {
+      sendConnHeadPtr = sendConn->head;
+      sendConnHeadCache = *sendConnHeadPtr;
+      sendConnFifoPtr = sendConn->fifo;
+      *(sendConn->opCountLoc) = opCount;
+    }
+    if (tid >= nthreads-WARP_SIZE && wid<nsend) {
+      sendConnTailPtr = sendConn->tail;
+    }
+  }
 
-  __device__ __forceinline__ void saveRecvConn(int i) {
-    if (tid == i) {
-      recvConn[i]->step = recvStep[i];
+  __device__ __forceinline__ void saveRecvSync() {
+    if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+      recvConn->step = recvConnHead;
+      *(recvConn->opCountLoc) = opCount+1;
       __threadfence_system();
-      *(recvConn[i]->opCountLoc) += 1;
     }
   }
 
-  __device__ __forceinline__ void saveSendConn(int i) {
-    if (tid == WARP_SIZE+i) {
-      sendConn[i]->step = sendStep[i];
+  __device__ __forceinline__ void saveSendSync() {
+    if (tid < nsend) {
+      sendConn->step = sendConnHead;
+      *(sendConn->opCountLoc) = opCount+1;
       __threadfence_system();
-      *(sendConn[i]->opCountLoc) += 1;
     }
   }
 
  public:
   __device__ __forceinline__
-  ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
-    : comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
-    // Make sure step is updated before we read it
-    __syncthreads();
+  ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
+    : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepSize(stepSize), opCount(opCount) {
+    // Make sure step is updated before we read it.
+    barrier();
 
     for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, directBuff);
     for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, directBuff);
+    loadRecvSync();
+    loadSendSync();
   }
 
   __device__ __forceinline__ void
@@ -305,283 +362,13 @@ class ncclPrimitives {
   }
 
   __device__ __forceinline__ ~ncclPrimitives() {
-    // Save steps for next collective. Have thread 0 do it to be compatible
-    // with the way LL works.
-    for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
-    for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
+    // Save steps for the next operation
+    saveRecvSync();
+    saveSendSync();
   }
 };
 
-template <typename T, class FUNC, int NRECV, int NSEND>
-class ncclLLPrimitives {
- private:
-  const int tid;
-  const int nthreads;
-  int nrecv = 0;
-  int nsend = 0;
-  struct ncclConnInfo* recvConn[NRECV];
-  struct ncclConnInfo* sendConn[NSEND];
-  volatile uint64_t* waitPtr;
-  volatile uint64_t* postPtr;
-  volatile int* fifoPtr;
-  uint64_t recvStep[NRECV];
-  uint64_t sendStep[NSEND];
-  uint64_t sendConnHead;
-  union ncclLLFifoLine* recvBuff[NRECV];
-  union ncclLLFifoLine* sendBuff[NSEND];
-  struct ncclComm* comm;
-
-  inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
-  inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
-  inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
-  inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
-  inline __device__ uint32_t recvFlag(int i) { return recvStep[i]+1; }
-  inline __device__ uint32_t sendFlag(int i) { return sendStep[i]+1; }
-
-  // Exit If Abort Barrier : make sure all threads exit consistently
-  // Each thread sets a predicate to true if val == 1
-  // all CTA's threads enter the barrier and do a popc on their predicates being True
-  // If any of the thread's predicate was True, all the threads call exit()
-  inline __device__ void exitIfAbortLocalBarrier() {
-    uint32_t popc;
-    asm ("{");
-    asm volatile ("   .reg .pred barr_pred;");
-    asm volatile ("   setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
-    asm volatile ("   bar.red.popc.u32 %0, 14, %1, barr_pred;" : "=r"(popc) : "r"(nthreads));
-    asm ("}");
-    if (popc) {
-      // Make sure threads not participating in the operation get the abort and all threads exit
-      exitIfAbortBarrier(1);
-    }
-  }
-
-  inline __device__ void barrier() {
-    asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
-  }
-
-  uint32_t mismatch = 0;
-  const uint64_t opCount;
-
-  inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
-    if (mismatch > 20) {
-      // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
-      // Note that we are not using _threadfence_system in LL so the error cannot be asserted
-      *(comm->fatalDevError) = ncclDevSuspectedMismatch;
-    } else if (remoteOpCount && *remoteOpCount > opCount) {
-      mismatch += 1;
-    }
-  }
-
-  uint32_t spins = 0;
-  uint32_t abort = 0;
-
-  inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
-    spins++;
-    if (spins == SPINS_BEFORE_CHECK_ABORT) {
-      abort = *(comm->abortFlag);
-      checkMismatch(remoteOpCount);
-      spins = 0;
-    }
-    return abort;
-  }
-
-  inline __device__ void waitSend(int i, int nbytes) {
-    spins = 0;
-    mismatch = 0;
-    if (tid == WARP_SIZE+i) {
-      while (sendConnHead + NCCL_STEPS < sendStep[i] + 1) {
-        sendConnHead = *waitPtr;
-        if (checkAbort(sendConn[i]->opCountRem)) break;
-      }
-      if (fifoPtr) fifoPtr[sendStep[i]%NCCL_STEPS] = nbytes;
-    }
-  }
+#include "prims_ll.h"
+//#include "prims_ll128.h"
 
-  inline __device__ void postRecv(int i) {
-    recvStep[i]++;
-    if (tid == i) *postPtr = recvStep[i];
-  }
-
-  inline __device__ void postSend(int i) {
-    sendStep[i]++;
-  }
-
-  __device__ uint64_t readLL(int i, int offset) {
-    union ncclLLFifoLine* src = recvPtr(i) + offset;
-    uint32_t flag = recvFlag(i);
-    uint32_t data1, flag1, data2, flag2;
-    spins = 0;
-    mismatch = 0;
-    do {
-      asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
-      if (checkAbort(recvConn[i]->opCountRem)) break;
-    } while ((flag1 != flag) || (flag2 != flag));
-    uint64_t val64 = data1 + (((uint64_t)data2) << 32);
-    return val64;
-  }
-
-  __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
-    asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
-  }
-
-  // Using memcpy handles misaligned pointers.
-  __device__ uint64_t readAL(uint64_t* src) {
-    uint64_t val;
-    memcpy((char*)&val, (char*)src, sizeof(uint64_t));
-    return val;
-  }
-
-  __device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) {
-    memcpy((char*)dst, (char*)&val, nbytes);
-  }
-
-  template <int RECV, int SEND, int SRC, int DST>
-  __device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) {
-    uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T);
-    FOR_SEND(waitSend, nbytes*2);
-    barrier();
-    uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
-    uint64_t* srcPack = (uint64_t*)srcPtr;
-    uint64_t* dstPack = (uint64_t*)dstPtr;
-    // Do multiples of 64 bits
-    #pragma unroll 2
-    for (int offset=tid; offset<npack; offset+=nthreads) {
-      // Recv : local, then intra-node, then inter-node
-      uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
-      if (RECV) {
-        if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val);
-        for (int i=1; i<NRECV && i<nrecv; i++) {
-          val = MULTI<FUNC, T>()(readLL(i, offset), val);
-        }
-      }
-
-      // Send : inter-node, then intra-node, then local
-      if (SEND) {
-        for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i));
-        storeLL(sendPtr(0)+offset, val, sendFlag(0));
-      }
-      if (DST) {
-        if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) {
-          // Last incomplete word
-          storeAL(dstPack+offset, val, nbytes & 0x7);
-        } else {
-          storeAL(dstPack+offset, val, sizeof(uint64_t));
-        }
-      }
-    }
-    exitIfAbortLocalBarrier();
-    FOR_RECV(postRecv);
-    FOR_SEND(postSend);
-  }
-
-  __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
-    recvConn[i] = conn;
-    recvBuff[i] = recvConn[i]->llBuff;
-    recvStep[i] = recvConn[i]->step;
-    if (tid == i) {
-      postPtr = recvConn[i]->head;
-      *(recvConn[i]->opCountLoc) = opCount;
-    }
-    nrecv++;
-  }
-
-  __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
-    sendConn[i] = conn;
-    sendBuff[i] = sendConn[i]->llBuff;
-    sendStep[i] = sendConn[i]->step;
-    if (tid == WARP_SIZE+i) {
-      waitPtr = sendConn[i]->head;
-      fifoPtr = sendConn[i]->fifo;
-      sendConnHead = *waitPtr;
-      *(sendConn[i]->opCountLoc) = opCount;
-    }
-    nsend++;
-  }
-
-  __device__ __forceinline__ void saveRecvConn(int i) {
-    if (tid == i) {
-      recvConn[i]->step = recvStep[i];
-      *(recvConn[i]->opCountLoc) += 1;
-      __threadfence_block();
-    }
-  }
-
-  __device__ __forceinline__ void saveSendConn(int i) {
-    if (tid == WARP_SIZE+i) {
-      sendConn[i]->step = sendStep[i];
-      *(sendConn[i]->opCountLoc) += 1;
-      __threadfence_block();
-    }
-  }
-
-  __device__ __forceinline__ void llSendCleaning(int i) {
-    if (sendStep[i] > sendConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
-      /* Reset all flags */
-      static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS");
-      static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS");
-      for (int s=0; s<NCCL_STEPS; s++) {
-        waitSend(i, 0);
-        for (int o=tid; o<NCCL_LL_SLICE_LINES; o+=nthreads) {
-          const union ncclLLFifoLine resetLine = { 0, sendFlag(i), 0, sendFlag(i) };
-          sendPtr(i)[o].i4 = resetLine.i4;
-        }
-      }
-      if (tid == 0) sendConn[i]->llLastCleaning = sendStep[i];
-    }
-  }
-
-  __device__ __forceinline__ void llRecvCleaning(int i) {
-    if (recvStep[i] > recvConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
-      recvStep[i] += NCCL_STEPS;
-      if (tid == 0) recvConn[i]->llLastCleaning = recvStep[i];
-    }
-  }
-
- public:
-  __device__ __forceinline__
-  ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
-    : comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
-    // Make sure step is updated before we read it.
-    barrier();
-
-    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
-    for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
-  }
-
-  __device__ void send(const T* src, int nelem) {
-    return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem);
-  }
-
-  __device__ void recv(T* dst, int nelem) {
-    return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem);
-  }
-
-  __device__ void recvReduceSend(const T* src, int nelem) {
-    return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem);
-  }
-
-  __device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
-    return LLGenericOp<1, 0, 1, 1>(src, dst, nelem);
-  }
-
-  __device__ void copySend(const T* src, T* dst, int nelem) {
-    return LLGenericOp<0, 1, 1, 1>(src, dst, nelem);
-  }
-
-  __device__ void recvCopySend(T* dst, int nelem) {
-    return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem);
-  }
-
-  __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
-    return LLGenericOp<1, 1, 1, 1>(src, dst, nelem);
-  }
-
-  __device__ __forceinline__ ~ncclLLPrimitives() {
-    for (int i=0; i<NSEND && i<nsend; i++) llSendCleaning(i);
-    for (int i=0; i<NRECV && i<nrecv; i++) llRecvCleaning(i);
-    // Save steps for the next operation
-    for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
-    for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
-  }
-};
 #endif
diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h
new file mode 100644
index 0000000..f919493
--- /dev/null
+++ b/src/collectives/device/prims_ll.h
@@ -0,0 +1,259 @@
+template <typename T, class FUNC, int NRECV, int NSEND>
+class ncclLLPrimitives {
+ private:
+  const int tid;
+  const int nthreads;
+  const int wid;
+  int nrecv = 0;
+  int nsend = 0;
+  struct ncclConnInfo* recvConn = NULL;
+  volatile uint64_t* recvConnHeadPtr = NULL;
+  uint64_t recvConnHead;
+
+  struct ncclConnInfo* sendConn = NULL;
+  volatile int* sendConnFifoPtr = NULL;
+  volatile uint64_t* sendConnHeadPtr = NULL;
+  uint64_t sendConnHead;
+  uint64_t sendConnHeadCache; // Cache last seen value
+
+  uint64_t recvStep[NRECV];
+  uint64_t sendStep[NSEND];
+  union ncclLLFifoLine* recvBuff[NRECV];
+  union ncclLLFifoLine* sendBuff[NSEND];
+  struct ncclDevComm* comm;
+
+  inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+  inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+  inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
+  inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
+  inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
+  inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
+
+  inline __device__ void barrier() {
+    asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+  }
+
+  uint32_t mismatch = 0;
+  const uint64_t opCount;
+
+  inline __device__ void checkMismatch(struct ncclConnInfo* conn) {
+    if (mismatch > 20) {
+      // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
+      // Note that we are not using _threadfence_system in LL so the error cannot be asserted
+      *(comm->fatalDevError) = ncclDevSuspectedMismatch;
+    } else if (conn && *conn->opCountRem > opCount) {
+      mismatch += 1;
+    }
+  }
+
+  uint32_t spins = 0;
+  uint32_t abort = 0;
+
+  inline __device__ int checkAbort(int i, int send) {
+    spins++;
+    if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
+      abort = *(comm->abortFlag);
+      if (wid == i) checkMismatch(send ? sendConn : recvConn);
+      spins = 0;
+    }
+    return abort;
+  }
+
+  inline __device__ void waitSend(int nbytes) {
+    spins = 0;
+    mismatch = 0;
+    if (sendConnHeadPtr) {
+      while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
+        sendConnHeadCache = *sendConnHeadPtr;
+        if (checkAbort(wid, 1)) break;
+      }
+      if (sendConnFifoPtr) {
+        int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
+        sendConnFifoPtr[sendConnHead%NCCL_STEPS] = size;
+      }
+      sendConnHead += 1;
+    }
+    barrier();
+  }
+
+  inline __device__ void incRecv(int i) {
+    recvStep[i] += 1;
+  }
+  inline __device__ void postRecv() {
+    barrier();
+    if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1;
+  }
+
+  inline __device__ void incSend(int i, int offset) {
+    // LL Cleanup : write all flags in the slice to make sure we don't have
+    // data corruption when flag loops over.
+    if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
+      for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
+    }
+    sendStep[i]++;
+  }
+
+  __device__ uint64_t readLL(int i, int offset) {
+    union ncclLLFifoLine* src = recvPtr(i) + offset;
+    uint32_t flag = recvFlag(i);
+    uint32_t data1, flag1, data2, flag2;
+    spins = 0;
+    mismatch = 0;
+    do {
+      asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
+      if (checkAbort(i, 0)) break;
+    } while ((flag1 != flag) || (flag2 != flag));
+    uint64_t val64 = data1 + (((uint64_t)data2) << 32);
+    return val64;
+  }
+
+  __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
+    asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
+  }
+
+  // Using memcpy handles misaligned pointers.
+  __device__ uint64_t readAL(uint64_t* src) {
+    uint64_t val;
+    memcpy((char*)&val, (char*)src, sizeof(uint64_t));
+    return val;
+  }
+
+  __device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) {
+    memcpy((char*)dst, (char*)&val, nbytes);
+  }
+
+  template <int RECV, int SEND, int SRC, int DST>
+  __device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) {
+    uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T);
+    uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
+    uint64_t* srcPack = (uint64_t*)srcPtr;
+    uint64_t* dstPack = (uint64_t*)dstPtr;
+    int offset = tid;
+
+    // Always waitSend in case of cleanup
+    if (SEND) waitSend(npack*sizeof(union ncclLLFifoLine));
+
+    // Do multiples of 64 bits
+    #pragma unroll 2
+    for (; offset<npack; offset+=nthreads) {
+      // Recv : local, then intra-node, then inter-node
+      uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
+      if (RECV) {
+        if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val);
+        for (int i=1; i<NRECV && i<nrecv; i++) {
+          val = MULTI<FUNC, T>()(readLL(i, offset), val);
+        }
+      }
+
+      // Send : inter-node, then intra-node, then local
+      if (SEND) {
+        for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i));
+        storeLL(sendPtr(0)+offset, val, sendFlag(0));
+      }
+      if (DST) {
+        if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) {
+          // Last incomplete word
+          storeAL(dstPack+offset, val, nbytes & 0x7);
+        } else {
+          storeAL(dstPack+offset, val, sizeof(uint64_t));
+        }
+      }
+    }
+    FOR_RECV(incRecv); if (RECV) postRecv();
+    FOR_SEND(incSend, offset);
+  }
+
+  __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
+    recvBuff[i] = conn->llBuff;
+    recvStep[i] = conn->step;
+    if (wid == i) recvConn = conn;
+    nrecv++;
+  }
+  __device__ __forceinline__ void loadRecvSync() {
+    if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+      recvConnHeadPtr = recvConn->head;
+      recvConnHead = recvConn->step;
+      // Update opCount in case we skipped some operations
+      *(recvConn->opCountLoc) = opCount;
+    }
+  }
+
+  __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
+    sendBuff[i] = conn->llBuff;
+    sendStep[i] = conn->step;
+    if (wid == i) sendConn = conn;
+    nsend++;
+  }
+  __device__ __forceinline__ void loadSendSync() {
+    if (tid < nsend) {
+      sendConnHeadPtr = sendConn->head;
+      sendConnHeadCache = *sendConnHeadPtr;
+      sendConnHead = sendConn->step;
+      sendConnFifoPtr = sendConn->fifo;
+      *(sendConn->opCountLoc) = opCount;
+    }
+  }
+
+  __device__ __forceinline__ void saveRecvSync() {
+    if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+      recvConn->step = recvConnHead;
+      *(recvConn->opCountLoc) = opCount+1;
+      __threadfence_block();
+    }
+  }
+
+  __device__ __forceinline__ void saveSendSync() {
+    if (tid < nsend) {
+      sendConn->step = sendConnHead;
+      *(sendConn->opCountLoc) = opCount+1;
+      __threadfence_block();
+    }
+  }
+
+ public:
+  __device__ __forceinline__
+  ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
+    : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), opCount(opCount) {
+    // Make sure step is updated before we read it.
+    barrier();
+
+    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
+    for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
+    loadRecvSync();
+    loadSendSync();
+  }
+
+  __device__ void send(const T* src, int nelem) {
+    return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem);
+  }
+
+  __device__ void recv(T* dst, int nelem) {
+    return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem);
+  }
+
+  __device__ void recvReduceSend(const T* src, int nelem) {
+    return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem);
+  }
+
+  __device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
+    return LLGenericOp<1, 0, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ void copySend(const T* src, T* dst, int nelem) {
+    return LLGenericOp<0, 1, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ void recvCopySend(T* dst, int nelem) {
+    return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem);
+  }
+
+  __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
+    return LLGenericOp<1, 1, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ __forceinline__ ~ncclLLPrimitives() {
+    // Save steps for the next operation
+    saveRecvSync();
+    saveSendSync();
+  }
+};
diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h
new file mode 100644
index 0000000..40a8cff
--- /dev/null
+++ b/src/collectives/device/prims_ll128.h
@@ -0,0 +1,410 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "op128.h"
+
+#define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1)
+
+template <typename T, class FUNC, int NRECV, int NSEND>
+class ncclLL128Primitives {
+ private:
+  const int tid;
+  const int nthreads;
+  const int wid;
+  const int warp;
+  const bool flagThread;
+  int nrecv = 0;
+  int nsend = 0;
+  struct ncclConnInfo* recvConn = NULL;
+  volatile uint64_t* recvConnHeadPtr = NULL;
+  uint64_t recvConnHead;
+
+  struct ncclConnInfo* sendConn = NULL;
+  volatile int* sendConnFifoPtr = NULL;
+  volatile uint64_t* sendConnTailPtr = NULL;
+  uint64_t sendConnTail;
+  volatile uint64_t* sendConnHeadPtr = NULL;
+  uint64_t sendConnHead;
+  uint64_t sendConnHeadCache; // Cache last seen value
+
+  uint64_t recvStep[NRECV];
+  uint64_t sendStep[NSEND];
+  uint64_t* recvBuff[NRECV];
+  uint64_t* sendBuff[NSEND];
+  struct ncclDevComm* comm;
+
+  volatile uint64_t* shmem;
+
+  inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; }
+  inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; }
+  inline __device__ uint64_t* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
+  inline __device__ uint64_t* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
+  inline __device__ uint64_t recvFlag(int i) { return recvStep[i]+1; }
+  inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; }
+
+  inline __device__ void barrier() {
+    if (NSEND>NRECV) {
+      asm volatile ("bar.sync 2, %0;" :: "r"(nthreads));
+    } else {
+      asm volatile ("bar.sync 3, %0;" :: "r"(nthreads));
+    }
+  }
+
+  uint32_t mismatch = 0;
+  const uint64_t opCount;
+
+  inline __device__ void checkMismatch(struct ncclConnInfo* conn) {
+    if (mismatch > 20) {
+      // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
+      // Note that we are not using _threadfence_system in LL so the error cannot be asserted
+      *(comm->fatalDevError) = ncclDevSuspectedMismatch;
+    } else if (conn && *conn->opCountRem > opCount) {
+      mismatch += 1;
+    }
+  }
+
+  uint32_t spins = 0;
+  uint32_t abort = 0;
+
+  inline __device__ int checkAbort(int i, int send) {
+    spins++;
+    if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
+      abort = *(comm->abortFlag);
+      if (wid == i) checkMismatch(send ? sendConn : recvConn);
+      spins = 0;
+    }
+    return abort;
+  }
+
+  inline __device__ void waitSend(int nbytes) {
+    spins = 0;
+    mismatch = 0;
+    if (sendConnHeadPtr) {
+      while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
+        sendConnHeadCache = *sendConnHeadPtr;
+        if (checkAbort(wid, 1)) break;
+      }
+      if (sendConnFifoPtr) {
+        sendConnFifoPtr[sendStep[wid]%NCCL_STEPS] = nbytes;
+      }
+      sendConnHead += 1;
+    }
+  }
+
+  inline __device__ void incRecv(int i) {
+    recvStep[i] += 1;
+  }
+  inline __device__ void postRecv() {
+    if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1;
+  }
+
+  inline __device__ void incSend(int i) {
+    sendStep[i] += 1;
+  }
+  inline __device__ void postSend() {
+    if (sendConnTailPtr) { __threadfence(); *sendConnTailPtr = sendConnTail += 1; }
+  }
+
+  template <int ELEMS_PER_THREAD>
+  inline __device__ void loadSrcToShmem128(int maxOffset, const uint64_t* src64Ptr) {
+#if 0
+    uint64_t v[ELEMS_PER_THREAD];
+    #pragma unroll
+    for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+      if (u*WARP_SIZE < maxOffset) load128(src64Ptr+u*WARP_SIZE, v[u], v[u+1]);
+    }
+    uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
+    #pragma unroll
+    for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+      storeShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
+    }
+#else
+    uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
+    #pragma unroll
+    for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+      if (u*WARP_SIZE < maxOffset) {
+        uint64_t v0, v1;
+        load128(src64Ptr+u*WARP_SIZE, v0, v1);
+        storeShmem128(shmemAsmPtr+u*WARP_SIZE, v0, v1);
+      }
+    }
+#endif
+  }
+
+  inline __device__ void loadSrcToShmem(int start, int end, const T* srcPtr) {
+    T* shmemPtr = (T*)(shmem-2*wid);
+    for (int offset = start+wid; offset < end; offset += WARP_SIZE) {
+      shmemPtr[offset] = srcPtr[offset];
+    }
+  }
+
+  template <int ELEMS_PER_THREAD>
+  inline __device__ void storeShmemToDst128(int maxOffset, uint64_t* dst64Ptr) {
+    uint64_t v[ELEMS_PER_THREAD];
+    uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
+    #pragma unroll
+    for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+      loadShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
+    }
+    #pragma unroll
+    for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+      if (u*WARP_SIZE < maxOffset) store128(dst64Ptr+u*WARP_SIZE, v[u], v[u+1]);
+    }
+  }
+
+  inline __device__ void storeShmemToDst(int start, int end, T* dstPtr) {
+    T* shmemPtr = (T*)(shmem-2*wid);
+    for (int offset = start+wid; offset < end; offset += WARP_SIZE) {
+      dstPtr[offset] = shmemPtr[offset];
+    }
+  }
+
+  #define WARP_MASK 0xffffffff
+
+  template <int ELEMS_PER_THREAD, int RECV, int SEND, int SRC, int DST>
+  __device__ __forceinline__ void recvReduceSendCopy(int ll128Offset) {
+    uint64_t v[ELEMS_PER_THREAD];
+
+    /************* Data Loading : SHMEM -> REG **************/
+    if (SRC) {
+      volatile uint64_t* shmem64Ptr = shmem - (2*wid)/NCCL_LL128_LINEELEMS;
+      #pragma unroll
+      for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+        v[u] = shmem64Ptr[u*(WARP_SIZE-2)];
+        if (!flagThread) v[u+1] = shmem64Ptr[u*(WARP_SIZE-2)+1];
+      }
+    }
+    /*********** End Data Loading : SHMEM -> REG ************/
+
+    /************************ Recv **************************/
+    if (RECV) {
+      uint64_t flag = recvFlag(0);
+      uint64_t* ptr = recvPtr(0)+ll128Offset;
+      bool needReload;
+      uint64_t v0, v1;
+      do {
+        needReload = false;
+        #pragma unroll
+        for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+          load128(ptr+u*WARP_SIZE, v0, v1);
+          needReload |= flagThread && (v1 != flag);
+        }
+      } while (__any_sync(WARP_MASK, needReload) && checkAbort(0, 0) == 0);
+      #pragma unroll
+      for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+        load128(ptr+u*WARP_SIZE, v0, v1);
+        v[u] = SRC ? MULTI<FUNC, T>()(v0, v[u]) : v0;
+        v[u+1] = SRC ? MULTI<FUNC, T>()(v1, v[u+1]) : v1;
+      }
+
+      for (int i=1; i<NRECV && i<nrecv; i++) {
+        uint64_t flag = recvFlag(i);
+        uint64_t* ptr = recvPtr(i)+ll128Offset;
+        uint64_t v0, v1;
+        do {
+          needReload = false;
+          #pragma unroll
+          for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+            load128(ptr+u*WARP_SIZE, v0, v1);
+            needReload |= flagThread && (v1 != flag);
+          }
+        } while (__any_sync(WARP_MASK, needReload) && checkAbort(i, 0) == 0);
+        #pragma unroll
+        for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+          load128(ptr+u*WARP_SIZE, v0, v1);
+          v[u] = MULTI<FUNC, T>()(v0, v[u]);
+          v[u+1] = MULTI<FUNC, T>()(v1, v[u+1]);
+        }
+      }
+    }
+    /********************** End Recv ************************/
+
+    /************************ Send **************************/
+    if (SEND) {
+      for (int i=1; i<NSEND && i<nsend; i++) {
+        int flag = sendFlag(i);
+        uint64_t* ptr = sendPtr(i)+ll128Offset;
+        #pragma unroll
+        for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+          store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
+        }
+      }
+      int flag = sendFlag(0);
+      uint64_t* ptr = sendPtr(0)+ll128Offset;
+      #pragma unroll
+      for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+        store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
+      }
+    }
+    /********************** End Send ************************/
+
+    /************* Data Storing : REG -> SHMEM **************/
+    if (DST) {
+      volatile uint64_t* shmem64Ptr = shmem - (2*wid)/NCCL_LL128_LINEELEMS;
+      #pragma unroll
+      for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+        shmem64Ptr[u*(WARP_SIZE-2)] = v[u];
+        if (!flagThread) shmem64Ptr[u*(WARP_SIZE-2)+1] = v[u+1];
+      }
+    }
+    /*********** End data Storing : REG -> SHMEM ************/
+  }
+
+  #define LL128INC (WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD)
+  #define ELEMINC (LL128INC-(LL128INC/NCCL_LL128_LINEELEMS))
+
+  template <int RECV, int SEND, int SRC, int DST>
+  __device__ void GenericOp(const T* srcPtr, T* dstPtr, int nelem) {
+    if (nelem <= 0) {
+      // Don't move any data but still increase steps and sync with prev/next
+      if (SEND) waitSend(0);
+      FOR_SEND(incSend); if (SEND) postSend();
+      FOR_RECV(incRecv); if (RECV) postRecv();
+      return;
+    }
+    const int nelem64 = ((nelem*sizeof(T))/(2*sizeof(uint64_t)))*2;
+    const uint64_t* src64Ptr = ((uint64_t*)srcPtr);
+    uint64_t* dst64Ptr = ((uint64_t*)dstPtr);
+
+    int ll128Offset = LL128INC*warp+2*wid;
+    int elemOffset = ELEMINC*warp;
+    const int nwarps = nthreads/WARP_SIZE;
+
+    if (SEND) waitSend(DIVUP(nelem*sizeof(T), ELEMINC*sizeof(uint64_t))*LL128INC*sizeof(uint64_t));
+    barrier();
+
+    while (elemOffset*(sizeof(uint64_t)/sizeof(T)) < nelem) {
+      const int maxOffset128 = min(nelem64-elemOffset, (int)ELEMINC);
+      const int maxOffset = min(nelem-(elemOffset*((int)(sizeof(uint64_t)/sizeof(T)))), (int)(ELEMINC*(sizeof(uint64_t)/sizeof(T))));
+      if (SRC) {
+        int done = 0;
+        if ((((uint64_t)srcPtr)&0xf) == 0) {
+          loadSrcToShmem128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, src64Ptr+elemOffset+2*wid);
+          done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
+        }
+        loadSrcToShmem(done, maxOffset, (T*)(src64Ptr+elemOffset));
+      }
+      __syncwarp();
+      recvReduceSendCopy<NCCL_LL128_SHMEM_ELEMS_PER_THREAD, RECV, SEND, SRC, DST>(ll128Offset);
+      __syncwarp();
+      if (DST) {
+        int done = 0;
+        if ((((uint64_t)dstPtr)&0xf) == 0) {
+          storeShmemToDst128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, dst64Ptr+elemOffset+2*wid);
+          done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
+        }
+        storeShmemToDst(done, maxOffset, (T*)(dst64Ptr+elemOffset));
+      }
+      __syncwarp();
+      ll128Offset += LL128INC*nwarps;
+      elemOffset += ELEMINC*nwarps;
+    }
+
+    barrier();
+    FOR_SEND(incSend); if (SEND) postSend();
+    FOR_RECV(incRecv); if (RECV) postRecv();
+  }
+
+  __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
+    recvBuff[i] = conn->ll128Buff;
+    recvStep[i] = conn->step;
+    if (wid == i) recvConn = conn;
+    nrecv++;
+  }
+  __device__ __forceinline__ void loadRecvSync() {
+    if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+      recvConnHeadPtr = recvConn->head;
+      recvConnHead = recvConn->step;
+      // Update opCount in case we skipped some operations
+      *(recvConn->opCountLoc) = opCount;
+    }
+  }
+
+  __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
+    sendBuff[i] = conn->ll128Buff;
+    sendStep[i] = conn->step;
+    if (wid == i) sendConn = conn;
+    nsend++;
+  }
+  __device__ __forceinline__ void loadSendSync() {
+    if (tid < nsend) {
+      sendConnHeadPtr = sendConn->head;
+      sendConnHeadCache = *sendConnHeadPtr;
+      sendConnHead = sendConn->step;
+      sendConnFifoPtr = sendConn->fifo;
+      *(sendConn->opCountLoc) = opCount;
+    }
+    if (tid >= nthreads-WARP_SIZE && wid<nsend) {
+      if (sendConn->fifo) {
+        sendConnTailPtr = sendConn->tail;
+        sendConnTail = sendConn->step;
+      }
+    }
+  }
+
+  __device__ __forceinline__ void saveRecvSync() {
+    if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+      recvConn->step = recvConnHead;
+      *(recvConn->opCountLoc) = opCount+1;
+      __threadfence_block();
+    }
+  }
+
+  __device__ __forceinline__ void saveSendSync() {
+    if (tid < nsend) {
+      sendConn->step = sendConnHead;
+      *(sendConn->opCountLoc) = opCount+1;
+      __threadfence_block();
+    }
+  }
+
+ public:
+  __device__ __forceinline__
+  ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
+    : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), opCount(opCount), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) {
+    // Make sure step is updated before we read it.
+    barrier();
+
+    for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
+    for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
+    loadRecvSync();
+    loadSendSync();
+  }
+
+  __device__ void send(const T* src, int nelem) {
+    return GenericOp<0, 1, 1, 0>(src, NULL, nelem);
+  }
+
+  __device__ void recv(T* dst, int nelem) {
+    return GenericOp<1, 0, 0, 1>(NULL, dst, nelem);
+  }
+
+  __device__ void recvReduceSend(const T* src, int nelem) {
+    return GenericOp<1, 1, 1, 0>(src, NULL, nelem);
+  }
+
+  __device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
+    return GenericOp<1, 0, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ void copySend(const T* src, T* dst, int nelem) {
+    return GenericOp<0, 1, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ void recvCopySend(T* dst, int nelem) {
+    return GenericOp<1, 1, 0, 1>(NULL, dst, nelem);
+  }
+
+  __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
+    return GenericOp<1, 1, 1, 1>(src, dst, nelem);
+  }
+
+  __device__ __forceinline__ ~ncclLL128Primitives() {
+    // Save steps for the next operation
+    saveRecvSync();
+    saveSendSync();
+  }
+};
diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu
index 1ef66d4..a2caac5 100644
--- a/src/collectives/device/reduce.cu
+++ b/src/collectives/device/reduce.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index 302d053..0680abe 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -1,19 +1,19 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"
 
 template<int UNROLL, class FUNC, typename T>
 __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = args->nThreads-WARP_SIZE;
   const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
@@ -30,7 +30,7 @@ __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
 
   ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, FUNC>
-    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
+    prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
@@ -55,7 +55,7 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
   const int nthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
 
@@ -93,3 +93,48 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
 
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
+
+#include "prims_ll128.h"
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
+
+  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+  const ssize_t size = args->N;
+  const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  const int prevRank = ring->devUserRanks[nranks-1];
+  const int root = args->root;
+
+  ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+
+  const ssize_t loopSize = args->nChannels*chunkSize;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
+    ssize_t offset = gridOffset + bid*chunkSize;
+
+    int nelem = min(chunkSize, size-offset);
+    if (prevRank == root) {
+      LLprims.send(thisInput+offset, nelem);
+    } else if (rank == root) {
+      LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+    } else {
+      LLprims.recvReduceSend(thisInput+offset, nelem);
+    }
+  }
+}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceTreeLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu
index 10857ed..8b45299 100644
--- a/src/collectives/device/reduce_scatter.cu
+++ b/src/collectives/device/reduce_scatter.cu
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index c70c845..1985148 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -1,25 +1,25 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"
 
 template<int UNROLL, class FUNC, typename T>
 __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
-  const int nthreads = blockDim.x - 1;
+  const int nthreads = args->nThreads-WARP_SIZE;
   const int bid = args->bid;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
   const ssize_t size = args->N;
   const int nranks = comm->nRanks;
   const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
-  const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+  const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
   const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
 
   // Compute pointers
@@ -27,7 +27,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
   T * __restrict__ thisOutput = (T*)args->ThisOutput;
 
   ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, FUNC>
-    prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
+    prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
 
   for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
     int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
@@ -69,7 +69,7 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
   const int tid = threadIdx.x;
   const int bid = args->bid;
   const int nthreads = args->nThreads;
-  struct ncclComm* comm = args->comm;
+  struct ncclDevComm* comm = args->comm;
   struct ncclChannel* channel = comm->channels+blockIdx.x;
   struct ncclRing* ring = &channel->ring;
 
@@ -121,3 +121,64 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
 
 template<int UNUSED, class FUNC, typename T>
 __device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
+
+#include "prims_ll128.h"
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int nthreads = args->nThreads;
+  struct ncclDevComm* comm = args->comm;
+  struct ncclChannel* channel = comm->channels+blockIdx.x;
+  struct ncclRing* ring = &channel->ring;
+
+  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+  const ssize_t size = args->N;
+  //const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+  // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
+  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+
+  const ssize_t loopSize = args->nChannels*chunkSize;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
+
+    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+    /////////////// begin ReduceScatter steps ///////////////
+    ssize_t offset;
+    int nelem = min(chunkSize, size-chunkOffset);
+    int rankDest;
+
+    // step 0: push data to next GPU
+    rankDest = ring->devUserRanks[nranks-1];
+    offset = chunkOffset + rankDest * size;
+
+    LLprims.send(thisInput+offset, nelem);
+
+    // k-2 steps: reduce and copy to next GPU
+    for (int j=2; j<nranks; ++j) {
+      rankDest = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + rankDest * size;
+
+      LLprims.recvReduceSend(thisInput+offset, nelem);
+    }
+
+    // step k-1: reduce this buffer and data, which will produce the final
+    // result that we store in this data
+    rankDest = ring->devUserRanks[0];
+    offset = chunkOffset + rankDest * size;
+
+    LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
+  }
+}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceScatterTreeLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/reduce.cu b/src/collectives/reduce.cc
index 302d4bc..67f2fae 100644
--- a/src/collectives/reduce.cu
+++ b/src/collectives/reduce.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/collectives/reduce_scatter.cu b/src/collectives/reduce_scatter.cc
index 4ee77ef..5ad7f5f 100644
--- a/src/collectives/reduce_scatter.cu
+++ b/src/collectives/reduce_scatter.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/debug.cc b/src/debug.cc
new file mode 100644
index 0000000..03a77ae
--- /dev/null
+++ b/src/debug.cc
@@ -0,0 +1,169 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "nccl_net.h"
+#include <stdlib.h>
+#include <stdarg.h>
+
+int ncclDebugLevel = -1;
+thread_local int ncclDebugNoWarn = 0;
+uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
+FILE *ncclDebugFile = stdout;
+pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
+
+void ncclDebugInit() {
+  pthread_mutex_lock(&ncclDebugLock);
+  if (ncclDebugLevel != -1) return;
+  const char* nccl_debug = getenv("NCCL_DEBUG");
+  if (nccl_debug == NULL) {
+    ncclDebugLevel = NCCL_LOG_NONE;
+  } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
+    ncclDebugLevel = NCCL_LOG_VERSION;
+  } else if (strcasecmp(nccl_debug, "WARN") == 0) {
+    ncclDebugLevel = NCCL_LOG_WARN;
+  } else if (strcasecmp(nccl_debug, "INFO") == 0) {
+    ncclDebugLevel = NCCL_LOG_INFO;
+  } else if (strcasecmp(nccl_debug, "ABORT") == 0) {
+    ncclDebugLevel = NCCL_LOG_ABORT;
+  } else if (strcasecmp(nccl_debug, "TRACE") == 0) {
+    ncclDebugLevel = NCCL_LOG_TRACE;
+  }
+
+  /* Parse the NCCL_DEBUG_SUBSYS env var
+   * This can be a comma separated list such as INIT,COLL
+   * or ^INIT,COLL etc
+   */
+  char* ncclDebugSubsysEnv = getenv("NCCL_DEBUG_SUBSYS");
+  if (ncclDebugSubsysEnv != NULL) {
+    int invert = 0;
+    if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; }
+    ncclDebugMask = invert ? ~0ULL : 0ULL;
+    char *ncclDebugSubsys = strdup(ncclDebugSubsysEnv);
+    char *subsys = strtok(ncclDebugSubsys, ",");
+    while (subsys != NULL) {
+      uint64_t mask = 0;
+      if (strcasecmp(subsys, "INIT") == 0) {
+        mask = NCCL_INIT;
+      } else if (strcasecmp(subsys, "COLL") == 0) {
+        mask = NCCL_COLL;
+      } else if (strcasecmp(subsys, "P2P") == 0) {
+        mask = NCCL_P2P;
+      } else if (strcasecmp(subsys, "SHM") == 0) {
+        mask = NCCL_SHM;
+      } else if (strcasecmp(subsys, "NET") == 0) {
+        mask = NCCL_NET;
+      } else if (strcasecmp(subsys, "GRAPH") == 0) {
+        mask = NCCL_GRAPH;
+      } else if (strcasecmp(subsys, "TUNING") == 0) {
+        mask = NCCL_TUNING;
+      } else if (strcasecmp(subsys, "ALL") == 0) {
+        mask = NCCL_ALL;
+      }
+      if (mask) {
+        if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
+      }
+      subsys = strtok(NULL, ",");
+    }
+    free(ncclDebugSubsys);
+  }
+
+  /* Parse and expand the NCCL_DEBUG_FILE path and
+   * then create the debug file. But don't bother unless the
+   * NCCL_DEBUG level is > VERSION
+   */
+  const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE");
+  if (ncclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
+    int c = 0;
+    char debugFn[PATH_MAX+1] = "";
+    char *dfn = debugFn;
+    while (ncclDebugFileEnv[c] != '\0' && c < PATH_MAX) {
+      if (ncclDebugFileEnv[c++] != '%') {
+        *dfn++ = ncclDebugFileEnv[c-1];
+        continue;
+      }
+      switch (ncclDebugFileEnv[c++]) {
+        case '%': // Double %
+          *dfn++ = '%';
+          break;
+        case 'h': // %h = hostname
+          char hostname[1024];
+          getHostName(hostname, 1024, '.');
+          dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
+          break;
+        case 'p': // %p = pid
+          dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
+          break;
+        default: // Echo everything we don't understand
+          *dfn++ = '%';
+          *dfn++ = ncclDebugFileEnv[c-1];
+          break;
+      }
+    }
+    *dfn = '\0';
+    if (debugFn[0] != '\0') {
+      FILE *file = fopen(debugFn, "w");
+      if (file != NULL) {
+        INFO(NCCL_ALL,"DEBUG file is '%s'", debugFn);
+        ncclDebugFile = file;
+      }
+    }
+  }
+
+#ifdef ENABLE_TRACE
+  ncclEpoch = std::chrono::high_resolution_clock::now();
+#endif
+  pthread_mutex_unlock(&ncclDebugLock);
+}
+
+/* Common logging function used by the INFO, WARN and TRACE macros
+ * Also exported to the dynamically loadable Net transport modules so
+ * they can share the debugging mechanisms and output files
+ */
+void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
+  if (ncclDebugLevel == -1) ncclDebugInit();
+  if (ncclDebugNoWarn == 1 && level == NCCL_LOG_WARN) level = NCCL_LOG_INFO;
+
+  char hostname[1024];
+  getHostName(hostname, 1024, '.');
+  int cudaDev;
+  cudaGetDevice(&cudaDev);
+
+  char buffer[1024];
+  size_t len = 0;
+  pthread_mutex_lock(&ncclDebugLock);
+  if (ncclDebugNoWarn && ncclDebugLevel == NCCL_LOG_WARN) printf("WARN -> INFO\n");
+  if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
+    len = snprintf(buffer, sizeof(buffer),
+                   "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
+  else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask))
+    len = snprintf(buffer, sizeof(buffer),
+                   "%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev);
+#ifdef ENABLE_TRACE
+  else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
+    auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
+    double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
+    len = snprintf(buffer, sizeof(buffer),
+                   "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line);
+  }
+#endif
+  if (len) {
+    va_list vargs;
+    va_start(vargs, fmt);
+    (void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
+    va_end(vargs);
+    fprintf(ncclDebugFile,"%s\n", buffer);
+    fflush(ncclDebugFile);
+  }
+  pthread_mutex_unlock(&ncclDebugLock);
+
+  // If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort()
+  if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) {
+    fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n",
+            hostname, getpid(), gettid(), cudaDev, filefunc, line);
+    abort();
+  }
+}
diff --git a/src/enqueue.cu b/src/enqueue.cc
index d283223..2239865 100644
--- a/src/enqueue.cu
+++ b/src/enqueue.cc
@@ -1,23 +1,21 @@
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
 #include "enqueue.h"
-#include "checks.h"
-#include "param.h"
-
-#include "collectives/collectives.h"
+#include "argcheck.h"
 
 // Only generate inline kernels for LL
 #define NCCL_FUNC5(coll, op, dtype) \
   (void*)NCCL_KERN_NAME(coll##LL, op, dtype), \
+  (void*)NCCL_KERN_NAME(coll##LL, op, dtype), \
   (void*)NCCL_KERN_NAME(coll##LL, op, dtype)
 
 #define NCCL_FUNC4(coll, op, dtype) \
-  (void*)NCCL_FUNC5(coll##Ring, op, dtype), \
-  (void*)NCCL_FUNC5(coll##Tree, op, dtype)
+  (void*)NCCL_FUNC5(coll##Tree, op, dtype), \
+  (void*)NCCL_FUNC5(coll##Ring, op, dtype)
 
 // Must be consistent with ncclDataType_t
 #define NCCL_FUNCS3A(coll, op) \
@@ -54,7 +52,7 @@
   NCCL_FUNCS3B(coll, copy)
 
 // Must be consistent with the ncclFuncSet enum
-static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
+static void* const ncclKerns[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
   NCCL_FUNCS2B(ncclBroadcast),
   NCCL_FUNCS2A(ncclReduce),
   NCCL_FUNCS2B(ncclAllGather),
@@ -87,7 +85,7 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *par
 }
 
 ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
-  params->gridDim.x = std::min((int) params->gridDim.x, comm->nChannels);
+  params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);
 
   // Set active = 2 for the last operation
   for (int r=0; r<params->gridDim.x; r++) {
@@ -207,6 +205,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
     channel->collCount = 0;
   }
   params->gridDim.x = params->blockDim.x = 0;
+  comm->lastOpCount = comm->opCount;
   NCCLCHECK(transportStartProxy(comm));
   return ncclSuccess;
 }
@@ -228,20 +227,70 @@ ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
 /* Enqueueing system : computation of kernel and proxy operations parameters */
 /*****************************************************************************/
 
-static ncclResult_t getPatternInfo(struct ncclInfo* info) {
-  if (info->coll == ncclCollBroadcast) info->pattern = ncclPatternPipelineFrom;
-  else if (info->coll == ncclCollReduce) info->pattern = ncclPatternPipelineTo;
-  else if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->pattern = ncclPatternRing;
-  else if (info->coll == ncclCollAllReduce) {
-    if (info->nBytes <= info->comm->treeThreshold)
-      info->pattern = ncclPatternTreeUpDown;
-    else
-      info->pattern = ncclPatternRingTwice;
+// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
+// factor is not ideal but works quite well. Powers of two, 64 B to 1 GB.
+static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
+  { 1.0, 1.0, 1.0, 1.0,  .9,  .8,  .7,  .7,  .7,  .7,  .6,  .5,  .5,  .5,  .6,  .7,  .8,  .9,  .9, 1.0, 1.0, 1.0 },
+  { 1.0, 1.0, 1.0, 1.0, 1.0,  .9,  .8,  .8,  .8,  .8,  .7,  .7,  .7,  .6,  .6,  .7,  .7,  .8,  .8,  .9,  .9, 1.0 },
+  {  .9,  .9,  .9,  .9,  .9,  .9,  .9,  .8,  .7,  .6,  .6,  .5,  .5,  .5,  .5,  .5,  .5,  .6,  .6,  .7,  .8,  .9 }
+};
+
+static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
+  struct ncclComm* comm = info->comm;
+  float minTime = 3600000.0; // Hopefully no operation will take an hour to complete.
+  // Find algorithm / protocol.
+  info->algorithm = -1;
+  info->protocol = -1;
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      float bw = comm->bandwidths[info->coll][a][p];
+      if (bw == 0) continue;
+      int logSize = log2i(info->nBytes>>6);
+      if (a == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[p][logSize];
+      float time = comm->latencies[info->coll][a][p] + (info->nBytes) / (1000 * bw);
+      if (time < minTime) {
+        info->algorithm = a;
+        info->protocol = p;
+        minTime = time;
+      }
+    }
   }
-  else {
-    WARN("Unknown collective %d", info->coll);
+  if (info->algorithm == -1 || info->protocol == -1) {
+    WARN("Error : no algorithm/protocol available");
     return ncclInternalError;
   }
+  //if (comm->rank == 0) INFO(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %d", info->nBytes, info->algorithm, info->protocol, minTime);
+  TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
+
+  int nc = comm->nChannels;
+  int nt = comm->maxThreads[info->protocol];
+  int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
+  while (info->nBytes < nc*nt*threadThreshold) {
+    if (nc >= 2) nc--;
+    else if ((nt % 128) == 0) nt/=2;
+    else break;
+  }
+  if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync
+  info->nChannels = nc;
+  info->nThreads = nt;
+  return ncclSuccess;
+}
+
+static ncclResult_t getPatternInfo(struct ncclInfo* info) {
+  switch (info->coll) {
+    case ncclCollBroadcast:
+      info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; break;
+    case ncclCollReduce:
+      info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break;
+    case ncclCollReduceScatter:
+    case ncclCollAllGather:
+      info->pattern = ncclPatternRing; break;
+    case ncclCollAllReduce:
+      info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
+    default:
+      WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
+      return ncclInternalError;
+  }
   return ncclSuccess;
 }
 
@@ -264,40 +313,9 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
   return ncclSuccess;
 }
 
-static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) {
-  // Compute thresholds and limits that users can override
-  int perThreadLLThreshold = std::min(info->comm->threadThreshold, (ssize_t)NCCL_LL_CHANNEL_THRESHOLD);
-  int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads);
-
-  // First compute nThreads
-  int nt = NCCL_LL_MIN_NTHREADS;
-  while (DIVUP(info->nBytes, nt*info->nchunksPerLoop) > perThreadLLThreshold && nt*2 <= maxLLNthreads) nt *= 2;
-
-  // Then compute nChannels
-  int nc = DIVUP(info->nBytes, nt*info->nchunksPerLoop*perThreadLLThreshold);
-  if (nc == 0) nc = 1;
-  if (nc > info->comm->nChannels) nc = info->comm->nChannels;
-
-  // Check if we have a fixed LL threshold, otherwise compute it.
-  int perThreadThreshold = info->comm->threadThreshold;
-  if (info->pattern >= ncclPatternTreeUp) perThreadThreshold *= 4;
-  ssize_t llThreshold = info->comm->llThreshold >= 0 ?
-    info->comm->llThreshold :
-    nc*nt*info->nchunksPerLoop*perThreadThreshold;
-
-  if (info->nBytes <= llThreshold) {
-    *llMode = 1;
-    *nChannels = nc;
-    *nThreads = nt;
-  } else {
-    *llMode = 0;
-    *nChannels = info->comm->nChannels;
-    *nThreads = info->comm->nThreads+1;
-  }
-}
-
 static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
   // Set nstepsPerLoop and nchunksPerLoop
+  NCCLCHECK(getAlgoInfo(info));
   NCCLCHECK(getPatternInfo(info));
   NCCLCHECK(getLoopInfo(info));
 
@@ -307,48 +325,52 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
   coll->args.ThisOutput = info->recvbuff;
   coll->args.comm = info->comm->devComm;
   coll->args.opCount = info->comm->opCount;
+  coll->args.nChannels = info->nChannels;
+  coll->args.nThreads = info->nThreads;
 
-  // Compute llMode, nChannels, nThreads
-  int llMode;
-  getKernelInfo(info, &coll->args.nChannels, &coll->args.nThreads, &llMode);
-
-  int treeMode = info->pattern >= ncclPatternTreeUp ? 1 : 0;
-  coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, llMode, treeMode);
+  coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
 
-  int stepSize   = ( llMode ? NCCL_LL_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS;
-  int chunkSteps = (llMode|treeMode) ? 1 : info->chunkSteps;
-  int sliceSteps = (llMode|treeMode) ? 1 : info->sliceSteps;
+  int stepSize   = (info->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : info->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS;
+  int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
+  int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1;
   int chunkSize  = stepSize*chunkSteps;
 
   // Compute lastChunkSize
-  if (treeMode == 1 && llMode == 0) {
+  if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_SIMPLE) {
     if (info->pattern == ncclPatternTreeUpDown) {
       // Optimize chunkSize / nSteps
-      while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
-      while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
-      while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
+      while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*8 && chunkSize > 131072) chunkSize /= 2;
+      while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
+      while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth && chunkSize > 32768) chunkSize /= 2;
     }
     // Use lastChunkSize as chunkSize
     coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
-  } else if (llMode == 1) {
+  } else if (info->protocol == NCCL_PROTO_LL) {
     int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t);
-    const ssize_t loopSize = coll->args.nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
-    coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), coll->args.nChannels*info->nchunksPerLoop);
-    ALIGN_SIZE(coll->args.lastChunkSize, coll->args.nThreads*sizeof(uint64_t));
+    const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
+    coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
+    ALIGN_SIZE(coll->args.lastChunkSize, info->nThreads*sizeof(uint64_t));
     coll->args.lastChunkSize /= ncclTypeSize(info->datatype);
+  } else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) {
+    int nstepsInter = 1+log2i(info->comm->nNodes);
+    while (info->nBytes / (info->nChannels*chunkSize) < nstepsInter*4 && chunkSize > 32768) chunkSize /= 2;
+    // Use lastChunkSize as chunkSize
+    coll->args.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
   }
 
   // Compute nSteps for proxies
-  size_t nBytes  = llMode ? info->nBytes*2 : info->nBytes;
-
-  int nLoops = (int)(DIVUP(nBytes, (((size_t)(coll->args.nChannels))*info->nchunksPerLoop*chunkSize)));
+  int chunkEffectiveSize = chunkSize;
+  if (info->protocol == NCCL_PROTO_LL) chunkEffectiveSize /= 2;
+  if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS;
+  //if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol);
+  int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
   proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
   proxyArgs->sliceSteps = sliceSteps;
   proxyArgs->chunkSteps = chunkSteps;
-  proxyArgs->llMode = llMode;
+  proxyArgs->protocol = info->protocol;
   proxyArgs->opCount = info->comm->opCount;
-  TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> llmode %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
-      coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, nBytes, llMode, coll->args.nChannels, coll->args.nThreads,
+  TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
+      coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
       nLoops, proxyArgs->nsteps, info->comm);
   return ncclSuccess;
 }
@@ -365,7 +387,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
   memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
   NCCLCHECK(computeColl(info, &coll, &proxyArgs));
 
-  info->comm->myParams->blockDim.x = max(info->comm->myParams->blockDim.x, coll.args.nThreads);
+  info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, coll.args.nThreads);
   if (info->comm->userStreamSet == false) {
     info->comm->userStream = info->stream;
     info->comm->userStreamSet = true;
@@ -401,7 +423,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
     channel->collFifoTail = opIndex;
     channel->collCount++;
   }
-  /*if (llMode == 0)*/ info->comm->opCount++;
+  info->comm->opCount++;
   return ncclSuccess;
 }
 
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
new file mode 100644
index 0000000..af481d2
--- /dev/null
+++ b/src/graph/connect.cc
@@ -0,0 +1,268 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "graph.h"
+#include "trees.h"
+#include "rings.h"
+
+/******************************************************************/
+/********************* Internode connection ***********************/
+/******************************************************************/
+
+ncclResult_t ncclTopoPreset(struct ncclComm* comm,
+    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
+    struct ncclTopoRanks* topoRanks) {
+  int rank = comm->rank;
+  int localRanks = comm->localRanks;
+  int nChannels = comm->nChannels;
+
+  for (int c=0; c<nChannels; c++) {
+    struct ncclChannel* channel = comm->channels+c;
+    channel->ring.prev = channel->ring.next = -1;
+    channel->treeUp.up = -1;
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeUp.down[i] = -1;
+    channel->treeDn.up = -1;
+    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeDn.down[i] = -1;
+
+    int* ringIntra = ringGraph->intra+c*localRanks;
+    int* treeIntra = treeGraph->intra+c*localRanks;
+
+    for (int i=0; i<localRanks; i++) {
+      if (ringIntra[i] == rank) {
+        topoRanks->ringRecv[c] = ringIntra[0];
+        topoRanks->ringSend[c] = ringIntra[localRanks-1];
+        channel->ring.prev = (i == 0) ? -1 : ringIntra[i-1];
+        channel->ring.next = (i == localRanks-1) ? -1 : ringIntra[i+1];
+      }
+      if (treeIntra[i] == rank) {
+        int recvIndex = 0, sendIndex = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
+        int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
+
+        // Tree loop always flows in the same direction. Other trees are symmetric, i.e.
+        // up/down go in reverse directions
+        int sym = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP ? 0 : 1;
+
+        // Down tree is common
+        topoRanks->treeDnRecv[c] = treeIntra[recvIndex];
+        topoRanks->treeDnSend[c] = treeIntra[sendIndex];
+        channel->treeDn.up       = treeIntra[prev];
+        channel->treeDn.down[0]  = treeIntra[next];
+        // Up tree depends on the pattern
+        topoRanks->treeUpRecv[c] = sym ? topoRanks->treeDnSend[c] : topoRanks->treeDnRecv[c];
+        topoRanks->treeUpSend[c] = sym ? topoRanks->treeDnRecv[c] : topoRanks->treeDnSend[c];
+        channel->treeUp.down[0]  = sym ? channel->treeDn.down[0]  : channel->treeDn.up ;
+        channel->treeUp.up       = sym ? channel->treeDn.up       : channel->treeDn.down[0];
+      }
+    }
+    topoRanks->ringPrev[c] = channel->ring.prev;
+    topoRanks->ringNext[c] = channel->ring.next;
+  }
+  // Duplicate channels rings/trees
+  struct ncclChannel* channel0 = comm->channels;
+  struct ncclChannel* channel1 = channel0+nChannels;
+  memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel));
+  return ncclSuccess;
+}
+
+static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext, int* firstRanks) {
+  int nChannels = comm->nChannels;
+  int nNodes = comm->nNodes;
+  for (int c=0; c<nChannels; c++) {
+    int* recv = ringRecv+c*comm->nRanks;
+    int* send = ringSend+c*comm->nRanks;
+    int* prev = ringPrev+c*comm->nRanks;
+    int* next = ringNext+c*comm->nRanks;
+    struct ncclChannel* channel0 = comm->channels+c;
+    struct ncclChannel* channel1 = channel0+nChannels;
+    for (int n=0; n<nNodes; n++) {
+      int recvRank = recv[firstRanks[n]];
+      int prevSendRank = send[firstRanks[(n-1+nNodes)%nNodes]];
+      prev[recvRank] = prevSendRank;
+      if (comm->rank == recvRank) {
+        channel0->ring.prev = prevSendRank;
+        channel1->ring.prev = prevSendRank;
+      }
+      int sendRank = send[firstRanks[n]];
+      int nextRecvRank = recv[firstRanks[(n+1)%nNodes]];
+      next[sendRank] = nextRecvRank;
+      if (comm->rank == sendRank) {
+        channel0->ring.next = nextRecvRank;
+        channel1->ring.next = nextRecvRank;
+      }
+    }
+    TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, channel0->ring.prev, comm->rank, channel0->ring.next);
+    TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c+nChannels, channel1->ring.prev, comm->rank, channel1->ring.next);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes, int* firstRanks) {
+ for (int n=0; n<nNodes; n++) indexes[n] = ranks[firstRanks[n]];
+ return ncclSuccess;
+}
+
+static ncclResult_t setTreeUp(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int u0, int u1) {
+  if (u0 != -1) tree0->up = indexes[u0];
+  if (u1 != -1) tree1->up = indexes[u1];
+  return ncclSuccess;
+}
+
+static ncclResult_t addRanksDown(int* down, int* indexes, int r0, int r1) {
+  int x = 0;
+  if (down[x] >= 0) x++;
+  if (down[x] >= 0) {
+    WARN("Internal error : tree already has more than one child (%d %d %d)\n", down[0], down[1], down[2]);
+    return ncclInternalError;
+  }
+  if (r0 != -1) down[x++] = indexes[r0];
+  if (r1 != -1) down[x++] = indexes[r1];
+  return ncclSuccess;
+}
+
+static ncclResult_t setTreeDown(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int d0_0, int d0_1, int d1_0, int d1_1) {
+  NCCLCHECK(addRanksDown(tree0->down, indexes, d0_0, d0_1));
+  NCCLCHECK(addRanksDown(tree1->down, indexes, d1_0, d1_1));
+  return ncclSuccess;
+}
+
+static ncclResult_t openRing(struct ncclTree* tree, int rank, int upRank) {
+  if (tree->down[0] == upRank) tree->down[0] = -1;
+  if (rank == upRank) tree->up = -1;
+  return ncclSuccess;
+}
+
+static ncclResult_t connectTrees(struct ncclComm* comm, int* treeUpRecv, int* treeUpSend, int* treeDnRecv, int* treeDnSend, int* firstRanks) {
+  const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node;
+  int* indexesSend, *indexesRecv;
+  NCCLCHECK(ncclCalloc(&indexesSend, nNodes));
+  NCCLCHECK(ncclCalloc(&indexesRecv, nNodes));
+
+  // Compute tree depth. Not an exact value but a good approximation in most
+  // cases
+  int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
+
+  int u0, d0_0, d0_1, u1, d1_0, d1_1;
+  NCCLCHECK(ncclGetDtree(nNodes, node, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1));
+  for (int c=0; c<nChannels; c++) {
+     struct ncclChannel* channel0 = comm->channels+c;
+     struct ncclChannel* channel1 = channel0+nChannels;
+     NCCLCHECK(getIndexes(treeUpSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
+     NCCLCHECK(getIndexes(treeUpRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
+     NCCLCHECK(openRing(&channel0->treeUp, comm->rank, indexesSend[node]));
+     NCCLCHECK(openRing(&channel1->treeUp, comm->rank, indexesSend[node]));
+     int root = indexesSend[node];
+     if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeUp, &channel1->treeUp, indexesRecv, u0, u1));
+     if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeUp, &channel1->treeUp, indexesSend, d0_0, d0_1, d1_0, d1_1));
+     NCCLCHECK(getIndexes(treeDnSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
+     NCCLCHECK(getIndexes(treeDnRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
+     NCCLCHECK(openRing(&channel0->treeDn, comm->rank, u0 == -1 ? root : indexesRecv[node]));
+     NCCLCHECK(openRing(&channel1->treeDn, comm->rank, u1 == -1 ? root : indexesRecv[node]));
+     if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeDn, &channel1->treeDn, indexesRecv, d0_0, d0_1, d1_0, d1_1));
+     if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeDn, &channel1->treeDn, indexesSend, u0, u1));
+     TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c,           channel0->treeUp.up, channel0->treeUp.down[0], channel0->treeUp.down[1], channel0->treeUp.down[2]);
+     TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c+nChannels, channel1->treeUp.up, channel1->treeUp.down[0], channel1->treeUp.down[1], channel1->treeUp.down[2]);
+     TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c,           channel0->treeDn.up, channel0->treeDn.down[0], channel0->treeDn.down[1], channel0->treeDn.down[2]);
+     TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c+nChannels, channel1->treeDn.up, channel1->treeDn.down[0], channel1->treeDn.down[1], channel1->treeDn.down[2]);
+     channel0->treeUp.depth = channel1->treeUp.depth = depth;
+  }
+  free(indexesSend);
+  free(indexesRecv);
+  return ncclSuccess;
+}
+
+// Legacy naming
+NCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
+NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
+// New naming
+NCCL_PARAM(MinNchannels, "MIN_NCHANNELS", -2);
+NCCL_PARAM(MaxNchannels, "MAX_NCHANNELS", -2);
+
+int ncclMinNchannels() {
+  int minNchannels = 0;
+  if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
+  if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
+  if (minNchannels > MAXCHANNELS) {
+    WARN("User asked for a minimum of %d channels, limiting to %d\n", minNchannels, MAXCHANNELS);
+    minNchannels = MAXCHANNELS;
+  }
+  if (minNchannels < 0) minNchannels = 0;
+  return minNchannels;
+}
+int ncclMaxNchannels() {
+  int maxNchannels = MAXCHANNELS;
+  if (ncclParamMaxNrings() != -2) maxNchannels = ncclParamMaxNrings();
+  if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
+  if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
+  if (maxNchannels < 1) {
+    WARN("User asked for a maximum of %d channels, setting it to 1\n", maxNchannels);
+    maxNchannels = 1;
+  }
+  return maxNchannels;
+}
+
+ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings) {
+  // Gather data from all ranks
+  int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend;
+  int nranks = comm->nRanks;
+  int nChannels = comm->nChannels;
+  NCCLCHECK(ncclCalloc(&ringRecv, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&ringSend, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeUpRecv, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeUpSend, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeDnRecv, nranks*MAXCHANNELS));
+  NCCLCHECK(ncclCalloc(&treeDnSend, nranks*MAXCHANNELS));
+  for (int i=0; i<nranks; i++) {
+    for (int c=0; c<nChannels;c++) {
+      ringRecv[c*nranks+i] = allTopoRanks[i]->ringRecv[c];
+      ringSend[c*nranks+i] = allTopoRanks[i]->ringSend[c];
+      ringPrev[c*nranks+i] = allTopoRanks[i]->ringPrev[c];
+      ringNext[c*nranks+i] = allTopoRanks[i]->ringNext[c];
+      treeUpRecv[c*nranks+i] = allTopoRanks[i]->treeUpRecv[c];
+      treeUpSend[c*nranks+i] = allTopoRanks[i]->treeUpSend[c];
+      treeDnRecv[c*nranks+i] = allTopoRanks[i]->treeDnRecv[c];
+      treeDnSend[c*nranks+i] = allTopoRanks[i]->treeDnSend[c];
+    }
+  }
+
+  // Connect rings and trees. This should also duplicate the channels.
+  NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext, firstRanks));
+  NCCLCHECK(connectTrees(comm, treeUpRecv, treeUpSend, treeDnRecv, treeDnSend, firstRanks));
+
+  // Duplicate ringPrev/ringNext for ncclBuildRing
+  memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
+  memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
+
+  // Duplication should be complete now
+  nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);
+
+  // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
+  // We permit combining max, then min, to only use the first channels, then duplicate them.
+  nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels);
+  int c;
+  for (c=nChannels; c<ncclMinNchannels(); c++) {
+    memcpy(ringPrev+c*nranks, ringPrev+(c-nChannels)*nranks, nranks*sizeof(int));
+    memcpy(ringNext+c*nranks, ringNext+(c-nChannels)*nranks, nranks*sizeof(int));
+    memcpy(comm->channels+c, comm->channels+c-nChannels, sizeof(struct ncclChannel));
+  }
+  nChannels = comm->nChannels = c;
+
+  // Create rings array and check all is fine
+  NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
+
+  free(ringRecv);
+  free(ringSend);
+  free(ringPrev);
+  free(ringNext);
+  free(treeUpRecv);
+  free(treeUpSend);
+  free(treeDnRecv);
+  free(treeDnSend);
+
+  return ncclSuccess;
+}
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
new file mode 100644
index 0000000..ce1772c
--- /dev/null
+++ b/src/graph/paths.cc
@@ -0,0 +1,363 @@
+/*************************************************************************
+ * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "graph.h"
+#include "topo.h"
+#include "comm.h"
+#include "net.h"
+
+// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths
+
+struct ncclTopoNodeList {
+  struct ncclTopoNode* list[NCCL_TOPO_MAX_NODES];
+  int count;
+};
+
+static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode* node, int t, int64_t id, struct ncclTopoLinkList** path) {
+  for (int i=0; i<system->nodes[t].count; i++) {
+    if (system->nodes[t].nodes[i].id == id) {
+      *path = node->paths[t]+i;
+      return ncclSuccess;
+    }
+  }
+  WARN("Could not find node of type %d id %lx\n", t, id);
+  return ncclInternalError;
+}
+
+static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
+  if (baseNode->paths[baseNode->type] == NULL) {
+    NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
+  }
+
+  // breadth-first search to set all paths to that node in the system
+  struct ncclTopoNodeList nodeList;
+  struct ncclTopoNodeList nextNodeList;
+  nodeList.count = 1; nodeList.list[0] = baseNode;
+  nextNodeList.count = 0;
+  struct ncclTopoLinkList* basePath;
+  NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
+  basePath->count = 0;
+  basePath->width = LOC_WIDTH;
+  basePath->type = LINK_LOC;
+
+  while (nodeList.count) {
+    nextNodeList.count = 0;
+    for (int n=0; n<nodeList.count; n++) {
+      struct ncclTopoNode* node = nodeList.list[n];
+      struct ncclTopoLinkList* path;
+      NCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
+      for (int l=0; l<node->nlinks; l++) {
+        struct ncclTopoLink* link = node->links+l;
+        struct ncclTopoNode* remNode = link->remNode;
+        if (remNode->paths[baseNode->type] == NULL) {
+          NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count));
+        }
+        struct ncclTopoLinkList* remPath;
+        NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
+        int width = std::min(path->width, link->width);
+        if (remPath->width < width) {
+          // Find reverse link
+          for (int l=0; l<remNode->nlinks; l++) {
+            if (remNode->links[l].remNode == node) {
+              remPath->list[0] = remNode->links+l;
+              break;
+            }
+          }
+          if (remPath->list[0] == NULL) {
+            WARN("Failed to find reverse path from remNode id %d type %d nlinks %d to node id %d type %d",
+                 remNode->id, remNode->type, remNode->nlinks, node->id, node->type);
+            return ncclInternalError;
+          }
+          // Copy the rest of the path
+          for (int i=0; i<path->count; i++) remPath->list[i+1] = path->list[i];
+          remPath->count = path->count + 1;
+          remPath->width = width;
+
+          // Consider the path is QPI when going through the CPU
+          // Also don't consider LINK_NET as we only care about the NIC->GPU path.
+          int type = remNode->type == CPU ? LINK_QPI : link->type == LINK_NET ? 0 : link->type;
+          remPath->type = std::max(path->type, type);
+
+          // Add to the list for the next iteration if not already in the list
+          // Disallow GPUs as intermediate steps for now
+          if (remNode->type != GPU) {
+            int i;
+            for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break;
+            if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode;
+          }
+        }
+      }
+    }
+    memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
+  }
+  return ncclSuccess;
+}
+
+static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* node) {
+  char line[1024];
+#ifdef ENABLE_TRACE
+  INFO(NCCL_GRAPH, "Paths from %s/%lX :", topoNodeTypeStr[node->type], node->id);
+#else
+  sprintf(line, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
+  int offset = strlen(line);
+#endif
+  for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
+    if (node->paths[t] == NULL) continue;
+    for (int n = 0; n<system->nodes[t].count; n++) {
+#ifdef ENABLE_TRACE
+      line[0] = 0;
+      int offset = 0;
+      for (int i=0; i<node->paths[t][n].count; i++) {
+        struct ncclTopoLink* link = node->paths[t][n].list[i];
+        struct ncclTopoNode* remNode = link->remNode;
+        sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id);
+        offset = strlen(line);
+      }
+      INFO(NCCL_GRAPH, "%s (%d)", line, node->paths[t][n].width);
+#else
+      sprintf(line+offset, "%s/%lX (%d/%d/%d) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, node->paths[t][n].type);
+      offset = strlen(line);
+#endif
+    }
+  }
+#ifndef ENABLE_TRACE
+  INFO(NCCL_GRAPH, "%s", line);
+#endif
+}
+
+ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system) {
+  for (int i=0; i<system->nodes[GPU].count; i++) {
+    printNodePaths(system, system->nodes[GPU].nodes+i);
+  }
+  for (int i=0; i<system->nodes[NET].count; i++) {
+    printNodePaths(system, system->nodes[NET].nodes+i);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) {
+  // Find the closest CPU to a GPU
+  int minHops = 0;
+  int localCpu = -1;
+  struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU];
+  for (int c=0; c<system->nodes[CPU].count; c++) {
+    int hops = paths[c].count;
+    if (minHops == 0 || hops < minHops) {
+      localCpu = c;
+      minHops = hops;
+    }
+  }
+  if (localCpu == -1) {
+    WARN("Error : could not find CPU close to GPU %d", gpu);
+    return ncclInternalError;
+  }
+  *retCpu = localCpu;
+  return ncclSuccess;
+}
+
+static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int i1, int t2, int i2) {
+  struct ncclTopoNode* cpuNode = system->nodes[CPU].nodes+c;
+  struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1;
+
+  int l=0;
+  // Node 1 -> CPU
+  for (int i=0; i<srcNode->paths[CPU][c].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[CPU][c].list[i];
+  // CPU -> Node 2
+  for (int i=0; i<cpuNode->paths[t2][i2].count; i++) srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i];
+
+  // Update path characteristics
+  srcNode->paths[t2][i2].count = l;
+  srcNode->paths[t2][i2].type = LINK_QPI;
+  srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width);
+  return ncclSuccess;
+}
+
+// Remove/free paths for a given type
+static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) {
+  for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
+    for (int n=0; n<system->nodes[t].count; n++) {
+      struct ncclTopoNode* node = system->nodes[t].nodes+n;
+      free(node->paths[nodeType]);
+      node->paths[nodeType] = NULL;
+    }
+  }
+}
+
+ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) {
+  // Precompute paths between GPUs/NICs.
+
+  // Remove everything in case we're re-computing
+  for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
+
+  // Set direct paths from/to CPUs. We need them in many cases.
+  for (int c=0; c<system->nodes[CPU].count; c++) {
+    NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system));
+  }
+
+  // Set direct paths from/to GPUs.
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    // Compute paths to GPU g
+    NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system));
+
+    if (peerInfos == NULL) continue;
+    // Update paths from GPUs p to GPU g when we can't or don't want to use P2P or even SHM
+    struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].rank;
+    for (int p=0; p<system->nodes[GPU].count; p++) {
+      if (p == g) continue;
+      struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].rank;
+      int p2p;
+      NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo));
+      if (p2p == 0) {
+        int shm;
+        NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
+        if (shm == 1) {
+          // We cannot use GPU Direct, so we need all traffic to go through a CPU
+          int cpu;
+          NCCLCHECK(getLocalCpu(system, g, &cpu));
+          NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g));
+        } else {
+          // We cannot communicate with that peer.
+          system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
+        }
+      }
+    }
+  }
+
+  // Set direct paths from/to NICs.
+  for (int n=0; n<system->nodes[NET].count; n++) {
+    struct ncclTopoNode* netNode = system->nodes[NET].nodes+n;
+    NCCLCHECK(ncclTopoSetPaths(netNode, system));
+
+    if (peerInfos == NULL) continue;
+    for (int g=0; g<system->nodes[GPU].count; g++) {
+      if ((peerInfos[system->nodes[GPU].nodes[g].rank].gdrSupport & (1 << n)) == 0) {
+        // We cannot use GPU Direct RDMA, so we need all NIC<->GPU paths
+        // to go through a CPU
+        int localCpu;
+        NCCLCHECK(getLocalCpu(system, g, &localCpu));
+        NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g));
+        NCCLCHECK(addCpuStep(system, localCpu, GPU, g, NET, n));
+      }
+    }
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm) {
+  int *domains;
+  int64_t *ids;
+  NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count));
+  NCCLCHECK(ncclCalloc(&ids, system->nodes[GPU].count));
+  int myDomain = 0;
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+    domains[g] = g;
+    ids[g] = gpu->id;
+    for (int p=0; p<g; p++) {
+      if (gpu->paths[GPU][p].count > 0) {
+        domains[g] = std::min(domains[g], domains[p]);
+      }
+    }
+    if (gpu->rank == comm->rank) myDomain = domains[g];
+  }
+
+  int ngpus = system->nodes[GPU].count;
+  for (int i=0; i<ngpus; i++) {
+    if (domains[i] == myDomain) continue;
+    struct ncclTopoNode* gpu = NULL;
+    int g;
+    for (g=0; g<system->nodes[GPU].count /* This one varies over the loops */; g++) {
+      gpu = system->nodes[GPU].nodes+g;
+      if (gpu->id == ids[i]) break; else gpu=NULL;
+    }
+    if (gpu == NULL) {
+      WARN("Could not find id %lx", ids[i]);
+      free(domains);
+      free(ids);
+      return ncclInternalError;
+    }
+
+    // Remove GPUs I can't access (even indirectly) from my view of the node
+    for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
+      for (int n=0; n<system->nodes[t].count; n++) {
+        struct ncclTopoNode* node = system->nodes[t].nodes+n;
+        if (node == gpu) continue;
+        for (int l=0; l<node->nlinks; l++) {
+          while (l<node->nlinks && node->links[l].remNode == gpu) {
+            if (l<node->nlinks-1)
+              memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink));
+            node->nlinks--;
+          }
+          if (l<node->nlinks && node->links[l].remNode->type == GPU && node->links[l].remNode >= gpu) {
+            node->links[l].remNode--;
+          }
+        }
+      }
+    }
+    if (g != system->nodes[GPU].count-1)
+      memmove(gpu, gpu+1, (system->nodes[GPU].count-g-1)*sizeof(struct ncclTopoNode));
+    system->nodes[GPU].count--;
+  }
+
+  comm->localRanks = system->nodes[GPU].count;
+  if (system->nodes[GPU].count == comm->nRanks) {
+    // Trim network
+    ncclTopoRemovePathType(system, NET);
+    system->nodes[NET].count = 0;
+  }
+  free(domains);
+  free(ids);
+  return ncclSuccess;
+}
+
+static ncclResult_t getGpuSpeed(struct ncclTopoNode* node, int* speed) {
+  int nvlSpeed = 0;
+  int nvlPeers = 0;
+  int pciSpeed = 0;
+  for (int l=0; l<node->nlinks; l++) {
+    if (node->links[l].type == LINK_NVL) nvlSpeed += node->links[l].width;
+    if (node->links[l].remNode->type == GPU) nvlPeers++; else nvlPeers = 2;
+    if (node->links[l].type == LINK_PCI) pciSpeed = node->links[l].width;
+  }
+  *speed = std::min(*speed, std::max(nvlSpeed, pciSpeed));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system) {
+  // Compute max speed to try to accelerate the search.
+  system->maxSpeed = LOC_WIDTH;
+
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    NCCLCHECK(getGpuSpeed(system->nodes[GPU].nodes+g, &system->maxSpeed));
+  }
+  if (system->nodes[NET].count) {
+    // Try to assign one NIC per GPU
+    int netMaxSpeed = 0;
+    int netMaxSpeedCount = 0;
+    for (int n=0; n<system->nodes[NET].count; n++) {
+      int maxSpeed = 0;
+      struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+      for (int g=0; g<system->nodes[GPU].count; g++) {
+        maxSpeed = std::max(maxSpeed, net->paths[GPU][g].width);
+      }
+      if (maxSpeed > netMaxSpeed) {
+        netMaxSpeed = maxSpeed;
+        netMaxSpeedCount = 1;
+      } else if (maxSpeed == netMaxSpeed) {
+        netMaxSpeedCount++;
+      }
+    }
+    system->maxSpeed = std::min(system->maxSpeed, netMaxSpeedCount*NET_WIDTH);
+  }
+  return ncclSuccess;
+}
+
+void ncclTopoFree(struct ncclTopoSystem* system) {
+  for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
+  free(system);
+}
diff --git a/src/graph/rings.cc b/src/graph/rings.cc
new file mode 100644
index 0000000..5aacbb5
--- /dev/null
+++ b/src/graph/rings.cc
@@ -0,0 +1,57 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+
+#define MAXWIDTH 20
+#define PREFIXLEN 15
+#define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
+void dumpLine(int* values, int nranks, const char* prefix) {
+  int prefixlen = strlen(prefix);
+  char line[STRLENGTH+1];
+  line[STRLENGTH] = '\0';
+  memset(line, ' ', STRLENGTH);
+  strncpy(line, prefix, PREFIXLEN);
+  for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
+  INFO(NCCL_INIT,"%s", line);
+}
+
+ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
+  for (int r=0; r<nrings; r++) {
+    char prefix[30];
+    /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
+    dumpLine(prev+r*nranks, nranks, prefix);
+    sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
+    dumpLine(next+r*nranks, nranks, prefix);*/
+
+    int current = rank;
+    for (int i=0; i<nranks; i++) {
+      rings[r*nranks+i] = current;
+      current = next[r*nranks+current];
+    }
+    sprintf(prefix, "Channel %02d/%02d : ", r, nrings);
+    if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
+    if (current != rank) {
+      WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
+      return ncclInternalError;
+    }
+    // Check that all ranks are there
+    for (int i=0; i<nranks; i++) {
+      int found = 0;
+      for (int j=0; j<nranks; j++) {
+        if (rings[r*nranks+j] == i) {
+          found = 1;
+          break;
+        }
+      }
+      if (found == 0) {
+        WARN("Error : ring %d does not contain rank %d", r, i);
+        return ncclInternalError;
+      }
+    }
+  }
+  return ncclSuccess;
+}
diff --git a/src/graph/rings.h b/src/graph/rings.h
new file mode 100644
index 0000000..c52b1ca
--- /dev/null
+++ b/src/graph/rings.h
@@ -0,0 +1,7 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next);
diff --git a/src/graph/search.cc b/src/graph/search.cc
new file mode 100644
index 0000000..3a8b4e7
--- /dev/null
+++ b/src/graph/search.cc
@@ -0,0 +1,594 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "graph.h"
+#include "topo.h"
+
+static ncclResult_t ncclTopoFollowPath(struct ncclTopoGraph* graph, struct ncclTopoLinkList* path, struct ncclTopoNode** node, int width, int typeSave) {
+  if (path->count == 0) return ncclSuccess;
+
+  *node = NULL;
+  if (width > 0) {
+    if (path->type > graph->type) return ncclSuccess;
+    graph->type = std::max(graph->type, path->type);
+    graph->nHops += path->count;
+  } else {
+    graph->type = typeSave;
+    graph->nHops -= path->count;
+  }
+
+  for (int i=0; i<path->count; i++) {
+    if (path->list[i]->width < width) {
+      // Can't follow this path, rewind and exit
+      for (int j=0; j<i; j++) path->list[j]->width += width;
+      return ncclSuccess;
+    }
+    path->list[i]->width -= width;
+  }
+  *node = path->list[path->count-1]->remNode;
+  return ncclSuccess;
+}
+
+static int gpuPciWidth(struct ncclTopoNode* gpu) {
+  for (int l=0; l<gpu->nlinks; l++) {
+    struct ncclTopoLink* gpuLink = gpu->links+l;
+    if (gpuLink->type != LINK_PCI) continue;
+    struct ncclTopoNode* pci = gpuLink->remNode;
+    for (int l=0; l<pci->nlinks; l++) {
+      struct ncclTopoLink* pciLink = pci->links+l;
+      if (pciLink->remNode != gpu) continue;
+      return std::min(gpuLink->width, pciLink->width);
+    }
+  }
+  return -1;
+}
+
+/* Choose the order in which we try next GPUs. This is critical for the search
+   to quickly converge to the best solution even if it eventually times out. */
+struct ncclGpuScore {
+  int g;             // Retain the index
+  int startIndex;    // Least important
+  int intraNhops;
+  int intraWidth;
+  int interNhops;
+  int interPciWidth;
+  int interWidth;    // Most important
+};
+
+static int cmpScore(const void * g1, const void * g2) {
+   struct ncclGpuScore *s1 = (struct ncclGpuScore*)g1;
+   struct ncclGpuScore *s2 = (struct ncclGpuScore*)g2;
+   int d;
+   if ((d = (s2->interWidth - s1->interWidth))) return d;
+   if ((d = (s2->interPciWidth - s1->interPciWidth))) return d;
+   if ((d = (s1->interNhops - s2->interNhops))) return d;
+   if ((d = (s2->intraWidth - s1->intraWidth))) return d;
+   if ((d = (s1->intraNhops - s2->intraNhops))) return d;
+   return s1->startIndex - s2->startIndex;
+}
+
+static int cmpIntraScores(struct ncclGpuScore* scores, int count) {
+  int intraWidth = scores[0].intraWidth;
+  int intraNhops = scores[0].intraNhops;
+  for (int i=1; i<count; i++) {
+    if (scores[i].intraWidth != intraWidth || scores[i].intraNhops != intraNhops) return 1;
+  }
+  return 0;
+}
+
+static ncclResult_t getNetPaths(struct ncclTopoSystem* system, const uint64_t flag, struct ncclTopoLinkList** netPaths) {
+  for (int n=0; n<system->nodes[NET].count; n++) {
+    if (system->nodes[NET].nodes[n].used & flag) {
+      *netPaths=system->nodes[NET].nodes[n].paths[GPU];
+      return ncclSuccess;
+    }
+  }
+  return ncclInternalError;
+}
+
+ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoNode* gpu, int* next, int* countPtr, int sortNet) {
+  const uint64_t flag = 1ULL<<(graph->nChannels);
+  int ngpus = system->nodes[GPU].count;
+  struct ncclTopoLinkList* paths = gpu->paths[GPU];
+  struct ncclTopoLinkList* netPaths = NULL;
+  if (sortNet) NCCLCHECK(getNetPaths(system, flag, &netPaths));
+
+  struct ncclGpuScore scores[NCCL_TOPO_MAX_NODES];
+  memset(scores, 0, ngpus*sizeof(struct ncclGpuScore));
+  int start = gpu-system->nodes[GPU].nodes;
+  int count = 0;
+  for (int i=1; i<ngpus; i++) {
+    int g = (start+i)%ngpus;
+    if (paths[g].count == 0) continue; // There is no path to that GPU
+    if (system->nodes[GPU].nodes[g].used & flag) continue;
+    scores[count].g = g;
+    scores[count].startIndex = i;
+    scores[count].intraNhops = paths[g].count;
+    scores[count].intraWidth = paths[g].width;
+    if (netPaths) {
+      scores[count].interNhops = netPaths[g].count;
+      scores[count].interPciWidth = gpuPciWidth(system->nodes[GPU].nodes+g);
+      scores[count].interWidth = netPaths[g].width;
+    }
+    count++;
+  }
+
+  // Sort GPUs
+  qsort(scores, count, sizeof(struct ncclGpuScore), cmpScore);
+
+  // Check if all have the same intra-node score in which case we go reverse for sortNet = -1
+  if (sortNet == -1 && cmpIntraScores(scores, count) == 0) {
+    for (int i=0; i<count; i++) next[i] = scores[count-1-i].g;
+  } else {
+    for (int i=0; i<count; i++) next[i] = scores[i].g;
+  }
+  *countPtr = count;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time);
+
+#define NCCL_SEARCH_TIMEOUT (1ULL<<20) // This should get contain all search within a second or so.
+
+#define FORCED_ORDER_PCI 1
+#define FORCED_ORDER_REPLAY 2
+
+ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int step, int* g) {
+  *g = -1;
+  if (graph->nChannels == 0) return ncclInternalError;
+  int ngpus = system->nodes[GPU].count;
+  int nextRank = graph->intra[(graph->nChannels-1)*ngpus+step+1];
+  for (int i=0; i<ngpus; i++) if (system->nodes[GPU].nodes[i].rank == nextRank) {
+    *g = i;
+    return ncclSuccess;
+  }
+  if (*g == -1) return ncclInternalError;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time);
+
+ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoLinkList* paths, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time, int g, int speed) {
+  int typeSave = graph->type;
+  const uint64_t flag = 1ULL<<(graph->nChannels);
+  struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+  if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, speed, typeSave));
+  if (gpu) {
+    gpu->used ^= flag;
+    NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, maxSpeed, time));
+    gpu->used ^= flag;
+    if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, -speed, typeSave));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoCompareGraphs(struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
+  // 0. When we are trying to increase speedIntra, do not copy if the solution has less channels
+  // since it would likely impact the rings algorithms too.
+  if (graph->speedIntra > graph->speedInter && graph->nChannels < refGraph->nChannels) return ncclSuccess;
+
+  // 1. Try to get better bandwidth
+  if (graph->nChannels*graph->speedIntra < refGraph->nChannels*refGraph->speedIntra) return ncclSuccess;
+  if (graph->nChannels*graph->speedIntra > refGraph->nChannels*refGraph->speedIntra) {
+    *copy = 1;
+    return ncclSuccess;
+  }
+  // 2. Give an advantage when all channels are the same
+  if (graph->nChannels > 1 && graph->sameChannels && refGraph->sameChannels == 0) {
+    *copy = 1;
+    return ncclSuccess;
+  }
+  // 3. Less hops
+  if (graph->nHops < refGraph->nHops) *copy = 1;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time) {
+  if ((*time) <= 0) return ncclSuccess;
+  (*time)--;
+
+  int ngpus = system->nodes[GPU].count;
+  if (step == ngpus) {
+    // Determine whether we found a better solution or not
+    int copy = 0;
+    int sameChannels = graph->sameChannels;
+    if (graph->nChannels > 0) {
+      int* intra = graph->intra+graph->nChannels*ngpus;
+      for (int g=0; g<ngpus; g++) if (intra[g] != intra[g-ngpus]) graph->sameChannels = 0;
+    }
+    graph->nChannels++;
+    NCCLCHECK(ncclTopoCompareGraphs(graph, saveGraph, &copy));
+    if (copy) {
+      memcpy(saveGraph, graph, sizeof(struct ncclTopoGraph));
+      if (graph->nChannels*graph->speedIntra == maxSpeed) *time = -1;
+    }
+    if (graph->nChannels < MAXCHANNELS/2) {
+      NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, maxSpeed, time));
+    }
+    graph->nChannels--;
+    graph->sameChannels = sameChannels;
+    return ncclSuccess;
+  }
+  graph->intra[graph->nChannels*ngpus+step] = gpu->rank;
+  if (step == backToNet) {
+    // first get back to NIC
+    if (system->nodes[NET].count) {
+      int maxWidth = 0;
+      struct ncclTopoLinkList* paths = gpu->paths[NET];
+      for (int n=0; n<system->nodes[NET].count; n++) {
+        if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue;
+        maxWidth = std::max(paths[n].width, maxWidth);
+      }
+      for (int n=0; n<system->nodes[NET].count; n++) {
+        if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue;
+        if (paths[n].width == maxWidth) {
+          struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+          int typeSave = graph->type;
+          NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, graph->speedInter, typeSave));
+          if (net) {
+            graph->inter[graph->nChannels*2+1] = net->id;
+            NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, maxSpeed, time));
+            NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, -graph->speedInter, typeSave));
+          }
+        }
+      }
+    }
+  } else if (step < system->nodes[GPU].count-1) {
+    // Go to next GPU
+    struct ncclTopoLinkList* paths = gpu->paths[GPU];
+    int next[NCCL_TOPO_MAX_NODES];
+    int count;
+    if (forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order
+      next[0] = step+1;
+      count = 1;
+    } else if (forcedOrder == FORCED_ORDER_REPLAY) { // Try last channel order
+      NCCLCHECK(ncclTopoReplayGetGpu(system, graph, step, next));
+      count = 1;
+    } else { // Normal search
+      NCCLCHECK(ncclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step+1 ? 1 : -1 ));
+    }
+    for (int i=0; i<count; i++) {
+      int g = next[i];
+      int nvlink = graph->nvlink;
+      graph->nvlink &= paths[g].type <= LINK_NVL ? 1 : 0;
+      int speed = graph->speedIntra;
+      if (paths[g].type == LINK_QPI) speed = INTEL_P2P_OVERHEAD(speed);
+      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, step+1, backToNet, backToFirstRank, forcedOrder, maxSpeed, time, g, speed));
+      graph->nvlink = nvlink;
+    }
+  } else if (step == backToFirstRank) {
+    // Find first GPU and loop back to it
+    int g;
+    int rank = graph->intra[graph->nChannels*ngpus];
+    for (g=0; g<ngpus; g++) {
+      if (system->nodes[GPU].nodes[g].rank == rank) break;
+    }
+    if (g == ngpus) {
+      WARN("Could not find GPU with rank %d\n", rank);
+      return ncclInternalError;
+    }
+    struct ncclTopoLinkList* paths = gpu->paths[GPU];
+    struct ncclTopoNode* firstGpu = system->nodes[GPU].nodes+g;
+    int typeSave = graph->type;
+    NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, graph->speedIntra, typeSave));
+    if (firstGpu) {
+      NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step+1, backToNet, -1, forcedOrder, maxSpeed, time));
+      NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, -graph->speedIntra, typeSave));
+    }
+  } else {
+    // Next path
+    NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, maxSpeed, time));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int maxSpeed, int* time) {
+  const uint64_t flag = 1ULL<<(graph->nChannels);
+  const int speed = graph->speedInter;
+  for (int n=0; n<system->nodes[NET].count; n++) {
+    struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+    struct ncclTopoNode* gpu;
+    if (net->used == 0) {
+      graph->inter[graph->nChannels*2] = net->id;
+      for (int i=0; i<system->nodes[NET].count; i++) {
+        if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag;
+      }
+      struct ncclTopoLinkList* paths = net->paths[GPU];
+
+      // First try the PCI order to set a reference
+      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, speed));
+      // Then try to replay the last channel
+      if (graph->nChannels > 0) {
+        int g;
+        NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
+        NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, g, speed));
+      }
+
+      // Then try the most local GPUs
+      int maxWidth = 0, minHops = 0xfffffff;
+      for (int g=0; g<system->nodes[GPU].count; g++) {
+        if (paths[g].width > maxWidth) {
+          maxWidth = paths[g].width;
+          minHops = paths[g].count;
+        } else if (paths[g].width == maxWidth && paths[g].count < minHops) {
+          minHops = paths[g].count;
+        }
+      }
+      if (maxWidth >= speed) {
+        // In the first loop, avoid using GPUs in both directions between channels (one channel
+        // sending from that GPU and one channel receiving to that GPU), since that usually leads
+        // to lower BW.
+        for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) {
+          for (int g=0; g<system->nodes[GPU].count; g++) {
+            if (paths[g].width == maxWidth && paths[g].count == minHops) {
+              gpu = system->nodes[GPU].nodes+g;
+              int gpuUsed = gpuPciWidth(gpu) > 0 ? 0 : 1;
+              if (tryGpuBidir == gpuUsed) {
+                NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, 0, maxSpeed, time, g, speed));
+              }
+            }
+          }
+        }
+      }
+      for (int i=0; i<system->nodes[NET].count; i++) {
+        if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag;
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+/* Search Patterns
+ *
+ *     Intra-node
+ * Ring            : GPU a -> GPU b -> .. -> GPU x -> GPU a
+ * (=Split Tree Loop)
+ * Tree            : GPU a -> GPU b -> .. -> GPU x
+ * (=Split Tree)
+ *
+ *     Inter-node
+ * Ring            : NET n -> GPU a -> GPU b -> .. -> GPU x -> NET n (or m if crossNic)
+ * Tree            : NET n -> GPU a -> GPU b -> .. -> GPU x
+ *                              `--> NET n (or m if crossNic)
+ * Split Tree      : NET n -> GPU a -> GPU b -> .. -> GPU x
+ *                                       `--> NET n (or m if crossNic)
+ * Split Tree Loop : NET n -> GPU a -> GPU b -> .. -> GPU x -> GPU a
+ *                                       `--> NET n (or m if crossNic)
+ */
+ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) {
+  if (system->nodes[NET].count) {
+    if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1;
+    else if (pattern == NCCL_TOPO_PATTERN_TREE) *backToNet = 0;
+    else *backToNet = 1;
+    if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
+    else *backToFirstRank = -1;
+  } else {
+    *backToNet = -1;
+    if (pattern == NCCL_TOPO_PATTERN_RING || pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
+    else *backToFirstRank = -1;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time) {
+  int backToNet, backToFirstRank;
+  NCCLCHECK(ncclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank));
+  if (system->nodes[NET].count) {
+    // Start from NET
+    ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, maxSpeed, time);
+  } else {
+    // Start from GPU 0
+    NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, graph->speedIntra));
+    if (graph->nChannels > 0) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, 0, graph->speedIntra));
+    NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, 0, maxSpeed, time, 0, graph->speedIntra));
+  }
+  return ncclSuccess;
+}
+
+/* Parse user defined rings. Format is like :
+ * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
+ * Rings with a non-matching number of ranks are ignored so we can provide
+ * rings for multiple cases.
+ */
+#define MAX_ENV_RANKS 512
+static ncclResult_t parseGraph(const char* str, int* nChannelsRet, int ngpus, int* channels) {
+  int ranks[MAX_ENV_RANKS];
+  int nChannels = 0;
+  int rank = 0;
+  int offset = 0;
+  int status = 0; // 0 : between numbers, 1 : inside number
+  do {
+    int digit = str[offset] - '0';
+    if (digit >= 0 && digit <= 9) {
+      if (status == 0) {
+        ranks[rank] = digit;
+        status = 1;
+      } else {
+        ranks[rank] = ranks[rank]*10+digit;
+      }
+    } else {
+      if (status == 1) {
+        rank++;
+        if (rank == MAX_ENV_RANKS) goto end;
+      }
+      status = 0;
+      if (str[offset] == '|' || str[offset] == '\0') {
+        // Ignore if ngpus doesn't match
+        if (rank != ngpus) goto newchannel;
+
+        for (int r=0; r<ngpus; r++) {
+          int rank = ranks[r];
+          // Ignore if ranks are out of bounds
+          if (rank < 0 || rank >= ngpus) goto newchannel;
+          // Ignore if ranks are duplicate
+          for (int i=0; i<r; i++)
+            if (ranks[i] == rank) goto newchannel;
+
+          channels[nChannels*ngpus+r] = rank;
+        }
+        nChannels++;
+newchannel:
+        rank = 0;
+      }
+    }
+  } while (str[offset++] != 0);
+end:
+  *nChannelsRet = nChannels;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
+  int ngpus = system->nodes[GPU].count;
+  int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
+  graph->speedIntra = graph->speedInter = 0;
+  if (graph->crossNic == 2) graph->crossNic = 0;
+  graph->nvlink = 0;
+  graph->type = LINK_LOC;
+  graph->nChannels = 0;
+  graph->sameChannels = 1;
+
+  char* str = getenv("NCCL_GRAPH");
+  if (str) {
+    NCCLCHECK(parseGraph(str, &graph->nChannels, ngpus, graph->intra));
+    for (int i=0; i<graph->nChannels*ngpus; i++) {
+      // Translate gpu numbers into ranks
+      graph->intra[i] = system->nodes[GPU].nodes[graph->intra[i]].rank;
+    }
+    // TODO : let user specify NICs
+    graph->inter[0] = graph->inter[1] = 0;
+    graph->speedIntra = graph->speedInter = PCI_WIDTH+2;
+    graph->nvlink = 0;
+    if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
+      // Reverse the loop
+      for (int c=0; c<graph->nChannels; c++) {
+        for (int i=0; i<=ngpus/2; i++) {
+          int tmp = graph->intra[ngpus*c+i];
+          graph->intra[ngpus*c+i] = graph->intra[ngpus*c+(ngpus-i)%ngpus];
+          graph->intra[ngpus*c+ngpus-i] = tmp;
+        }
+      }
+    }
+    if (graph->nChannels) return ncclSuccess;
+  }
+
+  if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
+
+  struct ncclTopoGraph tmpGraph;
+  memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
+  int bestSpeed = 0;
+
+  // First try crossnic, then decrease speed and finally increase speedIntra.
+  tmpGraph.speedIntra = tmpGraph.speedInter = system->maxWidth;
+  int maxSpeed = system->maxSpeed;
+  tmpGraph.pattern = graph->pattern;
+
+search:
+  int time = NCCL_SEARCH_TIMEOUT;
+  tmpGraph.nvlink = 1;
+  tmpGraph.nChannels = 0;
+  tmpGraph.sameChannels = 1;
+  NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, maxSpeed, &time));
+#if 0
+  printf("Pattern %d, crossNic %d, Speed %d/%d, type %d -> nChannels %dx%d/%d %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.type, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : "");
+  for (int c=0; c<graph->nChannels; c++) {
+    printf("%2d : ", c);
+    for (int g=0; g<ngpus; g++) {
+      printf("%d ", graph->intra[c*ngpus+g]);
+    }
+    printf("\n");
+  }
+#endif
+  if (time == -1) goto done;
+  // We already have a solution and we timed out so lower speed will just timeout as well
+  if (time == 0 && graph->nChannels > 0) goto done;
+  if ((graph->nChannels > 0) && (bestSpeed == 0)) bestSpeed = graph->speedIntra;
+
+  if (tmpGraph.speedIntra == tmpGraph.speedInter) {
+    // First pass, we don't have a solution yet ; try to go slower.
+
+    // Try a simpler tree
+    if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) {
+      tmpGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
+      goto search;
+    }
+    if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
+      tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
+      goto search;
+    }
+    tmpGraph.pattern = graph->pattern;
+
+    if (tmpGraph.type < LINK_QPI) {
+      tmpGraph.type += 1;
+      goto search;
+    }
+    tmpGraph.type = graph->type;
+
+    if (crossNic && tmpGraph.crossNic == 0) {
+      // Try again with crossNic if permitted
+      tmpGraph.crossNic = crossNic;
+      goto search;
+    }
+    tmpGraph.crossNic = graph->crossNic;
+
+    // Try to reduce speed per channel
+    tmpGraph.speedIntra = tmpGraph.speedInter -= 3;
+    if (tmpGraph.speedIntra >= bestSpeed/2 && tmpGraph.speedIntra >= 3) goto search;
+  }
+
+done:
+  // We have a solution now. See if we can increase speedIntra
+  if (tmpGraph.speedIntra == tmpGraph.speedInter) {
+    time = -1;
+    memcpy(&tmpGraph, graph, sizeof(tmpGraph));
+  }
+  if (time != 0 && tmpGraph.pattern != NCCL_TOPO_PATTERN_RING && tmpGraph.speedIntra == graph->speedIntra) {
+    // Try to increase the intra speed only but keeping nChannels the same
+    tmpGraph.speedIntra += 3;
+    maxSpeed = tmpGraph.speedIntra * graph->nChannels;
+    if (tmpGraph.speedIntra <= tmpGraph.speedInter*2) goto search;
+  }
+
+  if (graph->nChannels == 0) {
+    WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern);
+    for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].rank;
+    graph->inter[0] = graph->inter[1] = 0;
+    graph->speedIntra = graph->speedInter = 3;
+    graph->nvlink = 0;
+    graph->nChannels = 1;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
+  INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, speed %d/%d, nvlink %d, type %d, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->speedIntra, graph->speedInter, graph->nvlink, graph->type, graph->sameChannels);
+  int ngpus = system->nodes[GPU].count;
+
+  char line[1024];
+  for (int c=0; c<graph->nChannels; c++) {
+    sprintf(line, "%2d :", c);
+    int offset = strlen(line);
+    if (system->nodes[NET].count > 0) {
+      sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c]);
+      offset = strlen(line);
+    }
+    for (int i=0; i<ngpus; i++) {
+      sprintf(line+offset, " %s/%d", topoNodeTypeStr[GPU], graph->intra[ngpus*c+i]);
+      offset = strlen(line);
+    }
+    if (system->nodes[NET].count > 0) {
+      sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c+1]);
+      offset = strlen(line);
+    }
+    INFO(NCCL_GRAPH, "%s", line);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* dev) {
+  *dev = graph->inter[(channelId%graph->nChannels)*2+dir];
+  return ncclSuccess;
+}
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
new file mode 100644
index 0000000..a1b3209
--- /dev/null
+++ b/src/graph/topo.cc
@@ -0,0 +1,641 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "graph.h"
+#include "topo.h"
+#include "comm.h"
+#include "nvmlwrap.h"
+#include "net.h"
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#define BUSID_SIZE (sizeof("0000:00:00.0"))
+#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
+
+const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
+
+const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
+const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "QPI", "NET" };
+
+/******************************************************************/
+/******************* Graph Creation Functions *********************/
+/******************************************************************/
+static int getNumaId(char *path) {
+  char npath[PATH_MAX];
+  snprintf(npath, PATH_MAX, "%s/numa_node", path);
+  npath[PATH_MAX-1] = '\0';
+
+  int numaId = -1;
+  FILE *file = fopen(npath, "r");
+  if (file == NULL) return -1;
+  if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
+  fclose(file);
+
+  return numaId;
+}
+
+static ncclResult_t getPciPath(char* busId, char** path) {
+  for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
+  char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
+  memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
+  memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
+  *path = realpath(busPath, NULL);
+  if (*path == NULL) {
+    WARN("Could not find real path of %s", busPath);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+// Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000.
+ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
+  char* str = path+offset;
+  // Remove trailing "/"
+  if (*str == '/') str--;
+  // Find next /
+  while (*str != '/') str--;
+  str++;
+  NCCLCHECK(busIdToInt64(str, id));
+  return ncclSuccess;
+}
+
+static ncclResult_t idToIndex(struct ncclTopoSystem* system, int64_t id, int* index) {
+  *index = -1;
+  for (int i=0; i<system->nodes[GPU].count; i++) {
+    if (system->nodes[GPU].nodes[i].id == id) {
+      *index = i;
+    }
+  }
+  return ncclSuccess;
+}
+
+
+static ncclResult_t getPath(int64_t id, char** path) {
+  char busId[] = "0000:00:00.0";
+  NCCLCHECK(int64ToBusId(id, busId));
+  NCCLCHECK(getPciPath(busId, path));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoCudaPath(int cudaDev, char** path) {
+  char busId[BUSID_SIZE];
+  CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
+  NCCLCHECK(getPciPath(busId, path));
+  return ncclSuccess;
+}
+
+
+int interCpuWidth = 0;
+int cpuPciWidth = 0;
+
+static ncclResult_t getCpuWidths() {
+  // Check if already detected
+  if (interCpuWidth + cpuPciWidth) return ncclSuccess;
+
+  // Defaults
+  char cpu[256];
+  sprintf(cpu, "Generic");
+  cpuPciWidth = interCpuWidth = PCI_WIDTH;
+
+#ifdef __PPC__
+  sprintf(cpu, "ppc64");
+  interCpuWidth = P9_WIDTH;
+#endif
+#ifdef __x86_64__
+  sprintf(cpu, "x86_64");
+  union {
+    struct {
+      // CPUID 0 String register order
+      uint32_t ebx;
+      uint32_t edx;
+      uint32_t ecx;
+    };
+    char vendor[12];
+  } cpuid0;
+
+  asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0));
+  if (strncmp(cpuid0.vendor, "GenuineIntel", 12) == 0) sprintf(cpu, "Intel");
+
+  if (strcmp(cpu, "Intel") == 0) {
+    union {
+      struct {
+        int steppingId:4;
+        int model:4;
+        int familyId:4;
+        int processorType:2;
+        int resv0:2;
+        int extModelId:4;
+        int modelId:8;
+        int resv1:4;
+      };
+      uint32_t val;
+    } cpuid1;
+    asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1));
+    if (cpuid1.familyId == 6 && cpuid1.modelId >= 0x55) { // Skylake
+      sprintf(cpu, "Intel/Skylake (or later)");
+      interCpuWidth = SKL_QPI_WIDTH;
+    } else {
+      interCpuWidth = QPI_WIDTH;
+    }
+  }
+#endif
+  INFO(NCCL_GRAPH, "%s CPU (PCI %d, InterCpu %d)", cpu, cpuPciWidth, interCpuWidth);
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTopoGetInterCpuWidth(int* width) {
+  NCCLCHECK(getCpuWidths());
+  *width = interCpuWidth;
+  return ncclSuccess;
+}
+static ncclResult_t ncclTopoGetCpuPciP2pWidth(int* width) {
+  NCCLCHECK(getCpuWidths());
+  *width = cpuPciWidth;
+  return ncclSuccess;
+}
+static ncclResult_t ncclTopoGetPciWidth(int* width) {
+  *width = PCI_WIDTH;
+  return ncclSuccess;
+}
+static ncclResult_t ncclTopoGetNetWidth(int* width) {
+  *width = NET_WIDTH;
+  return ncclSuccess;
+}
+
+enum ncclNvLinkDeviceType {
+  ncclNvLinkDeviceUnknown,
+  ncclNvLinkDeviceGpu,
+  ncclNvLinkDeviceSwitch,
+  ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
+};
+
+static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
+  char classPath[] =  "/sys/bus/pci/devices/0000:00:00.0/class";
+  memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
+  char* rPath = realpath(classPath, NULL);
+  int fd;
+  if ((fd = open(rPath, O_RDONLY)) == -1) {
+    // Could not find device. It might be because we're in a VM and
+    // we don't see the whole machine. This is handled silently so
+    // we don't want to print an INFO error.
+    TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
+    return ncclSystemError;
+  }
+  free(rPath);
+  char pciClass[9];
+  strncpy(pciClass, "0x000000", 9);
+  int len;
+  SYSCHECKVAL(read(fd, pciClass, 8), "read", len);
+  SYSCHECK(close(fd), "close");
+  if (strcmp(pciClass, "0x068000") == 0) {
+    // PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
+    *type = ncclNvLinkDeviceSwitch;
+  } else if (strcmp(pciClass, "0x068001") == 0) {
+    // PCI device is of type "Bridge: IBM Device 04ea"
+    *type = ncclNvLinkDeviceBridge;
+  } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
+      || strcmp(pciClass, "0x030000") == 0) {  // "VGA Controller" (GeForce)
+    *type = ncclNvLinkDeviceGpu;
+  } else {
+    *type = ncclNvLinkDeviceUnknown;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoConnectCpu(struct ncclTopoSystem* system, int numaId, struct ncclTopoNode* node, int linkType, int linkWidth) {
+  struct ncclTopoNode* cpuNode = NULL;
+  for (int c=0; c<system->nodes[CPU].count; c++) {
+    if (system->nodes[CPU].nodes[c].id == numaId) cpuNode = system->nodes[CPU].nodes+c;
+  }
+  if (cpuNode == NULL) { // Create CPU
+    NCCLCHECK(ncclTopoCreateNode(system, &cpuNode, CPU, numaId));
+  }
+  NCCLCHECK(ncclTopoConnectNodes(node, cpuNode, linkType, linkWidth));
+  NCCLCHECK(ncclTopoConnectNodes(cpuNode, node, linkType, linkWidth));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoConnectNVLink(nvmlDevice_t* nvmlDevs, struct ncclTopoSystem* system) {
+  struct ncclTopoNode* nvsNode = NULL;
+
+  int minNvlinks = 6, minWidth = VOLTA_NVLINK_WIDTH;
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+    int cudaMajor, cudaMinor;
+    NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDevs[g], &cudaMajor, &cudaMinor));
+    int maxNvLinks, width;
+    if (cudaMajor < 6) {
+      maxNvLinks = 0;
+      width = 0;
+    } else if (cudaMajor == 6) {
+      maxNvLinks = 4;
+      width = PASCAL_NVLINK_WIDTH;
+    } else {
+      maxNvLinks = 6;
+      width = VOLTA_NVLINK_WIDTH;
+    }
+
+    int nvlinks = 0;
+    for (int l=0; l<maxNvLinks; ++l) {
+      // Check whether we can use this NVLink for P2P
+      unsigned canP2P;
+      if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDevs[g], l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
+
+      // Make sure the Nvlink is up. The previous call should have trained the link.
+      nvmlEnableState_t isActive;
+      if ((wrapNvmlDeviceGetNvLinkState(nvmlDevs[g], l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
+
+      // Try to figure out what's on the other side of the NVLink
+      nvmlPciInfo_t remoteProc;
+      if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevs[g], l, &remoteProc) != ncclSuccess) continue;
+
+      // Make a lower case copy of the bus ID for calling ncclDeviceType
+      // PCI system path is in lower case
+      char* p = remoteProc.busId;
+      char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+      for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
+        lowerId[c] = tolower(p[c]);
+        if (p[c] == 0) break;
+      }
+
+      enum ncclNvLinkDeviceType type;
+      NCCLCHECK(ncclDeviceType(lowerId, &type));
+      if (type == ncclNvLinkDeviceGpu) {
+        int64_t remoteId;
+        NCCLCHECK(busIdToInt64(lowerId, &remoteId));
+        int peer;
+        NCCLCHECK(idToIndex(system, remoteId, &peer));
+        if (peer != -1) {
+          NCCLCHECK(ncclTopoConnectNodes(gpu, system->nodes[GPU].nodes+peer, LINK_NVL, width));
+          nvlinks++;
+        }
+      } else if (type == ncclNvLinkDeviceBridge) {
+        // Nvlink between GPU and CPU (PPC)
+        // Since the remote bridge does not have a valid numa_node, assume we
+        // are connected to the closest CPU.
+        char* path;
+        NCCLCHECK(getPath(gpu->id, &path));
+        int numaId = getNumaId(path);
+        free(path);
+        NCCLCHECK(ncclTopoConnectCpu(system, numaId, gpu, LINK_NVL, width));
+        nvlinks++;
+      } else { // Nvswitch
+        if (type == ncclNvLinkDeviceUnknown) {
+          // The NVLink is up but we couldn't find the PCI device on the other
+          // side. Assume it's an NVswitch outside a VM.
+          if (l == 0) INFO(NCCL_INIT, "%d/%d -> %s : Assuming NVLink is connected to NVswitch", g, l, lowerId);
+        }
+        if (nvsNode == NULL) { // Create nvswitch
+          NCCLCHECK(ncclTopoCreateNode(system, &nvsNode, NVS, 0));
+        }
+        NCCLCHECK(ncclTopoConnectNodes(gpu, nvsNode, LINK_NVL, VOLTA_NVLINK_WIDTH));
+        NCCLCHECK(ncclTopoConnectNodes(nvsNode, gpu, LINK_NVL, VOLTA_NVLINK_WIDTH));
+        nvlinks++;
+      }
+    }
+    minNvlinks = std::min(minNvlinks, nvlinks);
+    minWidth = std::min(minWidth, width);
+  }
+  int pciWidth;
+  NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
+  system->maxSpeed = minNvlinks ? minNvlinks*minWidth : pciWidth;
+  system->maxWidth = minNvlinks ? minWidth : pciWidth;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoCreatePciPath(struct ncclTopoSystem* system, struct ncclTopoNode* endNode, char* path) {
+  struct ncclTopoNode* lastNode = endNode;
+  int pciWidth;
+  NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
+  // Find intermediate PCI switches
+  int slashCount = 0;
+  int offsetRC = 0;
+  while (offsetRC < strlen(path)) {
+    if (path[offsetRC] == '/') slashCount++;
+    if (slashCount == 4) break;
+    offsetRC++;
+  }
+  int offset = strlen(path);
+  slashCount = 0;
+  while (--offset > offsetRC) {
+    if (path[offset] == '/') {
+      slashCount++;
+      // Find if already existing
+      if ((slashCount%2) == 0) {
+        int64_t pciId;
+        NCCLCHECK(pciPathToInt64(path, offset, offsetRC, &pciId));
+        for (int p=0; p<system->nodes[PCI].count; p++) {
+          if (system->nodes[PCI].nodes[p].id == pciId) {
+            // Found our PCI switch. Attach and stop since the rest should already
+            // be connected
+            NCCLCHECK(ncclTopoConnectNodes(system->nodes[PCI].nodes+p, lastNode, LINK_PCI, pciWidth));
+            NCCLCHECK(ncclTopoConnectNodes(lastNode, system->nodes[PCI].nodes+p, LINK_PCI, pciWidth));
+            return ncclSuccess;
+          }
+        }
+        struct ncclTopoNode* pciNode;
+        NCCLCHECK(ncclTopoCreateNode(system, &pciNode, PCI, pciId));
+        NCCLCHECK(ncclTopoConnectNodes(pciNode, lastNode, LINK_PCI, pciWidth));
+        NCCLCHECK(ncclTopoConnectNodes(lastNode, pciNode, LINK_PCI, pciWidth));
+        lastNode = pciNode;
+      }
+    }
+  }
+  // Then attach to a CPU node
+  int numaId = getNumaId(path);
+  int width;
+  NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
+  NCCLCHECK(ncclTopoConnectCpu(system, numaId, lastNode, LINK_PCI, width));
+  return ncclSuccess;
+}
+
+// Try to detect if IB cards are in fact the same physical NIC, hence sharing ports.
+#include <glob.h>
+#define IB_GUID_PATH "%s/infiniband/mlx5_*/sys_image_guid"
+uint64_t getIbGuid(char* path) {
+  uint64_t guid = 0ULL;
+  char guidPath[PATH_MAX];
+  snprintf(guidPath, PATH_MAX, IB_GUID_PATH, path);
+  // PATH has a wildcard in it so use glob()
+  glob_t globbuf;
+  glob(guidPath, 0, NULL, &globbuf);
+  if (globbuf.gl_pathc > 0)
+    strncpy(guidPath, globbuf.gl_pathv[0], PATH_MAX);
+  globfree(&globbuf);
+  guidPath[PATH_MAX-1] = '\0';
+  FILE *file = fopen(guidPath, "r");
+  if (file != NULL) {
+    uint64_t a, b, c, d;
+    if (fscanf(file, "%04lx:%04lx:%04lx:%04lx", &a, &b, &c, &d) != EOF) {
+      guid = (a << 48) + (b << 32) + (c<<16) + d;
+      TRACE(NCCL_GRAPH, "Opened %s guid %lx", guidPath, guid);
+    }
+    fclose(file);
+  }
+  return guid;
+}
+
+struct netInfo {
+  char* path;
+  int64_t nic;
+  uint64_t asic;
+  int port;
+  int net;
+};
+
+ncclResult_t ncclTopoComputeNetInfo(struct netInfo* netInfos, int ndev) {
+  for (int n=0; n<ndev; n++) {
+    struct netInfo* info = netInfos+n;
+    uint64_t ibGuid;
+    info->nic = n;
+    info->asic = n;
+    info->port = 0;
+    info->net = n;
+    if (info->path && (ibGuid = getIbGuid(info->path)) != 0) {
+      info->asic = ibGuid;
+
+      // Ignore PCI subdevice when computing the ID to merge multi-port cards
+      // and make them use the same PCI link.
+      char* path = strdup(info->path);
+      path[strlen(path)-1]='0';
+      NCCLCHECK(pciPathToInt64(path, strlen(path), 0, &info->nic));
+      free(path);
+
+      // Same PCI path -> different ports of the same NIC
+      for (int i=0; i<n; i++) if (netInfos[i].nic == info->nic) info->port++;
+
+      // Same GUID -> same network links as the other NIC
+      for (int i=0; i<n; i++) if (netInfos[i].asic == info->asic && netInfos[i].port == info->port) info->net = netInfos[i].net;
+    }
+    INFO(NCCL_GRAPH, "%s -> %x/%lx/%d/%d", info->path, info->nic, info->asic, info->port, info->net);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoConnectPCI(struct ncclTopoSystem* system) {
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+    char* path;
+    NCCLCHECK(getPath(gpu->id, &path));
+    NCCLCHECK(ncclTopoCreatePciPath(system, gpu, path));
+    free(path);
+  }
+
+  // Connect the NICs
+  int netDevCount;
+  NCCLCHECK(ncclNetDevices(&netDevCount));
+  int netWidth;
+  NCCLCHECK(ncclTopoGetNetWidth(&netWidth));
+
+  struct netInfo* netInfos;
+  NCCLCHECK(ncclCalloc(&netInfos, netDevCount));
+
+  for (int n=0; n<netDevCount; n++) {
+    ncclResult_t res = ncclNetPciPath(n, &netInfos[n].path);
+    if (res != ncclSuccess) netInfos[n].path = NULL;
+  }
+
+  NCCLCHECK(ncclTopoComputeNetInfo(netInfos, netDevCount));
+
+  for (int n=0; n<netDevCount; n++) {
+    struct netInfo* info = netInfos+n;
+    // Create NIC and attach it to the PCI tree
+    struct ncclTopoNode* nicNode = NULL;
+    for (int i=0; i<system->nodes[NIC].count; i++) {
+      if (system->nodes[NIC].nodes[i].id == info->nic) {
+        nicNode = system->nodes[NIC].nodes+i;
+        break;
+      }
+    }
+    if (!nicNode) {
+      NCCLCHECK(ncclTopoCreateNode(system, &nicNode, NIC, info->nic));
+      if (info->path) {
+        // Create the PCI path
+        NCCLCHECK(ncclTopoCreatePciPath(system, nicNode, info->path));
+      } else {
+        // This is probably a virtual NIC. Just attach it directly to CPU 0
+        int width;
+        NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
+        NCCLCHECK(ncclTopoConnectCpu(system, 0, nicNode, LINK_PCI, width));
+      }
+    }
+    free(info->path);
+
+    // Create the network side
+    struct ncclTopoNode* netNode;
+    NCCLCHECK(ncclTopoCreateNode(system, &netNode, NET, n));
+
+    // Use rank to store the net information
+    netNode->rank = info->net;
+
+    NCCLCHECK(ncclTopoConnectNodes(nicNode, netNode, LINK_NET, netWidth));
+    NCCLCHECK(ncclTopoConnectNodes(netNode, nicNode, LINK_NET, netWidth));
+  }
+  free(netInfos);
+
+  // And connect all CPU nodes together
+  for (int n=0; n<system->nodes[CPU].count; n++) {
+    for (int p=0; p<system->nodes[CPU].count; p++) {
+      if (n == p) continue;
+      int width;
+      NCCLCHECK(ncclTopoGetInterCpuWidth(&width));
+      NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_QPI, width));
+    }
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) {
+  if (node->type == GPU) {
+    sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->rank);
+  } else {
+    sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
+  }
+  INFO(NCCL_GRAPH, "%s", line);
+  for (int i=0; i<offset; i++) line[i] = ' ';
+
+  for (int l=0; l<node->nlinks; l++) {
+    struct ncclTopoLink* link = node->links+l;
+    if (link->type == LINK_LOC) continue;
+    if (link->remNode != prevNode) {
+      sprintf(line+offset, "+ %s[%2d] - ", topoLinkTypeStr[link->type], link->width);
+      int nextOffset = strlen(line);
+      if (link->type == LINK_PCI) {
+        NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
+      } else {
+        if (link->remNode->type == NET) {
+          sprintf(line+nextOffset, "%s/%lX (%d)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->rank);
+        } else {
+          sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
+        }
+        INFO(NCCL_GRAPH, "%s", line);
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
+  INFO(NCCL_GRAPH, "=== System : maxWidth %2d maxSpeed %2d ===", s->maxWidth, s->maxSpeed);
+  char line[1024];
+  for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
+  INFO(NCCL_GRAPH, "==========================================");
+  NCCLCHECK(ncclTopoPrintPaths(s));
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* upNode) {
+  // Shift all links to have upLink as last link
+  if (upNode) {
+    int l=0;
+    while (node->links[l].remNode != upNode) l++;
+    struct ncclTopoLink upLink;
+    memcpy(&upLink, node->links+l, sizeof(struct ncclTopoLink));
+    while (node->links[l+1].remNode) {
+      memcpy(node->links+l, node->links+l+1, sizeof(struct ncclTopoLink));
+      l++;
+    }
+    memcpy(node->links+l, &upLink, sizeof(struct ncclTopoLink));
+  }
+
+  // Recursively sort the PCI tree
+  for (int l=0; l<node->nlinks; l++) {
+    struct ncclTopoLink* link = node->links+l;
+    if (link->type == LINK_PCI && link->remNode != upNode) NCCLCHECK(ncclTopoSort(link->remNode, node));
+  }
+  return ncclSuccess;
+}
+
+// We want the graph to be organized to ease/accelerate traversal :
+// 1. NVLinks (already the case)
+// 2. PCI down
+// 3. PCI up
+// 4. QPI (already the case)
+ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) {
+  for (int n=0; n<system->nodes[CPU].count; n++) NCCLCHECK(ncclTopoSort(system->nodes[CPU].nodes+n, NULL));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
+  struct ncclTopoSystem* s;
+  NCCLCHECK(ncclCalloc(&s, 1));
+  nvmlDevice_t* nvmlDevs;
+  int g = 0;
+  NCCLCHECK(ncclCalloc(&nvmlDevs, comm->nRanks));
+  for (int r=0; r<comm->nRanks; r++) {
+    if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
+      // Consider the GPU as outside of our node if we can't see it through NVML.
+      char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+      NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
+      if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevs+g) != ncclSuccess) continue;
+      g++;
+      struct ncclTopoNode* gpuNode;
+      NCCLCHECK(ncclTopoCreateNode(s, &gpuNode, GPU, comm->peerInfo[r].busId));
+      gpuNode->rank = r;
+    }
+  }
+
+  NCCLCHECK(ncclTopoConnectNVLink(nvmlDevs, s));
+  NCCLCHECK(ncclTopoConnectPCI(s));
+
+  free(nvmlDevs);
+  NCCLCHECK(ncclTopoSortSystem(s));
+  *system = s;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink) {
+  int g1, g2;
+  NCCLCHECK(idToIndex(system, busId1, &g1));
+  NCCLCHECK(idToIndex(system, busId2, &g2));
+  *nvlink = g1 != -1 && g2 != -1 && system->nodes[GPU].nodes[g1].paths[GPU][g2].type == LINK_NVL;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink) {
+  int g;
+  NCCLCHECK(idToIndex(system, busId, &g));
+  for (int i=0; i<system->nodes[GPU].count; i++) {
+    if (i == g) continue;
+    if (system->nodes[GPU].nodes[g].paths[GPU][i].type == LINK_NVL) {
+      *nvlink = 1;
+      return ncclSuccess;
+    }
+  }
+  *nvlink = 0;
+  return ncclSuccess;
+}
+
+static int pathDistance(struct ncclTopoLinkList* links) {
+  int distance = PATH_PIX;
+  if (links->count > 2) distance = PATH_PXB;
+  for (int l=0; l<links->count; l++) {
+    // PHB if we go through 1 CPU, SYS if we go through 2 CPUs
+    if (links->list[l]->remNode->type == CPU) distance = (distance == PATH_PHB) ? PATH_SYS : PATH_PHB;
+  }
+  return distance;
+}
+
+ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance) {
+  int g1, g2;
+  NCCLCHECK(idToIndex(system, busId1, &g1));
+  NCCLCHECK(idToIndex(system, busId2, &g2));
+  *distance = pathDistance(system->nodes[GPU].nodes[g1].paths[GPU]+g2);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance) {
+  int g;
+  NCCLCHECK(idToIndex(system, busId, &g));
+  *distance = pathDistance(system->nodes[GPU].nodes[g].paths[NET]+netDev);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count) {
+  *count = system->nodes[CPU].count;
+  return ncclSuccess;
+}
diff --git a/src/graph/topo.h b/src/graph/topo.h
new file mode 100644
index 0000000..6b8a2f9
--- /dev/null
+++ b/src/graph/topo.h
@@ -0,0 +1,138 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TOPO_H_
+#define NCCL_TOPO_H_
+
+#include "graph.h"
+#include "core.h"
+
+#define LOC_WIDTH 5000
+#define PASCAL_NVLINK_WIDTH 18
+#define VOLTA_NVLINK_WIDTH 21
+#define PCI_WIDTH 12           // PCI Gen3 x16
+#define QPI_WIDTH 8
+#define SKL_QPI_WIDTH 12
+#define P9_WIDTH 32
+#define NET_WIDTH 12           // 100Gbit
+
+// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, to GPU
+// to GPU traffic consumed more PCI bandwidth.
+#define INTEL_P2P(speed) (speed*9/12)
+#define INTEL_P2P_OVERHEAD(speed) (speed*12/9)
+
+#define NCCL_TOPO_NODE_TYPES 6
+#define GPU 0
+#define PCI 1
+#define NVS 2
+#define CPU 3 // Actually NUMA domains
+#define NIC 4
+#define NET 5
+extern const char* topoNodeTypeStr[];
+
+#define LINK_LOC 0
+#define LINK_NVL 1
+#define LINK_PCI 2
+#define LINK_QPI 3
+#define LINK_NET 4
+extern const char* topoLinkTypeStr[];
+
+struct ncclTopoNode;
+struct ncclTopoLink {
+  int type;
+  int width;
+  struct ncclTopoNode* remNode;
+};
+#define NCCL_TOPO_MAX_LINKS 32
+#define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES)
+#define SELECT_PATH 1
+#define SELECT_LAST 2
+
+#define NET_GDR_MASK 0x70000000
+
+struct ncclTopoLinkList {
+  struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS];
+  int count;
+  int width;
+  int type;
+};
+
+struct ncclTopoNode {
+  int type;
+  int64_t id;
+  int rank;
+  int nlinks;
+  struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS];
+  // Pre-computed paths to GPUs and NICs
+  struct ncclTopoLinkList* paths[NCCL_TOPO_NODE_TYPES];
+  // Used during search
+  uint64_t used;
+};
+
+struct ncclTopoNodeSet {
+  int count;
+  struct ncclTopoNode nodes[NCCL_TOPO_MAX_NODES];
+};
+
+struct ncclTopoSystem {
+  struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
+  int maxSpeed;
+  int maxWidth;
+  int searchInitDone;
+};
+
+static ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
+  for (int i=0; i<system->nodes[type].count; i++) {
+    if (system->nodes[type].nodes[i].id == id) {
+      *node = system->nodes[type].nodes+i;
+      return ncclSuccess;
+    }
+  }
+  if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
+    WARN("Error : tried to create too many nodes of type %d\n", type);
+    return ncclInternalError;
+  }
+  struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
+  system->nodes[type].count++;
+  n->type = type;
+  n->id = id;
+  if (type == GPU) {
+    // Create link to itself (used in some corner cases)
+    n->nlinks=1;
+    n->links[0].type = LINK_LOC;
+    n->links[0].remNode = n;
+    n->links[0].width = LOC_WIDTH;
+  }
+  *node = n;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, int width) {
+  // Aggregate links into higher width for NVLink
+  struct ncclTopoLink* link;
+  for (link = node->links; link->remNode; link++) {
+    if (link->remNode == remNode && link->type == type) break;
+  }
+  if (link->remNode == NULL) node->nlinks++;
+  link->type = type;
+  link->remNode = remNode;
+  link->width += width;
+
+  // Sort links in BW descending order
+  struct ncclTopoLink linkSave;
+  memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
+  while (link != node->links) {
+    if ((link-1)->width >= linkSave.width) break;
+    memcpy(link, link-1, sizeof(struct ncclTopoLink));
+    link--;
+  }
+  memcpy(link, &linkSave, sizeof(struct ncclTopoLink));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
+
+#endif
diff --git a/src/misc/trees.cu b/src/graph/trees.cc
index e53ea0b..722e61b 100644
--- a/src/misc/trees.cu
+++ b/src/graph/trees.cc
@@ -1,12 +1,10 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
-#include "net.h"
-#include "param.h"
+#include "nccl.h"
 
 #define RANK_TO_INDEX(r) (rank > root ? rank-1 : rank)
 
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
new file mode 100644
index 0000000..87afb2f
--- /dev/null
+++ b/src/graph/tuning.cc
@@ -0,0 +1,212 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "devcomm.h"
+#include "comm.h"
+#include "topo.h"
+
+NCCL_PARAM(Nthreads, "NTHREADS", -2);
+NCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2);
+
+static int getNthreads(const char* name, int env, int min, int max, int def) {
+  int nt = env;
+  if (nt > 0) {
+    if (nt % WARP_SIZE != 0) {
+      WARN("Invalid %s %d (must be a multiple of %d)", name, nt, WARP_SIZE);
+      nt = max;
+    } else if (nt > max) {
+      WARN("Invalid %s %d (maximum %d).", name, nt, max);
+      nt = max;
+    } else if (nt < min) {
+      WARN("Invalid %s %d (minimum %d).", name, nt, min);
+      nt = min;
+     }
+  } else {
+    nt = def;
+  }
+  return nt;
+}
+
+ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* list) {
+  int def, set;
+  if (str[0] == '^') {
+    def = 1; set = 0; str++;
+  } else {
+    def = 0; set = 1;
+  }
+  for (int i=0; i<nelems; i++) list[i] = def;
+  char* tokStr = strdup(str);
+  char* tmpStr;
+  char* token = strtok_r(tokStr, ",", &tmpStr);
+  while (token) {
+    for (int i=0; i<nelems; i++)
+      if (strcasecmp(token, elems[i]) == 0) list[i] = set;
+    token = strtok_r(NULL, ",", &tmpStr);
+  }
+  free(tokStr);
+  return ncclSuccess;
+}
+
+static const char* ncclFuncStr[] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
+static const char* ncclAlgoStr[] = { "Tree", "Ring" };
+static const char* ncclProtoStr[] = { "LL", "LL128", "Simple" };
+
+// Latencies in us, Bandwidths in GB/s
+// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
+static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4,  0 }, { 3.6, 3.6, 8.4 } };
+
+// NVLink, PCI, Network
+#define NCCL_HW_NVLINK 0
+#define NCCL_HW_PCI 1
+#define NCCL_HW_NET 2
+// Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network).
+static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
+{ /* NVLINK */
+  { /* Tree (LL/LL128/Simple)*/ {  .5, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ {  .4, 2.5, 5.7 } },
+  /* PCI */
+  { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 } },
+  /* NET */
+  { /* Tree (LL/LL128/Simple)*/ { 5.0, 7.5, 50 }, /* Ring (LL/LL128/Simple)*/ {  .9, 2.5, 6.6 } }
+};
+
+// LL128 max BW for the different collectives
+static const double ll128MaxBw[NCCL_NUM_FUNCTIONS] = { 113.0, 72.0, 110.0, 91.0, 100.0 };
+
+ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph) {
+  int simpleDefaultThreads = (treeGraph->speedIntra*treeGraph->nChannels <= 12) ? 256 : NCCL_MAX_NTHREADS;
+  comm->maxThreads[NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
+  comm->maxThreads[NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
+  comm->maxThreads[NCCL_PROTO_LL128] = getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
+
+  INFO(NCCL_INIT, "Threads per block : %d/%d/%d", comm->maxThreads[NCCL_PROTO_LL], comm->maxThreads[NCCL_PROTO_LL128], comm->maxThreads[NCCL_PROTO_SIMPLE]);
+
+  if (comm->nRanks <= 1) return ncclSuccess;
+
+  struct ncclTopoGraph* graphs[2] = { treeGraph, ringGraph };
+  int intraHw[2], hw[2];
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->nvlink ? NCCL_HW_NVLINK : NCCL_HW_PCI;
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = comm->nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
+
+  for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
+    int nsteps = coll == ncclCollAllReduce ? 2*(comm->nRanks-1) :
+      coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nRanks-1 :
+      comm->nRanks;
+
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      if (coll != ncclCollAllReduce && a == NCCL_ALGO_TREE) continue;
+
+      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+        int speed = comm->nNodes <= 2 ? graphs[a]->speedIntra : graphs[a]->speedInter;
+        float busBw = graphs[a]->nChannels * speed * 1.0;
+
+        // Various model refinements
+        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL)    busBw *= 1.0/4.0;
+        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw*120.0/128.0, ll128MaxBw[coll]);
+        if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.9, comm->nNodes > 1 ? 70.0 : 90.0);
+        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/3.0;
+        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw *= 7.0/9.0;
+
+        // Convert bus BW to algorithm BW
+        float ratio = a == NCCL_ALGO_TREE ? .5 : (1.0 * comm->nRanks) / nsteps;
+        comm->bandwidths[coll][a][p] = busBw * ratio;
+
+        comm->latencies[coll][a][p] = baseLat[a][p];
+        if (a == NCCL_ALGO_RING) {
+          float lat = hwLat[hw[a]][a][p];
+          if ((coll == ncclCollReduce || coll == ncclCollBroadcast)) {
+            if (ringGraph->sameChannels) {
+              comm->latencies[coll][a][p] += lat;
+            } else {
+              if (p == NCCL_PROTO_SIMPLE) lat = hwLat[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling
+              comm->latencies[coll][a][p] += nsteps*lat;
+            }
+          } else {
+            comm->latencies[coll][a][p] += nsteps*lat;
+          }
+        } else {
+          float intraLat = hwLat[intraHw[a]][a][p];
+          float interLat = hwLat[NCCL_HW_NET][a][p];
+          comm->latencies[coll][a][p] +=
+            2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat);
+        }
+      }
+    }
+  }
+
+  // Protocols/Algorithms enable/disable, and user overrides.
+  // All are enabled except ll128 which is enabled by default only in certain cases.
+  int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
+  int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1 };
+
+  const char *protoStr = getenv("NCCL_PROTO");
+  if (protoStr) NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
+  const char *algoStr = getenv("NCCL_ALGO");
+  if (algoStr) NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
+
+  for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    int pEnable = protoEnable[p];
+    if (pEnable == 2 && p == NCCL_PROTO_LL128) {
+      // Enable LL128 by default only on Volta+NVLink. Other cases are not tested and may cause silent data corruption.
+      pEnable = (graphs[a]->type <= LINK_PCI) && graphs[a]->nvlink && minCompCap == 70 && maxCompCap == 70 ? 1 : 0;
+    }
+    if (pEnable == 0 || algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
+  }
+
+  if (comm->rank == 0) {
+    char line[1024];
+    int offset = 0;
+    sprintf(line, "Latency/AlgBw |");
+    offset = strlen(line);
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+        sprintf(line+offset, " %4s/%6s |", ncclAlgoStr[a], ncclProtoStr[p]);
+        offset = strlen(line);
+      }
+    }
+    INFO(NCCL_TUNING, "%s", line);
+    for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
+      sprintf(line, "%13s |", ncclFuncStr[c]);
+      offset = strlen(line);
+      for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+        for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+          sprintf(line+offset, "%7.1f/%5.1f|", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
+          offset = strlen(line);
+        }
+      }
+      INFO(NCCL_TUNING, "%s", line);
+    }
+  }
+
+  // Set per-thread amount of work before we increase nThreads and nChannels
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+    comm->threadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD;
+    comm->threadThresholds[a][NCCL_PROTO_LL128] = NCCL_LL128_THREAD_THRESHOLD;
+    comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD;
+  }
+  comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= comm->nRanks;
+
+  // Override defaults with user env
+  char* str = getenv("NCCL_THREAD_THRESHOLDS");
+  if (str) {
+    ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2}};
+    sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2);
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+        if (t[a][p] >= 0) comm->threadThresholds[a][p] = t[a][p];
+      }
+    }
+  }
+
+  INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld",
+      comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL],
+      comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL128],
+      comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE],
+      comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL],
+      comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL128],
+      comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
+  return ncclSuccess;
+}
diff --git a/src/misc/group.cu b/src/group.cc
index c428a22..9bf8ac9 100644
--- a/src/misc/group.cu
+++ b/src/group.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -51,11 +51,6 @@ struct ncclAsyncArgs {
 
 thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];
 
-ncclResult_t ncclSetDevice(int cudaDev) {
-  CUDACHECK(cudaSetDevice(cudaDev));
-  return ncclSuccess;
-}
-
 #define CHECK(a) do { \
   if ((args->ret = (a)) != ncclSuccess) { \
     INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
@@ -65,15 +60,14 @@ ncclResult_t ncclSetDevice(int cudaDev) {
 
 void* ncclAsyncThreadMain(void* args_) {
   struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
-  CHECK(ncclSetDevice(args->init.cudaDev));
-  CHECK(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank));
+  CHECK(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev));
   return args;
 }
 
-ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) {
+ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev) {
   if (ncclGroupIndex >= MAX_ASYNC_OPS) {
     WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
-    return ncclAsyncErrCheck(ncclInternalError);
+    return ncclAsyncErrCheck(ncclInvalidUsage);
   }
   int index = ncclGroupIndex++;
   struct ncclAsyncArgs* args = ncclGroupArgs+index;
@@ -84,8 +78,6 @@ ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm
   args->init.ndev = ndev;
   memcpy(&args->init.commId, &commId, sizeof(commId));
   args->init.myrank = myrank;
-  // We need to use threads for Init
-  pthread_create(ncclGroupThreads+index, NULL, ncclAsyncThreadMain, args);
   return ncclSuccess;
 }
 
@@ -97,7 +89,7 @@ ncclResult_t ncclAsyncColl(ncclComm_t comm) {
   }
   if (ncclGroupIndex >= MAX_ASYNC_OPS) {
     WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
-    return ncclAsyncErrCheck(ncclInternalError);
+    return ncclAsyncErrCheck(ncclInvalidUsage);
   }
   ncclGroupIndex++;
   args->funcType = ASYNC_FUNC_COLL;
@@ -118,12 +110,20 @@ ncclResult_t ncclGroupEnd() {
   int savedDev;
   CUDACHECK(cudaGetDevice(&savedDev));
   int done = ncclGroupIndex;
-  int doneArray[ncclGroupIndex];
+  int doneArray[MAX_ASYNC_OPS];
   for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 0;
 
   ncclResult_t ret = ncclGroupError;
   if (ret != ncclSuccess) goto group_cleanup;
 
+  /* Launch async ncclCommInitRank */
+  for (int i=0; i<ncclGroupIndex; i++) {
+    struct ncclAsyncArgs* args = ncclGroupArgs+i;
+    if (args->funcType == ASYNC_FUNC_INIT) {
+      pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadMain, args);
+    }
+  }
+
   /* Collectives are done in three steps :
    * 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative]
    * 2. Barrier Wait. No CUDA call is permitted
@@ -166,8 +166,8 @@ ncclResult_t ncclGroupEnd() {
       if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
         int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL);
         if (err == EBUSY) continue;
-        if (err != 0) { ret = ncclSystemError; goto end; }
-        if (args->ret != ncclSuccess) { ret = args->ret; goto end; }
+        if (err != 0) ret = ncclSystemError;
+        if (args->ret != ncclSuccess) ret = args->ret;
         doneArray[i] = 1;
         done--;
       }
@@ -175,20 +175,47 @@ ncclResult_t ncclGroupEnd() {
   }
   goto end;
 group_cleanup:
-  // At least one call in the group failed. Since we want to make that group
-  // an atomic operation, we need to cancel all operations.
-  for (int i=0; i<ncclGroupIndex; i++) {
-    struct ncclComm* comm = ncclGroupArgs[i].coll.comm;
-    for (int c=0; c<comm->nChannels; c++) {
-      struct ncclChannel* channel = comm->channels+c;
-      for (int i=0; i<channel->collCount; i++) {
-        channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
+  if (ret != ncclSuccess) {
+    // At least one call in the group failed. Since we want to make that group
+    // an atomic operation, we need to cancel all operations.
+    for (int i=0; i<ncclGroupIndex; i++) {
+      struct ncclAsyncArgs* args = ncclGroupArgs+i;
+      if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
+        if (args->init.newcomm) NCCLCHECK(ncclCommDestroy(*args->init.newcomm));
+        *args->init.newcomm = NULL;
+      } else {
+        struct ncclComm* comm = args->coll.comm;
+        for (int c=0; c<comm->nChannels; c++) {
+          struct ncclChannel* channel = comm->channels+c;
+          for (int i=0; i<channel->collCount; i++) {
+            channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
+          }
+          channel->collFifoTail = channel->collStart;
+          channel->collCount = 0;
+        }
+        /* Cancel all proxy ops : mark them as ncclProxyOpNone and they should be freed later on */
+        struct ncclProxyState* state = &comm->proxyState;
+        struct ncclProxyArgs *op, *start;
+        pthread_mutex_lock(&state->mutex);
+        op = start = state->ops;
+        while (op) {
+          if (op->opCount >= comm->lastOpCount) op->state = ncclProxyOpNone;
+          struct ncclProxyArgs* peerOp = op->nextPeer;
+          while (peerOp) {
+            if (peerOp->opCount >= comm->lastOpCount) peerOp->state = ncclProxyOpNone;
+            peerOp = peerOp->nextPeer;
+          }
+          op = op->next;
+          if (op == start) break;
+        }
+        comm->opCount = comm->lastOpCount;
+        pthread_cond_signal(&state->cond);
+        pthread_mutex_unlock(&state->mutex);
+
+        comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
+        comm->userStreamSet = false;
       }
-      channel->collFifoTail = channel->collStart;
-      channel->collCount = 0;
     }
-    comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
-    comm->userStreamSet = false;
   }
 end:
   ncclGroupError = ncclSuccess;
diff --git a/src/include/alloc.h b/src/include/alloc.h
new file mode 100644
index 0000000..bcdbd18
--- /dev/null
+++ b/src/include/alloc.h
@@ -0,0 +1,51 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ALLOC_H_
+#define NCCL_ALLOC_H_
+
+#include "nccl.h"
+#include "checks.h"
+#include <sys/mman.h>
+
+static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
+  CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
+  memset(*ptr, 0, size);
+  *devPtr = *ptr;
+  return ncclSuccess;
+}
+
+static inline ncclResult_t ncclCudaHostFree(void* ptr) {
+  CUDACHECK(cudaFreeHost(ptr));
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
+  void* p = malloc(nelem*sizeof(T));
+  if (p == NULL) {
+    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
+    return ncclSystemError;
+  }
+  memset(p, 0, nelem*sizeof(T));
+  *ptr = (T*)p;
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
+  CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
+  CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
+  CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
+  return ncclSuccess;
+}
+
+#endif
diff --git a/src/include/argcheck.h b/src/include/argcheck.h
new file mode 100644
index 0000000..8d8b74e
--- /dev/null
+++ b/src/include/argcheck.h
@@ -0,0 +1,16 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ARGCHECK_H_
+#define NCCL_ARGCHECK_H_
+
+#include "core.h"
+#include "info.h"
+
+ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
+ncclResult_t ArgsCheck(struct ncclInfo* info);
+
+#endif
diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h
index a1aaf50..a7d6be9 100644
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,6 +9,7 @@
 
 #include "nccl.h"
 
+ncclResult_t bootstrapNetInit();
 ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
 ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
 ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState);
@@ -16,4 +17,5 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
 ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
 ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
 ncclResult_t bootstrapClose(void* commState);
+ncclResult_t bootstrapAbort(void* commState);
 #endif
diff --git a/src/include/channel.h b/src/include/channel.h
index 76c5e8a..e2da325 100644
--- a/src/include/channel.h
+++ b/src/include/channel.h
@@ -1,12 +1,12 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
 #ifndef NCCL_CHANNEL_H_
 #define NCCL_CHANNEL_H_
-#include "core.h"
+#include "comm.h"
 
 ncclResult_t initChannel(struct ncclComm* comm, int channelid);
 ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
diff --git a/src/include/checks.h b/src/include/checks.h
index bf7750e..50737b0 100644
--- a/src/include/checks.h
+++ b/src/include/checks.h
@@ -1,10 +1,73 @@
 /*************************************************************************
- * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#ifndef NCCL_CHECKS_H_
+#define NCCL_CHECKS_H_
 
-ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
-ncclResult_t ArgsCheck(struct ncclInfo* info);
+#include "debug.h"
+
+// Check CUDA calls
+#define CUDACHECK(cmd) do {                                 \
+    cudaError_t e = cmd;                                    \
+    if( e != cudaSuccess ) {                                \
+        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
+        return ncclUnhandledCudaError;                      \
+    }                                                       \
+} while(false)
+
+#define CUDACHECKGOTO(cmd, res, label) do {                 \
+    cudaError_t e = cmd;                                    \
+    if( e != cudaSuccess ) {                                \
+        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
+        res = ncclUnhandledCudaError;                       \
+        goto label;                                         \
+    }                                                       \
+} while(false)
+
+#include <errno.h>
+// Check system calls
+#define SYSCHECK(call, name) do { \
+  int retval; \
+  SYSCHECKVAL(call, name, retval); \
+} while (false)
+
+#define SYSCHECKVAL(call, name, retval) do { \
+  SYSCHECKSYNC(call, name, retval); \
+  if (retval == -1) { \
+    WARN("Call to " name " failed : %s", strerror(errno)); \
+    return ncclSystemError; \
+  } \
+} while (false)
+
+#define SYSCHECKSYNC(call, name, retval) do { \
+  retval = call; \
+  if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
+    INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
+  } else { \
+    break; \
+  } \
+} while(true)
+
+// Propagate errors up
+#define NCCLCHECK(call) do { \
+  ncclResult_t res = call; \
+  if (res != ncclSuccess) { \
+    /* Print the back trace*/ \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    return res; \
+  } \
+} while (0);
+
+#define NCCLCHECKGOTO(call, res, label) do { \
+  res = call; \
+  if (res != ncclSuccess) { \
+    /* Print the back trace*/ \
+    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    goto label; \
+  } \
+} while (0);
+
+#endif
diff --git a/src/collectives/collectives.h b/src/include/collectives.h
index e6b19cb..69c8e74 100644
--- a/src/collectives/collectives.h
+++ b/src/include/collectives.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,7 +7,10 @@
 #ifndef NCCL_COLLECTIVES_H_
 #define NCCL_COLLECTIVES_H_
 
-#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*2+(al))*2+(ll))
+#include "core.h"
+#include "info.h"
+
+#define FUNC_INDEX(coll, redop, dtype, al, pr) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
 
 #define NCCL_COLL_NAME(coll, op, dtype) \
   coll##_##op##_##dtype
@@ -22,7 +25,8 @@
 
 #define DECL_COLL4(coll, op, dtype) \
   DECL_COLL5(coll, op, dtype) \
-  DECL_COLL5(coll##LL, op, dtype)
+  DECL_COLL5(coll##LL, op, dtype) \
+  DECL_COLL5(coll##LL128, op, dtype)
 
 #define DECL_COLL3(coll, op, dtype) \
   DECL_COLL4(coll##Ring, op, dtype) \
diff --git a/src/include/comm.h b/src/include/comm.h
new file mode 100644
index 0000000..7164dc0
--- /dev/null
+++ b/src/include/comm.h
@@ -0,0 +1,141 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_COMM_H_
+#define NCCL_COMM_H_
+
+#include "transport.h"
+
+#if CUDART_VERSION < 9000
+struct cudaLaunchParams {
+  void *func;
+  dim3 gridDim;
+  dim3 blockDim;
+  void **args;
+  size_t sharedMem;
+  cudaStream_t stream;
+};
+#endif
+
+#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
+
+#define CACHE_LINE_SIZE 128
+#define MEM_ALIGN 4096
+#define CUDA_IPC_MIN 2097152UL
+
+// Channels / LL tuning
+#define NCCL_LL_THREAD_THRESHOLD 8
+#define NCCL_LL128_THREAD_THRESHOLD 8
+#define NCCL_SIMPLE_THREAD_THRESHOLD 64
+
+struct ncclSendMem {
+  union {
+    struct {
+      uint64_t head;
+      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      void* ptrExchange;
+      char pad2[CACHE_LINE_SIZE-sizeof(void*)];
+      uint64_t opCount;
+    };
+    char pad3[MEM_ALIGN];
+  };
+};
+
+struct ncclRecvMem {
+  union {
+    struct {
+      uint64_t tail;
+      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      uint64_t opCount;
+      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      int sizesFifo[NCCL_STEPS];
+    };
+    char pad4[MEM_ALIGN];
+  };
+  ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
+  uint64_t ll128Buff[NCCL_LL128_BUFF_ELEMS];
+  char buff[1]; // Actually larger than that
+};
+
+struct ncclComm {
+  struct ncclChannel channels[MAXCHANNELS];
+
+  struct ncclPeerInfo* peerInfo;
+  struct ncclTopoSystem* topo;
+
+  void* bootstrap;
+
+  int rank;    // my rank in the communicator
+  int nRanks;  // number of GPUs in communicator
+  int cudaDev; // my cuda device index
+  int64_t busId;   // my PCI bus ID in int format
+
+  int node;
+  int nNodes;
+  int localRanks;
+
+  enum { GROUP, PARALLEL } launchMode;
+  cudaStream_t userStream;
+  bool userStreamSet;
+  cudaEvent_t doneEvent;
+  bool checkPointers;
+
+  // Counter to make sure collectives match (needed for bcast/reduce
+  // where syncs are not symmetric).
+  uint64_t opCount;
+  uint64_t lastOpCount;
+
+  // Channels for collectives
+  int nChannels;
+
+  // Only nvlink is used for inter-GPU communication
+  int nvlink;
+
+  // Algorithm/Protocols thresholds
+  ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  int maxThreads[NCCL_NUM_PROTOCOLS];
+
+  // An internal CUDA stream for NCCL kernel CGMD launches
+  int groupCudaStream;
+  cudaStream_t groupStream;
+
+  // Whether there has been a fatal error in this communicator.
+  ncclResult_t fatalError;
+
+  // Error reported by GPU
+  volatile ncclDevError_t* fatalDevError;
+
+  // Flag to ask NCCL kernels to abort
+  volatile uint32_t *abortFlag;
+
+  // Device side of the communicator
+  struct ncclDevComm *devComm;
+  // Host copy of the devComm (to free CUDA allocs)
+  struct ncclDevComm hostDevComm;
+
+  // Intra-process sync
+  int intraRank;
+  int intraRanks;
+  int* intraBarrier;
+  int intraPhase;
+
+  // Storage for deferred intra-process launch
+  struct cudaLaunchParams * intraParams;
+  struct cudaLaunchParams *myParams;
+  int* intraCudaDevs;
+  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
+  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
+  struct ncclColl args;
+  void* argsptr;
+
+  // Global proxy thread
+  pthread_t proxyThread;
+  struct ncclProxyState proxyState;
+};
+
+#endif
diff --git a/src/include/core.h b/src/include/core.h
index d57d271..250f43b 100644
--- a/src/include/core.h
+++ b/src/include/core.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,385 +7,12 @@
 #ifndef NCCL_CORE_H_
 #define NCCL_CORE_H_
 
-#define NCCL_MAX_OPS 2048
-#define NCCL_STEPS 8
-
-#include "nccl.h"
-#include "transport.h"
-#include "debug.h"
-#include <cstdio>
-#include <algorithm> // std::min/std::max
+#include <pthread.h>
 #include <unistd.h>
 #include <stdlib.h>
-#include <cuda_runtime.h>
-
-#if CUDART_VERSION < 9000
-struct cudaLaunchParams {
-  void *func;
-  dim3 gridDim;
-  dim3 blockDim;
-  void **args;
-  size_t sharedMem;
-  cudaStream_t stream;
-};
-#endif
-
-#define MAXCHANNELS 16
-#define MAXTHREADS 256
-#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
-
-// Channels / LL tuning
-#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
-#define NCCL_THREAD_THRESHOLD 64  // Per thread size before we switch to non-LL
-#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
-#define NCCL_LL_MAX_NTHREADS MAXTHREADS
-#define NCCL_LL_MIN_NTHREADS 64
-
-#define DIVUP(x, y) \
-    (((x)+(y)-1)/(y))
-#define ROUNDUP(x, y) \
-    (DIVUP((x), (y))*(y))
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-union ncclLLFifoLine {
-  /* Flags have to be *after* data, because otherwise, an incomplete receive
-     from the network may receive the flag but not the data.
-     Note this is assuming that either we receive contiguous chunks of data
-     (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
-  struct {
-    uint32_t data1;
-    uint32_t flag1;
-    uint32_t data2;
-    uint32_t flag2;
-  };
-  uint64_t v[2];
-  int4 i4;
-};
-
-typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
-
-typedef enum {
-  ncclPatternRing,
-  ncclPatternRingTwice,
-  ncclPatternPipelineFrom,
-  ncclPatternPipelineTo,
-  ncclPatternTreeUp,
-  ncclPatternTreeDown,
-  ncclPatternTreeUpDown
-} ncclPattern_t;
-
-typedef enum {
-  ncclDevSuccess,
-  ncclDevAssertedMismatch,
-  ncclDevSuspectedMismatch
-} ncclDevError_t;
-
-// Used to pass NCCL call information between functions
-struct ncclInfo {
-  ncclColl_t coll;
-  const char* opName;
-  // NCCL Coll Args
-  const void* sendbuff;
-  void* recvbuff;
-  size_t count;
-  ncclDataType_t datatype;
-  ncclRedOp_t op;
-  int root;
-  ncclComm_t comm;
-  cudaStream_t stream;
-  // Algorithm details
-  int chunkSteps;
-  int sliceSteps;
-  // Computed later
-  ncclPattern_t pattern;
-  size_t nBytes;
-  int nstepsPerLoop;
-  int nchunksPerLoop;
-};
-
-struct ncclConnInfo {
-  // Regular comm mechanism
-  char *buff;         // Local for recv, remote for send
-  uint64_t *tail;     // Local for recv, remote for send
-  uint64_t *head;     // Local for send, remote for recv
-  uint64_t *opCountLoc; // opCount of local rank
-  uint64_t *opCountRem; // opCount of remote rank
-
-  int direct;         // Direct communication
-  void **ptrExchange; // Pointer exchange for direct communication
-
-  int *fifo;          // Size fifo for proxy
-
-  uint64_t step;      // Keep where we are
-
-  // Low latency mechanism
-  union ncclLLFifoLine *llBuff; // Local for recv, remote for send
-  uint64_t llLastCleaning;
-};
-
-struct ncclConnector {
-  int connected;
-  struct ncclProxyArgs *proxyAppend;
-  struct ncclTransportComm* transportComm;
-  void* transportResources; // Host-side resources
-  struct ncclConnInfo conn;
-  struct ncclComm *comm;
-};
-
-#define CACHE_LINE_SIZE 128
-#define MEM_ALIGN 4096
-#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
-
-#define NUM_LINES_PER_THREAD 8
-#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
-#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
-#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
-#define NCCL_LL_CLEAN_FREQ 0x10000000
-
-struct ncclSendMem {
-  union {
-    struct {
-      uint64_t head;
-      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      void* ptrExchange;
-      char pad2[CACHE_LINE_SIZE-sizeof(void*)];
-      uint64_t opCount;
-    };
-    char pad3[MEM_ALIGN];
-  };
-};
-
-struct ncclRecvMem {
-  union {
-    struct {
-      uint64_t tail;
-      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      uint64_t opCount;
-      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      int sizesFifo[NCCL_STEPS];
-    };
-    char pad4[MEM_ALIGN];
-  };
-  ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
-  char buff[1]; // Actually larger than that
-};
-
-struct ncclRing {
-  // Shortcuts for userRanks[1] and userRanks[n-1]
-  int prev;
-  int next;
-
-  // Maps an internal nccl index to user-specified rank order. This is necessary
-  // since we need to know how the user expects data to be ordered across
-  // devices. Ordered from current device.
-  int* userRanks;
-  int* devUserRanks;
-};
-
-#define NCCL_MAX_TREE_ARITY 3
-struct ncclTree {
-  int depth;
-  int up;
-  int down[NCCL_MAX_TREE_ARITY];
-};
-
-struct ncclPeer {
-  struct ncclConnector send;
-  struct ncclConnector recv;
-};
-
-struct ncclChannel {
-  union {
-    struct {
-      struct ncclRing ring;
-      struct ncclTree tree;
-
-      int id;
-      int nthreads;
-      int buffSize;
-
-      // Communication structures
-      struct ncclPeer* peers;
-      struct ncclPeer* devPeers;
-
-      // Operation list for aggregation
-      struct ncclColl* collectives;
-      struct ncclColl* devCollectives;
-      int collStart;
-      int collCount;
-      int collFifoHead; // Only used by GPU
-      int collFifoTail; // Only used by CPU
-    };
-    int data[0x80];
-  };
-};
-static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
-
-/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
-/* to make sure reads to host from the CUDA kernel are aligned. */
-/* Make sure to adjust padding at the end of ncclColl. */
-struct CollectiveArgs {
-  struct ncclComm* comm;
-  uint64_t opCount;
-
-  // local and remote input, output, and buffer
-  const void * ThisInput;
-  void * ThisOutput;
-
-  // general parameters
-  size_t N;
-  uint32_t root;
-  uint8_t bid;
-  uint8_t nChannels;
-  uint16_t nThreads;
-
-  int lastChunkSize;
-};
-struct ncclColl {
-  union {
-    struct {
-      struct CollectiveArgs args;
-      uint16_t funcIndex;
-      uint16_t nextIndex;
-      uint8_t  active;
-    };
-    int data[0x10];
-  };
-};
-static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
-
-struct ncclComm {
-  struct ncclChannel channels[MAXCHANNELS];
-
-  struct ncclPeerInfo* peerInfo;
-
-  void* bootstrap;
-
-  int rank;    // my rank in the communicator
-  int nRanks;  // number of GPUs in communicator
-  int cudaDev; // my cuda device index
-  int nvmlDev; // my NVML device number
-
-  enum { GROUP, PARALLEL } launchMode;
-  cudaStream_t userStream;
-  bool userStreamSet;
-  cudaEvent_t doneEvent;
-  bool checkPointers;
-
-  // Counter to make sure collectives match (needed for bcast/reduce
-  // where syncs are not symmetric).
-  uint64_t opCount;
-
-  // Channels for collectives
-  int nChannels;
-  int nThreads;
-
-  // Low-latency algorithm threshold
-  ssize_t llThreshold;
-  ssize_t threadThreshold;
-
-  // Tree algorithm threshold
-  ssize_t treeThreshold;
-
-  // An internal CUDA stream for NCCL kernel CGMD launches
-  int groupCudaStream;
-  cudaStream_t groupStream;
-
-  // Whether there has been a fatal error in this communicator.
-  ncclResult_t fatalError;
-
-  // Error reported by GPU
-  volatile ncclDevError_t* fatalDevError;
-
-  // On host: this pointer has been obtained from cudaHostAlloc(cudaHostAllocMapped)
-  // On device:  this pointer has been obtained from cudaHostGetDevicePointer()
-  volatile uint32_t *abortFlag;
-
-  // Device copy of the communicator
-  struct ncclComm *devComm;
-
-  // Intra-process sync
-  int intraRank;
-  int intraRanks;
-  int* intraBarrier;
-  int intraPhase;
-
-  // Storage for deferred intra-process launch
-  struct cudaLaunchParams * intraParams;
-  struct cudaLaunchParams *myParams;
-  int* intraCudaDevs;
-  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
-  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
-  struct ncclColl args;
-  void* argsptr;
-
-  // Global proxy thread
-  pthread_t proxyThread;
-  struct ncclProxyState proxyState;
-};
-
-// Check CUDA calls
-#define CUDACHECK(cmd) do {                                 \
-    cudaError_t e = cmd;                                    \
-    if( e != cudaSuccess ) {                                \
-        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
-        return ncclUnhandledCudaError;                      \
-    }                                                       \
-} while(false)
-
-#define CUDACHECKGOTO(cmd, res, label) do {                 \
-    cudaError_t e = cmd;                                    \
-    if( e != cudaSuccess ) {                                \
-        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
-        res = ncclUnhandledCudaError;                       \
-        goto label;                                         \
-    }                                                       \
-} while(false)
-
-#include <errno.h>
-// Check system calls
-#define SYSCHECK(call, name) do { \
-  int retval; \
-  SYSCHECKVAL(call, name, retval); \
-} while (false)
-
-#define SYSCHECKVAL(call, name, retval) do { \
-  SYSCHECKSYNC(call, name, retval); \
-  if (retval == -1) { \
-    WARN("Call to " name " failed : %s", strerror(errno)); \
-    return ncclSystemError; \
-  } \
-} while (false)
-
-#define SYSCHECKSYNC(call, name, retval) do { \
-  retval = call; \
-  if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
-    INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
-  } else { \
-    break; \
-  } \
-} while(true)
-
-// Propagate errors up
-#define NCCLCHECK(call) do { \
-  ncclResult_t res = call; \
-  if (res != ncclSuccess) { \
-    /* Print the back trace*/ \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
-    return res; \
-  } \
-} while (0);
-
-#define NCCLCHECKGOTO(call, res, label) do { \
-  res = call; \
-  if (res != ncclSuccess) { \
-    /* Print the back trace*/ \
-    INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
-    goto label; \
-  } \
-} while (0);
+#include <stdint.h>
+#include <algorithm> // For std::min/std::max
+#include "nccl.h"
 
 #ifdef PROFAPI
 #define NCCL_API(ret, func, args...)        \
@@ -403,10 +30,6 @@ struct ncclComm {
     ret func(args)
 #endif // end PROFAPI
 
-int ncclCudaCompCap();
-ncclResult_t ncclNvlinkGpu(int* nvlink);
-int64_t ncclTreeThreshold();
-
 static __inline__ int ncclTypeSize(ncclDataType_t type) {
   switch (type) {
     case ncclInt8:
@@ -427,42 +50,22 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
   }
 }
 
-#include <sys/mman.h>
-static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
-  CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
-  memset(*ptr, 0, size);
-  *devPtr = *ptr;
-  return ncclSuccess;
-}
-
-static inline ncclResult_t ncclCudaHostFree(void* ptr) {
-  CUDACHECK(cudaFreeHost(ptr));
-  return ncclSuccess;
-}
+#define NCCL_NUM_FUNCTIONS 5
+typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce } ncclFunc_t;
 
-template <typename T>
-static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
-  void* p = malloc(nelem*sizeof(T));
-  if (p == NULL) {
-    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
-    return ncclSystemError;
-  }
-  memset(p, 0, nelem*sizeof(T));
-  *ptr = (T*)p;
-  return ncclSuccess;
-}
+#define NCCL_NUM_ALGORITHMS 2 // Tree/Ring
+#define NCCL_ALGO_TREE 0
+#define NCCL_ALGO_RING 1
 
-template <typename T>
-static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
-  CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
-  CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
-  return ncclSuccess;
-}
+#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
+#define NCCL_PROTO_LL 0
+#define NCCL_PROTO_LL128 1
+#define NCCL_PROTO_SIMPLE 2
 
-template <typename T>
-static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
-  CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
-  return ncclSuccess;
-}
+#include "debug.h"
+#include "checks.h"
+#include "alloc.h"
+#include "utils.h"
+#include "param.h"
 
 #endif // end include guard
diff --git a/src/include/cpuset.h b/src/include/cpuset.h
index f70d1d8..98b93de 100644
--- a/src/include/cpuset.h
+++ b/src/include/cpuset.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/include/debug.h b/src/include/debug.h
index 3acdf8c..89b6e42 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,15 +7,14 @@
 #ifndef NCCL_DEBUG_H_
 #define NCCL_DEBUG_H_
 
-#include <pthread.h>
+#include "core.h"
+
 #include <stdio.h>
 #include <chrono>
 
-#include <unistd.h>
 #include <sys/syscall.h>
 #include <limits.h>
 #include <string.h>
-#include "nccl.h"
 #include "nccl_net.h"
 
 #define gettid() (pid_t) syscall(SYS_gettid)
@@ -24,10 +23,17 @@ extern int ncclDebugLevel;
 extern uint64_t ncclDebugMask;
 extern pthread_mutex_t ncclDebugOutputLock;
 extern FILE *ncclDebugFile;
-extern ncclResult_t getHostName(char* hostname, int maxlen);
-extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev);
+extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
+
+void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
 
-extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
+// Let code temporarily downgrade WARN into INFO
+extern thread_local int ncclDebugNoWarn;
+#define NOWARN(a, ret) do { \
+  ncclDebugNoWarn = 1; \
+  ret = a; \
+  ncclDebugNoWarn = 0; \
+} while (0)
 
 #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
 #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
@@ -39,101 +45,4 @@ extern std::chrono::high_resolution_clock::time_point ncclEpoch;
 #define TRACE(...)
 #endif
 
-#include <stdlib.h>
-
-static inline void initDebug() {
-  const char* nccl_debug = getenv("NCCL_DEBUG");
-  if (nccl_debug == NULL) {
-    ncclDebugLevel = NCCL_LOG_NONE;
-  } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
-    ncclDebugLevel = NCCL_LOG_VERSION;
-  } else if (strcasecmp(nccl_debug, "WARN") == 0) {
-    ncclDebugLevel = NCCL_LOG_WARN;
-  } else if (strcasecmp(nccl_debug, "INFO") == 0) {
-    ncclDebugLevel = NCCL_LOG_INFO;
-  } else if (strcasecmp(nccl_debug, "ABORT") == 0) {
-    ncclDebugLevel = NCCL_LOG_ABORT;
-  } else if (strcasecmp(nccl_debug, "TRACE") == 0) {
-    ncclDebugLevel = NCCL_LOG_TRACE;
-  }
-
-  /* Parse the NCCL_DEBUG_SUBSYS env var
-   * This can be a comma separated list such as INIT,COLL
-   * or ^INIT,COLL etc
-   */
-  char* nccl_debug_subsys = getenv("NCCL_DEBUG_SUBSYS");
-  if (nccl_debug_subsys != NULL) {
-    char *subsys = strtok(nccl_debug_subsys, ",");
-    while (subsys != NULL) {
-      int invert = 0;
-      uint64_t mask = 0;
-      if (subsys[0] == '^') { invert = 1; subsys++; }
-      if (strcasecmp(subsys, "INIT") == 0) {
-        mask = NCCL_INIT;
-      } else if (strcasecmp(subsys, "COLL") == 0) {
-        mask = NCCL_COLL;
-      } else if (strcasecmp(subsys, "P2P") == 0) {
-        mask = NCCL_P2P;
-      } else if (strcasecmp(subsys, "SHM") == 0) {
-        mask = NCCL_SHM;
-      } else if (strcasecmp(subsys, "NET") == 0) {
-        mask = NCCL_NET;
-      } else if (strcasecmp(subsys, "ALL") == 0) {
-        mask = NCCL_ALL;
-      }
-      if (mask) {
-        if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
-      }
-      subsys = strtok(NULL, ",");
-    }
-  }
-
-  /* Parse and expand the NCCL_DEBUG_FILE path and
-   * then create the debug file. But don't bother unless the
-   * NCCL_DEBUG level is > VERSION
-   */
-  const char* nccl_debug_file = getenv("NCCL_DEBUG_FILE");
-  if (ncclDebugLevel > NCCL_LOG_VERSION && nccl_debug_file != NULL) {
-    int c = 0;
-    char debug_fn[PATH_MAX+1] = "";
-    char *dfn = debug_fn;
-    while (nccl_debug_file[c] != '\0' && c < PATH_MAX) {
-      if (nccl_debug_file[c++] != '%') {
-        *dfn++ = nccl_debug_file[c-1];
-        continue;
-      }
-      switch (nccl_debug_file[c++]) {
-        case '%': // Double %
-          *dfn++ = '%';
-          break;
-        case 'h': // %h = hostname
-          char hostname[1024];
-          getHostName(hostname, 1024);
-          dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
-          break;
-        case 'p': // %p = pid
-          dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
-          break;
-        default: // Echo everything we don't understand
-          *dfn++ = '%';
-          *dfn++ = nccl_debug_file[c-1];
-          break;
-      }
-    }
-    *dfn = '\0';
-    if (debug_fn[0] != '\0') {
-      FILE *file = fopen(debug_fn, "w");
-      if (file != NULL) {
-        INFO(NCCL_ALL,"DEBUG file is '%s'", debug_fn);
-        ncclDebugFile = file;
-      }
-    }
-  }
-  pthread_mutex_init(&ncclDebugOutputLock, NULL);
-
-#ifdef ENABLE_TRACE
-  ncclEpoch = std::chrono::high_resolution_clock::now();
-#endif
-}
-
 #endif
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
new file mode 100644
index 0000000..46d236b
--- /dev/null
+++ b/src/include/devcomm.h
@@ -0,0 +1,214 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_DEVICE_H_
+#define NCCL_DEVICE_H_
+
+#include "nccl.h"
+#include <stdint.h>
+
+#define NCCL_MAX_OPS 2048
+#define NCCL_STEPS 8
+
+#define DIVUP(x, y) \
+    (((x)+(y)-1)/(y))
+#define ROUNDUP(x, y) \
+    (DIVUP((x), (y))*(y))
+
+#define ALIGN_SIZE(size, align) \
+  size = ((size + (align) - 1) / (align)) * (align);
+
+union ncclLLFifoLine {
+  /* Flags have to be *after* data, because otherwise, an incomplete receive
+     from the network may receive the flag but not the data.
+     Note this is assuming that either we receive contiguous chunks of data
+     (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
+  struct {
+    uint32_t data1;
+    uint32_t flag1;
+    uint32_t data2;
+    uint32_t flag2;
+  };
+  uint64_t v[2];
+  int4 i4;
+};
+
+#define WARP_SIZE 32
+#define MAXCHANNELS 32
+#define NCCL_MAX_NTHREADS 512
+#define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS
+#define NCCL_LL_LINES_PER_THREAD 8
+#define NCCL_LL_SLICE_LINES (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
+#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
+#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
+#ifdef TEST_LL_CLEANUP
+#define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
+#define NCCL_LL_FLAG_MAX   0x100
+#define NCCL_LL_FLAG(a) ((uint32_t)((a) % NCCL_LL_FLAG_MAX))
+#else
+#define NCCL_LL_CLEAN_MASK 0x7ffffff8
+#define NCCL_LL_FLAG(a) ((uint32_t)(a))
+#endif
+// Make sure the clean mask will last for at least NCCL_NSTEPS
+static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
+
+#define NCCL_LL128_LINESIZE 128
+#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
+#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)
+
+#define NCCL_LL128_MAX_NTHREADS 640
+#define NCCL_LL128_ELEMS_PER_THREAD 120
+
+// Receiving from up to 3 sources is more compute intensive than sending
+// to 3 dests. Use 70% for reduce and 30% for bcast.
+#define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)
+
+#define NCCL_LL128_SLICE_ELEMS (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
+#define NCCL_LL128_BUFF_ELEMS (NCCL_LL128_SLICE_ELEMS*NCCL_STEPS)
+#define NCCL_LL128_BUFF_SIZE (NCCL_LL128_BUFF_ELEMS*sizeof(uint64_t))
+
+#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
+#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
+
+struct ncclConnInfo {
+  // Regular comm mechanism
+  char *buff;         // Local for recv, remote for send
+  uint64_t *tail;     // Local for recv, remote for send
+  uint64_t *head;     // Local for send, remote for recv
+  uint64_t *opCountLoc; // opCount of local rank
+  uint64_t *opCountRem; // opCount of remote rank
+
+  int direct;         // Direct communication
+  void **ptrExchange; // Pointer exchange for direct communication
+
+  int *fifo;          // Size fifo for proxy
+
+  uint64_t step;      // Keep where we are
+
+  // Low latency mechanism
+  union ncclLLFifoLine *llBuff; // Local for recv, remote for send
+  uint64_t llLastCleaning;
+
+  // High bandwidth, low latency protocol
+  uint64_t* ll128Buff; // Local for recv, remote for send
+};
+
+struct ncclConnector {
+  int connected;
+  struct ncclProxyArgs *proxyAppend;
+  struct ncclTransportComm* transportComm;
+  void* transportResources; // Host-side resources
+  struct ncclConnInfo conn;
+  struct ncclComm *comm;
+};
+
+struct ncclRing {
+  // Shortcuts for userRanks[1] and userRanks[n-1]
+  int prev;
+  int next;
+
+  // Maps an internal nccl index to user-specified rank order. This is necessary
+  // since we need to know how the user expects data to be ordered across
+  // devices. Ordered from current device.
+  int* userRanks;
+  int* devUserRanks;
+};
+
+
+#define NCCL_MAX_TREE_ARITY 3
+struct ncclTree {
+  int depth;
+  int up;
+  int down[NCCL_MAX_TREE_ARITY];
+};
+
+struct ncclPeer {
+  struct ncclConnector send;
+  struct ncclConnector recv;
+};
+
+struct ncclDevComm;
+
+/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
+/* to make sure reads to host from the CUDA kernel are aligned. */
+/* Make sure to adjust padding at the end of ncclColl. */
+struct CollectiveArgs {
+  struct ncclDevComm* comm;
+  uint64_t opCount;
+
+  // local and remote input, output, and buffer
+  const void * ThisInput;
+  void * ThisOutput;
+
+  // general parameters
+  size_t N;
+  uint32_t root;
+  uint8_t bid;
+  uint8_t nChannels;
+  uint16_t nThreads;
+
+  int lastChunkSize;
+};
+struct ncclColl {
+  union {
+    struct {
+      struct CollectiveArgs args;
+      uint16_t funcIndex;
+      uint16_t nextIndex;
+      uint8_t  active;
+    };
+    int data[0x10];
+  };
+};
+static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
+
+struct ncclChannel {
+  union {
+    struct {
+      struct ncclRing ring;
+      struct ncclTree treeUp;
+      struct ncclTree treeDn;
+
+      int id;
+      int nthreads;
+      int buffSize;
+
+      // Communication structures
+      struct ncclPeer* peers;
+      struct ncclPeer* devPeers;
+
+      // Operation list for aggregation
+      struct ncclColl* collectives;
+      struct ncclColl* devCollectives;
+      int collStart;
+      int collCount;
+      int collFifoHead; // Only used by GPU
+      int collFifoTail; // Only used by CPU
+    };
+    int data[0x80];
+  };
+};
+static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
+
+typedef enum {
+  ncclDevSuccess,
+  ncclDevAssertedMismatch,
+  ncclDevSuspectedMismatch
+} ncclDevError_t;
+
+struct ncclDevComm {
+  int rank;
+  int nRanks;
+
+  // Flag to ask NCCL kernels to abort
+  volatile uint32_t *abortFlag;
+  volatile ncclDevError_t *fatalDevError;
+
+  // Channels, device side
+  struct ncclChannel* channels;
+};
+
+#endif
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
index 4db7094..cea486e 100644
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,8 +7,9 @@
 #ifndef NCCL_ENQUEUE_H_
 #define NCCL_ENQUEUE_H_
 
-#include "core.h"
+#include "comm.h"
 #include "group.h"
+#include "collectives.h"
 
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
 ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
diff --git a/src/include/graph.h b/src/include/graph.h
new file mode 100644
index 0000000..3c8ba19
--- /dev/null
+++ b/src/include/graph.h
@@ -0,0 +1,94 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_GRAPH_H_
+#define NCCL_GRAPH_H_
+
+#include "nccl.h"
+#include "devcomm.h"
+#include <limits.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <stdio.h>
+
+enum ncclPathDist {
+  PATH_PIX  = 0,
+  PATH_PXB  = 1,
+  PATH_PHB  = 2,
+  PATH_NODE = 3,
+  PATH_SYS  = 4,
+  PATH_ARRAY_SIZE = 5
+};
+
+extern const char* pathDists[PATH_ARRAY_SIZE];
+
+ncclResult_t ncclTopoCudaPath(int cudaDev, char** path);
+
+struct ncclTopoSystem;
+// Build the topology
+ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system);
+ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system);
+ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
+
+ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info);
+void ncclTopoFree(struct ncclTopoSystem* system);
+ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
+ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system);
+
+// Query topology
+ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink);
+ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink);
+ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance);
+ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* net);
+ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance);
+ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count);
+
+#define NCCL_TOPO_MAX_NODES 256
+
+#define NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP 1 // Split tree (send/recv from different ranks) always flowing in the same direction
+#define NCCL_TOPO_PATTERN_SPLIT_TREE 2      // Split tree (send/recv from different ranks) flowing in both directions
+#define NCCL_TOPO_PATTERN_TREE 3            // Simple tree (send/recv from same rank) flowing in both directions
+#define NCCL_TOPO_PATTERN_RING 4            // Ring
+struct ncclTopoGraph {
+  // Input / output
+  int pattern;
+  int crossNic;
+  // Output
+  int nChannels;
+  int speedIntra;
+  int speedInter;
+  int type;
+  int nvlink;
+  int sameChannels;
+  int nHops;
+  int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES];
+  int inter[MAXCHANNELS*2];
+};
+ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
+
+ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
+
+struct ncclTopoRanks {
+  int ringRecv[MAXCHANNELS];
+  int ringSend[MAXCHANNELS];
+  int ringPrev[MAXCHANNELS];
+  int ringNext[MAXCHANNELS];
+  int treeUpRecv[MAXCHANNELS];
+  int treeUpSend[MAXCHANNELS];
+  int treeDnRecv[MAXCHANNELS];
+  int treeDnSend[MAXCHANNELS];
+};
+
+ncclResult_t ncclTopoPreset(struct ncclComm* comm,
+    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
+    struct ncclTopoRanks* topoRanks);
+
+ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
+    struct ncclTopoRanks** allTopoRanks, int* rings);
+
+ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph);
+
+#endif
diff --git a/src/include/group.h b/src/include/group.h
index 76da30f..239b05f 100644
--- a/src/include/group.h
+++ b/src/include/group.h
@@ -8,14 +8,14 @@
 #define NCCL_GROUP_H_
 
 #include "nccl.h"
-#include "core.h"
+#include "comm.h"
 
 bool ncclAsyncMode();
 ncclResult_t ncclAsyncErrCheck(ncclResult_t ret);
 
-typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
+typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
 
-ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
+ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
 
 typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h
index 4f3e831..0943f99 100644
--- a/src/include/ibvwrap.h
+++ b/src/include/ibvwrap.h
@@ -4,7 +4,7 @@
  * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
  *
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/include/info.h b/src/include/info.h
new file mode 100644
index 0000000..9461759
--- /dev/null
+++ b/src/include/info.h
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INFO_H_
+#define NCCL_INFO_H_
+
+#include "nccl.h"
+#include "core.h"
+
+typedef enum {
+  ncclPatternRing,
+  ncclPatternRingTwice,
+  ncclPatternPipelineFrom,
+  ncclPatternPipelineTo,
+  ncclPatternTreeUp,
+  ncclPatternTreeDown,
+  ncclPatternTreeUpDown
+} ncclPattern_t;
+
+// Used to pass NCCL call information between functions
+struct ncclInfo {
+  ncclFunc_t coll;
+  const char* opName;
+  // NCCL Coll Args
+  const void* sendbuff;
+  void* recvbuff;
+  size_t count;
+  ncclDataType_t datatype;
+  ncclRedOp_t op;
+  int root;
+  ncclComm_t comm;
+  cudaStream_t stream;
+  // Algorithm details
+  int chunkSteps;
+  int sliceSteps;
+  // Computed later
+  int algorithm;
+  int protocol;
+  ncclPattern_t pattern;
+  int nChannels;
+  int nThreads;
+  size_t nBytes;
+  int nstepsPerLoop;
+  int nchunksPerLoop;
+};
+
+#endif
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
index 89edbf5..d6ae9f8 100644
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -15,7 +15,7 @@
 #define NCCL_PTR_CUDA 0x2
 
 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_ALL=~0} ncclDebugLogSubSys;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ALL=~0} ncclDebugLogSubSys;
 
 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
 
@@ -80,12 +80,13 @@ typedef struct {
   // Finalize connection establishment after remote peer has called connectHandle
   ncclResult_t (*accept)(void* listenComm, void** recvComm);
   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
   ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
   ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  // Asynchronous send to a peer.
   // May return request == NULL if the call cannot be performed (or would block)
   ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
-  // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  // Asynchronous recv from a peer.
   // May return request == NULL if the call cannot be performed (or would block)
   ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
diff --git a/src/include/net.h b/src/include/net.h
index e75e6bb..3d37c8c 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -13,16 +13,10 @@
 extern ncclNet_t* ncclNet;
 typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
 
-/* Socket Interface Selection type */
-typedef enum { findSubnetIf   = -1,
-    dontCareIf     = -2
-} ncclSocketIfSl_t;
-
 // Translation to external API
 static const char* ncclNetName() { return ncclNet->name; }
 static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
 static ncclResult_t ncclNetPciPath(int dev, char** path) { NCCLCHECK(ncclNet->pciPath(dev, path)); return ncclSuccess; }
-static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(ncclNet->ptrSupport(dev, supportedTypes)); return ncclSuccess; }
 static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
 static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
 static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
@@ -36,7 +30,37 @@ static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeS
 static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
 static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
 
-extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str);
+#define GPU_BUF_SIZE (2*1024*1024)
+static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) {
+  int support;
+  NCCLCHECK(ncclNet->ptrSupport(dev, &support));
+  *supportedTypes = support & ~NCCL_PTR_CUDA;
+  // The network supports GPU Direct RDMA ; verify the GPU supports it as well.
+  if (support & NCCL_PTR_CUDA) {
+    void *lComm = NULL, *sComm = NULL, *rComm = NULL;
+    ncclNetHandle_t handle;
+    void* gpuPtr = NULL;
+    void* mHandle = NULL;
+    ncclResult_t res;
+    NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), res, cleanup);
+    NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), res, cleanup);
+    NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), res, cleanup);
+    CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), res, cleanup);
+    NOWARN(ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res);
+    if (res != ncclSuccess) goto cleanup;
+    NCCLCHECKGOTO(ncclNetDeregMr(sComm, mHandle), res, cleanup);
+    NCCLCHECKGOTO(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res, cleanup);
+    NCCLCHECKGOTO(ncclNetDeregMr(rComm, mHandle), res, cleanup);
+    *supportedTypes |= NCCL_PTR_CUDA;
+cleanup:
+    if (gpuPtr) cudaFree(gpuPtr);
+    if (rComm) ncclNetCloseRecv(rComm);
+    if (sComm) ncclNetCloseSend(sComm);
+    if (lComm) ncclNetCloseListen(lComm);
+  }
+  return ncclSuccess;
+}
+
 extern ncclNet_t ncclNetIb;
 extern ncclNet_t ncclNetSocket;
 
diff --git a/src/include/nvlink.h b/src/include/nvlink.h
deleted file mode 100644
index 1baf9e5..0000000
--- a/src/include/nvlink.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_NVLINK_H_
-#define NCCL_NVLINK_H_
-
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "nvmlwrap.h"
-#include "topo.h"
-
-#define CONNECT_NVLINK 0x10
-#define CONNECT_NVSWITCH 0x100
-
-enum ncclNvLinkDeviceType {
-  ncclNvLinkDeviceGpu,
-  ncclNvLinkDeviceSwitch,
-};
-
-static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
-  char classPath[] =  "/sys/bus/pci/devices/0000:00:00.0/class";
-  memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
-  char* rPath = realpath(classPath, NULL);
-  int fd;
-  SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd);
-  free(rPath);
-  char pciClass[9];
-  strncpy(pciClass, "0x000000", 9);
-  int len;
-  SYSCHECKVAL(read(fd, pciClass, 8), "read", len);
-  SYSCHECK(close(fd), "close");
-  if (strcmp(pciClass, "0x068000") == 0) {
-    // PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
-    *type = ncclNvLinkDeviceSwitch;
-  } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
-      || strcmp(pciClass, "0x030000") == 0) {  // "VGA Controller" (GeForce)
-    *type = ncclNvLinkDeviceGpu;
-  } else {
-    // Ignore if we don't know what's on the other side.
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-/* Get the maximum number of NVLinks based on the GPU generation */
-static ncclResult_t getMaxNvlinks(int* maxLinks) {
-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  int ccMajor;
-  CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
-  // 6 for Volta, 4 for Pascal
-  *maxLinks = (ccMajor > 6) ? 6 : 4;
-  // INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks);
-  return ncclSuccess;
-}
-
-static int getNvlinkGpu(const char* busId1, const char* busId2) {
-  // Determine if that connection is through NVLink
-  int links = 0;
-  int nvswitch_links = 0;
-  int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
-  nvmlDevice_t nvmlDev;
-  ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId1, &nvmlDev);
-  if (res != ncclSuccess) return 0;
-
-  for(int l=0; l<maxNvLinks; ++l) {
-    // Check whether we can use this NVLink for P2P
-    unsigned canP2P;
-    if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
-
-    // Make sure the Nvlink is up. The previous call should have trained the link.
-    nvmlEnableState_t isActive;
-    if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
-
-    // Try to figure out what's on the other side of the NVLink
-    nvmlPciInfo_t remoteProc;
-    if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
-
-    // Old versions of NVML return a lowercase PCI ID
-    char* p = remoteProc.busId;
-    for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
-      if (p[c] == 0) break;
-      p[c] = toupper(p[c]);
-    }
-
-    if (busId2 != NULL && strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
-      links++;
-    } else {
-      // Make a lower case copy of the bus ID for calling ncclDeviceType
-      // PCI system path is in lower case
-      char* p = remoteProc.busId;
-      char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-      for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
-        if (p[c] == 0) break;
-        lowerId[c] = tolower(p[c]);
-      }
-
-      // Determine if the remote side is NVswitch or a GPU
-      enum ncclNvLinkDeviceType type;
-      ncclResult_t ret = ncclDeviceType(lowerId, &type);
-      if (ret == ncclSuccess) {
-        if (type == ncclNvLinkDeviceSwitch) {
-          //TODO: we are making an assumption that all GPUs are connected to this switch
-          //This assumption may change for future architectures
-          nvswitch_links++;
-        } else if (type == ncclNvLinkDeviceGpu && busId2 == NULL) {
-          links++;
-        }
-      } else {
-        // The NVLink is up but we couldn't find the PCI device on the other
-        // side. Assume it's an NVswitch outside a VM.
-        if (l==0) INFO(NCCL_INIT, "Assuming NVLink is connected to NVswitch");
-        nvswitch_links++;
-      }
-    }
-  }
-  return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*links;
-}
-
-#endif
diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h
index 0b6198a..01bbb7f 100644
--- a/src/include/nvmlwrap.h
+++ b/src/include/nvmlwrap.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -9,18 +9,31 @@
 
 #include "nccl.h"
 
-//#define NVML_DIRECT 1
-#ifdef NVML_DIRECT
-#include "nvml.h"
+// The NVML library doesn't appear to be thread safe
+#include <pthread.h>
+extern pthread_mutex_t nvmlLock;
+#define NVMLLOCK() pthread_mutex_lock(&nvmlLock)
+#define NVMLUNLOCK() pthread_mutex_unlock(&nvmlLock)
+
+#define NVMLLOCKCALL(cmd, ret) do {                      \
+    NVMLLOCK();                                          \
+    ret = cmd;                                           \
+    NVMLUNLOCK();                                        \
+} while(false)
 
 #define NVMLCHECK(cmd) do {                              \
-    nvmlReturn_t e = cmd;                                \
+    nvmlReturn_t e;                                      \
+    NVMLLOCKCALL(cmd, e);                                \
     if( e != NVML_SUCCESS ) {                            \
       WARN("NVML failure '%s'", nvmlErrorString(e));     \
       return ncclSystemError;                            \
     }                                                    \
 } while(false)
 
+//#define NVML_DIRECT 1
+#ifdef NVML_DIRECT
+#include "nvml.h"
+
 static ncclResult_t wrapNvmlSymbols(void) { return ncclSuccess; }
 static ncclResult_t wrapNvmlInit(void) { NVMLCHECK(nvmlInit()); return ncclSuccess; }
 static ncclResult_t wrapNvmlShutdown(void) { NVMLCHECK(nvmlShutdown()); return ncclSuccess; }
@@ -57,6 +70,10 @@ static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned i
   NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber));
   return ncclSuccess;
 }
+static ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
+  NVMLCHECK(nvmlDeviceGetCudaComputeCapability(device, major, minor));
+  return ncclSuccess;
+}
 #else
 // Dynamically handle dependencies on NVML
 
@@ -139,6 +156,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned
 ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
                                                    nvmlNvLinkCapability_t capability, unsigned int *capResult);
 ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber);
+ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
 
 #endif // NVML_DIRECT
 
diff --git a/src/include/param.h b/src/include/param.h
index dd5f697..5431757 100644
--- a/src/include/param.h
+++ b/src/include/param.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -36,7 +36,6 @@ static void setEnvFile(const char* fileName) {
     s++;
     strncpy(envValue, line+s, 1024);
     setenv(envVar, envValue, 0);
-    char *str = getenv(envVar);
   }
   if (line) free(line);
   fclose(file);
diff --git a/src/include/rings.h b/src/include/rings.h
deleted file mode 100644
index 43fc595..0000000
--- a/src/include/rings.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_RINGS_H_
-#define NCCL_RINGS_H_
-
-static int getDefaultThreads() {
-  // On Kepler, rings are doubled later.
-  return ncclCudaCompCap() == 3 ? 128 : 256;
-}
-
-ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut);
-
-#endif
diff --git a/src/include/shm.h b/src/include/shm.h
index 4fb49cb..9cd9d05 100644
--- a/src/include/shm.h
+++ b/src/include/shm.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/include/socket.h b/src/include/socket.h
index fb5cfc0..96bf5db 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -18,8 +18,9 @@
 
 #define MAX_IFS 16
 #define MAX_IF_NAME_SIZE 16
-#define SLEEP_INT     1000  // sleep interval in usec
-#define RETRY_TIMES   2e4   // retry times before reporting a timeout (20 sec)
+#define SLEEP_INT            1000 // connection retry sleep interval in usec
+#define RETRY_REFUSED_TIMES   2e4 // connection refused retry times before reporting a timeout (20 sec)
+#define RETRY_TIMEDOUT_TIMES    3 // connection timed out retry times (each one can take 20s)
 
 /* Common socket address storage structure for IPv4/IPv6 */
 union socketAddress {
@@ -41,7 +42,7 @@ static inline const char *socketToString(struct sockaddr *saddr, char *buf) {
   return buf;
 }
 
-static inline short socketToPort(struct sockaddr *saddr) {
+static inline uint16_t socketToPort(struct sockaddr *saddr) {
   return ntohs(saddr->sa_family == AF_INET ? ((struct sockaddr_in*)saddr)->sin_port : ((struct sockaddr_in6*)saddr)->sin6_port);
 }
 
@@ -65,6 +66,9 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
 #endif
   struct netIf userIfs[MAX_IFS];
   bool searchNot = prefixList && prefixList[0] == '^';
+  if (searchNot) prefixList++;
+  bool searchExact = prefixList && prefixList[0] == '=';
+  if (searchExact) prefixList++;
   int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
 
   int found = 0;
@@ -91,7 +95,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
     }
 
     // check against user specified interfaces
-    if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs) ^ searchNot)) {
+    if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) {
       continue;
     }
 
@@ -116,17 +120,17 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
   return found;
 }
 
-static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
+static bool matchSubnet(struct ifaddrs local_if, union socketAddress* remote) {
   /* Check family first */
   int family = local_if.ifa_addr->sa_family;
-  if (family != remote.sa.sa_family) {
+  if (family != remote->sa.sa_family) {
     return false;
   }
 
   if (family == AF_INET) {
     struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
     struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
-    struct sockaddr_in& remote_addr = remote.sin;
+    struct sockaddr_in& remote_addr = remote->sin;
     struct in_addr local_subnet, remote_subnet;
     local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
     remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
@@ -134,7 +138,7 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
   } else if (family == AF_INET6) {
     struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
     struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
-    struct sockaddr_in6& remote_addr = remote.sin6;
+    struct sockaddr_in6& remote_addr = remote->sin6;
     struct in6_addr& local_in6 = local_addr->sin6_addr;
     struct in6_addr& mask_in6 = mask->sin6_addr;
     struct in6_addr& remote_in6 = remote_addr.sin6_addr;
@@ -159,8 +163,11 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
   }
 }
 
-static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress remoteAddr, int ifNameMaxSize, int maxIfs) {
-  char line[1024], line_a[1024];
+static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
+#ifdef ENABLE_TRACE
+  char line[1024];
+#endif
+  char line_a[1024];
   int found = 0;
   struct ifaddrs *interfaces, *interface;
   getifaddrs(&interfaces);
@@ -184,13 +191,13 @@ static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAdd
     // Store the interface name
     strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
 
-    INFO(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
+    TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr->sa), line_a));
     found++;
     if (found == maxIfs) break;
   }
 
   if (found == 0) {
-    WARN("Net : No interface found in the same subnet as remote address %s", socketToString(&(remoteAddr.sa), line_a));
+    WARN("Net : No interface found in the same subnet as remote address %s", socketToString(&(remoteAddr->sa), line_a));
   }
   freeifaddrs(interfaces);
   return found;
@@ -295,7 +302,7 @@ static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNam
         // Try to find interface that is in the same subnet as the IP in comm id
         union socketAddress idAddr;
         GetSocketAddrFromString(&idAddr, commId);
-        nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, idAddr, ifNameMaxSize, maxIfs);
+        nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
       }
     }
     // Then look for anything else (but not docker or lo)
@@ -322,7 +329,11 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
   if (socketToPort(&localAddr->sa)) {
     // Port is forced by env. Make sure we get the port.
     int opt = 1;
+#if defined(SO_REUSEPORT)
     SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
+#else
+    SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
+#endif
   }
 
   // localAddr port should be 0 (Any port)
@@ -370,14 +381,18 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
 #endif
 
   int ret;
-  int retries = 0;
+  int timedout_retries = 0;
+  int refused_retries = 0;
 retry:
   SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret);
   if (ret == 0) return ncclSuccess;
-  if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) {
-    INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); \
-    usleep(SLEEP_INT);
-    goto retry;
+  if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) {
+    if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
+        (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
+      if (refused_retries % 1000 == 0) INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
+      usleep(SLEEP_INT);
+      goto retry;
+    }
   }
   WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno));
   return ncclSystemError;
@@ -385,12 +400,12 @@ retry:
 
 #define NCCL_SOCKET_SEND 0
 #define NCCL_SOCKET_RECV 1
-static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* offset) {
+static ncclResult_t socketProgressOpt(int op, int fd, void* ptr, int size, int* offset, int block) {
   int bytes = 0;
   char* data = (char*)ptr;
   do {
-    if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), MSG_DONTWAIT);
-    if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), MSG_DONTWAIT);
+    if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
+    if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
     if (op == NCCL_SOCKET_RECV && bytes == 0) {
       WARN("Net : Connection closed by remote peer");
       return ncclSystemError;
@@ -408,9 +423,13 @@ static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* off
   return ncclSuccess;
 }
 
+static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* offset) {
+  return socketProgressOpt(op, fd, ptr, size, offset, 0);
+}
+
 static ncclResult_t socketWait(int op, int fd, void* ptr, int size, int* offset) {
   while (*offset < size)
-    NCCLCHECK(socketProgress(op, fd, ptr, size, offset));
+    NCCLCHECK(socketProgressOpt(op, fd, ptr, size, offset, 1));
   return ncclSuccess;
 }
 
diff --git a/src/include/topo.h b/src/include/topo.h
deleted file mode 100644
index e824a81..0000000
--- a/src/include/topo.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_TOPO_H_
-#define NCCL_TOPO_H_
-
-#include "nccl.h"
-#include <limits.h>
-#include <stdlib.h>
-#include <ctype.h>
-
-#define BUSID_SIZE (sizeof("0000:00:00.0"))
-#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
-
-static ncclResult_t getCudaPath(int cudaDev, char** path) {
-  char busId[BUSID_SIZE];
-  CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
-  for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
-  char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
-  memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
-  memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
-  *path = realpath(busPath, NULL);
-  if (*path == NULL) {
-    WARN("Could not find real path of %s", busPath);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-enum ncclPathDist {
-  PATH_PIX = 0,
-  PATH_PXB = 1,
-  PATH_PHB = 2,
-  PATH_SOC = 3
-};
-
-static const char* pathDists[] = { "PIX", "PXB", "PHB", "SOC" };
-
-static int pciDistance(char* path1, char* path2) {
-  int score = 0;
-  int depth = 0;
-  int same = 1;
-  for (int i=0; i<strlen(path1); i++) {
-    if (path1[i] != path2[i]) same = 0;
-    if (path1[i] == '/') {
-      depth++;
-      if (same == 1) score++;
-    }
-  }
-  if (score <= 3) return PATH_SOC;
-  if (score == 4) return PATH_PHB;
-  if (score == depth-1) return PATH_PIX;
-  return PATH_PXB;
-}
-
-#endif
diff --git a/src/include/transport.h b/src/include/transport.h
index 6231a71..8f9bf0e 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -7,11 +7,15 @@
 #ifndef NCCL_TRANSPORT_H_
 #define NCCL_TRANSPORT_H_
 
-#include "nccl.h"
-#include <stdint.h>
+#include "devcomm.h"
+#include "graph.h"
 #include "nvmlwrap.h"
+#include "core.h"
 
 #define NTRANSPORTS 3
+#define TRANSPORT_P2P 0
+#define TRANSPORT_SHM 1
+#define TRANSPORT_NET 2
 
 extern struct ncclTransport ncclTransports[];
 
@@ -23,21 +27,19 @@ struct ncclComm;
 struct ncclPeerInfo {
   int rank;
   int cudaDev;
-  int nvmlDev;
+  int gdrSupport;
   uint64_t hostHash;
   uint64_t pidHash;
-  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+  dev_t shmDev;
+  int64_t busId;
 };
 
-// Used to hold the transport connection values
-typedef int64_t ncclTvalue_t;
-
 #define CONNECT_SIZE 128
 struct ncclConnect {
   char data[CONNECT_SIZE];
 };
 
-enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress, ncclProxyOpDone };
+enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
 
 struct ncclProxyArgs;
 typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
@@ -50,7 +52,7 @@ struct ncclProxyArgs {
   int chunkSteps;
   int nsteps;
   uint64_t opCount;
-  int llMode;
+  int protocol;
   int state;   // add component before this line -- it is left out during initialization
 
   // Internal state
@@ -77,7 +79,7 @@ struct ncclProxyState {
 };
 
 struct ncclTransportComm {
-  ncclResult_t (*setup)(struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
+  ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
   ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*);
   ncclResult_t (*free)(void*);
   ncclResult_t (*proxy)(struct ncclProxyArgs*);
@@ -85,8 +87,7 @@ struct ncclTransportComm {
 
 struct ncclTransport {
   const char name[4];
-  ncclResult_t (*canConnect)(ncclTvalue_t*, struct ncclPeerInfo*, struct ncclPeerInfo*);
-  ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*);
+  ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
   struct ncclTransportComm send;
   struct ncclTransportComm recv;
 };
@@ -117,8 +118,4 @@ inline void transportProxyWait(const FUNC& func) {
   }
 }
 
-inline void transportProxyIdle(int idle) {
-  sched_yield();
-}
-
 #endif
diff --git a/src/include/trees.h b/src/include/trees.h
index 1a151d1..7eadd85 100644
--- a/src/include/trees.h
+++ b/src/include/trees.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/include/utils.h b/src/include/utils.h
index 5a6a588..266abca 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,7 +10,16 @@
 #include "nccl.h"
 #include <stdint.h>
 
-ncclResult_t getHostName(char* hostname, int maxlen);
+int ncclCudaCompCap();
+
+// PCI Bus ID <-> int64 conversion functions
+ncclResult_t int64ToBusId(int64_t id, char* busId);
+ncclResult_t busIdToInt64(char* busId, int64_t* id);
+
+ncclResult_t getBusId(int cudaDev, int64_t *busId);
+
+ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
+uint64_t getHash(const char* string, int n);
 uint64_t getHostHash();
 uint64_t getPidHash();
 
@@ -20,6 +29,12 @@ struct netIf {
 };
 
 int parseStringList(const char* string, struct netIf* ifList, int maxList);
-bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize);
+bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
+
+static long log2i(long n) {
+ long l = 0;
+ while (n>>=1) l++;
+ return l;
+}
 
 #endif
diff --git a/src/init.cc b/src/init.cc
new file mode 100644
index 0000000..627f6c7
--- /dev/null
+++ b/src/init.cc
@@ -0,0 +1,934 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "channel.h"
+#include "nvmlwrap.h"
+#include "bootstrap.h"
+#include "transport.h"
+#include "group.h"
+#include "net.h"
+#include "enqueue.h"
+#include "graph.h"
+#include "argcheck.h"
+#include "cpuset.h"
+#include <sched.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <dlfcn.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#define STR2(v) #v
+#define STR(v) STR2(v)
+
+#ifdef ENABLE_TRACE
+std::chrono::high_resolution_clock::time_point ncclEpoch;
+#endif
+
+#if CUDART_VERSION >= 9020
+#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
+#else
+#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
+#endif
+
+NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
+
+NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
+
+ncclNet_t* ncclNet = NULL;
+
+// Returns ncclInternalError if anything fails, causing that network to be ignored.
+ncclResult_t initNet(ncclNet_t* net) {
+  int ndev;
+  if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
+  if (net->devices(&ndev) != ncclSuccess) return ncclInternalError;
+  if (ndev <= 0) return ncclSystemError;
+  return ncclSuccess;
+}
+
+ncclResult_t initNetPlugin(ncclNet_t** net) {
+  void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL);
+  if (netPluginLib == NULL) {
+    // dlopen does not guarantee to set errno, but dlerror only gives us a
+    // string, so checking errno doesn't hurt to try to provide a better
+    // error message
+    if (errno == ENOENT) {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (libnccl-net.so), using internal implementation");
+    } else {
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
+    }
+    return ncclSuccess;
+  }
+  ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
+  if (extNet == NULL) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol.");
+    goto cleanup;
+  }
+  if (initNet(extNet) == ncclSuccess) {
+    *net = extNet;
+    return ncclSuccess;
+  }
+cleanup:
+  if (netPluginLib != NULL) dlclose(netPluginLib);
+  return ncclSuccess;
+}
+
+ncclResult_t initNet() {
+  // Always initialize bootstrap network
+  NCCLCHECK(bootstrapNetInit());
+
+  NCCLCHECK(initNetPlugin(&ncclNet));
+  if (ncclNet != NULL) return ncclSuccess;
+  if (initNet(&ncclNetIb) == ncclSuccess) {
+    ncclNet = &ncclNetIb;
+  } else {
+    NCCLCHECK(initNet(&ncclNetSocket));
+    ncclNet = &ncclNetSocket;
+  }
+  return ncclSuccess;
+}
+
+pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
+static bool initialized = false;
+static ncclResult_t ncclInit() {
+  if (initialized) return ncclSuccess;
+  pthread_mutex_lock(&initLock);
+  if (!initialized) {
+    initEnv();
+    initNet();
+    initialized = true;
+  }
+  pthread_mutex_unlock(&initLock);
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclGetVersion, int* version);
+ncclResult_t ncclGetVersion(int* version) {
+  if (version == NULL) return ncclInvalidArgument;
+  *version = NCCL_VERSION_CODE;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
+ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
+  NCCLCHECK(ncclInit());
+  NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
+  return bootstrapGetUniqueId(out);
+}
+
+// Prevent compiler from optimizing out these operations
+#ifdef __clang__
+#define NCCL_NO_OPTIMIZE __attribute__((optnone))
+#else
+#define NCCL_NO_OPTIMIZE __attribute__((optimize("O0")))
+#endif
+
+void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) {
+  comm->rank = comm->cudaDev = comm->busId = comm->nRanks = -1;
+}
+
+#undef NCCL_NO_OPTIMIZE
+
+static ncclResult_t commFree(ncclComm_t comm) {
+  if (comm == NULL)
+    return ncclSuccess;
+
+  free(comm->peerInfo);
+  ncclTopoFree(comm->topo);
+
+  if (comm->bootstrap)
+    NCCLCHECK(bootstrapClose(comm->bootstrap));
+
+  CUDACHECK(cudaFree(comm->hostDevComm.channels));
+  CUDACHECK(cudaFree(comm->devComm));
+
+  for (int channel=0; channel<comm->nChannels; channel++)
+    NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks));
+
+  if (comm->doneEvent != NULL)
+    CUDACHECK(cudaEventDestroy(comm->doneEvent));
+
+  if (comm->launchMode == ncclComm::GROUP) {
+    CUDACHECK(cudaStreamDestroy(comm->groupStream));
+  }
+
+  // Last rank frees shared resources between threads
+  int isLast;
+  NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
+  if (isLast) {
+    free(comm->intraBarrier);
+    free(comm->intraParams);
+    free(comm->intraCudaDevs);
+    free(comm->intraCGMode);
+    free(comm->intraCC);
+  }
+  CUDACHECK(cudaFreeHost((void *)comm->abortFlag));
+  CUDACHECK(cudaFreeHost((void *)comm->fatalDevError));
+
+  // Poison comm to try and catch a double free
+  commPoison(comm);
+
+  free(comm);
+  return ncclSuccess;
+}
+
+static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
+  if (ndev < 1) {
+    WARN("invalid device count (%d) requested", ndev);
+    return ncclInvalidArgument;
+  }
+  if (rank >= ndev || rank < 0) {
+    WARN("rank %d exceeds ndev=%d", rank, ndev);
+    return ncclInvalidArgument;
+  }
+
+  // Try to create a CUDA object right away. If there is something wrong with
+  // the device we're on (failure cause #1) , better know it early.
+  cudaEvent_t doneEvent;
+  CUDACHECK(cudaEventCreateWithFlags(&doneEvent, cudaEventDisableTiming));
+
+  struct ncclComm* comm;
+  NCCLCHECK(ncclCalloc(&comm, 1));
+
+  comm->rank = comm->hostDevComm.rank =rank;
+  comm->nRanks = comm->hostDevComm.nRanks = ndev;
+  cudaGetDevice(&comm->cudaDev);
+  NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
+  TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x", comm, rank, ndev, comm->cudaDev, comm->busId);
+
+  comm->doneEvent = doneEvent;
+  comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
+#if CUDART_VERSION >= 9020
+  comm->groupCudaStream = ncclParamGroupCudaStream();
+#else
+  // Don't allow the user to overload the default setting in older CUDA builds
+  comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM;
+#endif
+  comm->fatalError = ncclSuccess;
+
+  NCCLCHECK(ncclCudaHostAlloc((void**) &comm->fatalDevError, (void**) &comm->hostDevComm.fatalDevError, sizeof(ncclDevError_t)));
+  *comm->fatalDevError = ncclDevSuccess;
+
+  NCCLCHECK(ncclCudaHostAlloc((void**) &comm->abortFlag, (void**) &comm->hostDevComm.abortFlag, sizeof(uint32_t)));
+  *comm->abortFlag = 0;
+
+  comm->argsptr = &comm->args;
+
+  *comret = comm;
+  return ncclSuccess;
+}
+
+static ncclResult_t devCommSetup(ncclComm_t comm) {
+  // Duplicate the channels on the device
+  NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->nChannels));
+  NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->nChannels));
+
+  // Copy userRanks and peers
+  for (int r=0; r<comm->nChannels; r++) {
+    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
+    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks));
+  }
+
+  // Duplicate the dev comm on the device
+  NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
+  NCCLCHECK(ncclCudaMemcpy(comm->devComm, &comm->hostDevComm, 1));
+  return ncclSuccess;
+}
+
+// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
+#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
+static void showVersion() {
+  static int shown = 0;
+  if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
+    printf("%s\n", VERSION_STRING);
+    fflush(stdout);
+    if (ncclDebugFile != stdout)
+      INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
+    shown = 1;
+  }
+}
+
+static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) {
+  info->rank = comm->rank;
+  CUDACHECK(cudaGetDevice(&info->cudaDev));
+  info->hostHash=getHostHash()+commHash;
+  info->pidHash=getPidHash()+commHash;
+
+  // Get the device MAJOR:MINOR of /dev/shm so we can use that
+  // information to decide whether we can use SHM for inter-process
+  // communication in a container environment
+  struct stat statbuf;
+  SYSCHECK(stat("/dev/shm", &statbuf), "stat");
+  info->shmDev = statbuf.st_dev;
+
+  info->busId = comm->busId;
+  int netDevs;
+
+  NCCLCHECK(ncclNetDevices(&netDevs));
+  for (int n=0; n<netDevs; n++) {
+    int ptrSupport;
+    NCCLCHECK(ncclNetPtrSupport(n, &ptrSupport));
+    if (ptrSupport & NCCL_PTR_CUDA) info->gdrSupport |= (1 << n);
+  }
+  return ncclSuccess;
+}
+
+template <int type>
+static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) {
+  for (int t=0; t<NTRANSPORTS; t++) {
+    struct ncclTransport *transport = ncclTransports+t;
+    struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
+    int ret = 0;
+    NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo));
+    if (ret) {
+      connector->transportComm = transportComm;
+      NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, buffSize, channelId));
+      return ncclSuccess;
+    }
+  }
+  WARN("No transport found !");
+  return ncclInternalError;
+}
+
+static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks) {
+  TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
+  NCCLCHECK(initChannel(comm, channelId));
+
+  struct ncclRing* ring = &comm->channels[channelId].ring;
+  // Reorganize ranks to start with rank.
+  int shift;
+  for (shift = 0; shift<nranks; shift++) {
+    if (ringRanks[shift] == rank) {
+      break;
+    }
+  }
+  for (int i=0; i<nranks; i++) {
+    ring->userRanks[i] = ringRanks[(i+shift)%nranks];
+  }
+  return ncclSuccess;
+}
+
+void* waitForNonNullPtr(void* p) {
+  volatile void** ptr = (volatile void**) p;
+  while (*ptr == NULL) sched_yield();
+  return (void*)*ptr;
+}
+
+ncclResult_t initParams(struct ncclComm* comm) {
+  struct cudaLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
+  params->args = &comm->argsptr;
+  params->stream = NULL;
+  params->sharedMem = 0;
+  params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
+  params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1;
+  return ncclSuccess;
+}
+
+// Allocate/Set Intra Process Structures and set CG options
+ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) {
+  comm->intraRank = rank;
+  comm->intraRanks = ranks;
+  comm->intraPhase = 0;
+
+  // Alloc shared structures
+  if (rank == 0) {
+    assert(comm == comm0);
+    int* bar;
+    NCCLCHECK(ncclCalloc(&bar, 2));
+    bar[0] = bar[1] = 0;
+    comm->intraBarrier = bar;
+    NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks));
+    NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks));
+    int* CGMode;
+    NCCLCHECK(ncclCalloc(&CGMode, 1));
+    *CGMode = 0x11;
+    comm->intraCGMode = CGMode;
+    int* CC;
+    NCCLCHECK(ncclCalloc(&CC, 1));
+    *CC = ncclCudaCompCap();
+    comm->intraCC = CC;
+  } else {
+    comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
+    comm->intraParams = (struct cudaLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
+    comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
+    comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
+    comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
+  }
+  comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
+  NCCLCHECK(initParams(comm));
+
+  int cgMdLaunch = 0;
+
+  // Set CG Mode
+  comm->launchMode = ncclComm::GROUP;
+  char* str = getenv("NCCL_LAUNCH_MODE");
+  if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) {
+    comm->launchMode = ncclComm::PARALLEL;
+  }
+  if (comm->launchMode == ncclComm::GROUP) {
+    CUDACHECK(cudaStreamCreateWithFlags(&comm->groupStream, cudaStreamNonBlocking));
+#if CUDART_VERSION >= 9000
+    if (*comm->intraCC && (ncclCudaCompCap() == *comm->intraCC)) {
+      // Check whether the GPU supports Cooperative Group Multi Device Launch
+      (void) cudaDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
+    }
+#endif
+  }
+
+  // Disable cgMdLaunch if any rank does not support it
+  if (cgMdLaunch == 0) {
+    *comm->intraCGMode = 0x10;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
+  TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
+  uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
+  struct ncclConnect connect;
+  struct ncclConnector* conn;
+  for (int i=0; i<nrecv; i++) {
+    int peer = peerRecv[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].recv;
+    if (conn->connected) { ++nSkippedRecv; continue; }
+    memset(&connect, 0, sizeof(connect));
+    NCCLCHECK(selectTransport<0>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
+    NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+  }
+  for (int i=0; i<nsend; i++) {
+    int peer = peerSend[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].send;
+    if (conn->connected) { ++nSkippedSend; continue; }
+    memset(&connect, 0, sizeof(connect));
+    NCCLCHECK(selectTransport<1>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
+    NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+  }
+  for (int i=0; i<nsend; i++) {
+    int peer = peerSend[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].send;
+    if (conn->connected) {++nSkippedSend; continue; }
+    memset(&connect, 0, sizeof(connect));
+    NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+    NCCLCHECK(conn->transportComm->connect(&connect, conn));
+    conn->connected = 1;
+  }
+  for (int i=0; i<nrecv; i++) {
+    int peer = peerRecv[i];
+    if (peer == -1) continue;
+    conn = &channel->peers[peer].recv;
+    if (conn->connected) {++nSkippedRecv; continue; }
+    memset(&connect, 0, sizeof(connect));
+    NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+    NCCLCHECK(conn->transportComm->connect(&connect, conn));
+    conn->connected = 1;
+  }
+  TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
+  return ncclSuccess;
+}
+
+NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
+
+static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
+  // We use 3 AllGathers
+  // 1. { peerInfo, comm }
+  // 2. ConnectTransport[nranks], ConnectValue[nranks]
+  // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
+
+  int rank = comm->rank;
+  int nranks = comm->nRanks;
+  uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
+  TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
+  NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
+
+  // AllGather1 - begin
+  struct {
+    struct ncclPeerInfo peerInfo;
+    struct ncclComm* comm;
+  } *allGather1Data;
+
+  NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
+  allGather1Data[rank].comm = comm;
+  struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo;
+  NCCLCHECK(fillInfo(comm, myInfo, commHash));
+  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
+
+  NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
+  for (int i = 0; i < nranks; i++) {
+    memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
+    if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
+      WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId);
+      return ncclInvalidUsage;
+    }
+  }
+  // AllGather1 data is used again below
+  // AllGather1 - end
+
+  // Topo detection / System graph creation
+  NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo));
+  // Compute paths between GPUs and NICs
+  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
+  // Remove inaccessible GPUs and unused NICs
+  NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm));
+  // Recompute paths after trimming
+  NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
+  // Compute max speed to accelerate search
+  NCCLCHECK(ncclTopoGetMaxSpeed(comm->topo));
+  // Print final topology
+  NCCLCHECK(ncclTopoPrint(comm->topo));
+
+  // Get rings and trees
+  struct ncclTopoGraph treeGraph;
+  treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
+  treeGraph.crossNic = ncclParamCrossNic();
+  // We communicate only half the data between node with trees on 2 nodes.
+  NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph));
+  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph));
+  struct ncclTopoGraph ringGraph;
+  ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
+  ringGraph.crossNic = ncclParamCrossNic();
+  NCCLCHECK(ncclTopoCompute(comm->topo, &ringGraph));
+  NCCLCHECK(ncclTopoPrintGraph(comm->topo, &ringGraph));
+
+  // AllGather3 - begin
+
+  struct {
+    int cudaCompCap;
+    int fullCudaCompCap;
+    int nvlink;
+    int nChannels;
+    struct {
+      int sameChannels;
+      int speedIntra;
+      int speedInter;
+      int nvlink;
+    } tree;
+    struct {
+      int sameChannels;
+      int speedIntra;
+      int speedInter;
+      int nvlink;
+    } ring;
+    struct ncclTopoRanks topoRanks;
+  } *allGather3Data;
+
+  NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
+  allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
+  allGather3Data[rank].nvlink = treeGraph.nvlink;
+  allGather3Data[rank].nChannels = comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
+  allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
+  allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
+  allGather3Data[rank].tree.speedInter = treeGraph.speedInter;
+  allGather3Data[rank].tree.nvlink = treeGraph.nvlink;
+  allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
+  allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra;
+  allGather3Data[rank].ring.speedInter = ringGraph.speedInter;
+  allGather3Data[rank].ring.nvlink = ringGraph.nvlink;
+
+  NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &allGather3Data[rank].topoRanks));
+
+  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
+
+  // Determine nNodes, firstRanks, ...
+  int* nodesFirstRank;
+  NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks));
+  for (int i=0; i<nranks; i++) {
+    int node = -1;
+    int firstRank = allGather3Data[i].topoRanks.ringRecv[0];
+    for (int n=0; n<comm->nNodes; n++) {
+      if (nodesFirstRank[n] == firstRank) node = n;
+    }
+    if (node == -1) {
+      node = comm->nNodes++;
+      nodesFirstRank[node] = firstRank;
+    }
+    if (i == comm->rank) comm->node = node;
+  }
+
+  // Determine the minimum CUDA Compute capability of all GPUs
+  int myCompCap = allGather3Data[rank].cudaCompCap;
+  int minCompCap = myCompCap, maxCompCap = myCompCap;
+  for (int i = 0; i < nranks; i++) {
+    minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap);
+    maxCompCap = std::max(allGather3Data[i].cudaCompCap, maxCompCap);
+  }
+
+  comm->nvlink = 1;
+  for (int i = 0; i < nranks; i++) comm->nvlink &= allGather3Data[i].nvlink;
+
+  int nChannelsOrig = comm->nChannels;
+  struct ncclTopoRanks** allTopoRanks;
+  NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
+  for (int i=0; i<nranks; i++) {
+    allTopoRanks[i] = &allGather3Data[i].topoRanks;
+    // Make sure we align all ranks so that the tuning is consistent across ranks
+    treeGraph.nChannels = ringGraph.nChannels = comm->nChannels = std::min(allGather3Data[i].nChannels, comm->nChannels);
+    treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
+    treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
+    treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
+    treeGraph.nvlink = std::min(allGather3Data[i].tree.nvlink, treeGraph.nvlink);
+    ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
+    ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
+    ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
+    ringGraph.nvlink = std::min(allGather3Data[i].ring.nvlink, ringGraph.nvlink);
+  }
+
+  if (comm->nChannels < nChannelsOrig) {
+    // We started duplicating channels during Preset(), so we need to move the
+    // duplicated channels since we have removed some.
+    for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
+  }
+
+  int *rings;
+  NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
+
+  NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings));
+
+  free(allTopoRanks);
+  free(nodesFirstRank);
+  free(allGather3Data);
+
+  // AllGather3 - end
+
+  TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
+
+  NCCLCHECK(ncclSetThresholds(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph));
+
+  char line[1024];
+  line[0]='\0';
+  for (int c=0; c<comm->nChannels; c++) {
+    struct ncclTree* treeUp = &comm->channels[c].treeUp;
+    struct ncclTree* treeDn = &comm->channels[c].treeDn;
+    snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d|%d->%d->%d/%d/%d",
+        c, treeUp->down[0], treeUp->down[1], treeUp->down[2], rank, treeUp->up,
+        treeDn->up, rank, treeDn->down[0], treeDn->down[1], treeDn->down[2]);
+  }
+  line[1023] = '\0';
+  INFO(NCCL_INIT, "Trees%s", line);
+
+  // Connect with prev/next for each ring
+  struct ncclConnect *connect;
+  NCCLCHECK(ncclCalloc(&connect, 2));
+  for (int c=0; c<comm->nChannels; c++) {
+    struct ncclChannel* channel = comm->channels+c;
+    NCCLCHECK(setupChannel(comm, c, rank, nranks, rings+c*nranks));
+    if (comm->nRanks == 1) continue;
+    NCCLCHECK(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next));
+    NCCLCHECK(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up));
+    NCCLCHECK(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down));
+  }
+  TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
+  free(connect);
+  free(rings);
+
+  // Compute intra ranks (using AllGather1 data)
+  int intraRank0 = -1, intraRank = -1, intraRanks = 0;
+  for (int i = 0; i < nranks; i++) {
+    if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) &&
+        (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) {
+      if (intraRanks == 0) intraRank0 = i;
+      if (i == rank) intraRank = intraRanks;
+      intraRanks++;
+    }
+  }
+  TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+        rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
+  if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
+    WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+         rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
+    return ncclInternalError;
+  }
+  NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm));
+
+  // Done with AllGather1 data
+  free(allGather1Data);
+
+  if (comm->nNodes) NCCLCHECK(transportCreateProxy(comm));
+
+  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
+  return ncclSuccess;
+}
+
+static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) {
+  CPU_ZERO_S(sizeof(cpu_set_t), mask);
+  char* cudaPath;
+  NCCLCHECK(ncclTopoCudaPath(cudaDev, &cudaPath));
+  char path[PATH_MAX];
+  strncpy(path, cudaPath, PATH_MAX-1);
+  snprintf(path+strlen(path), PATH_MAX-1-strlen(path), "/local_cpus");
+  path[PATH_MAX-1] = '\0';
+  int fd;
+  SYSCHECKVAL(open(path, O_RDONLY), "open", fd);
+  char affinityStr[sizeof(cpu_set_t)*2 + 1];
+  int r = read(fd, affinityStr, sizeof(cpu_set_t)*2);
+  if (r > 0) {
+    affinityStr[r] = '\0';
+    NCCLCHECK(ncclStrToCpuset(affinityStr, mask));
+  }
+  close(fd);
+  free(cudaPath);
+  return ncclSuccess;
+}
+
+NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
+
+static ncclResult_t setCpuAffinity(int cudaDev) {
+  // Query the CPU affinity set we were provided
+  cpu_set_t mask;
+  SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
+
+#ifdef ENABLE_TRACE
+  {
+    char affinityStr[sizeof(cpu_set_t)*2];
+    NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
+    TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", cudaDev, affinityStr);
+  }
+#endif
+
+  // Find the CPUs that are local to the supplied GPU
+  cpu_set_t gpuMask;
+  NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask));
+
+#ifdef ENABLE_TRACE
+  {
+    char affinityStr[sizeof(cpu_set_t)*2];
+    NCCLCHECK(ncclCpusetToStr(&gpuMask, affinityStr));
+    TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", cudaDev, affinityStr);
+  }
+#endif
+
+  cpu_set_t finalMask;
+  if (ncclParamIgnoreCpuAffinity())
+    // Ignore the CPU affinity set and use the GPU one instead
+    finalMask = gpuMask;
+  else
+    // Use a subset of the GPU affinity set
+    CPU_AND(&finalMask, &mask, &gpuMask);
+
+  // If there is a non empty set, use it to set affinity
+  if (CPU_COUNT(&finalMask)) {
+    char affinityStr[sizeof(cpu_set_t)*2];
+    NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
+    INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", cudaDev, affinityStr);
+    SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity");
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) {
+  cpu_set_t affinitySave;
+  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+
+  NCCLCHECK(wrapNvmlSymbols());
+  NCCLCHECK(wrapNvmlInit());
+
+  // Make sure all host memory allocation are close to the GPU
+  CUDACHECK(cudaSetDevice(cudaDev));
+  NCCLCHECK(setCpuAffinity(cudaDev));
+  ncclResult_t res;
+
+  NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
+  NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
+  NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
+
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
+
+  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
+
+  return ncclSuccess;
+cleanup:
+  if ((*newcomm) && (*newcomm)->bootstrap) bootstrapAbort((*newcomm)->bootstrap);
+  *newcomm = NULL;
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  return res;
+}
+
+static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) {
+  ncclResult_t res;
+  char* env = getenv("NCCL_COMM_ID");
+  if (env && myrank == 0) {
+    NCCLCHECKGOTO(bootstrapCreateRoot(&commId, true), res, end);
+  }
+
+  NCCLCHECKGOTO(ncclInit(), res, end);
+  if (myrank == 0) showVersion();
+
+  // Make sure the CUDA runtime is initialized.
+  CUDACHECKGOTO(cudaFree(NULL), res, end);
+
+  NCCLCHECKGOTO(PtrCheck(newcomm, "CommInitRank", "newcomm"), res, end);
+  if (nranks < 1 || myrank < 0 || myrank >= nranks) {
+    WARN("Invalid rank requested : %d/%d", myrank, nranks);
+    res = ncclInvalidArgument;
+    goto end;
+  }
+
+  if (ncclAsyncMode()) {
+    NCCLCHECKGOTO(ncclAsyncInit(ncclCommInitRankSync, newcomm, nranks, commId, myrank, cudaDev), res, end);
+  } else {
+    NCCLCHECKGOTO(ncclCommInitRankSync(newcomm, nranks, commId, myrank, cudaDev), res, end);
+  }
+end:
+  if (ncclAsyncMode()) return ncclAsyncErrCheck(res);
+  else return res;
+}
+
+NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
+ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev));
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
+ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
+  NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
+  if (ndev < 0) {
+    WARN("Invalid device count requested : %d", ndev);
+    return ncclInvalidArgument;
+  }
+
+  ncclUniqueId uniqueId;
+  NCCLCHECK(ncclGetUniqueId(&uniqueId));
+  NCCLCHECK(ncclGroupStart());
+  for (int i=0; i<ndev; i++) {
+    // Ignore return codes .. we need to call ncclGroupEnd to clean up anyway
+    ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i);
+  }
+  NCCLCHECK(ncclGroupEnd());
+  return ncclSuccess;
+}
+
+static ncclResult_t commDestroy(ncclComm_t comm) {
+  int savedDevice;
+#ifdef ENABLE_TRACE
+  int rank = comm->rank;
+#endif
+  CUDACHECK(cudaGetDevice(&savedDevice));
+  int commDevice = comm->cudaDev;
+
+  if (savedDevice != commDevice) {
+    CUDACHECK(cudaSetDevice(commDevice));
+  }
+
+  TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, *comm->abortFlag, comm->fatalError);
+
+  CUDACHECK(cudaStreamSynchronize(comm->groupStream));
+  NCCLCHECK(transportDestroyProxy(comm));
+  NCCLCHECK(commFree(comm));
+
+  if (savedDevice != commDevice)
+    CUDACHECK(cudaSetDevice(savedDevice));
+
+  TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
+
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
+ncclResult_t ncclCommDestroy(ncclComm_t comm) {
+  if (comm == NULL)
+    return ncclSuccess;
+
+  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %x", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
+
+  // Try and prevent a double free of the comm struct (user error)
+  if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {
+    WARN("comm %p has already been destroyed", comm);
+    return ncclInvalidArgument;
+  }
+
+  return commDestroy(comm);
+}
+
+NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
+ncclResult_t ncclCommAbort(ncclComm_t comm) {
+  if (comm == NULL)
+    return ncclSuccess;
+
+  // Ask anything that might still be running on the device to quit
+  *comm->abortFlag = 1;
+
+  return commDestroy(comm);
+}
+
+NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
+const char* ncclGetErrorString(ncclResult_t code) {
+  switch (code) {
+    case ncclSuccess                : return "no error";
+    case ncclUnhandledCudaError     : return "unhandled cuda error";
+    case ncclSystemError            : return "unhandled system error";
+    case ncclInternalError          : return "internal error";
+    case ncclInvalidArgument        : return "invalid argument";
+    case ncclInvalidUsage           : return "invalid usage";
+    default                         : return "unknown result code";
+  }
+}
+
+NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError);
+ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
+  NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm"));
+  NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError"));
+
+  // Check device reported error
+  static ncclDevError_t printedDevErr = ncclDevSuccess;
+  switch(*comm->fatalDevError) {
+    case ncclDevSuccess :
+      break;
+    case ncclDevAssertedMismatch :
+      if (printedDevErr != ncclDevAssertedMismatch) {
+        WARN("Mismatched collective detected, please check your collective calls at and around rank %d. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank);
+        printedDevErr = ncclDevAssertedMismatch;
+      }
+      if (comm->fatalError == ncclSuccess) {
+        comm->fatalError = ncclInvalidUsage;
+      }
+      break;
+    case ncclDevSuspectedMismatch :
+      if (printedDevErr != ncclDevSuspectedMismatch) {
+        WARN("Your program may be hanging, this may be caused by a collective mismatch around rank %d. Please check your collective calls at and around this rank. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank);
+        printedDevErr = ncclDevSuspectedMismatch;
+      }
+      break;
+    default:
+      WARN("Unknown device error %d", *comm->fatalDevError);
+      return ncclInternalError;
+  }
+  *asyncError = comm->fatalError;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
+ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
+  NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
+  NCCLCHECK(PtrCheck(count, "CommCount", "count"));
+  *count = comm->nRanks;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
+ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
+  NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
+  NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
+  *devid = comm->cudaDev;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
+ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
+  NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
+  NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
+  *rank = comm->rank;
+  return ncclSuccess;
+}
diff --git a/src/init.cu b/src/init.cu
deleted file mode 100644
index 75822e6..0000000
--- a/src/init.cu
+++ /dev/null
@@ -1,1240 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "nccl.h"
-#include "core.h"
-#include "channel.h"
-#include "param.h"
-#include "nvmlwrap.h"
-#include "rings.h"
-#include "trees.h"
-#include "bootstrap.h"
-#include "transport.h"
-#include "group.h"
-#include "utils.h"
-#include "net.h"
-#include "checks.h"
-#include "enqueue.h"
-#include "topo.h"
-#include "nvlink.h"
-#include "cpuset.h"
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sched.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <cuda_runtime.h>
-#include <string.h>
-#include <errno.h>
-#include <assert.h>
-#include <dlfcn.h>
-
-#define STR2(v) #v
-#define STR(v) STR2(v)
-
-int ncclDebugLevel;
-uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
-pthread_mutex_t ncclDebugOutputLock;
-FILE *ncclDebugFile = stdout;
-
-#ifdef ENABLE_TRACE
-std::chrono::high_resolution_clock::time_point ncclEpoch;
-#endif
-
-#if CUDART_VERSION >= 9200
-#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
-#else
-#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
-#endif
-
-NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
-
-NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
-
-ncclNet_t* ncclNet = NULL;
-
-// We define this as weak to let tests redefine their own
-#pragma weak ncclNvlinkGpu
-ncclResult_t ncclNvlinkGpu(int* nvlink) {
-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-  CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
-  *nvlink = getNvlinkGpu(busId, NULL);
-  return ncclSuccess;
-}
-// We define this as weak to let tests redefine their own
-#pragma weak ncclCudaCompCap
-int ncclCudaCompCap() {
-  int cudaDev;
-  if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
-  int ccMajor;
-  if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
-  return ccMajor;
-}
-int ncclCudaFullCompCap() {
-  int cudaDev;
-  if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
-  int ccMajor, ccMinor;
-  if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
-  if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0;
-  return ccMajor*10+ccMinor;
-}
-
-// Returns ncclInternalError if anything fails, causing that network to be ignored.
-ncclResult_t initNet(ncclNet_t* net) {
-  int ndev;
-  if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
-  if (net->devices(&ndev) != ncclSuccess) return ncclInternalError;
-  if (ndev <= 0) return ncclSystemError;
-  return ncclSuccess;
-}
-
-ncclResult_t initNetPlugin(ncclNet_t** net) {
-  void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL);
-  if (netPluginLib == NULL) {
-    // dlopen does not guarantee to set errno, but dlerror only gives us a
-    // string, so checking errno doesn't hurt to try to provide a better
-    // error message
-    if (errno == ENOENT) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (libnccl-net.so).");
-    } else {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
-    }
-    return ncclSuccess;
-  }
-  ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
-  if (extNet == NULL) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol.");
-    goto cleanup;
-  }
-  if (initNet(extNet) == ncclSuccess) {
-    *net = extNet;
-    return ncclSuccess;
-  }
-cleanup:
-  if (netPluginLib != NULL) dlclose(netPluginLib);
-  return ncclSuccess;
-}
-
-ncclResult_t initNet() {
-  // Always initialize sockets as we use it for bootstrap
-  NCCLCHECK(initNet(&ncclNetSocket));
-
-  NCCLCHECK(initNetPlugin(&ncclNet));
-  if (ncclNet != NULL) return ncclSuccess;
-  if (initNet(&ncclNetIb) == ncclSuccess) {
-    ncclNet = &ncclNetIb;
-  } else {
-    ncclNet = &ncclNetSocket;
-  }
-  return ncclSuccess;
-}
-
-NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
-NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2);
-NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", -2);
-
-int ncclThreadThreshold(int minCompCap, int multiNode) {
-  int threshold = ncclParamThreadThreshold();
-  if (threshold == -2) { // user has not set this env variable
-    threshold = (minCompCap <= 6) ? NCCL_THREAD_THRESHOLD_PREVOLTA : NCCL_THREAD_THRESHOLD;
-    // multiply by 2 if running on multiple nodes
-    if (multiNode) {
-      threshold *= 2;
-    }
-  }
-  return threshold;
-}
-
-pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
-static bool initialized = false;
-static ncclResult_t ncclInit() {
-  if (initialized) return ncclSuccess;
-  pthread_mutex_lock(&initLock);
-  if (!initialized) {
-    initEnv();
-    initDebug();
-    initNet();
-    initialized = true;
-  }
-  pthread_mutex_unlock(&initLock);
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclGetVersion, int* version);
-ncclResult_t ncclGetVersion(int* version) {
-  if (version == NULL) return ncclInvalidArgument;
-  *version = NCCL_VERSION_CODE;
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
-ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
-  NCCLCHECK(ncclInit());
-  NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
-  return bootstrapGetUniqueId(out);
-}
-
-static ncclResult_t commFree(ncclComm_t comm) {
-  if (comm == NULL)
-    return ncclSuccess;
-
-  free(comm->peerInfo);
-
-  if (comm->bootstrap)
-    NCCLCHECK(bootstrapClose(comm->bootstrap));
-
-  CUDACHECK(cudaFree(comm->devComm));
-
-  for (int channel=0; channel<comm->nChannels; channel++)
-    NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks));
-
-  if (comm->doneEvent != NULL)
-    CUDACHECK(cudaEventDestroy(comm->doneEvent));
-
-  if (comm->launchMode == ncclComm::GROUP) {
-    CUDACHECK(cudaStreamDestroy(comm->groupStream));
-  }
-
-  // Last rank frees shared resources between threads
-  int isLast;
-  NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
-  if (isLast) {
-    free(comm->intraBarrier);
-    free(comm->intraParams);
-    free(comm->intraCudaDevs);
-    free(comm->intraCGMode);
-    free(comm->intraCC);
-  }
-  CUDACHECK(cudaFreeHost((void *)comm->abortFlag));
-  CUDACHECK(cudaFreeHost((void *)comm->fatalDevError));
-
-  free(comm);
-  return ncclSuccess;
-}
-
-static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
-  if (ndev < 1) {
-    WARN("invalid device count (%d) requested", ndev);
-    return ncclInvalidArgument;
-  }
-  if (rank >= ndev || rank < 0) {
-    WARN("rank %d exceeds ndev=%d", rank, ndev);
-    return ncclInvalidArgument;
-  }
-
-  // Try to create a CUDA object right away. If there is something wrong with
-  // the device we're on (failure cause #1) , better know it early.
-  cudaEvent_t doneEvent;
-  CUDACHECK(cudaEventCreateWithFlags(&doneEvent, cudaEventDisableTiming));
-
-  struct ncclComm* comm;
-  NCCLCHECK(ncclCalloc(&comm, 1));
-
-  comm->rank = rank;
-  comm->nRanks = ndev;
-  cudaGetDevice(&comm->cudaDev);
-  getNvmlDevice(comm->cudaDev, &comm->nvmlDev);
-  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);
-
-  comm->doneEvent = doneEvent;
-  comm->llThreshold = ncclParamLlThreshold();
-  comm->treeThreshold = ncclParamTreeThreshold();
-  comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
-#if CUDART_VERSION >= 9200
-  comm->groupCudaStream = ncclParamGroupCudaStream();
-#else
-  // Don't allow the user to overload the default setting in older CUDA builds
-  comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM;
-#endif
-  comm->fatalError = ncclSuccess;
-
-  CUDACHECK(cudaHostAlloc((void**) &comm->fatalDevError, sizeof(ncclDevError_t), cudaHostAllocMapped));
-  *comm->fatalDevError = ncclDevSuccess;
-
-  CUDACHECK(cudaHostAlloc((void**) &comm->abortFlag, sizeof(uint32_t), cudaHostAllocMapped));
-  *comm->abortFlag = 0;
-
-  comm->argsptr = &comm->args;
-
-  *comret = comm;
-  return ncclSuccess;
-}
-
-static ncclResult_t devCommSetup(ncclComm_t comm) {
-  // Fully duplicate the comm on the device
-  NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
-  // Copy the comm on the device
-  NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1));
-  // Copy userRanks
-  for (int r=0; r<comm->nChannels; r++) {
-    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
-    NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks));
-  }
-  // Copy the device-accessible pointer to comm->abortFlag
-  void *devAbortFlag;
-  CUDACHECK(cudaHostGetDevicePointer(&devAbortFlag, (uint32_t *)comm->abortFlag, 0));
-  CUDACHECK(cudaMemcpy(&comm->devComm->abortFlag, &devAbortFlag, sizeof(int *), cudaMemcpyHostToDevice));
-  // Copy the device-accessible pointer to comm->fatalDevError
-  void *devFatalError;
-  CUDACHECK(cudaHostGetDevicePointer(&devFatalError, (ncclDevError_t *)comm->fatalDevError, 0));
-  CUDACHECK(cudaMemcpy(&comm->devComm->fatalDevError, &devFatalError, sizeof(ncclDevError_t *), cudaMemcpyHostToDevice));
-  return ncclSuccess;
-}
-
-// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
-#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
-static void showVersion() {
-  static int shown = 0;
-  if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
-    printf("%s\n", VERSION_STRING);
-    fflush(stdout);
-    if (ncclDebugFile != stdout)
-      INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
-    shown = 1;
-  }
-}
-
-static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank) {
-  info->rank = rank;
-  CUDACHECK(cudaGetDevice(&info->cudaDev));
-  NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev))
-  info->hostHash=getHostHash();
-  info->pidHash=getPidHash();
-
-  // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
-  // cudaDev is a CUDA runtime dev number which could be different from the
-  // NVML device number. Then we get the busID from NVML to be sure it is
-  // consistent with NVML remote PCI bus Ids.
-  CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
-  nvmlDevice_t nvmlDevice;
-  NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice));
-  nvmlPciInfo_t pciInfo;
-  NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo));
-  strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE);
-  return ncclSuccess;
-}
-
-template <int type>
-static ncclResult_t selectTransport(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) {
-  for (int t=0; t<NTRANSPORTS; t++) {
-    struct ncclTransport *transport = ncclTransports+t;
-    struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
-    ncclTvalue_t ret = 0;
-    NCCLCHECK(transport->canConnect(&ret, myInfo, peerInfo));
-    if (ret > 0) {
-      connector->transportComm = transportComm;
-      NCCLCHECK(transportComm->setup(myInfo, peerInfo, connect, connector, buffSize, channelId));
-      return ncclSuccess;
-    }
-  }
-  WARN("No transport found !");
-  return ncclInternalError;
-}
-
-static int log2(int n) {
- int l = 0;
- while (n>>=1) l++;
- return l;
-}
-
-static ncclResult_t ncclTreeThreshold(int nnodes, int nranks, int nChannels, ssize_t *treeThreshold) {
-  int nvlink;
-  NCCLCHECK(ncclNvlinkGpu(&nvlink));
-  float ringbw = nvlink ? 5000*nChannels : 5000; // approx, in MB/s or B/us
-  float ringlatinter = 6;
-  float treelatintra = 4;
-  float treelatinter = 15;
-  float treebw;
-  if (!nvlink) {
-    treebw = ringbw * 2 / 3;
-  } else {
-    treebw = ringbw * 3 / 4;
-    if (nnodes == 2) treebw *= 2;
-  }
-  float ringlat = ringlatinter*(nranks-1);
-  float treelat = treelatinter*log2(nnodes)+treelatintra*(nranks/nnodes-1);
-  if (nnodes < 2 || ringlat <= treelat)
-    *treeThreshold = 0;
-  else if (treebw > ringbw)
-    *treeThreshold = 0x7fffffffffffffff;
-  else
-    *treeThreshold = (ssize_t)(((ringbw*treebw/(ringbw-treebw)))*(ringlat-treelat));
-  return ncclSuccess;
-}
-
-static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks, int* treeMasters) {
-  TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
-  NCCLCHECK(initChannel(comm, channelId));
-
-  struct ncclChannel* channel = comm->channels+channelId;
-  struct ncclRing* ring = &channel->ring;
-
-  // Reorganize ranks to start with rank.
-  int shift;
-  for (shift = 0; shift<nranks; shift++) {
-    if (ringRanks[shift] == rank) {
-      break;
-    }
-  }
-  for (int i=0; i<nranks; i++) {
-    ring->userRanks[i] = ringRanks[(i+shift)%nranks];
-  }
-  int prev = ring->prev = ring->userRanks[nranks-1];
-  int next = ring->next = ring->userRanks[1];
-
-  struct ncclTree* tree = &channel->tree;
-  tree->up = -1;
-  tree->down[0] = tree->down[1] = tree->down[2] = -1;
-
-  //
-  // Find per-node masters and connect them via a binary tree
-  //
-
-  int nMasters = 0;
-  for (int r=0; r<nranks; r++) nMasters += treeMasters[r];
-  if (nMasters == 0) {
-    nMasters = 1;
-    treeMasters[0] = 1;
-  }
-
-  if (comm->treeThreshold == -2)
-    NCCLCHECK(ncclTreeThreshold(nMasters, comm->nRanks, comm->nChannels, &comm->treeThreshold));
-
-  if (comm->treeThreshold > 0) {
-    // Compute tree depth. Not an exact value but a good approximation in most
-    // cases and consistent across nodes
-    tree->depth = nranks/nMasters + log2(nMasters);
-
-    // Find my master : go backwards in the ring to find my root
-    int master = 0;
-    for (int i = 0; i<nranks; i++) {
-      int r = ring->userRanks[(nranks-i)%nranks];
-      if (treeMasters[r]) {
-        master = r;
-        break;
-      }
-    }
-
-    int ranks[nMasters];
-    int i = 0, masterIndex = -1;
-    // Build binary tree
-    for (int r=0; r<nranks; r++) {
-      // Create index table
-      if (r == master) masterIndex = i;
-      if (treeMasters[r]) ranks[i++] = r;
-    }
-    int btreeUp, btreeDown0, btreeDown1;
-    int u0, d0_0, d0_1, u1, d1_0, d1_1;
-    NCCLCHECK(ncclGetDtree(nMasters, masterIndex, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1));
-    if (channelId < DIVUP(comm->nChannels, 2)) {
-      btreeUp = u0; btreeDown0 = d0_0; btreeDown1 = d0_1;
-    } else {
-      btreeUp = u1; btreeDown0 = d1_0; btreeDown1 = d1_1;
-    }
-
-    //
-    // Now build the full tree, combining the intra-node ring and the
-    // inter-node binary tree.
-    //
-
-    if (rank == master) {
-      int nDown = 0;
-      if (btreeUp != -1) tree->up = ranks[btreeUp];
-      if (treeMasters[next] == 0) tree->down[nDown++] = next;
-      if (btreeDown0 != -1) tree->down[nDown++] = ranks[btreeDown0];
-      if (btreeDown1 != -1) tree->down[nDown++] = ranks[btreeDown1];
-    } else {
-      tree->up = prev;
-      if (treeMasters[next] == 0) tree->down[0] = next;
-    }
-  }
-
-  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
-  return ncclSuccess;
-}
-
-static ncclResult_t fillConnect(struct ncclPeerInfo* peerInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) {
-  for (int r=0; r<nranks; r++) {
-    connectTransport[r] = -1;
-    for (int t=0; t<NTRANSPORTS; t++) {
-      NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, peerInfo+rank, peerInfo+r));
-      if (connectValue[r] > 0) {
-        connectTransport[r] = t;
-        break;
-      }
-    }
-  }
-  return ncclSuccess;
-}
-
-#define MAXWIDTH 20
-#define PREFIXLEN 15
-#define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
-void dumpMatrix(int* connectMatrix, int nranks) {
-  char line[STRLENGTH+1];
-  line[STRLENGTH] = '\0';
-  memset(line, ' ', STRLENGTH);
-  for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j);
-  INFO(NCCL_INIT,"%s", line);
-  for (int i=0; i<nranks; i++) {
-    memset(line, ' ', STRLENGTH);
-    sprintf(line, "%3d ", i);
-    for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]);
-    INFO(NCCL_INIT,"%s", line);
-  }
-}
-
-void dumpMatrixTvalue(ncclTvalue_t* connectMatrix, int nranks) {
-  char line[STRLENGTH+1];
-  line[STRLENGTH] = '\0';
-  memset(line, ' ', STRLENGTH);
-  for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4d", j);
-  INFO(NCCL_INIT,"%s", line);
-  for (int i=0; i<nranks; i++) {
-    memset(line, ' ', STRLENGTH);
-    sprintf(line, "%3d ", i);
-    for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4o", (int)connectMatrix[i*nranks+j]);
-    INFO(NCCL_INIT,"%s", line);
-  }
-}
-
-
-void dumpLine(int* values, int nranks, const char* prefix) {
-  int prefixlen = strlen(prefix);
-  char line[STRLENGTH+1];
-  line[STRLENGTH] = '\0';
-  memset(line, ' ', STRLENGTH);
-  strncpy(line, prefix, PREFIXLEN);
-  for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
-  INFO(NCCL_INIT,"%s", line);
-}
-
-static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
-  for (int r=0; r<nrings; r++) {
-    char prefix[30];
-    /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
-    dumpLine(prev+r*nranks, nranks, prefix);
-    sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
-    dumpLine(next+r*nranks, nranks, prefix);*/
-
-    int current = rank;
-    for (int i=0; i<nranks; i++) {
-      rings[r*nranks+i] = current;
-      current = next[r*nranks+current];
-    }
-    sprintf(prefix, "Channel %02d : ", r);
-    if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
-    if (current != rank) {
-      WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
-      return ncclInternalError;
-    }
-    // Check that all ranks are there
-    for (int i=0; i<nranks; i++) {
-      int found = 0;
-      for (int j=0; j<nranks; j++) {
-        if (rings[r*nranks+j] == i) {
-          found = 1;
-          break;
-        }
-      }
-      if (found == 0) {
-        WARN("Error : ring %d does not contain rank %d", r, i);
-        return ncclInternalError;
-      }
-    }
-  }
-  return ncclSuccess;
-}
-
-void* waitForNonNullPtr(void* p) {
-  volatile void** ptr = (volatile void**) p;
-  while (*ptr == NULL) sched_yield();
-  return (void*)*ptr;
-}
-
-ncclResult_t initParams(struct ncclComm* comm) {
-  struct cudaLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
-  params->args = &comm->argsptr;
-  params->stream = NULL;
-  params->sharedMem = 0;
-  params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
-  params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1;
-  return ncclSuccess;
-}
-
-// Allocate/Set Intra Process Structures and set CG options
-ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) {
-  comm->intraRank = rank;
-  comm->intraRanks = ranks;
-  comm->intraPhase = 0;
-
-  // Alloc shared structures
-  if (rank == 0) {
-    assert(comm == comm0);
-    int* bar;
-    NCCLCHECK(ncclCalloc(&bar, 2));
-    bar[0] = bar[1] = 0;
-    comm->intraBarrier = bar;
-    NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks));
-    NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks));
-    int* CGMode;
-    NCCLCHECK(ncclCalloc(&CGMode, 1));
-    *CGMode = 0x11;
-    comm->intraCGMode = CGMode;
-    int* CC;
-    NCCLCHECK(ncclCalloc(&CC, 1));
-    *CC = ncclCudaFullCompCap();
-    comm->intraCC = CC;
-  } else {
-    comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
-    comm->intraParams = (struct cudaLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
-    comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
-    comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
-    comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
-  }
-  comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
-  NCCLCHECK(initParams(comm));
-
-  int cgMdLaunch = 0;
-
-  // Set CG Mode
-  comm->launchMode = ncclComm::GROUP;
-  char* str = getenv("NCCL_LAUNCH_MODE");
-  if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) {
-    comm->launchMode = ncclComm::PARALLEL;
-  }
-  if (comm->launchMode == ncclComm::GROUP) {
-    CUDACHECK(cudaStreamCreateWithFlags(&comm->groupStream, cudaStreamNonBlocking));
-#if CUDART_VERSION >= 9000
-    if (*comm->intraCC && (ncclCudaFullCompCap() == *comm->intraCC)) {
-      // Check whether the GPU supports Cooperative Group Multi Device Launch
-      (void) cudaDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
-    }
-#endif
-  }
-
-  // Disable cgMdLaunch if any rank does not support it
-  if (cgMdLaunch == 0) {
-    *comm->intraCGMode = 0x10;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
-  TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
-  uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
-  struct ncclConnect connect;
-  struct ncclConnector* conn;
-  for (int i=0; i<nrecv; i++) {
-    int peer = peerRecv[i];
-    if (peer == -1) continue;
-    conn = &channel->peers[peer].recv;
-    if (conn->connected) { ++nSkippedRecv; continue; }
-    NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
-    NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
-  }
-  for (int i=0; i<nsend; i++) {
-    int peer = peerSend[i];
-    if (peer == -1) continue;
-    conn = &channel->peers[peer].send;
-    if (conn->connected) { ++nSkippedSend; continue; }
-    NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
-    NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
-  }
-  for (int i=0; i<nsend; i++) {
-    int peer = peerSend[i];
-    if (peer == -1) continue;
-    conn = &channel->peers[peer].send;
-    if (conn->connected) {++nSkippedSend; continue; }
-    NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
-    NCCLCHECK(conn->transportComm->connect(&connect, conn));
-    conn->connected = 1;
-  }
-  for (int i=0; i<nrecv; i++) {
-    int peer = peerRecv[i];
-    if (peer == -1) continue;
-    conn = &channel->peers[peer].recv;
-    if (conn->connected) {++nSkippedRecv; continue; }
-    NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
-    NCCLCHECK(conn->transportComm->connect(&connect, conn));
-    conn->connected = 1;
-  }
-  TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
-  return ncclSuccess;
-}
-
-static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
-  // We use 3 AllGathers
-  // 1. { peerInfo, comm }
-  // 2. ConnectTransport[nranks], ConnectValue[nranks]
-  // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
-
-  int rank = comm->rank;
-  int nranks = comm->nRanks;
-  TRACE(NCCL_INIT, "rank %d nranks %d - BEGIN", rank, nranks);
-  NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
-
-  // AllGather1 - begin
-  struct {
-    struct ncclPeerInfo peerInfo;
-    struct ncclComm* comm;
-  } *allGather1Data;
-
-  NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
-  allGather1Data[rank].comm = comm;
-  NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank));
-  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
-
-  NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
-  for (int i = 0; i < nranks; i++) {
-    memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
-  }
-  // AllGather1 data is used again below
-  // AllGather1 - end
-
-  // AllGather2 - begin
-  size_t allGather2DataRowSize = sizeof(int)*nranks + sizeof(ncclTvalue_t)*nranks;
-  void *allGather2Data;
-  NCCLCHECK(ncclCalloc((char **)&allGather2Data, allGather2DataRowSize*nranks));
-  int *myTransportRow = (int *)((char *)allGather2Data + allGather2DataRowSize*rank);
-  ncclTvalue_t *myValueRow = (ncclTvalue_t *)(myTransportRow + nranks);
-
-  NCCLCHECK(fillConnect(comm->peerInfo, nranks, rank, myTransportRow, myValueRow));
-  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather2Data, allGather2DataRowSize));
-
-  int* connectTransport;
-  ncclTvalue_t* connectValue;
-  NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
-  NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
-  for (int i = 0; i < nranks; i++) {
-    memcpy(connectTransport + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize, sizeof(int)*nranks);
-    memcpy(connectValue + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize + nranks*sizeof(int), sizeof(ncclTvalue_t)*nranks);
-  }
-  free(allGather2Data);
-  // AllGather2 - end
-
-  //if (rank == 0) dumpMatrix(connectTransport, nranks);
-  //if (rank == 0) dumpMatrixTvalue(connectValue, nranks);
-
-  // Get my rings
-  int nrings;
-  int* prev, *next, *treeIn, *treeOut;
-  NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS));
-  comm->nThreads = getDefaultThreads();
-  NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut));
-  TRACE(NCCL_INIT, "rank %d nranks %d - BUILD %d RINGS", rank, nranks, nrings);
-  assert(nrings <= MAXCHANNELS);
-  free(connectTransport);
-  free(connectValue);
-
-  // AllGather3 - begin
-  struct {
-    int nThreads;
-    int nrings;
-    int cudaCompCap;
-    int prev[MAXCHANNELS];
-    int next[MAXCHANNELS];
-  } *allGather3Data;
-
-  NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
-  allGather3Data[rank].nThreads = comm->nThreads;
-  allGather3Data[rank].nrings = nrings;
-  allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
-  for (int r=0; r<nrings; r++) {
-    allGather3Data[rank].prev[r] = *(prev+r*nranks+rank);
-    allGather3Data[rank].next[r] = *(next+r*nranks+rank);
-  }
-  NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
-
-  // Find max nThreads
-  for (int i=0; i<nranks; i++)
-    comm->nThreads = std::max(allGather3Data[i].nThreads, comm->nThreads);
-
-  // Determine the minimum CUDA Compute capability of all GPUs
-  int myCompCap = allGather3Data[rank].cudaCompCap;
-  int minCompCap = myCompCap;
-  for (int i = 0; i < nranks; i++)
-    minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap);
-
-  // Determine thread threshold across all GPUs
-  int nnodes = 0;
-  for (int r=0; r<nranks; r++) nnodes += treeIn[r];
-  comm->threadThreshold = ncclThreadThreshold(minCompCap, nnodes);
-
-  // Find min nrings across ranks
-  for (int i=0; i<nranks; i++)
-    nrings = std::min(allGather3Data[i].nrings, nrings);
-  comm->nChannels = nrings;
-
-  // Unpack the per ring prev/next arrays
-  for (int i = 0; i < nranks; i++) {
-    for (int r = 0; r < nrings; r++) {
-      prev[r*nranks+i] = allGather3Data[i].prev[r];
-      next[r*nranks+i] = allGather3Data[i].next[r];
-    }
-  }
-  free(allGather3Data);
-  // AllGather3 - end
-
-  int *rings;
-  NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
-  NCCLCHECK(buildRings(nrings, rings, rank, nranks, prev, next));
-  free(prev);
-  free(next);
-  TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d RINGS", rank, nranks, nrings);
-
-  // Connect with prev/next for each ring
-  struct ncclConnect *connect;
-  NCCLCHECK(ncclCalloc(&connect, 2));
-  for (int r=0; r<nrings; r++) {
-    struct ncclChannel* channel = comm->channels+r;
-    NCCLCHECK(setupChannel(comm, r, rank, nranks, rings+r*nranks, treeIn+r*nranks));
-    NCCLCHECK(p2pSetup(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next));
-    NCCLCHECK(p2pSetup(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up));
-    NCCLCHECK(p2pSetup(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down));
-  }
-  if (comm->treeThreshold > 0) {
-    char line[1024];
-    line[0]='\0';
-    for (int c=0; c<nrings; c++) {
-      struct ncclTree* tree = &comm->channels[c].tree;
-      snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d->%d->%d/%d/%d",
-          c, tree->up, rank, tree->down[0], tree->down[1], tree->down[2]);
-    }
-    line[1023] = '\0';
-    INFO(NCCL_INIT, "Trees%s", line);
-  }
-  if (rank == 0) {
-    char treeline[64];
-    snprintf(treeline, 64, "enabled up to size %ld", comm->treeThreshold);
-    INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees %s", comm->nThreads, minCompCap,
-       comm->treeThreshold == 0 ? "disabled" :
-       comm->treeThreshold == 0x7fffffffffffffff ? "enabled for all sizes" :
-       treeline);
-  }
-
-  TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, nrings);
-  free(connect);
-  free(rings);
-  free(treeIn);
-  free(treeOut);
-
-  // Compute intra ranks (using AllGather1 data)
-  int intraRank0 = -1, intraRank = -1, intraRanks = 0;
-  for (int i = 0; i < nranks; i++) {
-    if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) &&
-        (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) {
-      if (intraRanks == 0) intraRank0 = i;
-      if (i == rank) intraRank = intraRanks;
-      intraRanks++;
-    }
-  }
-  TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
-        rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
-  if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
-    WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
-         rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
-    return ncclInternalError;
-  }
-  NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm));
-
-  // Done with AllGather1 data
-  free(allGather1Data);
-
-  if (nnodes) NCCLCHECK(transportCreateProxy(comm));
-
-  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
-  return ncclSuccess;
-}
-
-static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) {
-  CPU_ZERO_S(sizeof(cpu_set_t), mask);
-  char* cudaPath;
-  NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
-  char path[PATH_MAX];
-  strncpy(path, cudaPath, PATH_MAX-1);
-  snprintf(path+strlen(path), PATH_MAX-1-strlen(path), "/local_cpus");
-  path[PATH_MAX-1] = '\0';
-  int fd;
-  SYSCHECKVAL(open(path, O_RDONLY), "open", fd);
-  char affinityStr[sizeof(cpu_set_t)*2];
-  int r = read(fd, affinityStr, sizeof(cpu_set_t)*2);
-  if (r > 0)
-    NCCLCHECK(ncclStrToCpuset(affinityStr, mask));
-  close(fd);
-  free(cudaPath);
-  return ncclSuccess;
-}
-
-static ncclResult_t setCpuAffinity(int cudaDev) {
-  // Work within the enveloppe we were provided
-  cpu_set_t mask;
-  SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
-
-  // Find the subpart that is local to our GPU
-  cpu_set_t gpuMask;
-  NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask));
-  cpu_set_t finalMask;
-  CPU_AND(&finalMask, &mask, &gpuMask);
-
-  // If those are not disjoint, try to stay local
-  if (CPU_COUNT(&finalMask)) {
-    char affinityStr[sizeof(cpu_set_t)*2];
-    NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
-    INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", cudaDev, affinityStr);
-    SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity");
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
-  cpu_set_t affinitySave;
-  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-
-  NCCLCHECK(wrapNvmlSymbols());
-  NCCLCHECK(wrapNvmlInit());
-
-  // Make sure all host memory allocation are close to the GPU
-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  NCCLCHECK(setCpuAffinity(cudaDev));
-  ncclResult_t res;
-
-  NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
-  NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
-  NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
-
-  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-  NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
-
-  INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->nvmlDev);
-
-  return ncclSuccess;
-cleanup:
-  *newcomm = NULL;
-  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-  return res;
-}
-
-NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
-ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
-  char* env = getenv("NCCL_COMM_ID");
-  if (env && myrank == 0) {
-    NCCLCHECK(bootstrapCreateRoot(&commId, true));
-  }
-
-  NCCLCHECK(ncclInit());
-  if (myrank == 0) showVersion();
-
-  // Make sure the CUDA runtime is initialized.
-  CUDACHECK(cudaFree(NULL));
-
-  NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm"));
-  if (nranks < 1 || myrank < 0 || myrank >= nranks) {
-    WARN("Invalid rank requested : %d/%d", myrank, nranks);
-    return ncclInvalidArgument;
-  }
-
-  if (ncclAsyncMode()) {
-    int cudaDev;
-    CUDACHECK(cudaGetDevice(&cudaDev));
-    return ncclAsyncInit(ncclCommInitRankSync, cudaDev, newcomm, nranks, commId, myrank);
-  } else {
-    return ncclCommInitRankSync(newcomm, nranks, commId, myrank);
-  }
-}
-
-static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) {
-  struct ncclPeerInfo* allInfo;
-  NCCLCHECK(ncclCalloc(&allInfo, nranks));
-  for (int rank=0; rank<nranks; rank++) {
-    CUDACHECK(cudaSetDevice(devs[rank]));
-    NCCLCHECK(fillInfo(allInfo+rank, rank));
-  }
-
-  int* connectTransport;
-  ncclTvalue_t* connectValue;
-  NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
-  NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
-  for (int rank=0; rank<nranks; rank++)
-    NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
-
-  int* prev, *prevFinal, *next, *nextFinal, *treeIn, *treeOut;
-  NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS));
-  int nrings = MAXCHANNELS;
-  int nthreads=0;
-  int myCompCap = ncclCudaCompCap();
-  int minCompCap = myCompCap;
-  for (int rank=0; rank<nranks; rank++) {
-    CUDACHECK(cudaSetDevice(devs[rank]));
-    int nringsRank;
-    int nthreadsRank = getDefaultThreads();
-    myCompCap = ncclCudaCompCap();
-    NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut));
-    nrings = std::min(nrings, nringsRank);
-    nthreads = std::max(nthreads, nthreadsRank);
-    minCompCap = std::min(minCompCap, myCompCap);
-    for (int ring=0; ring<nrings; ring++) {
-      int index = ring*nranks+rank;
-      prevFinal[index] = prev[index];
-      nextFinal[index] = next[index];
-    }
-  }
-  free(connectTransport);
-  free(connectValue);
-  free(prev);
-  free(next);
-
-  INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees disabled", nthreads, minCompCap);
-
-  int* rings;
-  NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
-  NCCLCHECK(buildRings(nrings, rings, 0, nranks, prevFinal, nextFinal));
-  free(prevFinal);
-  free(nextFinal);
-
-  // Determine thread threshold across all GPUs
-  int threadThreshold = ncclThreadThreshold(minCompCap, 0);
-
-  for (int rank=0; rank<nranks; rank++) {
-    comms[rank]->nChannels = nrings;
-    comms[rank]->nThreads = nthreads;
-    comms[rank]->threadThreshold = threadThreshold;
-  }
-
-  for (int r=0; r<nrings; r++) {
-    struct ncclConnect connect[2*nranks];
-    int* ringRanks = rings+r*nranks;
-    for (int rank=0; rank<nranks; rank++) {
-      CUDACHECK(cudaSetDevice(devs[rank]));
-      struct ncclChannel* channel = comms[rank]->channels+r;
-      struct ncclRing *ring = &channel->ring;
-      NCCLCHECK(setupChannel(comms[rank], r, rank, nranks, ringRanks, treeIn));
-      // Make sure we don't use trees, we cannot use them with initAll
-      comms[rank]->treeThreshold = 0;
-      int prev = channel->ring.prev = ring->userRanks[nranks-1];
-      int next = channel->ring.next = ring->userRanks[1];
-      struct ncclConnector* recv = &channel->peers[prev].recv;
-      struct ncclConnector* send = &channel->peers[next].send;
-      NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+rank*2+0, recv, channel->buffSize, channel->id));
-      NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+rank*2+1, send, channel->buffSize, channel->id));
-    }
-    for (int rank=0; rank<nranks; rank++) {
-      CUDACHECK(cudaSetDevice(devs[rank]));
-      struct ncclChannel* channel = comms[rank]->channels+r;
-      struct ncclRing *ring = &channel->ring;
-      struct ncclConnector* recv = &channel->peers[ring->prev].recv;
-      struct ncclConnector* send = &channel->peers[ring->next].send;
-      NCCLCHECK(recv->transportComm->connect(connect+ring->prev*2+1, recv));
-      NCCLCHECK(send->transportComm->connect(connect+ring->next*2+0, send));
-    }
-  }
-  free(allInfo);
-  free(rings);
-  free(treeIn);
-  free(treeOut);
-  return ncclSuccess;
-}
-
-
-NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
-ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
-  NCCLCHECK(ncclInit());
-  NCCLCHECK(wrapNvmlSymbols());
-  NCCLCHECK(wrapNvmlInit());
-  showVersion();
-
-  INFO(NCCL_INIT,"nranks %d", ndev);
-
-  NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
-  if (ndev < 1) {
-    WARN("Invalid device count requested : %d", ndev);
-    return ncclInvalidArgument;
-  }
-
-  ncclResult_t res;
-  int savedDevice;
-  int rank, cudaDev;
-  ncclComm_t comm = NULL;
-  int ncclDevList[ndev];
-  for (int i=0; i<ndev; i++) {
-    ncclDevList[i] = devlist ? devlist[i] : i;
-  }
-
-  cudaGetDevice(&savedDevice);
-
-  for(rank=0; rank<ndev; ++rank)
-    comms[rank] = NULL;
-
-  cpu_set_t affinitySave;
-  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-
-  for (rank=0; rank<ndev; ++rank) {
-    cudaDev = ncclDevList[rank];
-    CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
-
-    NCCLCHECK(setCpuAffinity(cudaDev));
-
-    NCCLCHECKGOTO(commAlloc(&comm, ndev, rank), res, cleanup);
-    comms[rank] = comm;
-
-    NCCLCHECKGOTO(ncclCommSetIntra(comm, rank, ndev, comms[0]), res, cleanup);
-  }
-
-  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-
-  NCCLCHECKGOTO(initTransportsAll(comms, ncclDevList, ndev), res, cleanup);
-
-  for(rank=0; rank<ndev; ++rank) {
-    cudaDev = ncclDevList[rank];
-    CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
-    NCCLCHECKGOTO(devCommSetup(comms[rank]), res, cleanup);
-  }
-
-  res = ncclSuccess;
-  goto final;
-
-cleanup:
-  for(rank=0; rank<ndev; ++rank) {
-    if(comms[rank] != NULL) {
-      commFree(comms[rank]);
-    }
-  }
-
-final:
-  if(wrapNvmlShutdown() != ncclSuccess)
-    INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
-  cudaSetDevice(savedDevice);
-  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-  return res;
-}
-
-
-static ncclResult_t commDestroy(ncclComm_t comm) {
-  int savedDevice;
-  CUDACHECK(cudaGetDevice(&savedDevice));
-  int commDevice = comm->cudaDev;
-  int rank = comm->rank;
-
-  if (savedDevice != commDevice) {
-    CUDACHECK(cudaSetDevice(commDevice));
-  }
-
-  TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, *comm->abortFlag, comm->fatalError);
-
-  CUDACHECK(cudaStreamSynchronize(comm->groupStream));
-  NCCLCHECK(transportDestroyProxy(comm));
-  NCCLCHECK(commFree(comm));
-
-  if (savedDevice != commDevice)
-    CUDACHECK(cudaSetDevice(savedDevice));
-
-  INFO(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
-
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
-ncclResult_t ncclCommDestroy(ncclComm_t comm) {
-  if (comm == NULL)
-    return ncclSuccess;
-
-  return commDestroy(comm);
-}
-
-NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
-ncclResult_t ncclCommAbort(ncclComm_t comm) {
-  if (comm == NULL)
-    return ncclSuccess;
-
-  // Ask anything that might still be running on the device to quit
-  *comm->abortFlag = 1;
-
-  return commDestroy(comm);
-}
-
-NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
-const char* ncclGetErrorString(ncclResult_t code) {
-  switch (code) {
-    case ncclSuccess                : return "no error";
-    case ncclUnhandledCudaError     : return "unhandled cuda error";
-    case ncclSystemError            : return "unhandled system error";
-    case ncclInternalError          : return "internal error";
-    case ncclInvalidArgument        : return "invalid argument";
-    case ncclInvalidUsage           : return "invalid usage";
-    default                         : return "unknown result code";
-  }
-}
-
-NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError);
-ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
-  NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm"));
-  NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError"));
-
-  // Check device reported error
-  static ncclDevError_t printedDevErr = ncclDevSuccess;
-  switch(*comm->fatalDevError) {
-    case ncclDevSuccess :
-      break;
-    case ncclDevAssertedMismatch :
-      if (printedDevErr != ncclDevAssertedMismatch) {
-        WARN("Mismatched collective detected, please check your collective calls at and around rank %d. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank);
-        printedDevErr = ncclDevAssertedMismatch;
-      }
-      if (comm->fatalError == ncclSuccess) {
-        comm->fatalError = ncclInvalidUsage;
-      }
-      break;
-    case ncclDevSuspectedMismatch :
-      if (printedDevErr != ncclDevSuspectedMismatch) {
-        WARN("Your program may be hanging, this may be caused by a collective mismatch around rank %d. Please check your collective calls at and around this rank. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank);
-        printedDevErr = ncclDevSuspectedMismatch;
-      }
-      break;
-    default:
-      WARN("Unknown device error %d", *comm->fatalDevError);
-      return ncclInternalError;
-  }
-  *asyncError = comm->fatalError;
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
-ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
-  NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
-  NCCLCHECK(PtrCheck(count, "CommCount", "count"));
-  *count = comm->nRanks;
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
-ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
-  NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
-  NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
-  *devid = comm->cudaDev;
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
-ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
-  NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
-  NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
-  *rank = comm->rank;
-  return ncclSuccess;
-}
diff --git a/src/misc/checks.cu b/src/misc/argcheck.cc
index a07e577..67931f8 100644
--- a/src/misc/checks.cu
+++ b/src/misc/argcheck.cc
@@ -1,10 +1,11 @@
 /*************************************************************************
- * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "checks.h"
+#include "argcheck.h"
+#include "comm.h"
 
 static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
   cudaPointerAttributes attr;
diff --git a/src/misc/ibvwrap.cu b/src/misc/ibvwrap.cc
index 7ac3431..f47c141 100644
--- a/src/misc/ibvwrap.cu
+++ b/src/misc/ibvwrap.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
diff --git a/src/misc/nvmlwrap.cu b/src/misc/nvmlwrap.cc
index 635f332..34ed0aa 100644
--- a/src/misc/nvmlwrap.cu
+++ b/src/misc/nvmlwrap.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -16,6 +16,7 @@ static nvmlReturn_t (*nvmlInternalInit)(void);
 static nvmlReturn_t (*nvmlInternalShutdown)(void);
 static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
 static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
+static nvmlReturn_t (*nvmlInternalDeviceGetHandleByIndex)(unsigned int index, nvmlDevice_t* device);
 static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
 static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
 static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci);
@@ -23,7 +24,10 @@ static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t dev
 static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link,
     nvmlNvLinkCapability_t capability, unsigned int *capResult);
 static nvmlReturn_t (*nvmlInternalDeviceGetMinorNumber)(nvmlDevice_t device, unsigned int* minorNumber);
+static nvmlReturn_t (*nvmlInternalDeviceGetCudaComputeCapability)(nvmlDevice_t device, int* major, int* minor);
 
+// Used to make the NVML library calls thread safe
+pthread_mutex_t nvmlLock = PTHREAD_MUTEX_INITIALIZER;
 
 ncclResult_t wrapNvmlSymbols(void) {
   if (nvmlState == nvmlInitialized)
@@ -70,12 +74,14 @@ ncclResult_t wrapNvmlSymbols(void) {
   LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
   LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
   LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByIndex", nvmlInternalDeviceGetHandleByIndex);
   LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
   LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo);
   LOAD_SYM(nvmlhandle, "nvmlDeviceGetMinorNumber", nvmlInternalDeviceGetMinorNumber);
   LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState);
   LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo);
   LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceGetCudaComputeCapability", nvmlInternalDeviceGetCudaComputeCapability);
 
   nvmlState = nvmlInitialized;
   return ncclSuccess;
@@ -85,6 +91,7 @@ teardown:
   nvmlInternalShutdown = NULL;
   nvmlInternalDeviceGetHandleByPciBusId = NULL;
   nvmlInternalDeviceGetIndex = NULL;
+  nvmlInternalDeviceGetHandleByIndex = NULL;
   nvmlInternalDeviceGetPciInfo = NULL;
   nvmlInternalDeviceGetMinorNumber = NULL;
   nvmlInternalDeviceGetNvLinkState = NULL;
@@ -130,7 +137,8 @@ ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_
     WARN("lib wrapper not initialized.");
     return ncclInternalError;
   }
-  nvmlReturn_t ret = nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device);
+  nvmlReturn_t ret;
+  NVMLLOCKCALL(nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device), ret);
   if (ret != NVML_SUCCESS) {
     WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ",
         nvmlInternalErrorString(ret));
@@ -144,7 +152,8 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
     WARN("lib wrapper not initialized.");
     return ncclInternalError;
   }
-  nvmlReturn_t ret = nvmlInternalDeviceGetIndex(device, index);
+  nvmlReturn_t ret;
+  NVMLLOCKCALL(nvmlInternalDeviceGetIndex(device, index), ret);
   if (ret != NVML_SUCCESS) {
     WARN("nvmlDeviceGetIndex() failed: %s ",
         nvmlInternalErrorString(ret));
@@ -153,12 +162,28 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
   return ncclSuccess;
 }
 
+ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) {
+  if (nvmlInternalDeviceGetHandleByIndex == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret;
+  NVMLLOCKCALL(nvmlInternalDeviceGetHandleByIndex(index, device), ret);
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlDeviceGetHandleByIndex() failed: %s ",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
   if (nvmlInternalDeviceGetPciInfo == NULL) {
     WARN("lib wrapper not initialized.");
     return ncclInternalError;
   }
-  nvmlReturn_t ret = nvmlInternalDeviceGetPciInfo(device, pci);
+  nvmlReturn_t ret;
+  NVMLLOCKCALL(nvmlInternalDeviceGetPciInfo(device, pci), ret);
   if (ret != NVML_SUCCESS) {
     WARN("nvmlDeviceGetPciInfo() failed: %s ",
         nvmlInternalErrorString(ret));
@@ -172,7 +197,8 @@ ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* min
     WARN("lib wrapper not initialized.");
     return ncclInternalError;
   }
-  nvmlReturn_t ret = nvmlInternalDeviceGetMinorNumber(device, minorNumber);
+  nvmlReturn_t ret;
+  NVMLLOCKCALL(nvmlInternalDeviceGetMinorNumber(device, minorNumber), ret);
   if (ret != NVML_SUCCESS) {
     WARN("nvmlDeviceGetMinorNumber() failed: %s ",
         nvmlInternalErrorString(ret));
@@ -186,7 +212,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link
     /* Do not warn, this symbol is optional. */
     return ncclInternalError;
   }
-  nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive);
+  nvmlReturn_t ret;
+  NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkState(device, link, isActive), ret);
   if (ret != NVML_SUCCESS) {
     if (ret != NVML_ERROR_NOT_SUPPORTED)
       INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
@@ -201,7 +228,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned
     /* Do not warn, this symbol is optional. */
     return ncclInternalError;
   }
-  nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci);
+  nvmlReturn_t ret;
+  NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci), ret);
   if (ret != NVML_SUCCESS) {
     if (ret != NVML_ERROR_NOT_SUPPORTED)
       INFO(NCCL_INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ",
@@ -217,7 +245,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int
     /* Do not warn, this symbol is optional. */
     return ncclInternalError;
   }
-  nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult);
+  nvmlReturn_t ret;
+  NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult), ret);
   if (ret != NVML_SUCCESS) {
     if (ret != NVML_ERROR_NOT_SUPPORTED)
       INFO(NCCL_INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ",
@@ -226,4 +255,19 @@ ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int
   }
   return ncclSuccess;
 }
+
+ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
+  if (nvmlInternalDeviceGetNvLinkCapability == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret;
+  NVMLLOCKCALL(nvmlInternalDeviceGetCudaComputeCapability(device, major, minor), ret);
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlDeviceGetCudaComputeCapability() failed: %s ",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
 #endif
diff --git a/src/misc/rings.cu b/src/misc/rings.cu
deleted file mode 100644
index a7b122c..0000000
--- a/src/misc/rings.cu
+++ /dev/null
@@ -1,385 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "param.h"
-
-#define NCCL_MAX_SCORE 7
-
-/* Parse user defined rings. Format is like :
- * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
- * Rings with a non-matching number of ranks are ignored so we can provide
- * rings for multiple cases.
- */
-#define MAX_ENV_RANKS 512
-static ncclResult_t parseRings(const char* str, int* nringsRet, int nranks, int* prev, int* next) {
-  int ranks[MAX_ENV_RANKS];
-  int nrings = 0;
-  int rank = 0;
-  int offset = 0;
-  int status = 0; // 0 : between numbers, 1 : inside number
-  do {
-    int digit = str[offset] - '0';
-    if (digit >= 0 && digit <= 9) {
-      if (status == 0) {
-        ranks[rank] = digit;
-        status = 1;
-      } else {
-        ranks[rank] = ranks[rank]*10+digit;
-      }
-    } else {
-      if (status == 1) {
-        rank++;
-        if (rank == MAX_ENV_RANKS) goto end;
-      }
-      status = 0;
-      if (str[offset] == '|' || str[offset] == '\0') {
-        int prevRank = ranks[rank-1];
-        // Ignore rings if nranks doesn't match
-        if (rank != nranks) goto newring;
-
-        for (int r=0; r<nranks; r++) {
-          int rank = ranks[r];
-          // Ignore rings with ranks out of bounds
-          if (rank < 0 || rank >= nranks) goto newring;
-          // Ignore rings with duplicate ranks
-          for (int i=0; i<r; i++)
-            if (ranks[i] == rank) goto newring;
-
-          next[nrings*nranks+prevRank] = rank;
-          prev[nrings*nranks+rank] = prevRank;
-          prevRank = rank;
-        }
-        nrings++;
-newring:
-        rank = 0;
-      }
-    }
-  } while (str[offset++] != 0);
-end:
-  *nringsRet = nrings;
-  return ncclSuccess;
-}
-
-/*
- * Ring creation algorithm
- *
- * First, we establish hierarchical coordinates depending on the way ranks can
- * communicate. After fillCoords, we have for each rank a unique 3-int array
- * {   node, pci_domain,   rank } corresponding to the three transports :
- * { 2[NET],     1[SHM], 0[P2P] }.
- * Also, we renumber ranks (to indexes) based on their growing coordinates.
- *
- * Then, we ask transports to connect groups together. We start with net, then
- * shm, then p2p. We maintain two arrays, prev and next, where values are equal
- * to -1 when ranks are not yet connected, and a rank otherwise. We never
- * connect ranks outside our group, meaning that on 4 nodes of 2 sockets of 4
- * ranks, if we are rank 13, we should see something like (provided we have a
- * single net interface, hence a single ring) :
- *
- * Connecting all nodes                                <13>
- * 2[NET] : prev 31 -1 -1 -1 -1 -1 -1 -1  7 -1 -1 -1 -1 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
- *          next -1 -1 -1 -1 -1 -1 -1  8 -1 -1 -1 -1 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1  0
- *
- * Connecting P2P domains with shared memory           <13>
- * 1[SHM] : prev 31 -1 -1 -1 -1 -1 -1 -1  7 -1 -1 -1 11 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
- *          next -1 -1 -1 -1 -1 -1 -1  8 -1 -1 -1 12 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1  0
- *
- * Connecting ranks (only inside the P2P domain)       <13>
- * 0[P2P] : prev 31 -1 -1 -1 -1 -1 -1 -1  7 -1 -1 -1 11 12 13 14 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
- *          next -1 -1 -1 -1 -1 -1 -1  8 -1 -1 -1 12 13 14 15 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1  0
- *
- * Hence, when we ask a transport to connect groups, we provide it with a subview of the ranks (except for net
- * which always sees the full world). That way, P2P can bruteforce all combinations inside the node without
- * risking to explode in terms of combinations, and we scale better.
- *
- * Finally, we loop over Network scores to try to create rings with high scores (=locality) and decrease until
- * we get at least one ring.
- */
-
-static void recIsConnected(int rank, int* connected, int nranks, int* matrix, int transport) {
-  connected[rank] = 1;
-  for (int r=0; r<nranks; r++) {
-    if (connected[r] == 0 && matrix[rank*nranks+r] == transport) {
-      recIsConnected(r, connected, nranks, matrix, transport);
-    }
-  }
-}
-
-static void isConnected(int rank, int* connected, int nranks, int* matrix, int transport) {
-  for (int r=0; r<nranks; r++) connected[r] = 0;
-  recIsConnected(rank, connected, nranks, matrix, transport);
-}
-
-#define NEW_IDX(rank) do { \
-  rankToIdx[rank] = idx; \
-  idxToRank[idx] = rank; \
-  for (int t=0; t<NTRANSPORTS; t++) coords[rank*NTRANSPORTS+t] = current[t]; \
-  idx++; \
-} while (0)
-
-int findConnected(int rank, int* matrix, int nranks, int transport, int* coords) {
-  for (int r=0; r<nranks; r++) {
-    if (coords[r*NTRANSPORTS] == -1 && matrix[rank*nranks+r] == transport) return r;
-  }
-  return -1;
-}
-
-static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankToIdx, int* idxToRank) {
-  int current[NTRANSPORTS];
-  int* p2pConnected;
-  NCCLCHECK(ncclCalloc(&p2pConnected, nranks));
-  for (int i=0; i<NTRANSPORTS; i++) current[i] = 0;
-  int curRank = 0, idx = 0;
-  while (1) {
-    // P2P is handled separately as there is no level below it and we need to
-    // cover the case of being connected to another GPU indirectly.
-    // So we detect all GPUs in the same P2P domain once and add them all at
-    // once.
-    isConnected(curRank, p2pConnected, nranks, matrix, 0);
-    for (int r=0; r<nranks; r++) {
-      if (p2pConnected[r]) {
-        NEW_IDX(r);
-        curRank = r;
-        current[0]++;
-      }
-    }
-    current[0] = 0;
-
-    if (idx == nranks) {
-      free(p2pConnected);
-      return ncclSuccess;
-    }
-
-    // Find next group, either connected through SHM or NET.
-    int rank;
-    int transport = 1;
-    while ((rank = findConnected(curRank, matrix, nranks, transport, coords)) == -1) {
-      current[transport] = 0;
-      transport++;
-      if (transport == NTRANSPORTS) {
-        WARN("Error : Could not find transport to connect next group\n");
-        free(p2pConnected);
-        return ncclInternalError; }
-    }
-    curRank = rank;
-    current[transport]++;
-  }
-}
-
-NCCL_PARAM(MinNrings, "MIN_NRINGS", 0);
-NCCL_PARAM(MaxNrings, "MAX_NRINGS", 0);
-
-/* Users can force the number of threads with an environment variable */
-NCCL_PARAM(Nthreads, "NTHREADS", -2);
-ncclResult_t getEnvThreads(int* nthreads) {
-  int64_t nt = ncclParamNthreads();
-  if (nt != -2)
-    *nthreads = nt;
-  return ncclSuccess;
-}
-
-static inline int copyRings(int nrings, int newNrings, int nranks, int* a, int* b, int* c, int* d) {
-  if (newNrings > MAXCHANNELS) newNrings = MAXCHANNELS;
-  for (int r=nrings; r<newNrings; r++) {
-    for (int i=0; i<nranks; i++) {
-      a[r*nranks+i] = a[(r-nrings)*nranks+i];
-      b[r*nranks+i] = b[(r-nrings)*nranks+i];
-      c[r*nranks+i] = c[(r-nrings)*nranks+i];
-      d[r*nranks+i] = d[(r-nrings)*nranks+i];
-    }
-  }
-  return newNrings;
-}
-/* Main ring creation function */
-ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut) {
-  *nrings = 0;
-
-  if (nranks == 1) return ncclSuccess;
-
-  char* str = getenv("NCCL_RINGS");
-  if (str && strlen(str)>0) {
-    int ret = parseRings(str, nrings, nranks, prev, next);
-    if (ret == ncclSuccess && *nrings > 0) {
-      if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings);
-      NCCLCHECK(getEnvThreads(nthreads));
-      for (int r = 0; r<*nrings; r++) {
-        for (int i = 0; i<nranks; i++) {
-          if (transports[i*nranks+prev[i]] == 2) treeIn[i] = 1;
-          if (transports[i*nranks+next[i]] == 2) treeOut[i] = 1;
-        }
-      }
-      return ncclSuccess;
-    }
-    if (rank == 0) INFO(NCCL_INIT,"No valid ring found in environment, ignoring");
-    *nrings = 0;
-  }
-
-  // Compute hierarchical topology groups, indexes, and rank<->index tables
-  int* coords, *globalIdxToRank, *globalRankToIdx;
-  NCCLCHECK(ncclCalloc(&coords, nranks*NTRANSPORTS));
-  for (int i=0; i<nranks*NTRANSPORTS; i++) coords[i] = -1;
-  NCCLCHECK(ncclCalloc(&globalIdxToRank, nranks));
-  NCCLCHECK(ncclCalloc(&globalRankToIdx, nranks));
-
-  NCCLCHECK(fillCoords(nranks, transports, coords, globalRankToIdx, globalIdxToRank));
-
-  // Start with a high score, then decrease until we find rings
-  int minScore = NCCL_MAX_SCORE;
-  int nringsTmp;
-  int *prevTmp, *nextTmp, *idxToRank, *rankToIdx, *groups, *subgroups;
-  NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXCHANNELS));
-  NCCLCHECK(ncclCalloc(&idxToRank, nranks));
-  NCCLCHECK(ncclCalloc(&rankToIdx, nranks));
-  NCCLCHECK(ncclCalloc(&groups, nranks));
-  NCCLCHECK(ncclCalloc(&subgroups, nranks));
-
-  int nThreads;
-  do {
-    nThreads = *nthreads;
-    for (int i=0; i<nranks*MAXCHANNELS; i++) prevTmp[i] = nextTmp[i] = -1;
-    nringsTmp = MAXCHANNELS;
-    // Loop over transports to connect groups
-    for (int t=NTRANSPORTS-1; t>=0; t--) {
-      for (int i=0; i<nranks; i++) idxToRank[i] = rankToIdx[i] = -1;
-
-      int nidx = 0;
-      for (int i=0; i<nranks; i++) {
-        // Extract only ranks in the same local area as rank
-        // We need to extract them in the topological order, hence we iterate over indexes, not ranks
-        int r = globalIdxToRank[i];
-        int sameLocal = 1;
-        for (int tr = NTRANSPORTS-1; tr > t; tr--) if (coords[r*NTRANSPORTS+tr] != coords[rank*NTRANSPORTS+tr]) sameLocal = 0;
-        if (!sameLocal) continue;
-
-        groups[nidx] = coords[r*NTRANSPORTS+t];
-        subgroups[nidx] = t ? coords[r*NTRANSPORTS+t-1] : nidx;
-        rankToIdx[r] = nidx;
-        idxToRank[nidx] = r;
-        nidx++;
-      }
-
-      int ngroups = groups[nidx-1] + 1; // Coords should be ordered
-
-      ncclTvalue_t* subvalues;
-      int *subprev, *subnext;
-      NCCLCHECK(ncclCalloc(&subvalues, nidx*nidx));
-      NCCLCHECK(ncclCalloc(&subprev, nidx*nringsTmp));
-      NCCLCHECK(ncclCalloc(&subnext, nidx*nringsTmp));
-      if (ngroups > 1) {
-        /* Extract subvalues */
-        for (int i=0; i<nidx; i++) {
-          for (int j=0; j<nidx; j++) {
-            if (transports[idxToRank[i]*nranks+idxToRank[j]] == t)
-              subvalues[i*nidx+j] = values[idxToRank[i]*nranks+idxToRank[j]];
-            else
-              subvalues[i*nidx+j] = 0;
-          }
-        }
-        /* Extract subprev/subnext */
-        for (int i=0; i<nidx*nringsTmp; i++) {
-          subprev[i] = subnext[i] = -1;
-        }
-        for (int r=0; r<nringsTmp; r++) {
-          int start = -1, end = -1;
-          for (int i=0; i<nranks; i++) {
-            if (rankToIdx[i] == -1) continue;
-            if (prevTmp[r*nranks+i] != -1) start = i;
-            if (nextTmp[r*nranks+i] != -1) end = i;
-          }
-          if (start != -1 && end != -1) {
-            subprev[r*nidx+rankToIdx[start]] = rankToIdx[end];
-            subnext[r*nidx+rankToIdx[end]] = rankToIdx[start];
-          }
-        }
-        /* Get rings */
-        NCCLCHECK(ncclTransports[t].getRings(nidx, groups, subgroups, subvalues, &nringsTmp, subprev, subnext, minScore, &nThreads));
-        /* Merge subprev/subnext into prev/next */
-        for (int r=0; r<nringsTmp; r++) {
-          for (int i=0; i<nidx; i++) {
-            if ((prevTmp[r*nranks+idxToRank[i]] == -1) && (subprev[r*nidx+i] != -1)) prevTmp[r*nranks+idxToRank[i]] = idxToRank[subprev[r*nidx+i]];
-            if ((nextTmp[r*nranks+idxToRank[i]] == -1) && (subnext[r*nidx+i] != -1)) nextTmp[r*nranks+idxToRank[i]] = idxToRank[subnext[r*nidx+i]];
-            if (t == NTRANSPORTS-1) {
-              // Save node-level masters for trees
-              treeIn[r*nranks+idxToRank[i]] = prevTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
-              treeOut[r*nranks+idxToRank[i]] = nextTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
-            }
-          }
-        }
-        //for (int r=0; r<nringsTmp; r++) {
-        //printf("[%d] [%d] [%d] [%d] Prev ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", prevTmp[r*nranks+i]); printf("\n");
-        //printf("[%d] [%d] [%d] [%d] Next ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", nextTmp[r*nranks+i]); printf("\n");
-        //}
-      }
-      free(subvalues);
-      free(subprev);
-      free(subnext);
-      if (nringsTmp == 0) break;
-    }
-    minScore--;
-    if (nringsTmp > *nrings) {
-      *nrings = nringsTmp;
-      for (int i=0; i<nranks*(*nrings); i++) {
-        prev[i] = prevTmp[i];
-        next[i] = nextTmp[i];
-      }
-    }
-  } while (nringsTmp == 0 && minScore);
-
-  free(coords);
-  free(globalRankToIdx);
-  free(globalIdxToRank);
-  free(prevTmp);
-  free(nextTmp);
-  free(idxToRank);
-  free(rankToIdx);
-  free(groups);
-  free(subgroups);
-
-  *nthreads = nThreads;
-
-  /* Duplicate the rings in case of multinode+NVLink */
-  int nnodes = 0;
-  for (int r=0; r<nranks; r++) nnodes += treeIn[r];
-  int nvlink;
-  NCCLCHECK(ncclNvlinkGpu(&nvlink));
-  if (nnodes > 1 && nvlink) {
-    *nrings = copyRings(*nrings, *nrings*2, nranks, prev, next, treeIn, treeOut);
-  }
-
-  if (*nrings == 0) {
-    WARN("Could not create rings, falling back on simple ring");
-    *nrings = 1;
-    prev[rank] = (rank-1+nranks) % nranks;
-    next[rank] = (rank+1)%nranks;
-  }
-
-  int maxNrings = ncclParamMaxNrings();
-  int minNrings = ncclParamMinNrings();
-  if (maxNrings > 0 && minNrings > maxNrings) {
-    if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than NCCL_MAX_NRINGS, ignoring NCCL_MIN_NRINGS");
-    minNrings = 0;
-  }
-  if (minNrings > MAXCHANNELS) {
-    if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXCHANNELS, MAXCHANNELS);
-    minNrings = MAXCHANNELS;
-  }
-  if (maxNrings > 0 && maxNrings <= *nrings) {
-    if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
-    *nrings = maxNrings;
-  } else {
-    int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
-    if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
-    if (minNrings > 0 && minNrings > *nrings) {
-      if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings);
-      *nrings = copyRings(*nrings, minNrings, nranks, prev, next, treeIn, treeOut);
-    }
-  }
-
-  NCCLCHECK(getEnvThreads(nthreads));
-  return ncclSuccess;
-}
diff --git a/src/misc/utils.cc b/src/misc/utils.cc
new file mode 100644
index 0000000..5158529
--- /dev/null
+++ b/src/misc/utils.cc
@@ -0,0 +1,194 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "utils.h"
+#include "core.h"
+
+#include "nvmlwrap.h"
+
+// Get current Compute Capability
+int ncclCudaCompCap() {
+  int cudaDev;
+  if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
+  int ccMajor, ccMinor;
+  if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
+  if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0;
+  return ccMajor*10+ccMinor;
+}
+
+ncclResult_t int64ToBusId(int64_t id, char* busId) {
+  sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
+  return ncclSuccess;
+}
+
+ncclResult_t busIdToInt64(char* busId, int64_t* id) {
+  const int size = strlen(busId);
+  char* hexStr;
+  NCCLCHECK(ncclCalloc(&hexStr, size));
+  int hexOffset = 0;
+  for (int i=0; i<size; i++) {
+    char c = busId[i];
+    if (c == '.' || c == ':') continue;
+    if ((c >= '0' && c <= '9') ||
+        (c >= 'A' && c <= 'F') ||
+        (c >= 'a' && c <= 'f')) {
+      hexStr[hexOffset++] = busId[i];
+    } else break;
+  }
+  hexStr[hexOffset] = '\0';
+  *id = strtol(hexStr, NULL, 16);
+  free(hexStr);
+  return ncclSuccess;
+}
+
+// Convert a logical cudaDev index to the NVML device minor number
+ncclResult_t getBusId(int cudaDev, int64_t *busId) {
+  // On most systems, the PCI bus ID comes back as in the 0000:00:00.0
+  // format. Still need to allocate proper space in case PCI domain goes
+  // higher.
+  char busIdStr[] = "00000000:00:00.0";
+  CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
+  NCCLCHECK(busIdToInt64(busIdStr, busId));
+  return ncclSuccess;
+}
+
+ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
+  if (gethostname(hostname, maxlen) != 0) {
+    strncpy(hostname, "unknown", maxlen);
+    return ncclSystemError;
+  }
+  int i = 0;
+  while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen-1)) i++;
+  hostname[i] = '\0';
+  return ncclSuccess;
+}
+
+uint64_t getHash(const char* string, int n) {
+  // Based on DJB2, result = result * 33 + char
+  uint64_t result = 5381;
+  for (int c = 0; c < n; c++) {
+    result = ((result << 5) + result) + string[c];
+  }
+  return result;
+}
+
+/* Generate a hash of the unique identifying string for this host
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
+ *
+ * This string can be overridden by using the NCCL_HOSTID env var.
+ */
+#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
+uint64_t getHostHash(void) {
+  char hostHash[1024];
+  char *hostId;
+
+  // Fall back is the full hostname if something fails
+  (void) getHostName(hostHash, sizeof(hostHash), '\0');
+  int offset = strlen(hostHash);
+
+  if ((hostId = getenv("NCCL_HOSTID")) != NULL) {
+    strncpy(hostHash, hostId, sizeof(hostHash));
+  } else {
+    FILE *file = fopen(HOSTID_FILE, "r");
+    if (file != NULL) {
+      char *p;
+      if (fscanf(file, "%ms", &p) == 1) {
+        strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
+        free(p);
+      }
+    }
+    fclose(file);
+  }
+
+  // Make sure the string is terminated
+  hostHash[sizeof(hostHash)-1]='\0';
+
+  TRACE(NCCL_INIT,"unique hostname '%s'", hostHash);
+
+  return getHash(hostHash, strlen(hostHash));
+}
+
+/* Generate a hash of the unique identifying string for this process
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $$ $(readlink /proc/self/ns/pid)
+ */
+uint64_t getPidHash(void) {
+  char pname[1024];
+  // Start off with our pid ($$)
+  sprintf(pname, "%ld", (long) getpid());
+  int plen = strlen(pname);
+  int len = readlink("/proc/self/ns/pid", pname+plen, sizeof(pname)-1-plen);
+  if (len < 0) len = 0;
+
+  pname[plen+len]='\0';
+  TRACE(NCCL_INIT,"unique PID '%s'", pname);
+
+  return getHash(pname, strlen(pname));
+}
+
+int parseStringList(const char* string, struct netIf* ifList, int maxList) {
+  if (!string) return 0;
+
+  const char* ptr = string;
+
+  int ifNum = 0;
+  int ifC = 0;
+  char c;
+  do {
+    c = *ptr;
+    if (c == ':') {
+      if (ifC > 0) {
+        ifList[ifNum].prefix[ifC] = '\0';
+        ifList[ifNum].port = atoi(ptr+1);
+        ifNum++; ifC = 0;
+      }
+      while (c != ',' && c != '\0') c = *(++ptr);
+    } else if (c == ',' || c == '\0') {
+      if (ifC > 0) {
+        ifList[ifNum].prefix[ifC] = '\0';
+        ifList[ifNum].port = -1;
+        ifNum++; ifC = 0;
+      }
+    } else {
+      ifList[ifNum].prefix[ifC] = c;
+      ifC++;
+    }
+    ptr++;
+  } while (ifNum < maxList && c);
+  return ifNum;
+}
+
+static bool matchIf(const char* string, const char* ref, bool matchExact) {
+  // Make sure to include '\0' in the exact case
+  int matchLen = matchExact ? strlen(string) + 1 : strlen(ref);
+  return strncmp(string, ref, matchLen) == 0;
+}
+
+static bool matchPort(const int port1, const int port2) {
+  if (port1 == -1) return true;
+  if (port2 == -1) return true;
+  if (port1 == port2) return true;
+  return false;
+}
+
+
+bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) {
+  // Make an exception for the case where no user list is defined
+  if (listSize == 0) return true;
+
+  for (int i=0; i<listSize; i++) {
+    if (matchIf(string, ifList[i].prefix, matchExact)
+        && matchPort(port, ifList[i].port)) {
+      return true;
+    }
+  }
+  return false;
+}
diff --git a/src/misc/utils.cu b/src/misc/utils.cu
deleted file mode 100644
index c618e71..0000000
--- a/src/misc/utils.cu
+++ /dev/null
@@ -1,203 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "utils.h"
-#include "debug.h"
-#include "nccl_net.h"
-#include <unistd.h>
-#include <string.h>
-#include <stdarg.h>
-
-#include "nvmlwrap.h"
-#include "core.h"
-
-// Convert a logical cudaDev index to the NVML device minor number
-ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) {
-  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-  nvmlDevice_t nvmlDevice;
-  unsigned int dev;
-  *nvmlDev = -1;
-  CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
-  NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDevice));
-  NCCLCHECK(wrapNvmlDeviceGetMinorNumber(nvmlDevice, &dev));
-
-  *nvmlDev = dev;
-
-  return ncclSuccess;
-}
-
-ncclResult_t getHostName(char* hostname, int maxlen) {
-  if (gethostname(hostname, maxlen) != 0) {
-    strncpy(hostname, "unknown", maxlen);
-    return ncclSystemError;
-  }
-  int i = 0;
-  while ((hostname[i] != '.') && (hostname[i] != '\0') && (i < maxlen-1)) i++;
-  hostname[i] = '\0';
-  return ncclSuccess;
-}
-
-/* Common logging function used by the INFO, WARN and TRACE macros
- * Also exported to the dynamically loadable Net transport modules so
- * they can share the debugging mechanisms and output files
- */
-void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
-  if (ncclDebugLevel <= NCCL_LOG_NONE) return;
-
-  char hostname[1024];
-  getHostName(hostname, 1024);
-  int cudaDev;
-  cudaGetDevice(&cudaDev);
-
-  char buffer[1024];
-  size_t len = 0;
-  pthread_mutex_lock(&ncclDebugOutputLock);
-  if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
-    len = snprintf(buffer, sizeof(buffer),
-                   "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
-  else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask))
-    len = snprintf(buffer, sizeof(buffer),
-                   "%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev);
-#ifdef ENABLE_TRACE
-  else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
-    auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
-    double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
-    len = snprintf(buffer, sizeof(buffer),
-                   "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line);
-  }
-#endif
-  if (len) {
-    va_list vargs;
-    va_start(vargs, fmt);
-    (void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
-    va_end(vargs);
-    fprintf(ncclDebugFile,"%s\n", buffer);
-    fflush(ncclDebugFile);
-  }
-  pthread_mutex_unlock(&ncclDebugOutputLock);
-
-  // If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort()
-  if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) {
-    fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n",
-            hostname, getpid(), gettid(), cudaDev, filefunc, line);
-    abort();
-  }
-}
-
-uint64_t getHash(const char* string) {
-  // Based on DJB2, result = result * 33 + char
-  uint64_t result = 5381;
-  for (int c = 0; string[c] != '\0'; c++) {
-    result = ((result << 5) + result) + string[c];
-  }
-  return result;
-}
-
-/* Generate a hash of the unique identifying string for this host
- * that will be unique for both bare-metal and container instances
- * Equivalent of a hash of;
- *
- * $(hostname) $(readlink /proc/self/ns/uts) $(readlink /proc/self/ns/mnt)
- */
-uint64_t getHostHash(void) {
-  char uname[1024];
-  // Start off with the hostname
-  (void) getHostName(uname, sizeof(uname));
-  int offset = strlen(uname);
-  int len;
-  // $(readlink /proc/self/ns/uts)
-  len = readlink("/proc/self/ns/uts", uname+offset, sizeof(uname)-1-offset);
-  if (len < 0) len = 0;
-  offset += len;
-  // $(readlink /proc/self/ns/mnt)
-  len = readlink("/proc/self/ns/mnt", uname+offset, sizeof(uname)-1-offset);
-  if (len < 0) len = 0;
-  offset += len;
-  // Trailing '\0'
-  uname[offset]='\0';
-  TRACE(NCCL_INIT,"unique hostname '%s'", uname);
-
-  return getHash(uname);
-}
-
-/* Generate a hash of the unique identifying string for this process
- * that will be unique for both bare-metal and container instances
- * Equivalent of a hash of;
- *
- * $$ $(readlink /proc/self/ns/pid)
- */
-uint64_t getPidHash(void) {
-  char pname[1024];
-  // Start off with our pid ($$)
-  sprintf(pname, "%ld", (long) getpid());
-  int plen = strlen(pname);
-  int len = readlink("/proc/self/ns/pid", pname+plen, sizeof(pname)-1-plen);
-  if (len < 0) len = 0;
-
-  pname[plen+len]='\0';
-  TRACE(NCCL_INIT,"unique PID '%s'", pname);
-
-  return getHash(pname);
-}
-
-int parseStringList(const char* string, struct netIf* ifList, int maxList) {
-  if (!string) return 0;
-
-  const char* ptr = string;
-  // Ignore "^" prefix, will be detected outside of this function
-  if (ptr[0] == '^') ptr++;
-
-  int ifNum = 0;
-  int ifC = 0;
-  char c;
-  do {
-    c = *ptr;
-    if (c == ':') {
-      if (ifC > 0) {
-        ifList[ifNum].prefix[ifC] = '\0';
-        ifList[ifNum].port = atoi(ptr+1);
-        ifNum++; ifC = 0;
-      }
-      while (c != ',' && c != '\0') c = *(++ptr);
-    } else if (c == ',' || c == '\0') {
-      if (ifC > 0) {
-        ifList[ifNum].prefix[ifC] = '\0';
-        ifList[ifNum].port = -1;
-        ifNum++; ifC = 0;
-      }
-    } else {
-      ifList[ifNum].prefix[ifC] = c;
-      ifC++;
-    }
-    ptr++;
-  } while (ifNum < maxList && c);
-  return ifNum;
-}
-
-static bool matchPrefix(const char* string, const char* prefix) {
-  return (strncmp(string, prefix, strlen(prefix)) == 0);
-}
-
-static bool matchPort(const int port1, const int port2) {
-  if (port1 == -1) return true;
-  if (port2 == -1) return true;
-  if (port1 == port2) return true;
-  return false;
-}
-
-
-bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize) {
-  // Make an exception for the case where no user list is defined
-  if (listSize == 0) return true;
-
-  for (int i=0; i<listSize; i++) {
-    if (matchPrefix(string, ifList[i].prefix)
-        && matchPort(port, ifList[i].port)) {
-      return true;
-    }
-  }
-  return false;
-}
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 985274e..f07e0a4 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -41,7 +41,7 @@ typedef enum { ncclSuccess                 =  0,
  * This integer is coded with the MAJOR, MINOR and PATCH level of the
  * NCCL library
  */
-ncclResult_t ncclGetVersion(int *version);
+ncclResult_t  ncclGetVersion(int *version);
 ncclResult_t pncclGetVersion(int *version);
 
 /* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
@@ -244,7 +244,8 @@ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou
  * Start a group call. All subsequent calls to NCCL may not block due to
  * inter-CPU synchronization.
  */
-ncclResult_t ncclGroupStart();
+ncclResult_t  ncclGroupStart();
+ncclResult_t pncclGroupStart();
 
 /*
  * Group End
@@ -252,7 +253,8 @@ ncclResult_t ncclGroupStart();
  * End a group call. Wait for all calls since ncclGroupStart to complete
  * before returning.
  */
-ncclResult_t ncclGroupEnd();
+ncclResult_t  ncclGroupEnd();
+ncclResult_t pncclGroupEnd();
 
 #ifdef __cplusplus
 } // end extern "C"
diff --git a/src/nccl.pc.in b/src/nccl.pc.in
new file mode 100755
index 0000000..0d98494
--- /dev/null
+++ b/src/nccl.pc.in
@@ -0,0 +1,10 @@
+prefix=${nccl:Prefix}
+exec_prefix=${prefix}
+libdir=${exec_prefix}/lib
+includedir=${prefix}/include
+
+Name: nccl
+Description: Optimized primitives for collective multi-GPU communication
+Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
+Libs: -L${libdir} -lnccl
+Cflags: -I${includedir}
diff --git a/src/transport.cu b/src/transport.cc
index 1436a5b..4059849 100644
--- a/src/transport.cu
+++ b/src/transport.cc
@@ -4,7 +4,8 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
+#include "comm.h"
+#include "info.h"
 
 extern struct ncclTransport p2pTransport;
 extern struct ncclTransport shmTransport;
@@ -119,13 +120,13 @@ ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int r
   }
   if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
     // Tree up
-    struct ncclTree* tree = &args->channel->tree;
+    struct ncclTree* tree = &args->channel->treeUp;
     for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args));
     NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
   }
   if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
     // Tree down
-    struct ncclTree* tree = &args->channel->tree;
+    struct ncclTree* tree = &args->channel->treeDn;
     for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
     NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
   }
@@ -157,7 +158,9 @@ void* persistentThread(void *comm_) {
       }
     } while (op == NULL);
     op->idle = 0;
-    if (op->state != ncclProxyOpNone) ret = op->progress(op);
+    // opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started
+    // yet and might be cancelled before they even start. Hold on on those.
+    if (op->state != ncclProxyOpNone && op->opCount < comm->lastOpCount) ret = op->progress(op);
     if (ret != ncclSuccess) {
       comm->fatalError = ret;
       INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
diff --git a/src/transport/net.cu b/src/transport/net.cc
index 06a6e23..87fc9ce 100644
--- a/src/transport/net.cu
+++ b/src/transport/net.cc
@@ -4,39 +4,9 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "core.h"
-#include "transport.h"
-#include "nvmlwrap.h"
+#include "comm.h"
 #include "net.h"
-#include "param.h"
-#include "topo.h"
-#include <cuda_runtime.h>
-#include <assert.h>
-
-#define NET_MAX_IFS 16
-#define NET_MAX_GPUS 32
-
-// Cache GPU-NIC distances to avoid re-computing them
-#define NET_TVALUE_UNKNOWN 0ULL
-static ncclTvalue_t ncclNetTvalues[NET_MAX_GPUS] = { NET_TVALUE_UNKNOWN };
-static int ncclNetNDev;
-
-// We encode 3 bits of distance per interface into a ncclTvalue_t (64-bit)
-#define NET_BITS_PER_IF 3
-#define NET_BITS_PER_IF_MASK ((1<<NET_BITS_PER_IF)-1)
-static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t");
-static ncclTvalue_t getTvalue(short* distances, int ndev) {
-  ncclTvalue_t tvalue = 0;
-  for (int d=0; d<ndev; d++) {
-    int score = 1 + PATH_SOC - distances[d];
-    // Keep 3 bits of score info per dev
-    tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
-  }
-  return tvalue;
-}
-static int getScore(ncclTvalue_t tvalue, int dev) {
-  return (tvalue >> (dev*NET_BITS_PER_IF)) & NET_BITS_PER_IF_MASK;
-}
+#include "graph.h"
 
 struct netConnectInfo {
   ncclNetHandle_t netHandle;
@@ -53,6 +23,7 @@ struct netSendResources {
   int buffSize;
   void* mhandle;
   void* llMhandle;
+  void* ll128Mhandle;
   struct ncclRecvMem* devRecvMem;
   uint64_t step;
   uint64_t llLastCleaning;
@@ -70,224 +41,61 @@ struct netRecvResources {
   int buffSize;
   void* mhandle;
   void* llMhandle;
+  void* ll128Mhandle;
   struct ncclRecvMem* devRecvMem;
   uint64_t step;
   uint64_t llLastCleaning;
 };
 
-static ncclResult_t netDistance(int cudaDev, int dev, short* distance) {
-  char* cudaPath = NULL;
-  char* nicPath = NULL;
-  ncclResult_t err;
-  NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
-  err = ncclNetPciPath(dev, &nicPath);
-  *distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath);
-  if (nicPath) free(nicPath);
-  if (cudaPath) free(cudaPath);
-  return ncclSuccess;
-}
-
-static ncclResult_t netDevices(int* ndev, short** distances) {
-  NCCLCHECK(ncclNetDevices(ndev));
-  if (*ndev == 0) {
-    WARN("Error : Network returned 0 device");
-    return ncclSystemError;
-  }
-  if (*ndev > NET_MAX_IFS) *ndev = NET_MAX_IFS;
-
-  *distances = (short*)malloc(*ndev*sizeof(short));
-  if (*distances == NULL) return ncclSystemError;
-
-  // Find distance with current GPU
-  int cudaDev, nvmlDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
-  char line[1024];
-  sprintf(line, "CUDA Dev %d[%d], %s NIC distance : ", cudaDev, nvmlDev, ncclNetName());
-  for (int d=0; d<*ndev; d++) {
-    NCCLCHECK(netDistance(cudaDev, d, *distances+d));
-    sprintf(line+strlen(line), " %s", pathDists[(*distances)[d]]);
-  }
-  INFO(NCCL_INIT|NCCL_NET, "%s", line);
-  return ncclSuccess;
-}
-
-/* Determine if we can communicate with the peer */
-ncclResult_t netCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  ret[0] = ncclNetTvalues[cudaDev];
-  if (ret[0] == NET_TVALUE_UNKNOWN) {
-    if (cudaDev >= NET_MAX_GPUS) {
-      WARN("CUDA device %d >= MAX %d\n", cudaDev, NET_MAX_GPUS);
-      return ncclInternalError;
-    }
-    int nDev;
-    short* distances;
-    NCCLCHECK(netDevices(&nDev, &distances));
-    ncclNetTvalues[cudaDev] = ret[0] = getTvalue(distances, nDev);
-    ncclNetNDev = nDev;
-    free(distances);
-  }
-  return ncclSuccess;
-}
-
-static inline int groupBestStart(int nranks, int* groups, int group, ncclTvalue_t* values, int card, int minScore) {
-  int bestRank = -1;
-  int bestScore = 0;
-  for (int rank=0; rank<nranks; rank++) {
-    if (groups[rank] != group) continue;
-    for (int i=0; i<nranks; i++) {
-      ncclTvalue_t netValue = values[rank*nranks+i];
-      if (netValue != 0) {
-        ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK;
-        if (score >= minScore && score > bestScore) {
-          bestScore = score;
-          bestRank = rank;
-        }
-        // All other values should be the same, stop here for this rank
-        break;
-      }
-    }
-  }
-  return bestRank;
-}
-static inline int groupBestEnd(int nranks, int* groups, int group, int* subgroups, int startSubGroup, int startRank, ncclTvalue_t* values, int card, int minScore) {
-  // For the last rank, we don't need the absolute best score, just to be within minScore.
-  for (int rank=nranks-1; rank>=0; rank--) {
-    if (groups[rank] != group) continue;
-    if (startSubGroup != -1 && startSubGroup == subgroups[rank]) continue;
-    if (startRank == rank) continue;
-    for (int i=0; i<nranks; i++) {
-      ncclTvalue_t netValue = values[rank*nranks+i];
-      if (netValue != 0) {
-        ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK;
-        if (score >= minScore) {
-          return rank;
-        }
-        // All other values should be the same, stop here for this rank
-        break;
-      }
-    }
-  }
-  return -1;
-}
-
-
-ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
-  int nGroups = groups[nranks-1] + 1;
-  int cardUsed[NET_MAX_IFS*nGroups];
-  for (int c=0; c<NET_MAX_IFS*nGroups; c++) cardUsed[c] = 0;
-
-  for (int ring = 0; ring<*nringsRet; ring++) {
-    int starts[nGroups];
-    int ends[nGroups];
-    for (int group = 0; group<nGroups; group++) {
-      int nranksInGroup = 0;
-      int nsubGroups = 0;
-      for (int rank=0; rank<nranks; rank++) if (groups[rank] == group) {
-          nranksInGroup++;
-          nsubGroups = std::max(subgroups[rank], nsubGroups);
-        }
-      starts[group] = ends[group] = -1;
-      // Receive on the rank closest to the NIC
-      for (int card=0; card<NET_MAX_IFS; card++) {
-        if (cardUsed[group*NET_MAX_IFS+card] == 1) continue;
-        int start = groupBestStart(nranks, groups, group, values, card, minScore);
-        // Send from any rank, but best on a different subgroup and close to the NIC also.
-        int end = (nranksInGroup == 1) ? start
-            : groupBestEnd(nranks, groups, group, subgroups, nsubGroups ? subgroups[start] : -1, start, values, card, minScore);
-        //printf("Ring %d, Minscore %d, Card %d, group %d, start = %d, end = %d\n", ring, minScore, card, group, start, end);
-        if (start != -1 && end != -1) {
-          cardUsed[group*NET_MAX_IFS+card] = 1;
-          starts[group] = start;
-          ends[group] = end;
-          break;
-        }
-      }
-      if (starts[group] == -1 || ends[group] == -1) {
-        *nringsRet = ring;
-        return ncclSuccess;
-      }
-    }
-    // Link groups together
-    for (int group = 0; group<nGroups; group++) {
-      int nextGroup = (group+1)%nGroups;
-      next[ring*nranks+ends[group]] = starts[nextGroup];
-      prev[ring*nranks+starts[nextGroup]] = ends[group];
-    }
-  }
+/* Determine if two peers can communicate with NET */
+ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+  *ret = 1;
   return ncclSuccess;
 }
 
-int getDev(int cudaDev, int ringId) {
-  ncclTvalue_t tvalues = ncclNetTvalues[cudaDev];
-
-  int dev = 0;
-  int maxScore = 0;
-  for (int d=0; d<ncclNetNDev; d++) if (getScore(tvalues,d) > maxScore) maxScore = getScore(tvalues,d);
-  int skip = ringId+1;
-  while (skip) {
-    for (int d=0; d<ncclNetNDev; d++) {
-      if (getScore(tvalues, d) == maxScore) {
-        skip--;
-        if (skip == 0) { dev = d; goto end; }
-      }
-    }
-  }
-end:
-  return dev;
-}
-
 NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
 NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
 
-static ncclResult_t netGetGdrSupport(int dev, int read, int* useGdr) {
+static ncclResult_t netGetGdrSupport(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr) {
   *useGdr = 0;
 
-  int cudaDev, nvmlDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
-
   if (read) { // For reads (sends) only enable under certain conditions
     int gdrReadParam = ncclParamNetGdrRead();
     if (gdrReadParam == 0) return ncclSuccess;
     if (gdrReadParam < 0) {
        int nvlink;
-       NCCLCHECK(ncclNvlinkGpu(&nvlink));
+       NCCLCHECK(ncclTopoHasNvlink(topo, busId, &nvlink));
        if (!nvlink) return ncclSuccess;
     }
   }
 
   // Check if we are close enough that it makes sense to enable GDR
   int netGdrLevel = ncclParamNetGdrLevel();
-  short distance;
-  NCCLCHECK(netDistance(cudaDev, dev, &distance));
+  int distance;
+  NCCLCHECK(ncclTopoNetDistance(topo, busId, netDev, &distance));
   if (distance >= netGdrLevel) {
-    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel);
+    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d >= %d)", ncclNetName(), busId, netDev, distance, netGdrLevel);
     return ncclSuccess;
   }
 
   // Finally, check if the NIC supports it
   int flags;
-  NCCLCHECK(ncclNetPtrSupport(dev, &flags));
+  NCCLCHECK(ncclNetPtrSupport(netDev, &flags));
   if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess;
   *useGdr = 1;
-  INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d[%d] / HCA %d (distance %d < %d), read %d", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel, read);
+  INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d < %d), read %d", ncclNetName(), busId, netDev, distance, netGdrLevel, read);
   return ncclSuccess;
 }
 
 /* Determine if we will use this transport for this peer and return connect
  * information for this peer */
-ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
   struct netSendResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
   send->transportResources = resources;
 
-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  resources->netDev = getDev(cudaDev, channelId);
-  NCCLCHECK(netGetGdrSupport(resources->netDev, 1, &resources->useGdr));
+  NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &resources->netDev));
+  NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
 
   int sendSize = sizeof(struct ncclSendMem);
   NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
@@ -299,20 +107,18 @@ ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
   NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
   resources->buffSize = buffSize;
 
-  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [send] via NET/%s/%d%s", channelId, myInfo->rank, peerInfo->rank, ncclNetName(), resources->netDev,
+  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
       resources->useGdr ? "/GDRDMA" : "");
   return ncclSuccess;
 }
 
-ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
+ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
   struct netRecvResources* resources;
   NCCLCHECK(ncclCalloc(&resources, 1));
   recv->transportResources = resources;
 
-  int cudaDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  resources->netDev = getDev(cudaDev, channelId);
-  NCCLCHECK(netGetGdrSupport(resources->netDev, 0, &resources->useGdr));
+  NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &resources->netDev));
+  NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
 
   int sendSize = sizeof(struct ncclSendMem);
   NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
@@ -324,7 +130,7 @@ ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
   NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
   resources->buffSize = buffSize;
 
-  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [receive] via NET/%s/%d%s", channelId, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
+  INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
       resources->useGdr ? "/GDRDMA" : "");
   struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
   NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
@@ -339,6 +145,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
   struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
   send->conn.buff = recvMem->buff;
   send->conn.llBuff = resources->devHostRecvMem->llBuff;
+  send->conn.ll128Buff = recvMem->ll128Buff;
 
   // Head/Tail/Opcount/Fifos are always on host
   send->conn.tail = &resources->devHostRecvMem->tail;
@@ -356,6 +163,8 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
         resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
   NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->devHostRecvMem->llBuff,
         NCCL_LL_BUFF_SIZE, NCCL_PTR_HOST, &resources->llMhandle));
+  NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE,
+        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle));
 
   return ncclSuccess;
 }
@@ -369,6 +178,7 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
   struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
   recv->conn.buff = recvMem->buff;
   recv->conn.llBuff = recvMem->llBuff;
+  recv->conn.ll128Buff = recvMem->ll128Buff;
 
   // Head/Tail/Opcount are always on host
   recv->conn.tail = &resources->devHostRecvMem->tail;
@@ -384,6 +194,8 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
         resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
   NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->llBuff, NCCL_LL_BUFF_SIZE,
         resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->llMhandle));
+  NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE,
+        resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle));
 
   return ncclSuccess;
 }
@@ -393,6 +205,7 @@ ncclResult_t netSendFree(void* transportResources) {
   NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
   NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandle));
   NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->llMhandle));
+  NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->ll128Mhandle));
   NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
   if (resources->useGdr)
     CUDACHECK(cudaFree(resources->devRecvMem));
@@ -406,6 +219,7 @@ ncclResult_t netRecvFree(void* transportResources) {
   NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
   NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandle));
   NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->llMhandle));
+  NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->ll128Mhandle));
   NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
   if (resources->useGdr)
     CUDACHECK(cudaFree(resources->devRecvMem));
@@ -432,11 +246,44 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
     if (args->head < args->end) {
       if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
         volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
-        if (args->llMode) {
+        volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
+        if (args->protocol == NCCL_PROTO_LL128) {
+          int stepSize = NCCL_LL128_BUFF_SIZE/NCCL_STEPS;
+          if (args->tail < *recvTail) {
+            int buffSlot = args->tail%NCCL_STEPS;
+            if (sizesFifo[buffSlot] != -1) {
+              struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
+              char* localBuff = (char*)localMem->ll128Buff;
+              int ready = resources->useGdr;
+              if (!ready) {
+                // When data is in sysmem, we need to wait until all flags are correct since the GPU only
+                // called threadfence()
+                uint64_t flag = args->tail + 1;
+                int nFifoLines = DIVUP(sizesFifo[buffSlot], sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
+                volatile uint64_t* lines = (volatile uint64_t*)(localBuff+buffSlot*stepSize);
+                ready = 1;
+                for (int i=0; i<nFifoLines; i++) {
+                  if (lines[i*NCCL_LL128_LINEELEMS+NCCL_LL128_DATAELEMS] != flag) { ready = 0; break; }
+                }
+              }
+              if (ready) {
+                // Send through network
+                NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, sizesFifo[buffSlot], resources->ll128Mhandle, args->requests+buffSlot));
+                if (args->requests[buffSlot] != NULL) {
+                  sizesFifo[buffSlot] = -1;
+                  // Make sure size is reset to zero before we update the head.
+                  __sync_synchronize();
+                  args->tail += args->sliceSteps;
+                  args->idle = 0;
+                }
+              }
+            }
+          }
+        } else if (args->protocol == NCCL_PROTO_LL) {
           int buffSlot = args->tail%NCCL_STEPS;
           int size = sizesFifo[buffSlot];
           if (size != -1) {
-            uint32_t flag = args->tail + 1;
+            uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
             int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
             size = nFifoLines * sizeof(union ncclLLFifoLine);
             union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
@@ -457,18 +304,20 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
               }
             }
           }
-        } else if (args->tail < resources->hostRecvMem->tail) {
-          struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
+        } else if (args->tail < *recvTail) {
           int stepSize = args->channel->buffSize/NCCL_STEPS;
+          struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
           // Send through network
           int buffSlot = args->tail%NCCL_STEPS;
-          NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot));
-          if (args->requests[buffSlot] != NULL) {
-            sizesFifo[buffSlot] = -1;
-            // Make sure size is reset to zero before we update the head.
-            __sync_synchronize();
-            args->tail += args->sliceSteps;
-            args->idle = 0;
+          if (sizesFifo[buffSlot] != -1) {
+            NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot));
+            if (args->requests[buffSlot] != NULL) {
+              sizesFifo[buffSlot] = -1;
+              // Make sure size is reset to zero before we update the head.
+              __sync_synchronize();
+              args->tail += args->sliceSteps;
+              args->idle = 0;
+            }
           }
         }
       }
@@ -486,18 +335,8 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
     if (args->head == args->end) {
       resources->step = args->end;
       args->idle = 0;
-      args->state = ncclProxyOpDone;
-    }
-  }
-  if (args->state == ncclProxyOpDone) {
-    union ncclLLFifoLine* llBuff = resources->hostRecvMem->llBuff;
-    if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
-      for (int i=0; i< NCCL_LL_BUFF_LINES; i++) llBuff[i].flag1 = llBuff[i].flag2 = resources->step;
-      resources->step += NCCL_STEPS;
-      resources->hostSendMem->head = resources->step;
-      resources->llLastCleaning = resources->step;
+      args->state = ncclProxyOpNone;
     }
-    args->state = ncclProxyOpNone;
   }
   return ncclSuccess;
 }
@@ -517,12 +356,13 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
   }
   if (args->state == ncclProxyOpProgress) {
     args->idle = 1;
-    int stepSize = ( args->llMode ? NCCL_LL_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS;
+    int stepSize = ( args->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : args->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS;
     if (args->head < args->end) {
       struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
-      char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff;
-      void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle;
-      if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
+      char* localBuff = args->protocol == NCCL_PROTO_LL ? (char*)localMem->llBuff : args->protocol == NCCL_PROTO_LL128 ? (char*)localMem->ll128Buff : localMem->buff;
+      void* mhandle = args->protocol == NCCL_PROTO_LL ? resources->llMhandle : args->protocol == NCCL_PROTO_LL128 ? resources->ll128Mhandle : resources->mhandle;
+      volatile uint64_t* sendHead = &resources->hostSendMem->head;
+      if ((args->tail < args->head + NCCL_STEPS) && (args->tail < *sendHead + NCCL_STEPS) && (args->tail < args->end)) {
         int buffSlot = args->tail%NCCL_STEPS;
         int sliceSize = stepSize * args->sliceSteps;
         NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot));
@@ -537,7 +377,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
         NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size));
         if (done) {
           args->head += args->sliceSteps;
-          if (args->llMode == 0) {
+          if (args->protocol == NCCL_PROTO_SIMPLE) {
             if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle);
             resources->hostRecvMem->tail = args->head;
           }
@@ -548,16 +388,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
     if (args->head == args->end) {
       resources->step = args->end;
       args->idle = 0;
-      args->state = ncclProxyOpDone;
-    }
-  }
-  if (args->state == ncclProxyOpDone) {
-    if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
-      resources->step += NCCL_STEPS;
-      while (resources->hostSendMem->head < resources->step);
-      resources->llLastCleaning = resources->step;
+      args->state = ncclProxyOpNone;
     }
-    args->state = ncclProxyOpNone;
   }
   return ncclSuccess;
 }
@@ -565,7 +397,6 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
 struct ncclTransport netTransport = {
   "NET",
   netCanConnect,
-  netGetRings,
   { netSendSetup, netSendConnect, netSendFree, netSendProxy },
   { netRecvSetup, netRecvConnect, netRecvFree, netRecvProxy }
 };
diff --git a/src/transport/net_ib.cu b/src/transport/net_ib.cc
index f7c574b..0d5307c 100644
--- a/src/transport/net_ib.cu
+++ b/src/transport/net_ib.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,7 +8,7 @@
 #include "core.h"
 #include "socket.h"
 #include "net.h"
-#include "topo.h"
+#include "graph.h"
 #include "utils.h"
 #include "param.h"
 
@@ -107,18 +107,22 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
       char* userIbEnv = getenv("NCCL_IB_HCA");
       struct netIf userIfs[MAX_IB_DEVS];
       bool searchNot = userIbEnv && userIbEnv[0] == '^';
+      if (searchNot) userIbEnv++;
+      bool searchExact = userIbEnv && userIbEnv[0] == '=';
+      if (searchExact) userIbEnv++;
       int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS);
 
       if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return ncclInternalError;
 
-      for (int d=0; d<nIbDevs; d++) {
+      for (int d=0; d<nIbDevs && ncclNIbDevs<MAX_IB_DEVS; d++) {
         struct ibv_context * context;
         if (ncclSuccess != wrap_ibv_open_device(&context, devices[d]) || context == NULL) {
           WARN("NET/IB : Unable to open device %s", devices[d]->name);
           continue;
         }
-        int found = 0;
+        int nPorts = 0;
         struct ibv_device_attr devAttr;
+        memset(&devAttr, 0, sizeof(devAttr));
         if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
           WARN("NET/IB : Unable to query device %s", devices[d]->name);
           if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
@@ -135,7 +139,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
               && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
 
           // check against user specified HCAs/ports
-          if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) {
+          if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs, searchExact) ^ searchNot)) {
             continue;
           }
           TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
@@ -146,10 +150,10 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
           ncclIbDevs[ncclNIbDevs].context = context;
           strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
           ncclNIbDevs++;
-          found++;
+          nPorts++;
           pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
         }
-        if (found == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
+        if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
       }
       if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; };
     }
@@ -197,32 +201,14 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
     moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
   }
   if (moduleLoaded == 0) return ncclSystemError;
-  ncclResult_t ret = ncclSystemError;
-  void* ptr;
-  if (cudaMalloc(&ptr, sizeof(int)) == cudaSuccess) {
-    struct ibv_mr* mr;
-    struct ibv_pd* pd;
-    if (wrap_ibv_alloc_pd(&pd, ncclIbDevs[ibDev].context) == ncclSuccess) {
-      if ((mr = wrap_direct_ibv_reg_mr(pd, ptr, sizeof(int), IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)) != NULL) {
-        ret = ncclSuccess;
-        wrap_ibv_dereg_mr(mr);
-      }
-      wrap_ibv_dealloc_pd(pd);
-    }
-    cudaFree(ptr);
-  }
-  return ret;
+  return ncclSuccess;
 }
 
 ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
   *supportedTypes = NCCL_PTR_HOST;
 
-  int cudaDev, nvmlDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
-
   if (ncclIbGdrSupport(dev) != ncclSuccess) {
-    INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d '%s' (no module or not supported by GPU)", cudaDev, nvmlDev, dev, ncclIbDevs[dev].devName);
+    INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName);
     return ncclSuccess;
   }
   *supportedTypes |= NCCL_PTR_CUDA;
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
new file mode 100644
index 0000000..1b1fc4f
--- /dev/null
+++ b/src/transport/net_socket.cc
@@ -0,0 +1,503 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "core.h"
+#include "socket.h"
+#include "net.h"
+#include "param.h"
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <poll.h>
+#include <limits.h>
+#include <fcntl.h>
+
+/* Init functions */
+static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
+static union socketAddress ncclNetIfAddrs[MAX_IFS];
+static int ncclNetIfs = -1;
+pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
+
+ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
+  if (ncclNetIfs == -1) {
+    pthread_mutex_lock(&ncclSocketLock);
+    if (ncclNetIfs == -1) {
+      ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
+      if (ncclNetIfs <= 0) {
+        WARN("NET/Socket : no interface found");
+        return ncclInternalError;
+      } else {
+        char line[1024];
+        char addrline[1024];
+        line[0] = '\0';
+        for (int i=0; i<ncclNetIfs; i++) {
+          snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, ncclNetIfNames+i*MAX_IF_NAME_SIZE,
+              socketToString(&ncclNetIfAddrs[i].sa, addrline));
+        }
+        line[1023] = '\0';
+        INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
+      }
+    }
+    pthread_mutex_unlock(&ncclSocketLock);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) {
+  *supportedTypes = NCCL_PTR_HOST;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketDevices(int* ndev) {
+  *ndev = ncclNetIfs;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketPciPath(int dev, char** path) {
+  char devicepath[PATH_MAX];
+  snprintf(devicepath, PATH_MAX, "/sys/class/net/%s/device", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
+  *path = realpath(devicepath, NULL);
+  if (*path == NULL) {
+    INFO(NCCL_NET|NCCL_INIT, "Could not find real path of %s", devicepath);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
+  if (dev >= ncclNetIfs) return ncclInternalError;
+  memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr));
+  return ncclSuccess;
+}
+
+/* Communication functions */
+
+#define MAX_SOCKETS 64
+#define MAX_THREADS 16
+#define MAX_REQUESTS 128
+#define MAX_QUEUE_LEN MAX_REQUESTS
+#define MIN_CHUNKSIZE (64*1024)
+
+NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2);
+NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2);
+
+struct ncclSocketHandle {
+  union socketAddress connectAddr;
+  int nSocks;
+  int nThreads;
+};
+
+struct ncclSocketTask {
+  int op;
+  void* data;
+  int size;
+  int fd;
+  int offset;
+  int used;
+  ncclResult_t result;
+};
+
+struct ncclSocketRequest {
+  int op;
+  void* data;
+  int size;
+  int ctrlFd;
+  int offset;
+  int used;
+  struct ncclSocketComm* comm;
+  struct ncclSocketTask* tasks[MAX_SOCKETS];
+  int nSubs;
+};
+
+struct ncclSocketTaskQueue {
+  int next;
+  struct ncclSocketTask* tasks;
+};
+
+enum threadState {start, stop};
+
+struct ncclSocketThreadResources {
+  struct ncclSocketTaskQueue threadTaskQueue;
+  enum threadState state;
+  struct ncclSocketComm* comm;
+  pthread_mutex_t threadLock;
+  pthread_cond_t  threadCond;
+};
+
+struct ncclSocketListenComm {
+  int fd;
+  int nSocks;
+  int nThreads;
+};
+
+struct ncclSocketComm {
+  int ctrlFd;
+  int fds[MAX_SOCKETS];
+  int nSocks;
+  int nThreads;
+  int nextFd;
+  struct ncclSocketRequest requests[MAX_REQUESTS];
+  pthread_t helperThread[MAX_THREADS];
+  struct ncclSocketThreadResources threadResources[MAX_THREADS];
+};
+
+void* persistentSocketThread(void *args_) {
+  struct ncclSocketThreadResources* resource = (struct ncclSocketThreadResources*)args_;
+  struct ncclSocketComm* comm = resource->comm;
+  volatile enum threadState* state = &resource->state;
+  struct ncclSocketTaskQueue* myQueue = &resource->threadTaskQueue;
+  int nSocksPerThread = comm->nSocks / comm->nThreads;
+  while (1) {
+    int idle = 1;
+    int mark = myQueue->next; // mark newest task seen
+    for (int i=0; i<MAX_QUEUE_LEN; i+=nSocksPerThread) {
+      int repeat;
+      do {
+        repeat = 0;
+        for (int j=0; j<nSocksPerThread; j++) {
+          struct ncclSocketTask* r = myQueue->tasks+i+j;
+          if (r != NULL && r->used == 1 && r->offset < r->size) {
+            r->result = socketProgress(r->op, r->fd, r->data, r->size, &r->offset);
+            if (r->result != ncclSuccess) {
+              WARN("NET/Socket : socket progress error");
+              return NULL;
+            }
+            idle = 0;
+            if (r->offset < r->size) repeat = 1;
+          }
+        }
+      } while (repeat);
+    }
+    if (idle) {
+      pthread_mutex_lock(&resource->threadLock);
+      while (mark == myQueue->next && *state != stop) { // no new tasks, wait
+        pthread_cond_wait(&resource->threadCond, &resource->threadLock);
+      }
+      pthread_mutex_unlock(&resource->threadLock);
+    }
+    if (*state == stop) return NULL;
+  }
+}
+
+ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
+  int nSocksPerThread = ncclParamSocketNsocksPerThread();
+  int nThreads = ncclParamSocketNthreads();
+  if (nThreads > MAX_THREADS) {
+    WARN("NET/Socket : NCCL_SOCKET_NTHREADS is greater than the maximum allowed, setting to %d", MAX_THREADS);
+    nThreads = MAX_THREADS;
+  }
+  if (nThreads == -2 || nSocksPerThread == -2) {
+    // Auto-detection
+    int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads
+    char vendorPath[PATH_MAX];
+    snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
+    char* rPath = realpath(vendorPath, NULL);
+    int fd = open(rPath, O_RDONLY);
+    free(rPath);
+    if (fd == -1) {
+      // Could not find device vendor. This is handled silently so
+      // we don't want to print an INFO error.
+      TRACE(NCCL_NET, "Open of %s failed : %s\n", vendorPath, strerror(errno));
+      goto end;
+    }
+    char vendor[7];
+    strncpy(vendor, "0x0000", 7);
+    int len;
+    SYSCHECKVAL(read(fd, vendor, 6), "read", len);
+    SYSCHECK(close(fd), "close");
+    if (strcmp(vendor, "0x1d0f") == 0) { // AWS
+      autoNt = 2;
+      autoNs = 8;
+    } else if (strcmp(vendor, "0x1ae0") == 0) { // GCP
+      autoNt = 4;
+      autoNs = 1;
+    }
+end:
+    if (nThreads == -2) nThreads = autoNt;
+    if (nSocksPerThread == -2) nSocksPerThread = autoNs;
+  }
+  int nSocks = nSocksPerThread * nThreads;
+  if (nSocks > MAX_SOCKETS) {
+    nSocksPerThread = MAX_SOCKETS/nThreads;
+    WARN("NET/Socket : the total number of sockets is greater than the maximum allowed, setting NCCL_NSOCKS_PERTHREAD to %d", nSocksPerThread);
+    nSocks = nSocksPerThread * nThreads;
+  }
+  *ns = nSocks;
+  *nt = nThreads;
+  if (nSocks > 0) INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketNewListenComm(struct ncclSocketListenComm** comm) {
+  NCCLCHECK(ncclCalloc(comm, 1));
+  (*comm)->fd = -1;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) {
+  NCCLCHECK(ncclCalloc(comm, 1));
+  (*comm)->ctrlFd = -1;
+  for (int i=0; i < MAX_SOCKETS; i++) {
+    (*comm)->fds[i] = -1;
+  }
+  (*comm)->nextFd = 0;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
+  if (dev < 0) { // data transfer socket is based on specified dev
+    return ncclInternalError;
+  }
+  struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
+  static_assert(sizeof(struct ncclSocketHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large");
+  struct ncclSocketListenComm* comm;
+  NCCLCHECK(ncclSocketNewListenComm(&comm));
+  NCCLCHECK(GetSocketAddr(dev, &handle->connectAddr));
+  NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
+  NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
+  handle->nSocks = comm->nSocks;
+  handle->nThreads = comm->nThreads;
+  *listenComm = comm;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) {
+  if (dev < 0) { // data transfer socket is based on specified dev
+    return ncclInternalError;
+  }
+  struct ncclSocketComm* comm;
+  NCCLCHECK(ncclSocketNewComm(&comm));
+  struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
+  comm->nSocks = handle->nSocks;
+  comm->nThreads = handle->nThreads;
+  for (int i=0; i<comm->nSocks+1; i++) {
+    int tmpFd, offset=0;
+    NCCLCHECK(connectAddress(&tmpFd, &handle->connectAddr));
+    NCCLCHECK(socketWait(NCCL_SOCKET_SEND, tmpFd, &i, sizeof(int), &offset));
+    if (i == comm->nSocks) comm->ctrlFd = tmpFd;
+    else comm->fds[i] = tmpFd;
+  }
+  *sendComm = comm;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) {
+  struct ncclSocketListenComm* lComm = (struct ncclSocketListenComm*)listenComm;
+  struct ncclSocketComm* rComm;
+  NCCLCHECK(ncclSocketNewComm(&rComm));
+  rComm->nSocks = lComm->nSocks;
+  rComm->nThreads = lComm->nThreads;
+  for (int i=0; i<rComm->nSocks+1; i++) {
+    int tmpFd, sendSockIdx, offset=0;
+    struct sockaddr_in sockaddr;
+    socklen_t socklen = sizeof(struct sockaddr_in);
+    SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", tmpFd);
+    NCCLCHECK(socketWait(NCCL_SOCKET_RECV, tmpFd, &sendSockIdx, sizeof(int), &offset));
+    if (sendSockIdx == rComm->nSocks) rComm->ctrlFd = tmpFd;
+    else rComm->fds[sendSockIdx] = tmpFd;
+  }
+  *recvComm = rComm;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketRequest** req) {
+  for (int i=0; i<MAX_REQUESTS; i++) {
+    struct ncclSocketRequest* r = comm->requests+i;
+    if (r->used == 0) {
+      r->op = op;
+      r->data = data;
+      r->size = size;
+      r->ctrlFd = comm->ctrlFd;
+      r->used = 1;
+      r->comm = comm;
+      r->nSubs = 0;
+      *req = r;
+      return ncclSuccess;
+    }
+  }
+  WARN("NET/Socket : unable to allocate requests");
+  return ncclInternalError;
+}
+
+ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketTask** req) {
+  int tid = comm->nextFd % comm->nThreads;
+  struct ncclSocketThreadResources* res = comm->threadResources+tid;
+  struct ncclSocketTaskQueue* queue = &res->threadTaskQueue;
+  // create helper threads and prepare per-thread task queue
+  if (queue->tasks == NULL) {
+    NCCLCHECK(ncclCalloc(&queue->tasks, MAX_QUEUE_LEN));
+    queue->next = 0;
+    res->comm = comm;
+    pthread_mutex_init(&res->threadLock, NULL);
+    pthread_cond_init(&res->threadCond, NULL);
+    pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res);
+  }
+  struct ncclSocketTask* r = queue->tasks+queue->next;
+  if (r->used == 0) {
+    r->op = op;
+    r->data = data;
+    r->size = size;
+    r->fd = comm->fds[comm->nextFd];
+    r->offset = 0;
+    r->result = ncclSuccess;
+    comm->nextFd = (comm->nextFd + 1) % comm->nSocks;
+    r->used = 1;
+    *req = r;
+    pthread_mutex_lock(&res->threadLock);
+    queue->next = (queue->next+1)%MAX_QUEUE_LEN;
+    res->state = start;
+    pthread_cond_signal(&res->threadCond);
+    pthread_mutex_unlock(&res->threadLock);
+    return ncclSuccess;
+  }
+  WARN("NET/Socket : unable to allocate subtasks");
+  return ncclInternalError;
+}
+
+ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
+  *done = 0;
+  struct ncclSocketRequest *r = (struct ncclSocketRequest*)request;
+  if (r == NULL) {
+    WARN("NET/Socket : test called with NULL request");
+    return ncclInternalError;
+  }
+  if (r->used == 1) { /* try to send/recv size */
+    int data = r->size;
+    int offset = 0;
+    NCCLCHECK(socketProgress(r->op, r->ctrlFd, &data, sizeof(int), &offset));
+
+    if (offset == 0) return ncclSuccess; /* Not ready -- retry later */
+
+    // Not sure we could ever receive less than 4 bytes, but just in case ...
+    if (offset < sizeof(int)) NCCLCHECK(socketWait(r->op, r->ctrlFd, &data, sizeof(int), &offset));
+
+    // Check size is less or equal to the size provided by the user
+    if (r->op == NCCL_SOCKET_RECV && data > r->size) {
+      WARN("NET/Socket : message truncated : receiving %d bytes instead of %d", data, r->size);
+      return ncclInternalError;
+    }
+    r->size = data;
+    r->offset = 0;
+    r->used = 2; // done exchanging size
+    // divide into subtasks
+    int chunkOffset = 0, i = 0;
+    if (r->comm->nSocks > 0) {
+      int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
+      while (chunkOffset < r->size) {
+        int chunkSize = std::min(taskSize, r->size-chunkOffset);
+        NCCLCHECK(ncclSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
+        chunkOffset += chunkSize;
+      }
+    }
+    r->nSubs = i;
+  }
+  if (r->used == 2) { // already exchanged size
+    if (r->nSubs > 0) {
+      int nCompleted = 0;
+      for (int i=0; i<r->nSubs; i++) {
+        struct ncclSocketTask* sub = r->tasks[i];
+        if (sub->result != ncclSuccess) return sub->result;
+        if (sub->offset == sub->size) nCompleted++;
+      }
+      if (nCompleted == r->nSubs) {
+        if (size) *size = r->size;
+        *done = 1;
+        r->used = 0;
+        for (int i=0; i<r->nSubs; i++) {
+          struct ncclSocketTask* sub = r->tasks[i];
+          sub->used = 0;
+        }
+      }
+    } else { // progress request using main thread
+      if (r->offset < r->size) {
+        NCCLCHECK(socketProgress(r->op, r->ctrlFd, r->data, r->size, &r->offset));
+      }
+      if (r->offset == r->size) {
+        if (size) *size = r->size;
+        *done = 1;
+        r->used = 0;
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+  return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess;
+}
+ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
+
+ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, void* mhandle, void** request) {
+  struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm;
+  NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclSocketRequest**)request));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) {
+  struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm;
+  NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_RECV, data, size, (struct ncclSocketRequest**)request));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size, void* mhandle) {
+  // We don't support CUDA pointers, so we don't need a flush operation
+  return ncclInternalError;
+}
+
+ncclResult_t ncclSocketCloseListen(void* opaqueComm) {
+  struct ncclSocketListenComm* comm = (struct ncclSocketListenComm*)opaqueComm;
+  if (comm) {
+    if (comm->fd != -1) close(comm->fd);
+    free(comm);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketClose(void* opaqueComm) {
+  struct ncclSocketComm* comm = (struct ncclSocketComm*)opaqueComm;
+  if (comm) {
+    for (int i=0; i<comm->nThreads; i++) {
+      struct ncclSocketThreadResources* res = comm->threadResources+i;
+      if (comm->helperThread[i]) {
+        pthread_mutex_lock(&res->threadLock);
+        res->state = stop;
+        pthread_cond_signal(&res->threadCond);
+        pthread_mutex_unlock(&res->threadLock);
+        pthread_join(comm->helperThread[i], NULL);
+      }
+      free(res->threadTaskQueue.tasks);
+    }
+    if (comm->ctrlFd != -1) close(comm->ctrlFd);
+    for (int i=0; i<comm->nSocks; i++) {
+      if (comm->fds[i] != -1) close(comm->fds[i]);
+    }
+    free(comm);
+  }
+  return ncclSuccess;
+}
+
+ncclNet_t ncclNetSocket = {
+  "Socket",
+  ncclSocketInit,
+  ncclSocketDevices,
+  ncclSocketPciPath,
+  ncclSocketPtrSupport,
+  ncclSocketListen,
+  ncclSocketConnect,
+  ncclSocketAccept,
+  ncclSocketRegMr,
+  ncclSocketDeregMr,
+  ncclSocketIsend,
+  ncclSocketIrecv,
+  ncclSocketFlush,
+  ncclSocketTest,
+  ncclSocketClose,
+  ncclSocketClose,
+  ncclSocketCloseListen
+};
diff --git a/src/transport/net_socket.cu b/src/transport/net_socket.cu
deleted file mode 100644
index 0464b43..0000000
--- a/src/transport/net_socket.cu
+++ /dev/null
@@ -1,267 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "nccl.h"
-#include "core.h"
-#include "socket.h"
-#include "net.h"
-
-#include <assert.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <poll.h>
-#include <limits.h>
-
-/* Init functions */
-static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
-static union socketAddress ncclNetIfAddrs[MAX_IFS];
-static int ncclNetIfs = -1;
-pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
-
-ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
-  if (ncclNetIfs == -1) {
-    pthread_mutex_lock(&ncclSocketLock);
-    if (ncclNetIfs == -1) {
-      ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
-      if (ncclNetIfs <= 0) {
-        WARN("NET/Socket : no interface found");
-        return ncclInternalError;
-      } else {
-        char line[1024];
-        char addrline[1024];
-        line[0] = '\0';
-        for (int i=0; i<ncclNetIfs; i++) {
-          snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, ncclNetIfNames+i*MAX_IF_NAME_SIZE,
-              socketToString(&ncclNetIfAddrs[i].sa, addrline));
-        }
-        line[1023] = '\0';
-        INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
-      }
-    }
-    pthread_mutex_unlock(&ncclSocketLock);
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) {
-  *supportedTypes = NCCL_PTR_HOST;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketDevices(int* ndev) {
-  *ndev = ncclNetIfs;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketPciPath(int dev, char** path) {
-  char devicepath[PATH_MAX];
-  snprintf(devicepath, PATH_MAX, "/sys/class/net/%s/device", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
-  *path = realpath(devicepath, NULL);
-  if (*path == NULL) {
-    INFO(NCCL_NET|NCCL_INIT, "Could not find real path of %s", devicepath);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
-  if (dev >= ncclNetIfs) return ncclInternalError;
-  memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr));
-  return ncclSuccess;
-}
-
-/* Communication functions */
-
-struct ncclSocketHandle {
-  union socketAddress connectAddr;
-};
-
-struct ncclSocketRequest {
-  int op;
-  void* data;
-  int size;
-  int fd;
-  int offset;
-  int used;
-};
-
-struct ncclSocketReqs {
-  struct ncclSocketRequest* requests;
-};
-
-struct ncclSocketComm {
-  int fd;
-  struct ncclSocketReqs reqs;
-};
-
-ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) {
-  NCCLCHECK(ncclCalloc(comm, 1));
-  (*comm)->fd = -1;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str) {
-  struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
-  NCCLCHECK(GetSocketAddrFromString(&(handle->connectAddr), str));
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
-  struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
-  static_assert(sizeof(struct ncclSocketHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large");
-  // if dev >= 0, listen based on dev
-  if (dev >= 0) {
-    NCCLCHECK(GetSocketAddr(dev, &(handle->connectAddr)));
-  } else if (dev == findSubnetIf) {
-    // handle stores a remote address
-    // need to find a local addr that is in the same network as the remote addr
-    union socketAddress localAddr;
-    char ifName[MAX_IF_NAME_SIZE];
-    if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
-      WARN("NET/Socket : No usable listening interface found");
-      return ncclSystemError;
-    }
-    // pass the local address back
-    memcpy(&handle->connectAddr, &localAddr, sizeof(handle->connectAddr));
-  } // Otherwise, handle stores a local address
-  struct ncclSocketComm* comm;
-  NCCLCHECK(ncclSocketNewComm(&comm));
-  NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
-  *listenComm = comm;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) {
-  struct ncclSocketComm* comm;
-  NCCLCHECK(ncclSocketNewComm(&comm));
-  struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
-  NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr));
-  *sendComm = comm;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) {
-  struct ncclSocketComm* lComm = (struct ncclSocketComm*)listenComm;
-  struct ncclSocketComm* rComm;
-  NCCLCHECK(ncclSocketNewComm(&rComm));
-  struct sockaddr_in sockaddr;
-  socklen_t socklen = sizeof(struct sockaddr_in);
-  SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
-  *recvComm = rComm;
-  return ncclSuccess;
-}
-
-#define MAX_REQUESTS 128
-
-ncclResult_t ncclSocketGetRequest(struct ncclSocketReqs* reqs, int op, void* data, int size, int fd, struct ncclSocketRequest** req) {
-  if (reqs->requests == NULL) {
-    NCCLCHECK(ncclCalloc(&reqs->requests, MAX_REQUESTS));
-  }
-  for (int i=0; i<MAX_REQUESTS; i++) {
-    struct ncclSocketRequest* r = reqs->requests+i;
-    if (r->used == 0) {
-      r->op = op;
-      r->data = data;
-      r->size = size;
-      r->fd = fd;
-      r->offset = -1;
-      r->used = 1;
-      *req = r;
-      return ncclSuccess;
-    }
-  }
-  WARN("Socket : unable to allocate requests");
-  return ncclInternalError;
-}
-
-ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
-  *done = 0;
-  struct ncclSocketRequest *r = (struct ncclSocketRequest*)request;
-  if (r == NULL) {
-    WARN("NET/Socket : test called with NULL request");
-    return ncclInternalError;
-  }
-  if (r->offset == -1) { /* try to send/recv size */
-    int data = r->size;
-    int offset = 0;
-    NCCLCHECK(socketProgress(r->op, r->fd, &data, sizeof(int), &offset));
-
-    if (offset == 0) return ncclSuccess; /* Not ready -- retry later */
-
-    // Not sure we could ever receive less than 4 bytes, but just in case ...
-    if (offset < sizeof(int)) NCCLCHECK(socketWait(r->op, r->fd, &data, sizeof(int), &offset));
-
-    // Check size is less or equal to the size provided by the user
-    if (r->op == NCCL_SOCKET_RECV && data > r->size) {
-      WARN("NET/Socket : message truncated : receiving %d bytes instead of %d", data, r->size);
-      return ncclInternalError;
-    }
-    r->size = data;
-    r->offset = 0;
-  }
-  if (r->offset < r->size) {
-    NCCLCHECK(socketProgress(r->op, r->fd, r->data, r->size, &r->offset));
-  }
-  if (r->offset == r->size) {
-    if (size) *size = r->size;
-    *done = 1;
-    r->used = 0;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) {
-  return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess;
-}
-ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
-
-ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, void* mhandle, void** request) {
-  struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm;
-  NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_SEND, data, size, comm->fd, (struct ncclSocketRequest**)request));
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) {
-  struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm;
-  NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_RECV, data, size, comm->fd, (struct ncclSocketRequest**)request));
-  return ncclSuccess;
-}
-
-ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size, void* mhandle) {
-  // We don't support CUDA pointers, so we don't need a flush operation
-  return ncclInternalError;
-}
-
-ncclResult_t ncclSocketClose(void* opaqueComm) {
-  struct ncclSocketComm* comm = (struct ncclSocketComm*)opaqueComm;
-  if (comm) {
-    free(comm->reqs.requests);
-    close(comm->fd);
-    free(comm);
-  }
-  return ncclSuccess;
-}
-
-ncclNet_t ncclNetSocket = {
-  "Socket",
-  ncclSocketInit,
-  ncclSocketDevices,
-  ncclSocketPciPath,
-  ncclSocketPtrSupport,
-  ncclSocketListen,
-  ncclSocketConnect,
-  ncclSocketAccept,
-  ncclSocketRegMr,
-  ncclSocketDeregMr,
-  ncclSocketIsend,
-  ncclSocketIrecv,
-  ncclSocketFlush,
-  ncclSocketTest,
-  ncclSocketClose,
-  ncclSocketClose,
-  ncclSocketClose
-};
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
new file mode 100644
index 0000000..0cc92f3
--- /dev/null
+++ b/src/transport/p2p.cc
@@ -0,0 +1,312 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "graph.h"
+#include "utils.h"
+
+struct p2pConnectInfo {
+  int direct;
+  union {
+    void* directPtr;
+    cudaIpcMemHandle_t devIpc;
+  };
+};
+
+struct p2pSendResources {
+  struct ncclSendMem* devMem;
+  void* ipcPtr;
+};
+
+struct p2pRecvResources {
+  struct ncclRecvMem* devMem;
+  void* ipcPtr;
+};
+
+#include <sys/types.h>
+
+NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2);
+NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2);
+
+/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
+static int busIdToCudaDev(int64_t busId) {
+  int ndev;
+  if (cudaGetDeviceCount(&ndev) != cudaSuccess)
+    return -1;
+  for (int i = 0; i < ndev; i++) {
+    char devBusIdStr[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+    if (cudaDeviceGetPCIBusId(devBusIdStr, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess)
+      return -1;
+    int64_t devBusId;
+    NCCLCHECK(busIdToInt64(devBusIdStr, &devBusId));
+    if (busId == devBusId) return i;
+  }
+  // BusId was not found in our locally visible CUDA devices
+  return -1;
+}
+
+/* Determine if two peers can communicate through p2p */
+ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+  int cpuCount;
+  NCCLCHECK(ncclTopoCpuCount(topo, &cpuCount));
+  // Do not use P2P across sockets by default (provided CUDA permits it).
+  // When we are on a single socket, don't even use P2P through the CPU as
+  // it should be able to sustain two flows to sysmem faster than PCI P2P.
+  int p2pLevel = cpuCount == 1 ? PATH_PHB : PATH_NODE;
+  if (ncclParamP2pDisable() == 1) p2pLevel = 0;
+  if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();
+
+  // Disable P2P
+  *ret = 0;
+
+  if (p2pLevel == 0) return ncclSuccess;
+
+  // Rule out different nodes
+  if (info1->hostHash != info2->hostHash) return ncclSuccess;
+
+  // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
+  int cudaDev1 = busIdToCudaDev(info1->busId);
+  int cudaDev2 = busIdToCudaDev(info2->busId);
+  if (cudaDev1 == -1 || cudaDev2 == -1) {
+    // Peer's CUDA device is not visible in this process
+#if CUDART_VERSION >= 10010
+    // But in CUDA 10.1 we can still communicate with 'invisible' devices
+    TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %lx and %lx", info1->busId, info2->busId);
+    // Check for NVLink/NVswitch including P2P access
+    int nvlink;
+    NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink));
+    if (nvlink > 0) {
+      *ret = 1;
+      return ncclSuccess;
+    }
+#endif
+    return ncclSuccess;
+  }
+
+  TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%lx] and [%d=%lx]", cudaDev1, info1->busId, cudaDev2, info2->busId);
+
+  // Do not detect topology if we're on the same GPU. Note this is not really supported.
+  if (cudaDev1 == cudaDev2) {
+    *ret = 1;
+    return ncclSuccess;
+  }
+
+  // See if CUDA can do P2P
+  int p2p;
+  if (cudaDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != cudaSuccess) {
+    INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%lx) and dev %d(=%lx)",
+         cudaDev1, info1->busId, cudaDev2, info2->busId);
+    return ncclSuccess;
+  }
+  if (p2p == 0) return ncclSuccess;
+
+  // Check for NVLink/NVswitch
+  int nvlink;
+  NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink));
+  if (nvlink > 0) {
+    *ret = 1;
+    return ncclSuccess;
+  }
+
+  // Finally compute the PCI distance and compare with the p2pLevel.
+  int distance;
+  NCCLCHECK(ncclTopoGpuDistance(topo, info1->busId, info2->busId, &distance));
+  if (distance < p2pLevel) {
+    *ret = 1;
+  }
+  return ncclSuccess;
+}
+
+#define TRACE_DUMP_IPC(DEVIPC)                                                             \
+  do {                                                                                     \
+    unsigned long *devIpc = (unsigned long *) (DEVIPC);                                    \
+    TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[0], devIpc[1], devIpc[2], devIpc[3]); \
+    TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \
+  } while (0)
+
+/* Send: Create and return connect structures for this peer to connect to me */
+ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+    struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+
+  struct p2pSendResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  send->transportResources = resources;
+  int sendSize = sizeof(struct ncclSendMem);
+  ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
+  NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize));
+
+  struct p2pConnectInfo info;
+  if (myInfo->pidHash == peerInfo->pidHash) {
+    info.direct = 1;
+    info.directPtr = resources->devMem;
+    if (myInfo->cudaDev == peerInfo->cudaDev) {
+      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/common device", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+      return ncclInternalError;
+    } else {
+      // Enable P2P access
+      cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
+      if (err == cudaErrorPeerAccessAlreadyEnabled) {
+        cudaGetLastError();
+      } else if (err != cudaSuccess) {
+        WARN("failed to peer with device %d(=%lx): %d %s",
+             peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
+        return ncclInternalError;
+      }
+      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer",
+          channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+    }
+  } else {
+    // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
+    int peerCudaDev = busIdToCudaDev(peerInfo->busId);
+    info.direct = 0;
+    // Map IPC and enable P2P access
+    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
+    if (err != cudaSuccess) {
+      WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s",
+           myInfo->rank, peerCudaDev, peerInfo->busId, err, cudaGetErrorString(err));
+      return ncclInternalError;
+    }
+    INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/IPC",
+        channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+    //TRACE_DUMP_IPC(&info.devIpc);
+  }
+  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
+  memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
+  return ncclSuccess;
+}
+
+/* Create and return connect structures for this peer to connect to me */
+ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+    struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) {
+
+  struct p2pRecvResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  recv->transportResources = resources;
+  int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+  ALIGN_SIZE(recvSize, CUDA_IPC_MIN);
+  NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize));
+
+  struct p2pConnectInfo info;
+  if (myInfo->pidHash == peerInfo->pidHash) {
+    info.direct = 1;
+    info.directPtr = resources->devMem;
+    if (myInfo->cudaDev == peerInfo->cudaDev) {
+      TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
+    } else {
+      // Enable P2P access
+      cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
+      if (err == cudaErrorPeerAccessAlreadyEnabled) {
+        cudaGetLastError();
+      } else if (err != cudaSuccess) {
+        WARN("failed to peer with device %d(=%lx): %d %s",
+             peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
+        return ncclInternalError;
+      }
+      TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/direct pointer", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+    }
+  } else {
+    // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
+    int peerCudaDev = busIdToCudaDev(peerInfo->busId);
+    info.direct = 0;
+    // Map IPC and enable P2P access
+    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
+    if (err != cudaSuccess) {
+      WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s",
+           myInfo->rank, peerCudaDev, peerInfo->busId, err, cudaGetErrorString(err));
+      return ncclInternalError;
+    }
+    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/IPC", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+    //TRACE_DUMP_IPC(&info.devIpc);
+  }
+  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
+  memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
+  return ncclSuccess;
+}
+
+/* Connect/Send to this peer */
+static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+  struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
+  struct ncclRecvMem* remDevMem;
+  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
+  if (info->direct) {
+    remDevMem = (struct ncclRecvMem*)(info->directPtr);
+    send->conn.direct = 1;
+  } else {
+    //TRACE_DUMP_IPC(&info->devIpc);
+    cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
+    remDevMem = (struct ncclRecvMem*)resources->ipcPtr;
+    if (err != cudaSuccess) {
+      WARN("failed to open CUDA IPC handle : %d %s",
+          err, cudaGetErrorString(err));
+      return ncclUnhandledCudaError;
+    }
+  }
+
+  send->conn.buff = remDevMem->buff;
+  send->conn.llBuff = remDevMem->llBuff;
+  send->conn.ll128Buff = remDevMem->ll128Buff;
+  send->conn.tail = &remDevMem->tail;
+  send->conn.opCountRem = &remDevMem->opCount;
+  send->conn.head = &resources->devMem->head;
+  send->conn.ptrExchange = &resources->devMem->ptrExchange;
+  send->conn.opCountLoc = &resources->devMem->opCount;
+  return ncclSuccess;
+}
+
+/* Connect/Recv from this peer */
+ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+  struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
+  struct ncclSendMem* remDevMem;
+  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
+  if (info->direct) {
+    remDevMem = (struct ncclSendMem*)(info->directPtr);
+    recv->conn.direct = 1;
+    recv->conn.ptrExchange = &remDevMem->ptrExchange;
+  } else {
+    //TRACE_DUMP_IPC(&info->devIpc);
+    cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
+    remDevMem = (struct ncclSendMem*)resources->ipcPtr;
+    if (err != cudaSuccess) {
+      WARN("failed to open CUDA IPC handle : %d %s",
+          err, cudaGetErrorString(err));
+      return ncclUnhandledCudaError;
+    }
+  }
+
+  recv->conn.buff = resources->devMem->buff;
+  recv->conn.llBuff = resources->devMem->llBuff;
+  recv->conn.ll128Buff = resources->devMem->ll128Buff;
+  recv->conn.tail = &resources->devMem->tail;
+  recv->conn.opCountLoc = &resources->devMem->opCount;
+  recv->conn.head = &remDevMem->head;
+  recv->conn.opCountRem = &remDevMem->opCount;
+  return ncclSuccess;
+}
+
+ncclResult_t p2pSendFree(void* resources) {
+  struct p2pSendResources* sendRes = (struct p2pSendResources*)resources;
+  if (sendRes->ipcPtr)
+    CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
+  CUDACHECK(cudaFree(sendRes->devMem));
+  free(sendRes);
+  return ncclSuccess;
+}
+
+ncclResult_t p2pRecvFree(void* resources) {
+  struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources;
+  if (recvRes->ipcPtr)
+    CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
+  CUDACHECK(cudaFree(recvRes->devMem));
+  free(recvRes);
+  return ncclSuccess;
+}
+
+struct ncclTransport p2pTransport = {
+  "P2P",
+  p2pCanConnect,
+  { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL },
+  { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL }
+};
diff --git a/src/transport/p2p.cu b/src/transport/p2p.cu
deleted file mode 100644
index 9f3e0b6..0000000
--- a/src/transport/p2p.cu
+++ /dev/null
@@ -1,620 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "utils.h"
-#include "topo.h"
-#include "transport.h"
-#include "param.h"
-#include <unistd.h>
-#include <cuda_runtime.h>
-#include <ctype.h>
-#include "nvlink.h"
-
-struct p2pConnectInfo {
-  int direct;
-  union {
-    void* directPtr;
-    cudaIpcMemHandle_t devIpc;
-  };
-};
-
-struct p2pSendResources {
-  struct ncclSendMem* devMem;
-  void* ipcPtr;
-};
-
-struct p2pRecvResources {
-  struct ncclRecvMem* devMem;
-  void* ipcPtr;
-};
-
-#include <sys/types.h>
-
-NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2);
-NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2);
-
-/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
-static int busIdToCudaDev(const char* busId) {
-  int ndev;
-  if (cudaGetDeviceCount(&ndev) != cudaSuccess)
-    return -1;
-  for (int i = 0; i < ndev; i++) {
-    char devBusId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-    if (cudaDeviceGetPCIBusId(devBusId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess)
-      return -1;
-    if (strcmp(busId, devBusId) == 0) {
-      return i;
-    }
-  }
-  // BusId was not found in our locally visible CUDA devices
-  return -1;
-}
-
-/* Determine if we can communicate with the peer through p2p */
-ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
-  // Do not use P2P across root complexes by default (provided CUDA permits it)
-  int p2pLevel = PATH_SOC;
-  if (ncclParamP2pDisable() == 1) p2pLevel = 0;
-  if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();
-
-  *ret = 0;
-
-  if (p2pLevel == 0) return ncclSuccess;
-
-  // Rule out different nodes
-  if (myInfo->hostHash != peerInfo->hostHash) return ncclSuccess;
-
-  // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
-  int peerCudaDev = busIdToCudaDev(peerInfo->busId);
-  if (peerCudaDev == -1) return ncclSuccess; // Peer's CUDA device is not visible in this process
-
-  TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
-
-  // Do not detect topology if we're on the same GPU. Note this is not really supported.
-  if (myInfo->cudaDev == peerCudaDev) {
-    *ret = 1 + PATH_SOC;
-    return ncclSuccess;
-  }
-
-  // See if CUDA can do P2P
-  int p2p;
-  if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerCudaDev) != cudaSuccess) {
-    INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%d) and dev %d(=%d)",
-         myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
-    return ncclSuccess;
-  }
-  if (p2p == 0) return ncclSuccess;
-
-  // Check for NVLink/NVswitch
-  int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
-  if (nvlinkp2p > 0) {
-    *ret = nvlinkp2p;
-    return ncclSuccess;
-  }
-
-  // Finally compute the PCI distance and compare with the p2pLevel.
-  char* myPath;
-  char* peerPath;
-  ncclResult_t err1 = getCudaPath(myInfo->cudaDev, &myPath);
-  ncclResult_t err2 = getCudaPath(peerCudaDev, &peerPath);
-  if (err1 == ncclSuccess && err2 == ncclSuccess) {
-    int distance = pciDistance(myPath, peerPath);
-    if (distance < p2pLevel) {
-      *ret = 1 + PATH_SOC - distance;
-    }
-  }
-  if (err1 == ncclSuccess) free(myPath);
-  if (err2 == ncclSuccess) free(peerPath);
-  return ncclSuccess;
-}
-
-static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) {
-  int nrings = 0;
-  ncclTvalue_t* line = matrix+current*n;
-  inTheRing[current] = 1;
-  int currentStep = (currentRing+1)*n-remaining;
-  rings[currentStep-1] = current;
-  if (remaining == 0) {
-    int looprank = rings[currentRing*n];
-    if (line[looprank] > 0) {
-      if (currentRing+1 == nRingsMax) {
-        nrings = 1;
-      } else {
-        line[looprank]--;
-        for (int i=0; i<n; i++) inTheRing[i] = 0;
-        if (connect) {
-          // First two slots are already set and we need to respect those constraints
-          inTheRing[rings[currentStep]] = 1;
-          nrings = 1 + computeRingsRec(matrix, n, rings, currentRing+1, nRingsMax, inTheRing, rings[currentStep+1], n-2, connect);
-        } else {
-          rings[(currentRing+1)*n] = 0;
-          nrings = 1 + computeRingsRec(matrix, n, rings, currentRing+1, nRingsMax, inTheRing, 0, n-1, connect);
-        }
-        line[looprank]++;
-        for (int i=0; i<n; i++) inTheRing[i] = 1;
-      }
-    }
-  } else {
-    int ringsSave[nRingsMax*n];
-    int maxStep = 0;
-    for (int i=0; i<n; i++) {
-      if (inTheRing[i] == 0 && line[i] > 0) {
-        line[i]--;
-        int nr = computeRingsRec(matrix, n, rings, currentRing, nRingsMax, inTheRing, i, remaining-1, connect);
-        if (nr > nrings) {
-          nrings = nr;
-          maxStep = (nr+currentRing)*n;
-          ringsSave[currentStep] = i;
-          // Save the rest of the rings
-          for (int r=currentStep+1; r<maxStep; r++) {
-            ringsSave[r] = rings[r];
-          }
-          if (nrings + currentRing == nRingsMax) {
-            // We found an optimal solution. Let's stop there.
-            break;
-          }
-        }
-        line[i]++;
-      }
-    }
-    for (int r=currentStep; r<maxStep; r++) {
-      rings[r] = ringsSave[r];
-    }
-  }
-  inTheRing[current] = 0;
-  return nrings;
-}
-
-static inline int copyRings(int nranks, int* rings, int nrings, int newNrings) {
-  if (nrings == 0) return 0;
-  // Copy rings by dup times
-  if (newNrings > MAXCHANNELS) {
-    newNrings = MAXCHANNELS;
-  }
-  for (int r=nrings; r<newNrings; r++) {
-    for (int i=0; i<nranks; i++) rings[r*nranks+i] = rings[(r%nrings)*nranks+i];
-  }
-  return newNrings;
-}
-
-int p2pComputeRingsNvLink(ncclTvalue_t* matrix, int nranks, int *rings, int nringsMax, int connect) {
-  int* inTheRing = (int*)malloc(sizeof(int)*nranks);
-  if (inTheRing == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*nranks); return 0; }
-  for (int i=0; i<nranks; i++) inTheRing[i] = 0;
-  int nrings;
-  if (connect) {
-    inTheRing[rings[0]] = 1;
-    nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, rings[1], nranks-2, connect);
-  } else {
-    rings[0] = 0;
-    nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, 0, nranks-1, connect);
-  }
-  free(inTheRing);
-  return nrings;
-}
-
-static inline int findConnect(int nranks, int* ranks) {
-  for (int i = 0; i<nranks; i++) {
-    if (ranks[i] != -1) return i;
-  }
-  return -1;
-}
-
-int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int oversubscribe, int* nthreads) {
-  if (nrings == 0) return 0;
-  if (nrings > MAXCHANNELS) {
-    WARN("Max rings reached, limiting to %d", MAXCHANNELS);
-    nrings = MAXCHANNELS;
-  }
-  // Find existing constraints / connections
-  int connect = 0;
-  for (int r=0; r<nrings; r++) {
-    int start = findConnect(nranks, prev+r*nranks);
-    int end = findConnect(nranks, next+r*nranks);
-    if (start != -1 && end != -1) {
-      rings[r*nranks] = end;
-      rings[r*nranks+1] = start;
-      connect = 1;
-    }
-  }
-
-  // Compute rings
-  ncclTvalue_t* matrix = (ncclTvalue_t*)malloc(sizeof(ncclTvalue_t)*nranks*nranks);
-  if (matrix == NULL) { WARN("malloc of %ld bytes failed", sizeof(ncclTvalue_t)*nranks*nranks); return 0; }
-  for (int i=0; i<nranks; i++) for (int j=0; j<nranks; j++)
-      matrix[i*nranks+j] = oversubscribe ? values[i*nranks+j]/CONNECT_NVLINK*2 : values[i*nranks+j]/CONNECT_NVLINK ;
-
-  int compNrings = p2pComputeRingsNvLink(matrix, nranks, rings, nrings, connect);
-
-  free(matrix);
-
-  if (oversubscribe || connect) return compNrings;
-
-  if (compNrings && compNrings < nrings && nranks <= 4) {
-    // Try to oversubscribe to get a better result
-    int *rings2 = (int *)malloc(sizeof(int)*MAXCHANNELS*nranks);
-    if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXCHANNELS*nranks); return 0; }
-    for (int i=0; i<MAXCHANNELS*nranks; i++) rings2[i] = -1;
-    int nThreads = *nthreads;
-    int compNrings2 = p2pComputeRingsNvLink(values, nranks, rings2, nrings, prev, next, 1, &nThreads);
-    if (compNrings2 > compNrings*2) {
-      // Oversubscription worked.
-      for (int i=0; i<compNrings2*nranks; i++) rings[i] = rings2[i];
-      compNrings = compNrings2;
-    }
-    free(rings2);
-  }
-
-  // Duplicate the rings for direct NVLink
-  compNrings = copyRings(nranks, rings, compNrings, compNrings*2);
-
-  return compNrings;
-}
-
-int p2pComputeRingsSeqConnect(ncclTvalue_t* values, int nranks, int* rings, int nringsStart, int* prev, int* next, int minScore, int* nthreads) {
-  int nrings = nringsStart;
-  int connect = 0;
-  for (int r=0; r<nrings; r++) {
-    int start = findConnect(nranks, prev+r*nranks);
-    int end = findConnect(nranks, next+r*nranks);
-    if (start != -1 && end != -1) {
-      rings[r*nranks] = end;
-      rings[r*nranks+1] = start;
-      int cur = start;
-      for (int i=2; i<nranks; i++) {
-        int next = (cur+1) % nranks;
-        while (next == end || next == start) next = (next+1) % nranks;
-        if (values[cur*nranks+next] < minScore) {
-          return 0;
-        }
-        rings[r*nranks+i] = next;
-        cur = next;
-      }
-      connect = 1;
-    } else {
-      if (connect == 1 && r > 0) {
-        WARN("Connecting rings but did not find start/end for ring %d. Disabling other rings.", r);
-        return r;
-      } else {
-        return 0;
-      }
-    }
-  }
-  return nrings;
-}
-
-int p2pComputeRingsSeqNew(ncclTvalue_t* values, int nranks, int* rings, int nringsStart, int* prev, int* next, int minScore, int* nthreads) {
-  for (int r=0; r<nringsStart; r++) {
-    for (int i=0; i<nranks; i++) {
-      rings[r*nranks+i] = i;
-    }
-  }
-  return nringsStart;
-}
-
-static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) {
-  for (int score = PATH_SOC+1; score >= minScore; score--) {
-    int best = -1;
-    int worst_end_score = PATH_SOC+2; // find the closest to rank, farthest from end
-    for (int n = 0; n < nranks; n++) {
-      if (inRing[n]) continue;
-      if (values[rank*nranks+n] == score) {
-        if (end == -1) return n;
-        if (values[end*nranks+n] < worst_end_score) {
-          best = n;
-          worst_end_score = values[end*nranks+n];
-        }
-      }
-    }
-    if (best != -1) return best;
-  }
-  return -1;
-}
-
-int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int minScore) {
-  int connect = 0;
-  for (int r=0; r<nrings; r++) {
-    int start = findConnect(nranks, prev+r*nranks);
-    int end = findConnect(nranks, next+r*nranks);
-
-    int inRing[nranks];
-    for (int i=0; i<nranks; i++) inRing[i] = 0;
-
-    if (start == -1 && end == -1) {
-      if (connect == 1 && r > 0) {
-        WARN("Connecting ring %d : did not find start/end. Disabling other rings.", r);
-        return r;
-      }
-      end = 0;
-      inRing[end] = 1;
-      start = findClosestPci(values, inRing, end, -1, nranks, minScore);
-      if (start == -1) return r;
-    } else if (start == -1 || end == -1) {
-      WARN("Connecting ring %d : inconsistent start/end. Disabling other rings.", r);
-      return r;
-    } else {
-      connect = 1;
-    }
-    rings[r*nranks] = end;
-    rings[r*nranks+1] = start;
-    inRing[start] = inRing[end] = 1;
-    int cur = start;
-    for (int i=2; i<nranks; i++) {
-      int next = findClosestPci(values, inRing, cur, end, nranks, minScore);
-      if (next == -1) return r;
-
-      inRing[next] = 1;
-      rings[r*nranks+i] = next;
-      cur = next;
-    }
-    // Check the loop is closing
-    inRing[end] = 0;
-    if (findClosestPci(values, inRing, cur, end, nranks, minScore) != end) return r;
-
-    if (connect == 0) return 1;
-  }
-  return nrings;
-}
-
-ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
-  if (*nringsRet == 0) return ncclSuccess;
-  int *rings;
-  NCCLCHECK(ncclCalloc(&rings, MAXCHANNELS*nranks));
-  for (int i=0; i<MAXCHANNELS*nranks; i++) rings[i] = -1;
-  int nrings = *nringsRet;
-
-  // NVswitch
-  int nvswitchLinks = 0;
-  int directLinks = 0;
-  for (int rank=0; rank<nranks; rank++) {
-    for (int j=1; j<nranks; j++) {
-      int i = (rank + j) % nranks;
-      ncclTvalue_t links = values[rank*nranks+i]/CONNECT_NVSWITCH;
-      if (j>1 && links != nvswitchLinks) {
-        WARN("Internal error : NVswitch links mismatch");
-        return ncclInternalError;
-      }
-      nvswitchLinks = links;
-    }
-  }
-  if (nvswitchLinks) {
-    // NVSwitch : Connect existing rings
-    int nringsConnected = p2pComputeRingsSeqConnect(values, nranks, rings, nrings, prev, next, minScore, nthreads);
-    if (nringsConnected > 0) {
-      nrings = nringsConnected;
-    } else {
-      nrings = std::min(nrings, nvswitchLinks); // NVSwitch: Limit rings to number of NVLinks
-      // Or create new ones
-      nrings = p2pComputeRingsSeqNew(values, nranks, rings, nrings, prev, next, minScore, nthreads);
-      // And duplicate them
-      nrings = copyRings(nranks, rings, nrings, nrings*2);
-    }
-    goto end;
-  }
-
-  // point-to-point NVLink
-  for (int rank=0; rank<nranks; rank++) {
-    int links = 0;
-    for (int i=0; i<nranks; i++) {
-      ncclTvalue_t val = values[rank*nranks+i];
-      if (val >= CONNECT_NVSWITCH) continue;
-      links += val/CONNECT_NVLINK;
-    }
-    if (rank == 0) directLinks = links;
-    else directLinks =  std::min(directLinks, links);
-  }
-  if (directLinks > 0) {
-    // NVLink : Connect rings or create new ones
-    nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads);
-    goto end;
-  }
-
-  // PCIe or QPI : Connect rings or create new ones
-  nrings = p2pComputeRingsPci(values, nranks, rings, *nringsRet, prev, next, minScore);
-
-end:
-  *nringsRet = nrings;
-  for (int ring = 0; ring<nrings; ring++) {
-    for (int index=0; index<nranks; index++) {
-      int prevIndex = (index - 1 + nranks) % nranks;
-      int nextIndex = (index + 1) % nranks;
-      int curRank = rings[ring*nranks+index];
-      int prevRank = rings[ring*nranks+prevIndex];
-      int nextRank = rings[ring*nranks+nextIndex];
-      if (prev[ring*nranks+curRank] == -1) prev[ring*nranks+curRank] = prevRank;
-      if (next[ring*nranks+curRank] == -1) next[ring*nranks+curRank] = nextRank;
-    }
-  }
-
-  free(rings);
-  return ncclSuccess;
-}
-
-#define TRACE_DUMP_IPC(DEVIPC)                                                             \
-  do {                                                                                     \
-    unsigned long *devIpc = (unsigned long *) (DEVIPC);                                    \
-    TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[0], devIpc[1], devIpc[2], devIpc[3]); \
-    TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \
-  } while (0)
-
-/* Send: Create and return connect structures for this peer to connect to me */
-ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
-    struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
-
-  struct p2pSendResources* resources;
-  NCCLCHECK(ncclCalloc(&resources, 1));
-  send->transportResources = resources;
-  const int sendSize = sizeof(struct ncclSendMem);
-  NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize));
-
-  struct p2pConnectInfo info;
-  if (myInfo->pidHash == peerInfo->pidHash) {
-    info.direct = 1;
-    info.directPtr = resources->devMem;
-    if (myInfo->cudaDev == peerInfo->cudaDev) {
-      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", channelId, myInfo->rank, peerInfo->rank);
-    } else {
-      // Enable P2P access
-      cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
-      if (err == cudaErrorPeerAccessAlreadyEnabled) {
-        cudaGetLastError();
-      } else if (err != cudaSuccess) {
-        WARN("failed to peer with device %d(=%d): %d %s",
-             peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
-        return ncclInternalError;
-      }
-      INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
-          channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
-    }
-  } else {
-    // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
-    int peerCudaDev = busIdToCudaDev(peerInfo->busId);
-    info.direct = 0;
-    // Map IPC and enable P2P access
-    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
-    if (err != cudaSuccess) {
-      WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
-           myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
-      return ncclInternalError;
-    }
-    INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
-        channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
-    //TRACE_DUMP_IPC(&info.devIpc);
-  }
-  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
-  memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
-  return ncclSuccess;
-}
-
-/* Create and return connect structures for this peer to connect to me */
-ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
-    struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) {
-
-  struct p2pRecvResources* resources;
-  NCCLCHECK(ncclCalloc(&resources, 1));
-  recv->transportResources = resources;
-  const int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
-  NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize));
-
-  struct p2pConnectInfo info;
-  if (myInfo->pidHash == peerInfo->pidHash) {
-    info.direct = 1;
-    info.directPtr = resources->devMem;
-    if (myInfo->cudaDev == peerInfo->cudaDev) {
-      TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
-    } else {
-      // Enable P2P access
-      cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
-      if (err == cudaErrorPeerAccessAlreadyEnabled) {
-        cudaGetLastError();
-      } else if (err != cudaSuccess) {
-        WARN("failed to peer with device %d(=%d): %d %s",
-             peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
-        return ncclInternalError;
-      }
-      TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
-    }
-  } else {
-    // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
-    int peerCudaDev = busIdToCudaDev(peerInfo->busId);
-    info.direct = 0;
-    // Map IPC and enable P2P access
-    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
-    if (err != cudaSuccess) {
-      WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
-           myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
-      return ncclInternalError;
-    }
-    TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
-    //TRACE_DUMP_IPC(&info.devIpc);
-  }
-  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
-  memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
-  return ncclSuccess;
-}
-
-/* Connect/Send to this peer */
-static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
-  struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
-  struct ncclRecvMem* remDevMem;
-  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
-  if (info->direct) {
-    remDevMem = (struct ncclRecvMem*)(info->directPtr);
-    send->conn.direct = 1;
-  } else {
-    //TRACE_DUMP_IPC(&info->devIpc);
-    cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
-    remDevMem = (struct ncclRecvMem*)resources->ipcPtr;
-    if (err != cudaSuccess) {
-      WARN("failed to open CUDA IPC handle : %d %s",
-          err, cudaGetErrorString(err));
-      return ncclUnhandledCudaError;
-    }
-  }
-
-  send->conn.buff = remDevMem->buff;
-  send->conn.llBuff = remDevMem->llBuff;
-  send->conn.tail = &remDevMem->tail;
-  send->conn.opCountRem = &remDevMem->opCount;
-  send->conn.head = &resources->devMem->head;
-  send->conn.ptrExchange = &resources->devMem->ptrExchange;
-  send->conn.opCountLoc = &resources->devMem->opCount;
-  return ncclSuccess;
-}
-
-/* Connect/Recv from this peer */
-ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
-  struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
-  struct ncclSendMem* remDevMem;
-  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
-  if (info->direct) {
-    remDevMem = (struct ncclSendMem*)(info->directPtr);
-    recv->conn.direct = 1;
-    recv->conn.ptrExchange = &remDevMem->ptrExchange;
-  } else {
-    //TRACE_DUMP_IPC(&info->devIpc);
-    cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
-    remDevMem = (struct ncclSendMem*)resources->ipcPtr;
-    if (err != cudaSuccess) {
-      WARN("failed to open CUDA IPC handle : %d %s",
-          err, cudaGetErrorString(err));
-      return ncclUnhandledCudaError;
-    }
-  }
-
-  recv->conn.buff = resources->devMem->buff;
-  recv->conn.llBuff = resources->devMem->llBuff;
-  recv->conn.tail = &resources->devMem->tail;
-  recv->conn.opCountLoc = &resources->devMem->opCount;
-  recv->conn.head = &remDevMem->head;
-  recv->conn.opCountRem = &remDevMem->opCount;
-  return ncclSuccess;
-}
-
-ncclResult_t p2pSendFree(void* resources) {
-  struct p2pSendResources* sendRes = (struct p2pSendResources*)resources;
-  if (sendRes->ipcPtr)
-    CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
-  CUDACHECK(cudaFree(sendRes->devMem));
-  return ncclSuccess;
-}
-
-ncclResult_t p2pRecvFree(void* resources) {
-  struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources;
-  if (recvRes->ipcPtr)
-    CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
-  CUDACHECK(cudaFree(recvRes->devMem));
-  return ncclSuccess;
-}
-
-struct ncclTransport p2pTransport = {
-  "P2P",
-  p2pCanConnect,
-  p2pGetRings,
-  { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL },
-  { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL }
-};
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
new file mode 100644
index 0000000..60f16c8
--- /dev/null
+++ b/src/transport/shm.cc
@@ -0,0 +1,175 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "shm.h"
+
+struct shmConnectInfo {
+  uint64_t pidHash;
+  int id;
+  int sendRank;
+  int recvRank;
+  int shmSize;
+};
+
+struct shmSendResources {
+  int remShmSize;
+  struct ncclRecvMem* remHostMem;
+  struct ncclRecvMem* devRemHostMem;
+  int shmSize;
+  struct ncclSendMem* hostMem;
+  struct ncclSendMem* devHostMem;
+};
+
+struct shmRecvResources {
+  int remShmSize;
+  struct ncclSendMem* remHostMem;
+  struct ncclSendMem* devRemHostMem;
+  int shmSize;
+  struct ncclRecvMem* hostMem;
+  struct ncclRecvMem* devHostMem;
+};
+
+NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);
+
+/* Determine two peers can communicate with SHM */
+ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+  *ret = 0;
+
+  if (ncclParamShmDisable() == 1) return ncclSuccess;
+
+  // Same host?
+  TRACE(NCCL_INIT|NCCL_SHM, "peer1 hostHash %lx peer2 hostHash %lx", info1->hostHash, info2->hostHash);
+  if (info1->hostHash != info2->hostHash) return ncclSuccess;
+
+  // Common /dev/shm (between containers) ?
+  TRACE(NCCL_INIT|NCCL_SHM, "peer1 shmDev %lx peer2 shmDev %lx", info1->shmDev, info2->shmDev);
+  if (info1->shmDev != info2->shmDev) return ncclSuccess;
+
+  *ret = 1;
+
+  return ncclSuccess;
+}
+
+#define MAX_SHM_NAME_LEN 1024
+
+/* Create and return connect structures for this peer to connect to me */
+ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+
+  struct shmSendResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  send->transportResources = resources;
+
+  struct shmConnectInfo info;
+  info.id = channelId;
+  info.pidHash = myInfo->pidHash;
+  info.sendRank = myInfo->rank;
+  info.recvRank = peerInfo->rank;
+
+  char shmName[MAX_SHM_NAME_LEN];
+  sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
+  info.shmSize = resources->shmSize = sizeof(struct ncclSendMem);
+  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
+  NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+
+  INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+  static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
+  memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
+  return ncclSuccess;
+}
+
+ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
+  struct shmRecvResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  recv->transportResources = resources;
+
+  struct shmConnectInfo info;
+  info.id = channelId;
+  info.pidHash = myInfo->pidHash;
+  info.sendRank = peerInfo->rank;
+  info.recvRank = myInfo->rank;
+
+  char shmName[MAX_SHM_NAME_LEN];
+  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
+  info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
+  NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+
+  static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big");
+  memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
+  return ncclSuccess;
+}
+
+/* Connect to this peer */
+ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+  // Setup device pointers
+  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
+  struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
+
+  char shmName[MAX_SHM_NAME_LEN];
+  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
+  resources->remShmSize = info->shmSize;
+  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
+  NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
+  // Remove the file to ensure proper clean-up
+  NCCLCHECK(shmUnlink(shmName));
+
+  send->transportResources = resources;
+  send->conn.buff = resources->devRemHostMem->buff;
+  send->conn.llBuff = resources->devRemHostMem->llBuff;
+  send->conn.ll128Buff = resources->devRemHostMem->ll128Buff;
+  send->conn.tail = &resources->devRemHostMem->tail;
+  send->conn.opCountRem = &resources->devRemHostMem->opCount;
+
+  send->conn.head = &resources->devHostMem->head;
+  send->conn.opCountLoc = &resources->devHostMem->opCount;
+  return ncclSuccess;
+}
+
+ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+  // Setup device pointers
+  struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
+  struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
+
+  char shmName[MAX_SHM_NAME_LEN];
+  sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
+  resources->remShmSize = info->shmSize;
+  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
+  NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
+  NCCLCHECK(shmUnlink(shmName));
+  recv->conn.head = &resources->devRemHostMem->head;
+  recv->conn.opCountRem = &resources->devRemHostMem->opCount;
+
+  recv->conn.buff = resources->devHostMem->buff;
+  recv->conn.llBuff = resources->devHostMem->llBuff;
+  recv->conn.ll128Buff = resources->devHostMem->ll128Buff;
+  recv->conn.tail = &resources->devHostMem->tail;
+  recv->conn.opCountLoc = &resources->devHostMem->opCount;
+  return ncclSuccess;
+}
+
+ncclResult_t shmSendFree(void* transportResources) {
+  struct shmSendResources* resources = (struct shmSendResources*)transportResources;
+  NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
+  NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
+  free(resources);
+  return ncclSuccess;
+}
+
+ncclResult_t shmRecvFree(void* transportResources) {
+  struct shmRecvResources* resources = (struct shmRecvResources*)transportResources;
+  NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
+  NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
+  free(resources);
+  return ncclSuccess;
+}
+
+struct ncclTransport shmTransport = {
+  "SHM",
+  shmCanConnect,
+  { shmSendSetup, shmSendConnect, shmSendFree, NULL },
+  { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL }
+};
diff --git a/src/transport/shm.cu b/src/transport/shm.cu
deleted file mode 100644
index 56e0242..0000000
--- a/src/transport/shm.cu
+++ /dev/null
@@ -1,245 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "utils.h"
-#include "transport.h"
-#include "param.h"
-#include "shm.h"
-#include <unistd.h>
-#include <cuda_runtime.h>
-
-struct shmSendConnectInfo {
-  uint64_t pidHash;
-  int id;
-  int rank;
-  int shmSize;
-};
-
-struct shmRecvConnectInfo {
-  uint64_t pidHash;
-  int id;
-  int rank;
-  int shmSize;
-};
-
-struct shmSendResources {
-  int remShmSize;
-  struct ncclRecvMem* remHostMem;
-  struct ncclRecvMem* devRemHostMem;
-  int shmSize;
-  struct ncclSendMem* hostMem;
-  struct ncclSendMem* devHostMem;
-};
-
-struct shmRecvResources {
-  int remShmSize;
-  struct ncclSendMem* remHostMem;
-  struct ncclSendMem* devRemHostMem;
-  int shmSize;
-  struct ncclRecvMem* hostMem;
-  struct ncclRecvMem* devHostMem;
-};
-
-NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);
-
-/* Determine if we can communicate with the peer */
-ncclResult_t shmCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
-  *ret = ((ncclParamShmDisable() == 1) || (myInfo->hostHash != peerInfo->hostHash)) ? 0 : 1;
-  return ncclSuccess;
-}
-
-static inline int groupFirst(int nranks, int* groups, int group, int rankToAvoid) {
-  for (int rank = 0; rank<nranks; rank++) {
-    if ((groups[rank] == group) && (rank != rankToAvoid)) return rank;
-  }
-  return -1;
-}
-
-static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid) {
-  for (int rank = nranks-1; rank>=0; rank--) {
-    if ((groups[rank] == group) && (rank != rankToAvoid)) return rank;
-  }
-  return -1;
-}
-
-ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
-  if (*nringsRet == MAXCHANNELS) *nringsRet = 1;
-  int nGroups = groups[nranks-1] + 1;
-  int starts[nGroups];
-  int ends[nGroups];
-  for (int ring = 0; ring<*nringsRet; ring++) {
-    int startGroup = -1, endGroup = -1;
-    for (int group = 0; group<nGroups; group++) {
-      int start = -1;
-      int end = -1;
-      int nranksInGroup = 0;
-      for (int rank=0; rank<nranks; rank++) {
-        if (groups[rank] != group) continue;
-        nranksInGroup++;
-        if (prev[ring*nranks+rank] != -1) {
-          if (start != -1) {
-            WARN("Multiple starts found in group");
-          }
-          start = rank;
-          startGroup = group;
-        }
-        if (next[ring*nranks+rank] != -1) {
-          if (end != -1) {
-            WARN("Multiple ends found in group");
-          }
-          end = rank;
-          endGroup = group;
-        }
-      }
-      if (nranksInGroup == 1) {
-        start = end = groupFirst(nranks, groups, group, -1);
-      } else {
-        if (start == -1)
-          start = groupFirst(nranks, groups, group, end);
-        if (end == -1)
-          end = groupLast(nranks, groups, group, start);
-      }
-      if (start == -1 || end == -1) {
-        *nringsRet = ring;
-        return ncclSuccess;
-      }
-      starts[group] = start;
-      ends[group] = end;
-    }
-    if (endGroup == -1 || startGroup == -1) {
-      startGroup = 0;
-      endGroup = nGroups-1;
-      // Close the loop
-      next[ring*nranks+ends[endGroup]] = starts[startGroup];
-      prev[ring*nranks+starts[startGroup]] = ends[endGroup];
-    }
-    int group = startGroup;
-    for (int i=0; i<nGroups-2; i++) {
-      int nextGroup = (group+1)%nGroups;
-      if (nextGroup == endGroup) nextGroup = (nextGroup+1)%nGroups;
-      next[ring*nranks+ends[group]] = starts[nextGroup];
-      prev[ring*nranks+starts[nextGroup]] = ends[group];
-      group = nextGroup;
-    }
-    // Connect with the last
-    next[ring*nranks+ends[group]] = starts[endGroup];
-    prev[ring*nranks+starts[endGroup]] = ends[group];
-  }
-  return ncclSuccess;
-}
-
-#define MAX_SHM_NAME_LEN 1024
-
-/* Create and return connect structures for this peer to connect to me */
-ncclResult_t shmSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
-
-  struct shmSendResources* resources;
-  NCCLCHECK(ncclCalloc(&resources, 1));
-  send->transportResources = resources;
-
-  struct shmRecvConnectInfo info;
-  char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank);
-  info.shmSize = resources->shmSize = sizeof(struct ncclSendMem);
-  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
-  NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
-
-  INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
-  info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
-  static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
-  memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo));
-  return ncclSuccess;
-}
-
-ncclResult_t shmRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
-  struct shmRecvResources* resources;
-  NCCLCHECK(ncclCalloc(&resources, 1));
-  recv->transportResources = resources;
-
-  struct shmSendConnectInfo info;
-
-  char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank);
-  info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+buffSize;
-  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
-  NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
-
-  info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
-  static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big");
-  memcpy(connectInfo, &info, sizeof(struct shmSendConnectInfo));
-  return ncclSuccess;
-}
-
-/* Connect to this peer */
-ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
-  // Setup device pointers
-  struct shmSendConnectInfo* info = (struct shmSendConnectInfo*)connectInfo;
-  struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
-
-  char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", info->pidHash, info->id, info->rank);
-  resources->remShmSize = info->shmSize;
-  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
-  NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
-  // Remove the file to ensure proper clean-up
-  NCCLCHECK(shmUnlink(shmName));
-
-  send->transportResources = resources;
-  send->conn.buff = resources->devRemHostMem->buff;
-  send->conn.llBuff = resources->devRemHostMem->llBuff;
-  send->conn.tail = &resources->devRemHostMem->tail;
-  send->conn.opCountRem = &resources->devRemHostMem->opCount;
-
-  send->conn.head = &resources->devHostMem->head;
-  send->conn.opCountLoc = &resources->devHostMem->opCount;
-  return ncclSuccess;
-}
-
-ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
-  // Setup device pointers
-  struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
-  struct shmRecvConnectInfo* info = (struct shmRecvConnectInfo*)connectInfo;
-
-  char shmName[MAX_SHM_NAME_LEN];
-  sprintf(shmName, "nccl-shm-send-%lx-%d-%d", info->pidHash, info->id, info->rank);
-  resources->remShmSize = info->shmSize;
-  TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
-  NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
-  NCCLCHECK(shmUnlink(shmName));
-  recv->conn.head = &resources->devRemHostMem->head;
-  recv->conn.opCountRem = &resources->devRemHostMem->opCount;
-
-  recv->conn.buff = resources->devHostMem->buff;
-  recv->conn.llBuff = resources->devHostMem->llBuff;
-  recv->conn.tail = &resources->devHostMem->tail;
-  recv->conn.opCountLoc = &resources->devHostMem->opCount;
-  return ncclSuccess;
-}
-
-ncclResult_t shmSendFree(void* transportResources) {
-  struct shmSendResources* resources = (struct shmSendResources*)transportResources;
-  NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
-  NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
-  free(resources);
-  return ncclSuccess;
-}
-
-ncclResult_t shmRecvFree(void* transportResources) {
-  struct shmRecvResources* resources = (struct shmRecvResources*)transportResources;
-  NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
-  NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
-  free(resources);
-  return ncclSuccess;
-}
-
-struct ncclTransport shmTransport = {
-  "SHM",
-  shmCanConnect,
-  shmGetRings,
-  { shmSendSetup, shmSendConnect, shmSendFree, NULL },
-  { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL }
-};
author	Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>	2020-01-07 02:48:51 +0300
committer	Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>	2020-01-07 02:49:13 +0300
commit	7bed5ac058c3ca3c4617d133f3240b6981f95428 (patch)
tree	49e7c6b2e08790b5d7b9eda124ce6cd130c87f1f
parent	8e3a3f7c5b520babff49cec54a866fa3eda3a3b6 (diff)
parent	3899f6e0f219b0cbf58537f791b0dd104b377750 (diff)