2.3.5-5v2.3.5-5

Add support for inter-node communication using sockets and InfiniBand/RoCE. Improve latency. Add support for aggregation. Improve LL/regular tuning. Remove tests as those are now at github.com/nvidia/nccl-tests .
author: Sylvain Jeaugey <sjeaugey@nvidia.com> 2018-09-25 02:06:59 +0300
committer: Sylvain Jeaugey <sjeaugey@nvidia.com> 2018-09-26 00:12:01 +0300
commit: f93fe9bfd94884cec2ba711897222e0df5569a53 (patch)
tree: 78b91eed1abfbaa3346b85bffe0c0ef9d9fb32bf
parent: 286916a1a37ca1fe8cd43e280f5c42ec29569fc5 (diff)
132 files changed, 12422 insertions, 9413 deletions
diff --git a/.gitignore b/.gitignore
index 34a07c2..87ad949 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
 # Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
 /build
+*.gcov
+/coverage/
diff --git a/LICENSE.txt b/LICENSE.txt
index c7efd73..3593a7a 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,5 +1,5 @@
 
- Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+ Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
 
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
diff --git a/Makefile b/Makefile
index c37b7f7..482f40e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,236 +1,30 @@
 #
-# Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
 #
-# See LICENCE.txt for license information
+# See LICENSE.txt for license information
 #
-
-CUDA_HOME ?= /usr/local/cuda
-PREFIX ?= /usr/local
-VERBOSE ?= 0
-KEEP ?= 0
-DEBUG ?= 0
-PROFAPI ?= 0
-BUILDDIR ?= build
-BUILDDIR := $(abspath $(BUILDDIR))
-
-CUDA_LIB ?= $(CUDA_HOME)/lib64
-CUDA_INC ?= $(CUDA_HOME)/include
-NVCC ?= $(CUDA_HOME)/bin/nvcc
-
-NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
-                -gencode=arch=compute_50,code=sm_50 \
-                -gencode=arch=compute_52,code=sm_52 \
-                -gencode=arch=compute_60,code=sm_60\
-                -gencode=arch=compute_61,code=sm_61 \
-                -gencode=arch=compute_60,code=compute_60
-
-CXXFLAGS   := -I$(CUDA_INC) -fPIC -fvisibility=hidden
-NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -maxrregcount 96
-# Use addprefix so that we can specify more than one path
-LDFLAGS    := $(addprefix -L,${CUDA_LIB}) -lcudart -lrt
-
-ifeq ($(DEBUG), 0)
-NVCUFLAGS += -O3
-CXXFLAGS  += -O3
-else
-NVCUFLAGS += -O0 -G
-CXXFLAGS  += -O0 -g -ggdb3
-endif
-
-ifneq ($(VERBOSE), 0)
-NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
-CXXFLAGS  += -Wall -Wextra
-else
-.SILENT:
-endif
-
-ifneq ($(KEEP), 0)
-NVCUFLAGS += -keep
-endif
-
-ifneq ($(PROFAPI), 0)
-CXXFLAGS += -DPROFAPI
-endif
-
-NCCL_MAJOR   := 1
-NCCL_MINOR   := 3
-NCCL_PATCH   := 5
-CXXFLAGS  += -DNCCL_MAJOR=$(NCCL_MAJOR) -DNCCL_MINOR=$(NCCL_MINOR) -DNCCL_PATCH=$(NCCL_PATCH)
-
-CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
-CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
-CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
-CXXFLAGS  += -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR)
-
-.PHONY : all lib staticlib clean test mpitest install deb debian debclean forlib fortest forclean
-.DEFAULT : all
-
-INCEXPORTS  := nccl.h
-LIBSRCFILES := libwrap.cu core.cu all_gather.cu all_reduce.cu broadcast.cu reduce.cu reduce_scatter.cu
-LIBNAME     := libnccl.so
-STATICLIBNAME := libnccl_static.a
-
-INCDIR := $(BUILDDIR)/include
-LIBDIR := $(BUILDDIR)/lib
-OBJDIR := $(BUILDDIR)/obj
-
-INCTARGETS := $(patsubst %, $(INCDIR)/%, $(INCEXPORTS))
-LIBSONAME  := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME))
-LIBTARGET  := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME))
-STATICLIBTARGET := $(STATICLIBNAME)
-LIBLINK    := $(patsubst lib%.so, -l%, $(LIBNAME))
-LIBOBJ     := $(patsubst %.cu, $(OBJDIR)/%.o, $(filter %.cu, $(LIBSRCFILES)))
-DEPFILES   := $(patsubst %.o, %.d, $(LIBOBJ)) $(patsubst %, %.d, $(TESTBINS)) $(patsubst %, %.d, $(MPITESTBINS))
-
-all : lib staticlib
-
-lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
-
-staticlib : $(INCTARGETS) $(LIBDIR)/$(STATICLIBTARGET)
-
--include $(DEPFILES)
-
-$(LIBDIR)/$(LIBTARGET) : $(LIBOBJ)
-	@printf "Linking   %-35s > %s\n" $(LIBTARGET) $@
-	mkdir -p $(LIBDIR)
-	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LDFLAGS) $(LIBOBJ)
-	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
-	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
-
-$(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
-	@printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@
-	mkdir -p $(LIBDIR)
-	ar cr $@ $(LIBOBJ)
-
-$(INCDIR)/%.h : src/%.h
-	@printf "Grabbing  %-35s > %s\n" $< $@
-	mkdir -p $(INCDIR)
-	cp -f $< $@
-
-$(OBJDIR)/%.o : src/%.cu
-	@printf "Compiling %-35s > %s\n" $< $@
-	mkdir -p $(OBJDIR)
-	$(NVCC) -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
-	@$(NVCC) -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
-	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
-	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
-                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
-	@rm -f $(@:%.o=%.d.tmp)
-
-clean :
-	rm -rf $(BUILDDIR)
-
-install : lib
-	mkdir -p $(PREFIX)/lib
-	mkdir -p $(PREFIX)/include
-	cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
-	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
-
-
-#### TESTS ####
-
-TEST_ONLY ?= 0
-
-# Tests depend on lib, except in TEST_ONLY mode.
-ifeq ($(TEST_ONLY), 0)
-TSTDEP = $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
-endif
-
-NCCL_LIB ?= $(LIBDIR)
-NCCL_INC ?= $(INCDIR)
-
-MPI_HOME ?= /usr
-MPI_INC ?= $(MPI_HOME)/include
-MPI_LIB ?= $(MPI_HOME)/lib
-MPIFLAGS   := -I$(MPI_INC) -L$(MPI_LIB) -lmpi
-
-TESTS       := all_gather_test     all_gather_scan \
-               all_reduce_test     all_reduce_scan \
-               broadcast_test      broadcast_scan \
-               reduce_test         reduce_scan \
-               reduce_scatter_test reduce_scatter_scan
-MPITESTS    := mpi_test
-
-TSTINC     := -I$(NCCL_INC) -Itest/include
-TSTLIB     := -L$(NCCL_LIB) $(LIBLINK) $(LDFLAGS)
-TSTDIR     := $(BUILDDIR)/test/single
-MPITSTDIR  := $(BUILDDIR)/test/mpi
-TESTBINS   := $(patsubst %, $(TSTDIR)/%, $(TESTS))
-MPITESTBINS:= $(patsubst %, $(MPITSTDIR)/%, $(MPITESTS))
-
-test : $(TESTBINS)
-
-$(TSTDIR)/% : test/single/%.cu test/include/*.h $(TSTDEP)
-	@printf "Building  %-35s > %s\n" $< $@
-	mkdir -p $(TSTDIR)
-	$(NVCC) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt
-	@$(NVCC) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcuda -lcurand -lnvToolsExt > $(@:%=%.d.tmp)
-	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
-	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
-                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
-	@rm -f $(@:%=%.d.tmp)
-
-mpitest : $(MPITESTBINS)
-
-$(MPITSTDIR)/% : test/mpi/%.cu $(TSTDEP)
-	@printf "Building  %-35s > %s\n" $< $@
-	mkdir -p $(MPITSTDIR)
-	$(NVCC) $(MPIFLAGS) $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" -o $@ $< $(TSTLIB) -lcurand
-	@$(NVCC) $(MPIFLAGS) -M $(TSTINC) $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< $(TSTLIB) -lcurand > $(@:%=%.d.tmp)
-	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%=%.d.tmp) > $(@:%=%.d)
-	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%=%.d.tmp) | fmt -1 | \
-                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%=%.d)
-	@rm -f $(@:%=%.d.tmp)
-
-#### PACKAGING ####
-
-DEBIANDIR  := $(BUILDDIR)/debian
-
-DEBGEN_IN  := $(shell (cd debian ; ls *.in))
-DEBGEN     := $(DEBGEN_IN:.in=)
-DEBFILES   := compat copyright libnccl-dev.install libnccl-dev.manpages nccl.7 rules $(DEBGEN)
-DEBTARGETS := $(patsubst %, $(DEBIANDIR)/%, $(DEBFILES))
-
-DEB_REVISION   ?= 1
-DEB_TIMESTAMP  := $(shell date -R)
-DEB_ARCH       ?= amd64
-
-debian : $(DEBTARGETS)
-
-deb : lib debian
-	@printf "Building Debian package\n"
-	(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b)
-	mkdir -p $(BUILDDIR)/deb/
-	mv $(BUILDDIR)/../libnccl*.deb $(BUILDDIR)/deb/
-
-debclean :
-	rm -Rf $(DEBIANDIR)
-
-$(DEBIANDIR)/% : debian/%.in
-	@printf "Generating %-35s > %s\n" $< $@
-	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
-	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
-	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
-	    -e "s/\$${deb:Revision}/$(DEB_REVISION)/g" \
-	    -e "s/\$${deb:Timestamp}/$(DEB_TIMESTAMP)/g" \
-	    -e "s/\$${deb:Arch}/$(DEB_ARCH)/g" \
-	    $< > $@
-
-$(DEBIANDIR)/% : debian/%
-	@printf "Grabbing  %-35s > %s\n" $< $@
-	mkdir -p $(DEBIANDIR)
-	cp -f $< $@
-
-#### FORTRAN BINDINGS ####
-
-export NCCL_MAJOR NCCL_MINOR NCCL_PATCH CUDA_MAJOR CUDA_MINOR LIBLINK CUDA_LIB BUILDDIR
-
-forlib : lib
-	$(MAKE) -C fortran lib
-fortest : forlib
-	$(MAKE) -C fortran test
-forclean :
-	$(MAKE) -C fortran clean
-
+.PHONY : all clean
+
+default : src.build
+BUILDDIR ?= $(abspath ./build)
+ABSBUILDDIR := $(abspath $(BUILDDIR))
+TARGETS := src pkg
+clean: ${TARGETS:%=%.clean}
+test.build: src.build
+LICENSE_FILES := LICENSE.txt
+LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%)
+lic: $(LICENSE_TARGETS)
+
+${BUILDDIR}/%.txt: %.txt
+	@printf "Copying    %-35s > %s\n" $< $@
+	mkdir -p ${BUILDDIR}
+	cp $< $@
+
+src.%:
+	${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR}
+
+pkg.%:
+	${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR}
+
+pkg.debian.prep: lic
+pkg.txz.prep: lic
diff --git a/README.md b/README.md
index 17b9546..47ae0c4 100644
--- a/README.md
+++ b/README.md
@@ -1,128 +1,84 @@
-**IMPORTANT NOTE**
-
-**NCCL1 is no longer maintained/updated and has been replaced by NCCL2, available at**
-
-**http://developer.nvidia.com/nccl.**
-
 # NCCL
 
 Optimized primitives for collective multi-GPU communication.
 
 ## Introduction
 
-NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines, such as all-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. NCCL supports an arbitrary number of GPUs installed in a single node and can be used in either single- or multi-process (e.g., MPI) applications.
-[This blog post](https://devblogs.nvidia.com/parallelforall/fast-multi-gpu-collectives-nccl/) provides details on NCCL functionality, goals, and performance.
+NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
+
+For more information on NCCL usage, please refer to the [NCCL documentation](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html).
 
 ## What's inside
 
-At present, the library implements the following collectives:
+At present, the library implements the following collectives operations:
+
 - all-reduce
 - all-gather
 - reduce-scatter
 - reduce
 - broadcast
 
-These collectives are implemented using ring algorithms and have been optimized primarily for throughput. For best performance, small collectives should be batched into larger operations whenever possible. Small test binaries demonstrating how to use each of the above collectives are also provided.
+These operations are implemented using ring algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API.
 
 ## Requirements
 
-NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. Best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported.
-
-Note: NCCL may also work with CUDA 6.5, but this is an untested configuration.
+NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. For PCIe based platforms, best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported.
 
-## Build & run
+## Build
 
-To build the library and tests.
+To build the library :
 
 ```shell
 $ cd nccl
-$ make CUDA_HOME=<cuda install path> test
+$ make -j src.build
 ```
 
-Test binaries are located in the subdirectories nccl/build/test/{single,mpi}.
+If CUDA is not installed in the default /usr/local/cuda path, you can define the CUDA path with :
 
 ```shell
-$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./build/lib
-$ ./build/test/single/all_reduce_test
-Error: must specify at least data size in bytes!
-
-Tests nccl AllReduce with user supplied arguments.
-    Usage: all_reduce_test <data size in bytes> [number of GPUs] [GPU 0] [GPU 1] ...
-
-$ ./build/test/single/all_reduce_test 10000000
-# Using devices
-#   Device  0 ->  0 [0x0a] GeForce GTX TITAN X
-#   Device  1 ->  1 [0x09] GeForce GTX TITAN X
-#   Device  2 ->  2 [0x06] GeForce GTX TITAN X
-#   Device  3 ->  3 [0x05] GeForce GTX TITAN X
-
-#                                                 out-of-place                    in-place
-#      bytes             N    type      op     time  algbw  busbw      res     time  algbw  busbw      res
-    10000000      10000000    char     sum    1.628   6.14   9.21    0e+00    1.932   5.18   7.77    0e+00
-    10000000      10000000    char    prod    1.629   6.14   9.21    0e+00    1.643   6.09   9.13    0e+00
-    10000000      10000000    char     max    1.621   6.17   9.25    0e+00    1.634   6.12   9.18    0e+00
-    10000000      10000000    char     min    1.633   6.12   9.19    0e+00    1.637   6.11   9.17    0e+00
-    10000000       2500000     int     sum    1.611   6.21   9.31    0e+00    1.626   6.15   9.23    0e+00
-    10000000       2500000     int    prod    1.613   6.20   9.30    0e+00    1.629   6.14   9.21    0e+00
-    10000000       2500000     int     max    1.619   6.18   9.26    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000     int     min    1.619   6.18   9.27    0e+00    1.624   6.16   9.24    0e+00
-    10000000       5000000    half     sum    1.617   6.18   9.28    4e-03    1.636   6.11   9.17    4e-03
-    10000000       5000000    half    prod    1.618   6.18   9.27    1e-03    1.657   6.03   9.05    1e-03
-    10000000       5000000    half     max    1.608   6.22   9.33    0e+00    1.621   6.17   9.25    0e+00
-    10000000       5000000    half     min    1.610   6.21   9.32    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000   float     sum    1.618   6.18   9.27    5e-07    1.622   6.17   9.25    5e-07
-    10000000       2500000   float    prod    1.614   6.20   9.29    1e-07    1.628   6.14   9.21    1e-07
-    10000000       2500000   float     max    1.616   6.19   9.28    0e+00    1.633   6.12   9.19    0e+00
-    10000000       2500000   float     min    1.613   6.20   9.30    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double     sum    1.629   6.14   9.21    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double    prod    1.619   6.18   9.26    2e-16    1.628   6.14   9.21    2e-16
-    10000000       1250000  double     max    1.613   6.20   9.30    0e+00    1.630   6.13   9.20    0e+00
-    10000000       1250000  double     min    1.622   6.16   9.25    0e+00    1.623   6.16   9.24    0e+00
+$ make src.build CUDA_HOME=<path to cuda install>
 ```
 
-To install, run `make PREFIX=<install dir> install` and add `<instal dir>/lib` to your `LD_LIBRARY_PATH`.
+NCCL will be compiled and installed in `build/` unless `BUILDDIR` is set.
 
-## Usage
+By default, NCCL is compiled for all supported architectures. To accelerate the compilation and reduce the binary size, consider redefining `NVCC_GENCODE` (defined in `makefiles/common.mk`) to only include the architecture of the target platform :
+```shell
+$ make -j src.build NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70"
+```
 
-NCCL follows the MPI collectives API fairly closely. Before any collectives can be called, a communicator object must be initialized on each GPU. On a single-process machine, all GPUs can be conveniently initialized using `ncclCommInitAll`. For multi-process applications (e.g., with MPI), `ncclCommInitRank` must be called for each GPU. Internally `ncclCommInitRank` invokes a synchronization among all GPUs, so these calls must be invoked in different host threads (or processes) for each GPU. A brief single-process example follows, for an MPI example see test/mpi/mpi_test.cu. For details about the API see nccl.h.
+## Install
 
-```c
-#include <nccl.h>
+To install NCCL on the system, create a package then install it as root.
 
-typedef struct {
-  double* sendBuff;
-  double* recvBuff;
-  int size;
-  cudaStream_t stream;
-} PerThreadData;
+Debian/Ubuntu :
+```shell
+$ make pkg.debian.build
+$ ls build/pkg/deb/
+```
 
-int main(int argc, char* argv[])
-{
-  int nGPUs;
-  cudaGetDeviceCount(&nGPUs);
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nGPUs);
-  ncclCommInitAll(comms, nGPUs); // initialize communicator
-                                // One communicator per process
+RedHat/CentOS :
+```shell
+$ make pkg.redhat.build
+$ ls build/pkg/rpm/
+```
 
-  PerThreadData* data;
+OS-agnostic tarball :
+```shell
+$ make pkg.txz.build
+$ ls build/pkg/txz/
+```
 
-  ... // Allocate data and issue work to each GPU's
-      // perDevStream to populate the sendBuffs.
+## Tests
 
-  for(int i=0; i<nGPUs; ++i) {
-    cudaSetDevice(i); // Correct device must be set
-                      // prior to each collective call.
-    ncclAllReduce(data[i].sendBuff, data[i].recvBuff, size,
-        ncclDouble, ncclSum, comms[i], data[i].stream);
-  }
+Tests for NCCL are maintained separately at https://github.com/nvidia/nccl-tests.
 
-  ... // Issue work into data[*].stream to consume buffers, etc.
-}
+```shell
+$ git clone https://github.com/NVIDIA/nccl-tests.git
+$ cd nccl-tests
+$ make
+$ ./build/allreduce_perf -b 8 -e 256M -f 2 -g <ngpus>
 ```
 
-## Copyright and License
-
-NCCL is provided under the [BSD licence](LICENSE.txt). All source code and
-accompanying documentation is copyright (c) 2015-2016, NVIDIA CORPORATION. All
-rights reserved.
+## Copyright
 
+All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
diff --git a/debian/changelog.in b/debian/changelog.in
deleted file mode 100644
index ad569a0..0000000
--- a/debian/changelog.in
+++ /dev/null
@@ -1,5 +0,0 @@
-nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}-${deb:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium
-
-  * Automatic Debian package from build
-
- -- cudatools <cudatools@nvidia.com>  ${deb:Timestamp}
diff --git a/debian/copyright b/debian/copyright
deleted file mode 120000
index 4ab4373..0000000
--- a/debian/copyright
+++ /dev/null
@@ -1 +0,0 @@
-../LICENSE.txt
-\ No newline at end of file
diff --git a/debian/libnccl-dev.install b/debian/libnccl-dev.install
deleted file mode 100644
index 90299a0..0000000
--- a/debian/libnccl-dev.install
+++ /dev/null
@@ -1,2 +0,0 @@
-include/nccl.h usr/include
-lib/libnccl.so /usr/lib/x86_64-linux-gnu
diff --git a/debian/libnccl-dev.manpages b/debian/libnccl-dev.manpages
deleted file mode 100644
index 4bfc2cb..0000000
--- a/debian/libnccl-dev.manpages
+++ /dev/null
@@ -1 +0,0 @@
-debian/nccl.7
diff --git a/debian/libnccl1.install.in b/debian/libnccl1.install.in
deleted file mode 100644
index 73b4c0a..0000000
--- a/debian/libnccl1.install.in
+++ /dev/null
@@ -1,2 +0,0 @@
-lib/libnccl.so.${nccl:Major} /usr/lib/x86_64-linux-gnu
-lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/x86_64-linux-gnu
diff --git a/debian/nccl.7 b/debian/nccl.7
deleted file mode 100644
index 0cb5601..0000000
--- a/debian/nccl.7
+++ /dev/null
@@ -1,139 +0,0 @@
-.TH NCCL
-.SH NAME
-.PP
-nccl \- Optimized primitives for collective multi\-GPU communication.
-
-.SH Introduction
-.PP
-NCCL (pronounced "Nickel") is a stand\-alone library of standard collective communication routines, such as all\-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. NCCL supports an arbitrary number of GPUs installed in a single node and can be used in either single\- or multi\-process (e.g., MPI) applications.
-
-.SH What's inside
-.PP
-At present, the library implements the following collectives:
-\- all\-reduce
-\- all\-gather
-\- reduce\-scatter
-\- reduce
-\- broadcast
-
-.PP
-These collectives are implemented using ring algorithms and have been optimized primarily for throughput. For best performance, small collectives should be batched into larger operations whenever possible. Small test binaries demonstrating how to use each of the above collectives are also provided.
-
-.SH Requirements
-.PP
-NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. Best performance is achieved when all GPUs are located on a common PCIe root complex, but multi\-socket configurations are also supported.
-
-.PP
-Note: NCCL may also work with CUDA 6.5, but this is an untested configuration.
-
-.SH Build & run
-.PP
-To build the library and tests.
-
-.PP
-.RS
-
-.nf
-$ cd nccl
-$ make CUDA\_HOME=<cuda install path> test
-
-.fi
-.RE
-
-.PP
-Test binaries are located in the subdirectories nccl/build/test and nccl/build/mpitest.
-
-.PP
-.RS
-
-.nf
-$ export LD\_LIBRARY\_PATH=$LD\_LIBRARY\_PATH:./build/lib
-$ ./build/test/all\_reduce\_test
-Error: must specify at least data size in bytes!
-
-Tests nccl AllReduce with user supplied arguments.
-    Usage: all\_reduce\_test <data size in bytes> [number of GPUs] [GPU 0] [GPU 1] ...
-
-$ ./build/test/all\_reduce\_test 10000000
-# Using devices
-#   Device  0 \->  0 [0x0a] GeForce GTX TITAN X
-#   Device  1 \->  1 [0x09] GeForce GTX TITAN X
-#   Device  2 \->  2 [0x06] GeForce GTX TITAN X
-#   Device  3 \->  3 [0x05] GeForce GTX TITAN X
-
-#                                                 out\-of\-place                    in\-place
-#      bytes             N    type      op     time  algbw  busbw      res     time  algbw  busbw      res
-    10000000      10000000    char     sum    1.628   6.14   9.21    0e+00    1.932   5.18   7.77    0e+00
-    10000000      10000000    char    prod    1.629   6.14   9.21    0e+00    1.643   6.09   9.13    0e+00
-    10000000      10000000    char     max    1.621   6.17   9.25    0e+00    1.634   6.12   9.18    0e+00
-    10000000      10000000    char     min    1.633   6.12   9.19    0e+00    1.637   6.11   9.17    0e+00
-    10000000       2500000     int     sum    1.611   6.21   9.31    0e+00    1.626   6.15   9.23    0e+00
-    10000000       2500000     int    prod    1.613   6.20   9.30    0e+00    1.629   6.14   9.21    0e+00
-    10000000       2500000     int     max    1.619   6.18   9.26    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000     int     min    1.619   6.18   9.27    0e+00    1.624   6.16   9.24    0e+00
-    10000000       5000000    half     sum    1.617   6.18   9.28    4e\-03    1.636   6.11   9.17    4e\-03
-    10000000       5000000    half    prod    1.618   6.18   9.27    1e\-03    1.657   6.03   9.05    1e\-03
-    10000000       5000000    half     max    1.608   6.22   9.33    0e+00    1.621   6.17   9.25    0e+00
-    10000000       5000000    half     min    1.610   6.21   9.32    0e+00    1.627   6.15   9.22    0e+00
-    10000000       2500000   float     sum    1.618   6.18   9.27    5e\-07    1.622   6.17   9.25    5e\-07
-    10000000       2500000   float    prod    1.614   6.20   9.29    1e\-07    1.628   6.14   9.21    1e\-07
-    10000000       2500000   float     max    1.616   6.19   9.28    0e+00    1.633   6.12   9.19    0e+00
-    10000000       2500000   float     min    1.613   6.20   9.30    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double     sum    1.629   6.14   9.21    0e+00    1.628   6.14   9.21    0e+00
-    10000000       1250000  double    prod    1.619   6.18   9.26    2e\-16    1.628   6.14   9.21    2e\-16
-    10000000       1250000  double     max    1.613   6.20   9.30    0e+00    1.630   6.13   9.20    0e+00
-    10000000       1250000  double     min    1.622   6.16   9.25    0e+00    1.623   6.16   9.24    0e+00
-
-.fi
-.RE
-
-.PP
-To install, run \fB\fCmake PREFIX=<install dir> install\fR and add \fB\fC<instal dir>/lib\fR to your \fB\fCLD\_LIBRARY\_PATH\fR.
-
-.SH Usage
-.PP
-NCCL follows the MPI collectives API fairly closely. Before any collectives can be called, a communicator object must be initialized on each GPU. On a single\-process machine, all GPUs can be conveniently initialized using \fB\fCncclCommInitAll\fR. For multi\-process applications (e.g., with MPI), \fB\fCncclCommInitRank\fR must be called for each GPU. Internally \fB\fCncclCommInitRank\fR invokes a synchronization among all GPUs, so these calls must be invoked in different host threads (or processes) for each GPU. A brief single\-process example follows, for an MPI example see src/mpi\_test.cu. For details about the API see nccl.h.
-
-.PP
-.RS
-
-.nf
-#include <nccl.h>
-
-typedef struct \{
-  double* sendBuff;
-  double* recvBuff;
-  int size;
-  cudaStream\_t stream;
-\} PerThreadData;
-
-int main(int argc, char* argv[])
-\{
-  int nGPUs;
-  cudaGetDeviceCount(\&nGPUs);
-  ncclComm\_t* comms = (ncclComm\_t*)malloc(sizeof(ncclComm\_t)*nGPUs);
-  ncclCommInitAll(comms, nGPUs); // initialize communicator
-                                // One communicator per process
-
-  PerThreadData* data;
-
-  ... // Allocate data and issue work to each GPU's
-      // perDevStream to populate the sendBuffs.
-
-  for(int i=0; i<nGPUs; ++i) \{
-    cudaSetDevice(i); // Correct device must be set
-                      // prior to each collective call.
-    ncclAllReduce(data[i].sendBuff, data[i].recvBuff, size,
-        ncclDouble, ncclSum, comms[i], data[i].stream);
-  \}
-
-  ... // Issue work into data[*].stream to consume buffers, etc.
-\}
-
-.fi
-.RE
-
-.SH Copyright
-.PP
-All source code and accompanying documentation is copyright (c) 2015\-2016, NVIDIA CORPORATION. All
-rights reserved.
diff --git a/debian/shlibs.local.in b/debian/shlibs.local.in
deleted file mode 100644
index 82505da..0000000
--- a/debian/shlibs.local.in
+++ /dev/null
@@ -1 +0,0 @@
-libcudart ${cuda:Major}.${cuda:Minor} cuda-cudart-${cuda:Major}-${cuda:Minor}
diff --git a/fortran/Makefile b/fortran/Makefile
deleted file mode 100644
index b60b016..0000000
--- a/fortran/Makefile
+++ /dev/null
@@ -1,81 +0,0 @@
-FC := gfortran
-FCNAME := $(notdir $(FC))
-
-BUILDDIR ?= ../build
-INCDIR := $(BUILDDIR)/include
-LIBDIR := $(BUILDDIR)/lib
-OBJDIR := $(BUILDDIR)/obj
-
-LIBNAME    := libncclfor.so
-LIBSONAME  := $(patsubst %,%.$(NCCL_MAJOR),$(LIBNAME))
-LIBTARGET  := $(patsubst %,%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH),$(LIBNAME))
-LIBLINK    += $(patsubst lib%.so,-l%,$(LIBNAME))
-
-LIBCUDAFOR := libcudafor.so
-
-ifneq ($(filter pgf%, $(FCNAME)), )
-# PGI compiler (pgfortran, pgf90, pgf95)
-FCMODFLAGS  := -module $(INCDIR)
-FCPREFLAGS  := -Mpreprocess
-FCCUDAFLAGS := -Mcuda,cuda$(CUDA_MAJOR).$(CUDA_MINOR)
-FCFLAGS     := -fast -O3
-else
-# non-PGI compilers do not have CUDA support, compile our own CUDA lib
-CUDAFORDEP  := $(LIBDIR)/$(LIBCUDAFOR)
-CUDALINK    := -L$(CUDA_LIB) -lcudart
-CUDAFORLINK := -lcudafor
-ifeq ($(FCNAME), gfortran)
-FCMODFLAGS  := -J$(INCDIR)
-FCPREFLAGS  += -cpp
-FCFLAGS     += -ffree-line-length-none
-else ifeq ($(FCNAME), ifort)
-FCMODFLAGS  := -module $(INCDIR)
-FCPREFLAGS  += -fpp
-endif
-endif
-
-ifeq ($(VERBOSE), 0)
-.SILENT:
-endif
-
-lib: $(CUDAFORDEP)
-	$(MAKE) $(LIBDIR)/$(LIBTARGET)
-
-$(LIBDIR)/$(LIBTARGET): $(OBJDIR)/ncclfor.o
-	@printf "Linking   %-35s > %s\n" $(LIBTARGET) $@
-	mkdir -p $(LIBDIR)
-	$(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) $< -o $(LIBDIR)/$(LIBTARGET)
-	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
-	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
-
-$(LIBDIR)/$(LIBCUDAFOR): $(OBJDIR)/cudafor.o
-	@printf "Linking   %-35s > %s\n" $(LIBCUDAFOR) $@
-	mkdir -p $(LIBDIR)
-	$(FC) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBCUDAFOR) $< -o $(LIBDIR)/$(LIBCUDAFOR)
-
-$(OBJDIR)/%.o: src/%.f90
-	@printf "Building  %-35s > %s\n" $< $@
-	mkdir -p $(OBJDIR)
-	mkdir -p $(INCDIR)
-	$(FC) -c $(FCMODFLAGS) $(FCPREFLAGS) -fPIC $(FCCUDAFLAGS) $(FCFLAGS) $< -o $@
-
-TESTS := reduce_ptr_out allreduce_ptr_out reducescatter_ptr_out broadcast_ptr allgather_ptr_out
-ifneq ($(filter pgf%, $(FCNAME)), )
-TESTS += reduce_arr_out allreduce_arr_out reducescatter_arr_out broadcast_arr allgather_arr_out
-endif
-
-TESTDIR  := $(BUILDDIR)/test/fortran
-TESTBINS := $(patsubst %,$(TESTDIR)/%,$(TESTS))
-
-test: lib $(TESTBINS)
-
-$(TESTDIR)/%: test/%.f90 lib
-	@printf "Building  %-35s > %s\n" $< $@
-	@mkdir -p $(TESTDIR)
-	$(FC) $(FCCUDAFLAGS) $(FCFLAGS) $< $(CUDALINK) -I$(INCDIR) -L$(LIBDIR) $(CUDAFORLINK) $(LIBLINK) -o $@
-
-clean:
-	rm -f $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(LIBSONAME) $(LIBDIR)/$(LIBNAME)
-	rm -f $(LIBDIR)/$(LIBCUDAFOR) $(OBJDIR)/*for.o $(INCDIR)/*.mod
-	rm -rf $(TESTDIR)/
-
diff --git a/fortran/src/cudafor.f90 b/fortran/src/cudafor.f90
deleted file mode 100644
index 4ecd0f4..0000000
--- a/fortran/src/cudafor.f90
+++ /dev/null
@@ -1,171 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-#ifndef _CUDA
-
-!Start cudaFor module
-module cudaFor
-use iso_c_binding
-implicit none
-private
-public :: c_devptr
-public :: cudaMemcpyKind,           &
-          cudaMemcpyHostToHost,     &
-          cudaMemcpyHostToDevice,   &
-          cudaMemcpyDeviceToHost,   &
-          cudaMemcpyDeviceToDevice, &
-          cudaMemcpyDefault
-public :: cuda_stream_kind
-public :: cudaGetDeviceCount
-public :: cudaSetDevice
-public :: cudaMalloc
-public :: cudaMemcpy
-public :: cudaFree
-public :: cudaStreamCreate
-public :: cudaStreamSynchronize
-public :: cudaStreamDestroy
-
-!Start types
-
-!Start c_devptr
-type, bind(c) :: c_devptr
-type(c_ptr) :: member
-end type c_devptr
-!End c_devptr
-
-!Start cudaMemcpyKind
-type, bind(c) :: cudaMemcpyKind
-integer(c_int) :: member
-end type cudaMemcpyKind
-
-type(cudaMemcpyKind), parameter :: cudaMemcpyHostToHost     = cudaMemcpyKind(0), &
-                                   cudaMemcpyHostToDevice   = cudaMemcpyKind(1), &
-                                   cudaMemcpyDeviceToHost   = cudaMemcpyKind(2), &
-                                   cudaMemcpyDeviceToDevice = cudaMemcpyKind(3), &
-                                   cudaMemcpyDefault        = cudaMemcpyKind(4)
-!End cudaMemcpyKind
-
-!Start cuda_stream_kind
-integer(c_intptr_t), parameter :: cuda_stream_kind = c_intptr_t
-!End cuda_stream_kind
-
-!End types
-
-!Start interfaces
-
-!Start cudaGetDeviceCount
-interface cudaGetDeviceCount
-integer(c_int) function cudaGetDeviceCount(count) bind(c, name = "cudaGetDeviceCount")
-import :: c_int
-implicit none
-integer(c_int) :: count
-end function cudaGetDeviceCount
-end interface cudaGetDeviceCount
-!End cudaGetDeviceCount
-
-!Start cudaSetDevice
-interface cudaSetDevice
-integer(c_int) function cudaSetDevice(device) bind(c, name = "cudaSetDevice")
-import :: c_int
-implicit none
-integer(c_int), value :: device
-end function cudaSetDevice
-end interface cudaSetDevice
-!End cudaSetDevice
-
-!Start cudaMalloc
-interface cudaMalloc
-integer(c_int) function cudaMalloc(devPtr, size) bind(c, name = "cudaMalloc")
-import :: c_int, c_size_t
-import :: c_devptr
-implicit none
-type(c_devptr) :: devPtr
-integer(c_size_t), value :: size
-end function cudaMalloc
-end interface cudaMalloc
-!End cudaMalloc
-
-!Start cudaMemcpy
-interface cudaMemcpy
-
-!Start cudaMemcpyH2D
-integer(c_int) function cudaMemcpyH2D(dst, src, count, kind) bind(c, name = "cudaMemcpy")
-import :: c_ptr, c_int, c_size_t
-import :: c_devptr, cudaMemcpyKind
-implicit none
-type(c_devptr), value :: dst
-type(c_ptr), value :: src
-integer(c_size_t), value :: count
-type(cudaMemcpyKind), value :: kind
-end function cudaMemcpyH2D
-!End cudaMemcpyH2D
-
-!Start cudaMemcpyD2H
-integer(c_int) function cudaMemcpyD2H(dst, src, count, kind) bind(c, name = "cudaMemcpy")
-import :: c_ptr, c_int, c_size_t
-import :: c_devptr, cudaMemcpyKind
-implicit none
-type(c_ptr), value :: dst
-type(c_devptr), value :: src
-integer(c_size_t), value :: count
-type(cudaMemcpyKind), value :: kind
-end function cudaMemcpyD2H
-!End cudaMemcpyD2H
-
-end interface cudaMemcpy
-!End cudaMemcpy
-
-!Start cudaFree
-interface cudaFree
-integer(c_int) function cudaFree(devPtr) bind(c, name = "cudaFree")
-import :: c_int
-import :: c_devptr
-implicit none
-type(c_devptr), value :: devPtr
-end function cudaFree
-end interface cudaFree
-!End cudaFree
-
-!Start cudaStreamCreate
-interface cudaStreamCreate
-integer(c_int) function cudaStreamCreate(pStream) bind(c, name = "cudaStreamCreate")
-import :: c_int
-import :: cuda_stream_kind
-implicit none
-integer(cuda_stream_kind) :: pStream
-end function cudaStreamCreate
-end interface cudaStreamCreate
-!End cudaStreamCreate
-
-!Start cudaStreamSynchronize
-interface cudaStreamSynchronize
-integer(c_int) function cudaStreamSynchronize(stream) bind(c, name = "cudaStreamSynchronize")
-import :: c_int
-import :: cuda_stream_kind
-implicit none
-integer(cuda_stream_kind), value :: stream
-end function cudaStreamSynchronize
-end interface cudaStreamSynchronize
-!End cudaStreamSynchronize
-
-!Start cudaStreamDestroy
-interface cudaStreamDestroy
-integer(c_int) function cudaStreamDestroy(stream) bind(c, name = "cudaStreamDestroy")
-import :: c_int
-import :: cuda_stream_kind
-implicit none
-integer(cuda_stream_kind), value :: stream
-end function cudaStreamDestroy
-end interface cudaStreamDestroy
-!End cudaStreamDestroy
-
-!End interfaces
-
-end module cudaFor
-!End cudaFor module
-
-#endif
diff --git a/fortran/src/ncclfor.f90 b/fortran/src/ncclfor.f90
deleted file mode 100644
index 2ed4d3d..0000000
--- a/fortran/src/ncclfor.f90
+++ /dev/null
@@ -1,312 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-!Start defines
-#define NCCL_UNIQUE_ID_BYTES 128
-!End defines
-
-!Start ncclFor module
-module ncclFor
-use iso_c_binding
-use cudaFor
-implicit none
-private
-public :: ncclUniqueId
-public :: ncclComm
-public :: ncclResult,                 &
-          ncclSuccess,                &
-          ncclUnhandledCudaError,     &
-          ncclSystemError,            &
-          ncclInternalError,          &
-          ncclInvalidDevicePointer,   &
-          ncclInvalidRank,            &
-          ncclUnsupportedDeviceCount, &
-          ncclDeviceNotFound,         &
-          ncclInvalidDeviceIndex,     &
-          ncclLibWrapperNotSet,       &
-          ncclCudaMallocFailed,       &
-          ncclRankMismatch,           &
-          ncclInvalidArgument,        &
-          ncclInvalidType,            &
-          ncclInvalidOperation,       &
-          nccl_NUM_RESULTS
-public :: ncclDataType, &
-          ncclChar,     &
-          ncclInt,      &
-#ifdef CUDA_HAS_HALF
-          ncclHalf,     &
-#endif
-          ncclFloat,    &
-          ncclDouble,   &
-          ncclInt64,    &
-          ncclUInt64,   &
-          nccl_NUM_TYPES
-public :: ncclRedOp, &
-          ncclSum,   &
-          ncclProd,  &
-          ncclMax,   &
-          ncclMin,   &
-          nccl_NUM_OPS
-public :: ncclGetUniqueId
-public :: ncclCommInitRank
-public :: ncclCommInitAll
-public :: ncclCommCuDevice
-public :: ncclCommUserRank
-public :: ncclCommCount
-public :: ncclCommDestroy
-public :: ncclReduce
-public :: ncclAllReduce
-public :: ncclReduceScatter
-public :: ncclBcast
-public :: ncclAllGather
-
-!Start types
-
-!Start ncclUniqueId
-type, bind(c) :: ncclUniqueId
-character(c_char) :: internal(NCCL_UNIQUE_ID_BYTES)
-end type ncclUniqueId
-!End ncclUniqueId
-
-!Start ncclComm
-type, bind(c) :: ncclComm
-type(c_ptr) :: member
-end type ncclComm
-!End ncclComm
-
-!Start ncclResult
-type, bind(c) :: ncclResult
-integer(c_int) :: member
-end type ncclResult
-
-type(ncclResult), parameter :: ncclSuccess                = ncclResult( 0), &
-                               ncclUnhandledCudaError     = ncclResult( 1), &
-                               ncclSystemError            = ncclResult( 2), &
-                               ncclInternalError          = ncclResult( 3), &
-                               ncclInvalidDevicePointer   = ncclResult( 4), &
-                               ncclInvalidRank            = ncclResult( 5), &
-                               ncclUnsupportedDeviceCount = ncclResult( 6), &
-                               ncclDeviceNotFound         = ncclResult( 7), &
-                               ncclInvalidDeviceIndex     = ncclResult( 8), &
-                               ncclLibWrapperNotSet       = ncclResult( 9), &
-                               ncclCudaMallocFailed       = ncclResult(10), &
-                               ncclRankMismatch           = ncclResult(11), &
-                               ncclInvalidArgument        = ncclResult(12), &
-                               ncclInvalidType            = ncclResult(13), &
-                               ncclInvalidOperation       = ncclResult(14), &
-                               nccl_NUM_RESULTS           = ncclResult(15)
-!End ncclResult
-
-!Start ncclDataType
-type, bind(c) :: ncclDataType
-integer(c_int) :: member
-end type ncclDataType
-
-type(ncclDataType), parameter :: ncclChar       = ncclDataType(0), &
-                                 ncclInt        = ncclDataType(1), &
-#ifdef CUDA_HAS_HALF
-                                 ncclHalf       = ncclDataType(2), &
-#endif
-                                 ncclFloat      = ncclDataType(3), &
-                                 ncclDouble     = ncclDataType(4), &
-                                 ncclInt64      = ncclDataType(5), &
-                                 ncclUInt64     = ncclDataType(6), &
-                                 nccl_NUM_TYPES = ncclDataType(7)
-!End ncclDataType
-
-!Start ncclRedOp
-type, bind(c) :: ncclRedOp
-integer(c_int) :: member
-end type ncclRedOp
-
-type(ncclRedOp), parameter :: ncclSum      = ncclRedOp(0), &
-                              ncclProd     = ncclRedOp(1), &
-                              ncclMax      = ncclRedOp(2), &
-                              ncclMin      = ncclRedOp(3), &
-                              nccl_NUM_OPS = ncclRedOp(4)
-!End ncclRedOp
-
-!End types
-
-!Start interfaces
-
-!Start ncclGetUniqueId
-interface ncclGetUniqueId
-type(ncclResult) function ncclGetUniqueId(uniqueId) bind(c, name = 'ncclGetUniqueId')
-import :: ncclResult, ncclUniqueId
-implicit none
-type(ncclUniqueId) :: uniqueId
-end function ncclGetUniqueId
-end interface ncclGetUniqueId
-!End ncclGetUniqueId
-
-!Start ncclCommInitRank
-interface ncclCommInitRank
-type(ncclResult) function ncclCommInitRank(comm, ndev, commId, rank) bind(c, name = 'ncclCommInitRank')
-import :: c_int
-import :: ncclResult, ncclUniqueId, ncclComm
-implicit none
-type(ncclComm) :: comm(*)
-integer(c_int), value :: ndev
-type(ncclUniqueId), value :: commId
-integer(c_int), value :: rank
-end function ncclCommInitRank
-end interface ncclCommInitRank
-!End ncclCommInitRank
-
-!Start ncclCommInitAll
-interface ncclCommInitAll
-type(ncclResult) function ncclCommInitAll(comm, ndev, devlist) bind(c, name = 'ncclCommInitAll')
-import :: c_int
-import :: ncclResult, ncclComm
-implicit none
-type(ncclComm) :: comm(*)
-integer(c_int), value :: ndev
-integer(c_int) :: devlist(*)
-end function ncclCommInitAll
-end interface ncclCommInitAll
-!End ncclCommInitAll
-
-!Start ncclCommCuDevice
-interface ncclCommCuDevice
-type(ncclResult) function ncclCommCuDevice(comm, devid) bind(c, name = 'ncclCommCuDevice')
-import :: c_int
-import :: ncclResult, ncclComm
-implicit none
-type(ncclComm), value :: comm
-integer(c_int) :: devid
-end function ncclCommCuDevice
-end interface ncclCommCuDevice
-!End ncclCommCuDevice
-
-!Start ncclCommUserRank
-interface ncclCommUserRank
-type(ncclResult) function ncclCommUserRank(comm, rank) bind(c, name = 'ncclCommUserRank')
-import :: c_int
-import :: ncclResult, ncclComm
-implicit none
-type(ncclComm), value :: comm
-integer(c_int) :: rank
-end function ncclCommUserRank
-end interface ncclCommUserRank
-!End ncclCommUserRank
-
-!Start ncclCommCount
-interface ncclCommCount
-type(ncclResult) function ncclCommCount(comm, count) bind(c, name = 'ncclCommCount')
-import :: c_int
-import :: ncclResult, ncclComm
-implicit none
-type(ncclComm), value :: comm
-integer(c_int) :: count
-end function ncclCommCount
-end interface ncclCommCount
-!End ncclCommCount
-
-!Start ncclCommDestroy
-interface ncclCommDestroy
-subroutine ncclCommDestroy(comm) bind(c, name = 'ncclCommDestroy')
-import :: ncclComm
-implicit none
-type(ncclComm), value :: comm
-end subroutine ncclCommDestroy
-end interface ncclCommDestroy
-!End ncclCommDestroy
-
-!Start ncclReduce
-interface ncclReduce
-type(ncclResult) function ncclReduce(sendbuff, recvbuff, count, datatype, op, root, comm, stream) bind(c, name = 'ncclReduce')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
-implicit none
-type(c_devptr), value :: sendbuff
-type(c_devptr), value :: recvbuff
-integer(c_int), value :: count
-type(ncclDataType), value :: datatype
-type(ncclRedOp), value :: op
-integer(c_int), value :: root
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclReduce
-end interface ncclReduce
-!End ncclReduce
-
-!Start ncclAllReduce
-interface ncclAllReduce
-type(ncclResult) function ncclAllReduce(sendbuff, recvbuff, count, datatype, op, comm, stream) bind(c, name = 'ncclAllReduce')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
-implicit none
-type(c_devptr), value :: sendbuff
-type(c_devptr), value :: recvbuff
-integer(c_int), value :: count
-type(ncclDataType), value :: datatype
-type(ncclRedOp), value :: op
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclAllReduce
-end interface ncclAllReduce
-!End ncclAllReduce
-
-!Start ncclReduceScatter
-interface ncclReduceScatter
-type(ncclResult) function ncclReduceScatter(sendbuff, recvbuff, recvcount, datatype, op, comm, stream) bind(c, name = 'ncclReduceScatter')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType, ncclRedOp
-implicit none
-type(c_devptr), value :: sendbuff
-type(c_devptr), value :: recvbuff
-integer(c_int), value :: recvcount
-type(ncclDataType), value :: datatype
-type(ncclRedOp), value :: op
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclReduceScatter
-end interface ncclReduceScatter
-!End ncclReduceScatter
-
-!Start ncclBcast
-interface ncclBcast
-type(ncclResult) function ncclBcast(buff, count, datatype, root, comm, stream) bind(c, name = 'ncclBcast')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType
-implicit none
-type(c_devptr), value :: buff
-integer(c_int), value :: count
-type(ncclDataType), value :: datatype
-integer(c_int), value :: root
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclBcast
-end interface ncclBcast
-!End ncclBcast
-
-!Start ncclAllGather
-interface ncclAllGather
-type(ncclResult) function ncclAllGather(sendbuff, count, datatype, recvbuff, comm, stream) bind(c, name = 'ncclAllGather')
-import :: c_int
-import :: c_devptr, cuda_stream_kind
-import :: ncclResult, ncclComm, ncclDataType
-implicit none
-type(c_devptr), value :: sendbuff
-integer(c_int), value :: count
-type(ncclDataType), value :: datatype
-type(c_devptr), value :: recvbuff
-type(ncclComm), value :: comm
-integer(cuda_stream_kind), value :: stream
-end function ncclAllGather
-end interface ncclAllGather
-!End ncclAllGather
-
-!End interfaces
-
-end module ncclFor
-!End nccl module
diff --git a/fortran/test/allgather_arr_out.f90 b/fortran/test/allgather_arr_out.f90
deleted file mode 100644
index 17fbf7a..0000000
--- a/fortran/test/allgather_arr_out.f90
+++ /dev/null
@@ -1,162 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: sendBuff(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-real(real32), allocatable, device :: recvBuff(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl * nDev, nDev + 1))
-
-  call random_number(hostBuff)
-
-  print "(a)", "before allgather:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(sendBuff(nEl))
-    sendBuffPtr(i) = c_devloc(sendBuff)
-    sendBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(recvBuff(nEl * nDev))
-    recvBuffPtr(i) = c_devloc(recvBuff)
-    recvBuff = hostBuff(:, i)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
-    hostBuff(:, i) = recvBuff
-  end do
-
-  print "(a)", ""
-  print "(a)", "after allgather:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    hostBuff((i - 1) * nEl + 1:i * nEl, 1) = sendBuff
-  end do
-
-  err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
-  print "(a)", ""
-  print "(a, e11.4e2)", "maximum error in sendbuff = ", err
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl * nDev])
-    deallocate(recvBuff)
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    deallocate(sendBuff)
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/fortran/test/allgather_ptr_out.f90 b/fortran/test/allgather_ptr_out.f90
deleted file mode 100644
index f7d1962..0000000
--- a/fortran/test/allgather_ptr_out.f90
+++ /dev/null
@@ -1,171 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl * nDev, nDev + 1))
-
-  call random_number(hostBuff)
-
-  print "(a)", "before allgather:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
-  end do
-
-  allocate(hostBuffPtr(nDev))
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
-    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclAllGather(sendBuffPtr(i), nEl, dataType, recvBuffPtr(i), comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
-  end do
-
-  print "(a)", ""
-  print "(a)", "after allgather:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs sendbuff = ", err
-  end do
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, 1))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-  end do
-
-  err = maxval(abs(hostBuff(:, 1) / hostBuff(:, nDev + 1) - 1.0_real32))
-  print "(a)", ""
-  print "(a, e11.4e2)", "maximum error in sendbuff = ", err
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(recvBuffPtr(i))
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(sendBuffPtr(i))
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/fortran/test/allreduce_arr_out.f90 b/fortran/test/allreduce_arr_out.f90
deleted file mode 100644
index 50c1b64..0000000
--- a/fortran/test/allreduce_arr_out.f90
+++ /dev/null
@@ -1,165 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: sendBuff(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-real(real32), allocatable, device :: recvBuff(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before allreduce:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(sendBuff(nEl))
-    sendBuffPtr(i) = c_devloc(sendBuff)
-    sendBuff = hostBuff(:, i)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(recvBuff(nEl))
-    recvBuffPtr(i) = c_devloc(recvBuff)
-    recvBuff = hostBuff(:, i)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  print "(a)", ""
-  print "(a)", "after allreduce:"
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    hostBuff(:, nDev + 1) = recvBuff
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    hostBuff(:, nDev + 1) = sendBuff
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    deallocate(recvBuff)
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    deallocate(sendBuff)
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/fortran/test/allreduce_ptr_out.f90 b/fortran/test/allreduce_ptr_out.f90
deleted file mode 100644
index 2c1248f..0000000
--- a/fortran/test/allreduce_ptr_out.f90
+++ /dev/null
@@ -1,166 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before allreduce:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  allocate(hostBuffPtr(nDev + 1))
-
-  do i = 1, nDev + 1
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclAllReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  print "(a)", ""
-  print "(a)", "after allreduce:"
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(recvBuffPtr(i))
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(sendBuffPtr(i))
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/fortran/test/broadcast_arr.f90 b/fortran/test/broadcast_arr.f90
deleted file mode 100644
index 867fa1a..0000000
--- a/fortran/test/broadcast_arr.f90
+++ /dev/null
@@ -1,137 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev, root
-type(ncclDataType) :: dataType
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: devBuff(:)
-type(c_devptr), allocatable :: devBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-!  root = 0
-  stat = cudaGetDeviceCount(nDev)
-  root = nDev - 1
-
-  dataType = ncclFloat
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 1))
-
-  call random_number(hostBuff(:, 1:nDev))
-
-  hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
-
-  print "(a)", "before broadcast:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
-  end do
-
-  allocate(devBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(devBuff(nEl))
-    devBuffPtr(i) = c_devloc(devBuff)
-    devBuff = hostBuff(:, i)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
-    hostBuff(:, i) = devBuff
-  end do
-
-  print "(a)", ""
-  print "(a)", "after broadcast:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(devBuffPtr(i), devBuff, [nEl])
-    deallocate(devBuff)
-  end do
-
-  deallocate(devBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/fortran/test/broadcast_ptr.f90 b/fortran/test/broadcast_ptr.f90
deleted file mode 100644
index 963afee..0000000
--- a/fortran/test/broadcast_ptr.f90
+++ /dev/null
@@ -1,142 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev, root
-type(ncclDataType) :: dataType
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: devBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-!  root = 0
-  stat = cudaGetDeviceCount(nDev)
-  root = nDev - 1
-
-  dataType = ncclFloat
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 1))
-
-  call random_number(hostBuff(:, 1:nDev))
-
-  hostBuff(:, nDev + 1) = hostBuff(:, root + 1)
-
-  print "(a)", "before broadcast:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
-  end do
-
-  allocate(hostBuffPtr(nDev))
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(devBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(devBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclBcast(devBuffPtr(i), nEl, dataType, root, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), devBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-  end do
-
-  print "(a)", ""
-  print "(a)", "after broadcast:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff(:, i) / hostBuff(:, nDev + 1) - 1.0_real32))
-    print "(a, i2.2, a, i2.2, a, e11.4e2)", "maximum error of rank ", i - 1, " vs root (rank ", root,") = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(devBuffPtr(i))
-  end do
-
-  deallocate(devBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/fortran/test/reduce_arr_out.f90 b/fortran/test/reduce_arr_out.f90
deleted file mode 100644
index 17e41b4..0000000
--- a/fortran/test/reduce_arr_out.f90
+++ /dev/null
@@ -1,164 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev, root
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: sendBuff(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-real(real32), allocatable, device :: recvBuff(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-!  root = 0
-  stat = cudaGetDeviceCount(nDev)
-  root = nDev - 1
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before reduce:"
-  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(sendBuff(nEl))
-    sendBuffPtr(i) = c_devloc(sendBuff)
-    sendBuff = hostBuff(:, i)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(recvBuff(nEl))
-    recvBuffPtr(i) = c_devloc(recvBuff)
-    recvBuff = hostBuff(:, i)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  stat = cudaSetDevice(devList(root + 1))
-  call c_f_pointer(recvBuffPtr(root + 1), recvBuff, [nEl])
-  hostBuff(:, nDev + 1) = recvBuff
-
-  print "(a)", ""
-  print "(a)", "after reduce:"
-  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    hostBuff(:, nDev + 1) = sendBuff
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    deallocate(recvBuff)
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl])
-    deallocate(sendBuff)
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/fortran/test/reduce_ptr_out.f90 b/fortran/test/reduce_ptr_out.f90
deleted file mode 100644
index 777f8ea..0000000
--- a/fortran/test/reduce_ptr_out.f90
+++ /dev/null
@@ -1,165 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev, root
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-!  root = 0
-  stat = cudaGetDeviceCount(nDev)
-  root = nDev - 1
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before reduce:"
-  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
-
-  allocate(hostBuffPtr(nDev + 1))
-
-  do i = 1, nDev + 1
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclReduce(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, root, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  stat = cudaSetDevice(devList(root + 1))
-  stat = cudaMemcpy(hostBuffPtr(nDev + 1), recvBuffPtr(root + 1), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-
-  print "(a)", ""
-  print "(a)", "after reduce:"
-  err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, nDev + 2) - 1.0_real32))
-  print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from root (rank ", root,") = ", err
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(nDev + 1), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(recvBuffPtr(i))
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(sendBuffPtr(i))
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/fortran/test/reducescatter_arr_out.f90 b/fortran/test/reducescatter_arr_out.f90
deleted file mode 100644
index 6a976da..0000000
--- a/fortran/test/reducescatter_arr_out.f90
+++ /dev/null
@@ -1,165 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable :: hostBuff(:, :)
-real(real32), allocatable, device :: sendBuff(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-real(real32), allocatable, device :: recvBuff(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl * nDev, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before reducescatter:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(sendBuff(nEl * nDev))
-    sendBuffPtr(i) = c_devloc(sendBuff)
-    sendBuff = hostBuff(:, i)
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    allocate(recvBuff(nEl))
-    recvBuffPtr(i) = c_devloc(recvBuff)
-    recvBuff = hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  print "(a)", ""
-  print "(a)", "after reducescatter:"
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) = recvBuff
-    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
-    hostBuff(:, nDev + 1) = sendBuff
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(recvBuffPtr(i), recvBuff, [nEl])
-    deallocate(recvBuff)
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    call c_f_pointer(sendBuffPtr(i), sendBuff, [nEl * nDev])
-    deallocate(sendBuff)
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/fortran/test/reducescatter_ptr_out.f90 b/fortran/test/reducescatter_ptr_out.f90
deleted file mode 100644
index 9df35bf..0000000
--- a/fortran/test/reducescatter_ptr_out.f90
+++ /dev/null
@@ -1,174 +0,0 @@
-!*************************************************************************
-!* Copyright (c) 2016 Research Computing Services (RCS), University of
-!* Cambridge. All rights reserved.
-!*
-!* See LICENSE.txt for license information
-!*************************************************************************
-
-program test
-use iso_c_binding
-use iso_fortran_env
-use cudaFor
-use ncclFor
-implicit none
-integer(int32) :: stat, i
-real(real32) :: err
-integer(int32) :: nEl, nDev
-type(ncclDataType) :: dataType
-type(ncclRedOp) :: redOp
-type(ncclComm), allocatable :: comm(:)
-integer(int32), allocatable :: devList(:)
-type(ncclResult) :: res
-integer(int32) :: cudaDev, rank
-integer(cuda_stream_kind), allocatable :: stream(:)
-integer(int32) :: time(8)
-integer(int32), allocatable :: seed(:)
-real(real32), allocatable, target :: hostBuff(:, :)
-type(c_ptr), allocatable :: hostBuffPtr(:)
-type(c_devptr), allocatable :: sendBuffPtr(:)
-type(c_devptr), allocatable :: recvBuffPtr(:)
-
-  nEl = 2621440
-
-!  nDev = 2
-  stat = cudaGetDeviceCount(nDev)
-
-  dataType = ncclFloat
-  redOp = ncclProd
-
-  allocate(comm(nDev))
-  allocate(devList(nDev))
-
-  do i = 1, nDev
-    devList(i) = i - 1
-  end do
-
-  res = ncclCommInitAll(comm, nDev, devList)
-
-  do i = 1, nDev
-    res = ncclCommCuDevice(comm(i), cudaDev)
-    res = ncclCommUserRank(comm(i), rank)
-  end do
-
-  allocate(stream(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamCreate(stream(i))
-  end do
-
-  call date_and_time(values = time)
-  call random_seed(size = i)
-  allocate(seed(i))
-  call random_seed(get = seed)
-  seed = 60 * 60 * 1000 * time(5) + 60 * 1000 * time(6) + 1000 * time(7) + time(8) - seed
-  call random_seed(put = seed)
-
-  allocate(hostBuff(nEl * nDev, nDev + 2))
-
-  call random_number(hostBuff(:, 1:nDev + 1))
-
-  hostBuff(:, nDev + 2) = hostBuff(:, 1)
-  do i = 2, nDev
-    hostBuff(:, nDev + 2) = hostBuff(:, nDev + 2) * hostBuff(:, i)
-  end do
-
-  print "(a)", "before reducescatter:"
-  do i = 1, nDev
-    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  allocate(hostBuffPtr(nDev + 1))
-
-  do i = 1, nDev + 1
-    hostBuffPtr(i) = c_loc(hostBuff(1, i))
-  end do
-
-  allocate(sendBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev)
-    stat = cudaMemcpy(sendBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    hostBuffPtr(i) = c_loc(hostBuff((i - 1) * nEl + 1, nDev + 1))
-  end do
-
-  allocate(recvBuffPtr(nDev))
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMalloc(recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)))
-    stat = cudaMemcpy(recvBuffPtr(i), hostBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyHostToDevice)
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    res = ncclReduceScatter(sendBuffPtr(i), recvBuffPtr(i), nEl, dataType, redOp, comm(i), stream(i))
-  end do
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamSynchronize(stream(i))
-  end do
-
-  print "(a)", ""
-  print "(a)", "after reduceScatter:"
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), recvBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)), cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 1) / hostBuff((i - 1) * nEl + 1:i * nEl, nDev + 2) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in recvbuff from rank ", i - 1," = ", err
-  end do
-
-  do i = 1, nDev + 1
-    hostBuffPtr(i) = c_loc(hostBuff(1, nDev + 1))
-  end do
-
-  print "(a)", ""
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaMemcpy(hostBuffPtr(i), sendBuffPtr(i), nEl * c_sizeof(hostBuff(1, 1)) * nDev, cudaMemcpyDeviceToHost)
-    err = maxval(abs(hostBuff(:, nDev + 1) / hostBuff(:, i) - 1.0_real32))
-    print "(a, i2.2, a, e11.4e2)", "maximum error in sendbuff of rank ", i - 1," = ", err
-  end do
-  print "(a)", ""
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(recvBuffPtr(i))
-  end do
-
-  deallocate(recvBuffPtr)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaFree(sendBuffPtr(i))
-  end do
-
-  deallocate(sendBuffPtr)
-
-  deallocate(hostBuffPtr)
-
-  deallocate(hostBuff)
-
-  deallocate(seed)
-
-  do i = 1, nDev
-    stat = cudaSetDevice(devList(i))
-    stat = cudaStreamDestroy(stream(i))
-  end do
-
-  deallocate(stream)
-
-  do i = 1, nDev
-    call ncclCommDestroy(comm(i))
-  end do
-
-  deallocate(devList)
-  deallocate(comm)
-
-end program test
diff --git a/makefiles/common.mk b/makefiles/common.mk
new file mode 100644
index 0000000..83a2a39
--- /dev/null
+++ b/makefiles/common.mk
@@ -0,0 +1,87 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+CUDA_HOME ?= /usr/local/cuda
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+KEEP ?= 0
+DEBUG ?= 0
+TRACE ?= 0
+PROFAPI ?= 0
+
+NVCC = $(CUDA_HOME)/bin/nvcc
+
+CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
+CUDA_VERSION = $(strip $(shell $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+#CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
+CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
+#$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR})
+
+
+# Better define NVCC_GENCODE in your environment to the minimal set
+# of archs to reduce compile time.
+CUDA8_GENCODE = -gencode=arch=compute_30,code=sm_30 \
+                -gencode=arch=compute_35,code=sm_35 \
+                -gencode=arch=compute_50,code=sm_50 \
+                -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61
+CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
+
+CUDA8_PTX     = -gencode=arch=compute_61,code=compute_61
+CUDA9_PTX     = -gencode=arch=compute_70,code=compute_70
+
+# Include Volta support if we're using CUDA9 or above
+ifeq ($(shell test "$(CUDA_MAJOR)" -gt 8; echo $$?),0)
+  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
+else
+  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
+endif
+#$(info NVCC_GENCODE is ${NVCC_GENCODE})
+
+CXXFLAGS   := -I$(CUDA_INC) -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
+CXXFLAGS   += -Wall -Wno-sign-compare
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
+# Use addprefix so that we can specify more than one path
+NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
+
+########## GCOV ##########
+GCOV ?= 0 # disable by default.
+GCOV_FLAGS := $(if $(filter 0,${GCOV} ${DEBUG}),,--coverage) # only gcov=1 and debug =1
+CXXFLAGS  += ${GCOV_FLAGS}
+NVCUFLAGS += ${GCOV_FLAGS:%=-Xcompiler %}
+LDFLAGS   += ${GCOV_FLAGS}
+NVLDFLAGS   += ${GCOV_FLAGS:%=-Xcompiler %}
+# $(warning GCOV_FLAGS=${GCOV_FLAGS})
+########## GCOV ##########
+
+ifeq ($(DEBUG), 0)
+NVCUFLAGS += -O3
+CXXFLAGS  += -O3 -g
+else
+NVCUFLAGS += -O0 -G -g
+CXXFLAGS  += -O0 -g -ggdb3
+endif
+
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
+CXXFLAGS  += -Wall -Wextra
+else
+.SILENT:
+endif
+
+ifneq ($(TRACE), 0)
+CXXFLAGS  += -DENABLE_TRACE
+endif
+
+ifneq ($(KEEP), 0)
+NVCUFLAGS += -keep
+endif
+
+ifneq ($(PROFAPI), 0)
+CXXFLAGS += -DPROFAPI
+endif
diff --git a/makefiles/formatting.mk b/makefiles/formatting.mk
new file mode 100644
index 0000000..4a4ab88
--- /dev/null
+++ b/makefiles/formatting.mk
@@ -0,0 +1,33 @@
+#
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# Prerequisite: $(FILESTOFORMAT) contains the list of files of interest for formatting
+# As this file defines a new target (format), it should be included at least after the definition of the
+# default target.
+
+ASTYLE_FORMAT_OPTS=-Qv --style=java --indent-after-parens --indent-modifiers --indent-switches --indent-continuation=2 --keep-one-line-blocks --keep-one-line-statements --indent=spaces=2 --lineend=linux --suffix=none
+ASTYLEDIR := $(BUILDDIR)/contrib
+ASTYLETAR := $(ASTYLEDIR)/astyle.tar.gz
+ASTYLEBIN := $(ASTYLEDIR)/astyle/build/gcc/bin/astyle
+ASTYLEBLD := $(ASTYLEDIR)/astyle/build/gcc/
+ASTYLEVER := 3.1
+ASTYLEURL := "https://versaweb.dl.sourceforge.net/project/astyle/astyle/astyle%20$(ASTYLEVER)/astyle_$(ASTYLEVER)_linux.tar.gz"
+
+$(ASTYLEDIR) :
+	@mkdir -p $(ASTYLEDIR)
+
+$(ASTYLETAR) : $(ASTYLEDIR)
+	@wget -q -O $(ASTYLETAR) $(ASTYLEURL)
+
+$(ASTYLEBLD) : $(ASTYLETAR)
+	@cd $(ASTYLEDIR) && tar xzf $(ASTYLETAR)
+
+$(ASTYLEBIN) : $(ASTYLEBLD)
+	${MAKE} -C $(ASTYLEBLD)
+
+.PHONY : format
+format : $(ASTYLEBIN)
+	@$(ASTYLEBIN) $(ASTYLE_FORMAT_OPTS) $(FILESTOFORMAT)
diff --git a/makefiles/version.mk b/makefiles/version.mk
new file mode 100644
index 0000000..c411280
--- /dev/null
+++ b/makefiles/version.mk
@@ -0,0 +1,6 @@
+##### version
+NCCL_MAJOR   := 2
+NCCL_MINOR   := 3
+NCCL_PATCH   := 5
+NCCL_SUFFIX  :=
+PKG_REVISION := 5
diff --git a/pkg/Makefile b/pkg/Makefile
new file mode 100644
index 0000000..04b23da
--- /dev/null
+++ b/pkg/Makefile
@@ -0,0 +1,26 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+.PHONY : all clean
+
+default : build
+build : debian.build txz.build
+
+BUILDDIR ?= $(abspath ../build)
+ABSBUILDDIR := $(abspath $(BUILDDIR))
+TARGETS := debian txz
+all:   ${TARGETS:%=%.build}
+prep:  ${TARGETS:%=%.prep}
+build: ${TARGETS:%=%.build}
+clean: ${TARGETS:%=%.clean}
+
+%.prep:
+	${MAKE} -C $* prep BUILDDIR=${ABSBUILDDIR}
+
+%.build:
+	${MAKE} -C $* build BUILDDIR=${ABSBUILDDIR}
+
+%.clean:
+	${MAKE} -C $* clean
diff --git a/debian/.gitignore b/pkg/debian/.gitignore
index 1e97a9f..1e97a9f 100644
--- a/debian/.gitignore
+++ b/pkg/debian/.gitignore
diff --git a/pkg/debian/Makefile b/pkg/debian/Makefile
new file mode 100644
index 0000000..439635f
--- /dev/null
+++ b/pkg/debian/Makefile
@@ -0,0 +1,58 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../../makefiles/common.mk
+include ../../makefiles/version.mk
+BUILDDIR ?= $(abspath ../../build)
+DEBPREPDIR := $(BUILDDIR)/debian
+PKGDIR  := $(BUILDDIR)/pkg/deb/
+
+DEBGEN_IN  := $(wildcard *.in)
+DEBGEN     := $(DEBGEN_IN:.in=)
+DEBFILES   := compat copyright libnccl-dev.install rules $(DEBGEN)
+DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES))
+
+PKG_TIMESTAMP  := $(shell date -R)
+ARCH           := $(shell uname -m)
+PKG_ARCH       ?= $(shell uname -m | sed -e "s/x86_64/amd64/g" | sed -e "s/ppc64le/ppc64el/g")
+PKG_MULTIARCH  ?= $(shell $(CXX) -print-multiarch)
+ifeq ($(PKG_MULTIARCH),)
+# Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
+PKG_MULTIARCH  := $(ARCH)-linux-gnu
+endif
+
+prep : $(DEBTARGETS)
+	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
+
+build : prep
+	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
+	@printf "Building Debian package\n"
+	(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b)
+	mkdir -p $(PKGDIR)
+	mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/
+
+clean:
+	rm -Rf $(DEBPREPDIR) $(PKGDIR)
+
+$(DEBPREPDIR)/% : %.in
+	@printf "Generating %-35s > %s\n" $< $@
+	mkdir -p $(DEBPREPDIR)
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
+	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
+	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
+	    -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
+	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
+	    -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
+	    $< > $@
+
+$(DEBPREPDIR)/% : %
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(DEBPREPDIR)
+	cp -f $< $@
diff --git a/pkg/debian/changelog.in b/pkg/debian/changelog.in
new file mode 100644
index 0000000..f280ae3
--- /dev/null
+++ b/pkg/debian/changelog.in
@@ -0,0 +1,5 @@
+nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium
+
+  * Automatic Debian package from build
+
+ -- cudatools <cudatools@nvidia.com>  ${pkg:Timestamp}
diff --git a/debian/compat b/pkg/debian/compat
index ec63514..ec63514 100644
--- a/debian/compat
+++ b/pkg/debian/compat
diff --git a/debian/control.in b/pkg/debian/control.in
index e5ca48e..c8f5266 100644
--- a/debian/control.in
+++ b/pkg/debian/control.in
@@ -7,22 +7,24 @@ Standards-Version: 3.9.5
 
 Package: libnccl${nccl:Major}
 Section: libs
-Architecture: ${deb:Arch}
+Architecture: ${pkg:Arch}
 Depends: ${misc:Depends}, ${shlibs:Depends}
 Description: NVIDIA Collectives Communication Library (NCCL) Runtime
  NCCL (pronounced "Nickel") is a stand-alone library of standard collective
- communication routines for GPUs, such as all-gather, reduce, broadcast, etc.,
- that have been optimized to achieve high bandwidth over PCIe. NCCL supports up
- to eight GPUs and can be used in either single- or multi-process (e.g., MPI)
- applications.
+ communication routines for GPUs, implementing all-reduce, all-gather, reduce,
+ broadcast, and reduce-scatter.
+ It has been optimized to achieve high bandwidth on any platform using PCIe,
+ NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
+ sockets.
 
 Package: libnccl-dev
 Section: libdevel
-Architecture: ${deb:Arch}
+Architecture: ${pkg:Arch}
 Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
 Description: NVIDIA Collectives Communication Library (NCCL) Development Files
  NCCL (pronounced "Nickel") is a stand-alone library of standard collective
- communication routines for GPUs, such as all-gather, reduce, broadcast, etc.,
- that have been optimized to achieve high bandwidth over PCIe. NCCL supports up
- to eight GPUs and can be used in either single- or multi-process (e.g., MPI)
- applications.
+ communication routines for GPUs, implementing all-reduce, all-gather, reduce,
+ broadcast, and reduce-scatter.
+ It has been optimized to achieve high bandwidth on any platform using PCIe,
+ NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
+ sockets.
diff --git a/pkg/debian/copyright b/pkg/debian/copyright
new file mode 120000
index 0000000..1ef648f
--- /dev/null
+++ b/pkg/debian/copyright
@@ -0,0 +1 @@
+../../LICENSE.txt
+\ No newline at end of file
diff --git a/pkg/debian/gbp.conf b/pkg/debian/gbp.conf
new file mode 100644
index 0000000..9a9d7ca
--- /dev/null
+++ b/pkg/debian/gbp.conf
@@ -0,0 +1,9 @@
+[DEFAULT]
+debian-branch   = master
+upstream-branch = master
+
+ignore-new = True
+
+[git-buildpackage]
+
+no-purge = True
diff --git a/pkg/debian/libnccl-dev.install.in b/pkg/debian/libnccl-dev.install.in
new file mode 100644
index 0000000..9cedf3e
--- /dev/null
+++ b/pkg/debian/libnccl-dev.install.in
@@ -0,0 +1,3 @@
+include/nccl.h /usr/include
+lib/libnccl.so /usr/lib/${pkg:MultiArch}
+lib/libnccl_static.a /usr/lib/${pkg:MultiArch}
diff --git a/pkg/debian/libnccl2.install.in b/pkg/debian/libnccl2.install.in
new file mode 100644
index 0000000..7c5367c
--- /dev/null
+++ b/pkg/debian/libnccl2.install.in
@@ -0,0 +1,2 @@
+lib/libnccl.so.${nccl:Major} /usr/lib/${pkg:MultiArch}
+lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/${pkg:MultiArch}
diff --git a/debian/rules b/pkg/debian/rules
index 23b90a9..23b90a9 100755
--- a/debian/rules
+++ b/pkg/debian/rules
diff --git a/debian/source/format b/pkg/debian/source/format
index 89ae9db..89ae9db 100644
--- a/debian/source/format
+++ b/pkg/debian/source/format
diff --git a/pkg/redhat/Makefile b/pkg/redhat/Makefile
new file mode 100644
index 0000000..ffcc973
--- /dev/null
+++ b/pkg/redhat/Makefile
@@ -0,0 +1,62 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../../makefiles/common.mk
+include ../../makefiles/version.mk
+BUILDDIR ?= $(abspath ../../build)
+RPMPREPDIR := $(BUILDDIR)/redhat
+PKGDIR  := $(BUILDDIR)/pkg/rpm/
+
+RPMGEN_IN  := $(wildcard *.in)
+RPMGEN     := $(RPMGEN_IN:.in=)
+RPMFILES   := $(RPMGEN)
+RPMTARGETS := $(patsubst %, $(RPMPREPDIR)/%, $(RPMFILES))
+
+PKG_TIMESTAMP  := $(shell date -R)
+ARCH           := $(shell uname -m)
+PKG_ARCH       ?= $(shell uname -m)
+PKG_MULTIARCH  ?= $(shell $(CXX) -print-multiarch)
+ifeq ($(PKG_MULTIARCH),)
+# Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
+PKG_MULTIARCH  := $(ARCH)-linux-gnu
+endif
+
+prep : $(RPMTARGETS)
+	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
+
+build : prep
+	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
+	$(MAKE) -C ../txz build BUILDDIR=$(BUILDDIR)
+	@printf "Building Redhat package\n"
+	mkdir -p $(PKGDIR)
+	rpmbuild --define "_sourcedir $(BUILDDIR)/pkg/txz" \
+                 --define "_rpmdir $(PKGDIR)" \
+                 --define "_builddir $(PKGDIR)/build/" \
+                 --define "_buildrootdir $(PKGDIR)/buildroot/" \
+                 -bb $(BUILDDIR)/redhat/nccl.spec
+
+clean:
+	rm -Rf $(RPMPREPDIR) $(PKGDIR)
+
+$(RPMPREPDIR)/% : %.in
+	@printf "Generating %-35s > %s\n" $< $@
+	mkdir -p $(RPMPREPDIR)
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
+	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
+	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
+	    -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
+	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
+	    -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
+	    $< > $@
+
+$(RPMPREPDIR)/% : %
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(RPMPREPDIR)
+	cp -f $< $@
diff --git a/pkg/redhat/nccl.spec.in b/pkg/redhat/nccl.spec.in
new file mode 100644
index 0000000..65a2c60
--- /dev/null
+++ b/pkg/redhat/nccl.spec.in
@@ -0,0 +1,73 @@
+Name:           libnccl
+Version:        ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
+Release:        ${pkg:Revision}
+Summary:        NVIDIA Collectives Communication Library (NCCL) Runtime
+
+Group:          Development/Libraries
+License:        BSD
+URL:            http://developer.nvidia.com/nccl
+Source0:        nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz
+
+%description
+NCCL (pronounced "Nickel") is a stand-alone library of standard collective
+communication routines for GPUs, implementing all-reduce, all-gather, reduce,
+broadcast, and reduce-scatter.
+It has been optimized to achieve high bandwidth on any platform using PCIe,
+NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
+sockets.
+
+%package devel
+Summary:        NVIDIA Collectives Communication Library (NCCL) Runtime
+Group:          Development/Libraries
+%description devel
+NCCL development files
+
+%package static
+Summary:        NVIDIA Collectives Communication Library (NCCL) Runtime
+Group:          Development/Libraries
+%description static
+NCCL static library
+
+%define debug_package %{nil}
+
+%prep
+%setup -n nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch} -q
+
+%build
+
+%install
+rm -rf $RPM_BUILD_ROOT
+install -m 755 -d $RPM_BUILD_ROOT
+install -m 755 -d $RPM_BUILD_ROOT/%{_libdir}
+install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}
+ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major}
+
+# devel
+install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
+install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
+ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
+
+# static
+install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir}
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%files devel
+%doc LICENSE.txt
+%defattr(-,root,root,-)
+%{_includedir}/nccl.h
+%{_libdir}/libnccl.so
+
+%files static
+%doc LICENSE.txt
+%defattr(-,root,root,-)
+%{_libdir}/libnccl_static.a
+
+%files
+%doc LICENSE.txt
+%defattr(-,root,root,-)
+%{_libdir}/libnccl.so.${nccl:Major}
+%{_libdir}/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch}
+
+%changelog
diff --git a/pkg/srctxz/Makefile b/pkg/srctxz/Makefile
new file mode 100644
index 0000000..1cb7c06
--- /dev/null
+++ b/pkg/srctxz/Makefile
@@ -0,0 +1,39 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../../makefiles/common.mk
+include ../../makefiles/version.mk
+BUILDDIR ?= $(abspath ../../build)
+TXZPREPDIR  := $(BUILDDIR)/srctxz
+PKGDIR  := $(BUILDDIR)/pkg/srctxz/
+
+TXZGEN_IN  := $(wildcard *.in)
+TXZGEN     := $(TXZGEN_IN:.in=)
+TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
+
+PKG_REVISION   ?= 3
+PKG_ARCH       := $(shell uname -m)
+
+prep: $(TXZTARGETS)
+
+build: prep
+	$(MAKE) -C ../../src clean
+	@printf "Building source tar.xz package\n"
+	(cd $(BUILDDIR); bash srctxz/create_srctxz.sh)
+	mkdir -p $(PKGDIR)
+	mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR)
+
+clean:
+	rm -Rf $(TXZPREPDIR) $(PKGDIR)
+
+$(TXZPREPDIR)/% : %.in
+	@printf "Generating %-35s > %s\n" $< $@
+	mkdir -p $(TXZPREPDIR)
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    $< > $@
diff --git a/pkg/srctxz/create_srctxz.sh.in b/pkg/srctxz/create_srctxz.sh.in
new file mode 100644
index 0000000..0b8e6d2
--- /dev/null
+++ b/pkg/srctxz/create_srctxz.sh.in
@@ -0,0 +1,34 @@
+#!/bin/bash
+#
+# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# To run from $BUILDDIR/
+
+cd ..
+NCCLDIR=`basename $PWD`
+
+echo "Checking for unclean directory ..."
+git clean -x -i
+echo "Clean done."
+echo "Checking for uncommited files ..."
+if [ "`git status -s | wc -l`" != "0" ]; then
+  git status -s
+  echo "Some changes are not committed yet. Continue ? (Ctrl-C to abort)"
+  read
+fi
+
+cd ..
+NCCL_MAJOR=${nccl:Major}
+NCCL_MINOR=${nccl:Minor}
+NCCL_PATCH=${nccl:Patch}
+NCCL_SUFFIX=${nccl:Suffix}
+
+NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}"
+
+tar --exclude build \
+    --exclude ".git*" \
+    --exclude pkg/srctxz \
+    --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR
diff --git a/pkg/txz/Makefile b/pkg/txz/Makefile
new file mode 100644
index 0000000..fa587ef
--- /dev/null
+++ b/pkg/txz/Makefile
@@ -0,0 +1,43 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../../makefiles/common.mk
+include ../../makefiles/version.mk
+BUILDDIR ?= $(abspath ../../build)
+TXZPREPDIR  := $(BUILDDIR)/txz
+PKGDIR  := $(BUILDDIR)/pkg/txz/
+
+TXZGEN_IN  := $(wildcard *.in)
+TXZGEN     := $(TXZGEN_IN:.in=)
+TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
+
+PKG_ARCH   := $(shell uname -m)
+
+prep: $(TXZTARGETS)
+	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
+
+build: prep
+	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
+	@printf "Building tar.xz package\n"
+	(cd $(BUILDDIR); bash txz/create_txz.sh)
+	mkdir -p $(PKGDIR)
+	mv $(BUILDDIR)/../nccl*.txz $(PKGDIR)
+
+clean:
+	rm -Rf $(TXZPREPDIR) $(PKGDIR)
+
+$(TXZPREPDIR)/% : %.in
+	@printf "Generating %-35s > %s\n" $< $@
+	mkdir -p $(TXZPREPDIR)
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
+	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
+	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
+	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
+	    $< > $@
diff --git a/pkg/txz/create_txz.sh.in b/pkg/txz/create_txz.sh.in
new file mode 100644
index 0000000..73922e0
--- /dev/null
+++ b/pkg/txz/create_txz.sh.in
@@ -0,0 +1,24 @@
+#!/bin/bash
+#
+# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# To run from $BUILDDIR/
+
+BUILDDIR=`basename $PWD`
+
+cd ..
+NCCL_MAJOR=${nccl:Major}
+NCCL_MINOR=${nccl:Minor}
+NCCL_PATCH=${nccl:Patch}
+NCCL_SUFFIX=${nccl:Suffix}
+CUDA_MAJOR=${cuda:Major}
+CUDA_MINOR=${cuda:Minor}
+PKG_REVISION=${pkg:Revision}
+PKG_ARCH=${pkg:Arch}
+
+NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}"
+
+tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..c0b674d
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,106 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../makefiles/common.mk
+include ../makefiles/version.mk
+
+##### src files
+INCEXPORTS  := nccl.h
+LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \
+		misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \
+		transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
+                collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
+
+##### lib files
+LIBNAME     := libnccl.so
+STATICLIBNAME := libnccl_static.a
+##### dirs
+BUILDDIR ?= $(abspath ../build)
+INCDIR := $(BUILDDIR)/include
+LIBDIR := $(BUILDDIR)/lib
+OBJDIR := $(BUILDDIR)/obj
+##### target files
+INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
+LIBSONAME  := $(LIBNAME:%=%.$(NCCL_MAJOR))
+LIBTARGET  := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
+STATICLIBTARGET := $(STATICLIBNAME)
+LIBOBJ     := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
+DEPFILES   := $(LIBOBJ:%.o=%.d)
+LDFLAGS    += -L${CUDA_LIB} -lcudart_static -lrt
+
+DEVICELIB  := $(BUILDDIR)/obj/collectives/device/colldevice.a
+
+
+##### rules
+build : lib staticlib
+
+lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
+
+staticlib : $(LIBDIR)/$(STATICLIBTARGET)
+
+devicelib: nccl.h
+	$(MAKE) -C collectives/device
+
+-include $(DEPFILES)
+$(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
+
+nccl.h : nccl.h.in
+# NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
+	@$(eval NCCL_VERSION := $(shell printf "%d%d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH)))
+	printf "Generating %-35s > %s\n" $< $@
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \
+	    $< > $@
+
+$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) devicelib
+	@printf "Linking    %-35s > %s\n" $(LIBTARGET) $@
+	mkdir -p $(LIBDIR)
+	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS)
+	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
+	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
+
+$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) devicelib
+	@printf "Archiving  %-35s > %s\n" $(STATICLIBTARGET) $@
+	mkdir -p $(LIBDIR)
+	$(eval TMP := $(shell mktemp -d))
+	cp $(LIBOBJ) $(TMP)
+	cd $(TMP) && ar x $(DEVICELIB) && cd -
+	ar cr $@ $(LIBOBJ) $(TMP)/*.o
+	rm -Rf $(TMP)
+
+$(INCDIR)/%.h : %.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(INCDIR)
+	cp -f $< $@
+
+$(OBJDIR)/%.o : %.cu
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(NVCC) -I. -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
+	@$(NVCC) -I. -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
+	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
+	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
+                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
+	@rm -f $(@:%.o=%.d.tmp)
+
+clean :
+	rm -rf ${INCDIR} ${LIBDIR} ${OBJDIR} nccl.h
+	$(MAKE) -C collectives/device clean
+
+install : lib
+	mkdir -p $(PREFIX)/lib
+	mkdir -p $(PREFIX)/include
+	cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
+	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
+
+FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
+# Note that formatting.mk defines a new target so in order to not overwrite the default target,
+# it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
+# as the BUILDDIR variable.
+include ../makefiles/formatting.mk
diff --git a/src/all_gather.cu b/src/all_gather.cu
deleted file mode 100644
index cb36b71..0000000
--- a/src/all_gather.cu
+++ /dev/null
@@ -1,202 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 2
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void AllGatherKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ T* sharedNextOutput;
-  __shared__ DevRing<T> ring;
-  bool pushrecv = args.pushrecv;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-    if (pushrecv) {
-      *ring.sendPtrToPrev = (T*)args.ThisOutput;
-      Wait([=] {
-        return *ring.recvPtrFromNext != nullptr;
-      });
-      sharedNextOutput = *ring.recvPtrFromNext;
-      *ring.recvPtrFromNext = nullptr;
-    }
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, -NUM_BUFCHUNKS*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, -1*NUM_SUBSTEPS);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, -1*NUM_SUBSTEPS);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T> Prims;
-
-  const int size = args.N;
-  const int nranks = args.nRanks;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int poffset, noffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int chunkOffset = 0; chunkOffset < size; chunkOffset += sliceSize) {
-    /////////////// begin AllGather steps ///////////////
-    int offset;
-    int maxOffset = size-chunkOffset;
-    int rankDest;
-
-    // step 0: push data to next GPU
-    rankDest = ring.userRank[0];
-    offset = chunkOffset + rankDest * size;
-
-    if (thisInput == thisOutput) {
-      Prims::Copy(
-          thisInput  + offset,
-          pushrecv ? sharedNextOutput + offset : nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-    } else {
-      Prims::DoubleCopy(
-          thisInput  + chunkOffset,
-          thisOutput + offset,
-          pushrecv ? sharedNextOutput + offset : nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-    }
-
-    NEXT_STEP; // Increases step, poffset, noffset
-
-    // k-2 steps: copy to next GPU
-    if (pushrecv) {
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ring.userRank[nranks-j];
-        offset = chunkOffset + rankDest * size;
-
-        Prims::Copy(
-            thisOutput + offset,
-            sharedNextOutput + offset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-    } else {
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ring.userRank[nranks-j];
-        offset = chunkOffset + rankDest * size;
-
-        Prims::DoubleCopy(
-            prevInput + poffset,
-            thisOutput + offset,
-            nextOutput + noffset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-
-      // Make final copy from buffer to dest.
-      rankDest = ring.userRank[1];
-      offset = chunkOffset + rankDest * size;
-
-      // Here we need to copy from buffer to this output.
-      Prims::Copy(
-          prevInput + poffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
-    }
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    // Wait for last update from next then reset the flag
-    waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-    *ring.recvFlagFromNext = 0;
-
-    // Wait for last update from prev then reset the flag
-    waitReadyFromPrev.wait(NUM_SUBSTEPS*(step+1));
-    *ring.recvFlagFromPrev = 0;
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 512
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingAllGather(const void* sendbuff, void* recvbuff,
-    const int count, ncclComm* comm, cudaStream_t stream) {
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
-  } else {
-    KernelArgs<T> args;
-    ArgsSetup(&args, sendbuff, recvbuff, 0, count, comm);
-    LAUNCH_KERNEL(AllGatherKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template<typename> class RedOp>
-class AllGather {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int /*root*/, ncclComm* comm, cudaStream_t stream) {
-    return RingAllGather<RedOp<T>, T>(sendbuff, recvbuff, count, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, int count, ncclDataType_t datatype,
-    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
-    void* recvbuff, ncclComm_t comm, cudaStream_t stream) {
-  NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, "AllGather"));
-  return enqueue<AllGather, FuncNull>(sendbuff, recvbuff, count, datatype, 0, comm, stream);
-}
-
diff --git a/src/all_reduce.cu b/src/all_reduce.cu
deleted file mode 100644
index 2f38d6e..0000000
--- a/src/all_reduce.cu
+++ /dev/null
@@ -1,234 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 2
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void AllReduceKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ T* sharedNextOutput;
-  __shared__ DevRing<T> ring;
-  bool pushrecv = args.pushrecv;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-    if (pushrecv) {
-      *ring.sendPtrToPrev = (T*)args.ThisOutput;
-      Wait([=] {
-        return *ring.recvPtrFromNext != nullptr;
-      });
-      sharedNextOutput = *ring.recvPtrFromNext;
-      *ring.recvPtrFromNext = nullptr;
-    }
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, -NUM_BUFCHUNKS*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, -1*NUM_SUBSTEPS);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, -1*NUM_SUBSTEPS);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T, FUNC> Prims;
-
-  const int size = args.N;
-  const int nranks = args.nRanks;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int poffset, noffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int chunkOffset = 0; chunkOffset < size; chunkOffset += nranks*sliceSize) {
-    /////////////// begin AllReduce steps ///////////////
-    int offset;
-    int maxOffset;
-    int slice;
-    int chunkSize = min(sliceSize, DIVUP(size-chunkOffset,nranks));
-    ALIGN_SIZE(chunkSize, THREADS*UNROLL);
-
-    // step 0: push data to next GPU
-    slice = ring.userRank[nranks-1];
-    offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
-
-    Prims::Copy(
-        thisInput  + offset,
-        nextOutput + noffset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP; // Increases step, poffset, noffset
-
-    // k-2 steps: reduce and copy to next GPU
-    for (int j=2; j<nranks; ++j) {
-      slice = ring.userRank[nranks-j];
-      offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
-
-      Prims::Reduce(
-          prevInput  + poffset,
-          thisInput  + offset,
-          nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
-    }
-
-    // step k-1: reduce this buffer and data, which will produce the final
-    // result that we store in this data and push to the next GPU
-    slice = ring.userRank[0];
-    offset = chunkOffset + slice * chunkSize;
-    maxOffset = min(chunkSize, size-offset);
-
-    Prims::ReduceCopy(
-        prevInput  + poffset,
-        thisInput  + offset,
-        pushrecv ? (sharedNextOutput + offset) : (nextOutput + noffset),
-        thisOutput + offset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP;
-
-    if (pushrecv) {
-      // k-2 steps: copy result to next GPU
-      for (int j=1; j<nranks-1; ++j) {
-        slice = ring.userRank[nranks - j];
-        offset = chunkOffset + slice * chunkSize;
-        maxOffset = min(chunkSize, size-offset);
-
-        Prims::Copy(
-            thisOutput + offset,
-            sharedNextOutput + offset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-    } else {
-      // k-2 steps: copy result to next GPU
-      for (int j=1; j<nranks-1; ++j) {
-        slice = ring.userRank[nranks - j];
-        offset = chunkOffset + slice * chunkSize;
-        maxOffset = min(chunkSize, size-offset);
-
-        Prims::DoubleCopy(
-            prevInput + poffset,
-            thisOutput + offset,
-            nextOutput + noffset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-
-        NEXT_STEP;
-      }
-
-      // Make final copy from buffer to dest.
-      slice = ring.userRank[1];
-      offset = chunkOffset + slice * chunkSize;
-      maxOffset = min(chunkSize, size-offset);
-
-      // Here we need to copy from buffer to this output.
-      Prims::Copy(
-          prevInput + poffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
-    }
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    // Wait for last update from next then reset the flag
-    waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-    *ring.recvFlagFromNext = 0;
-
-    // Wait for last update from prev then reset the flag
-    waitReadyFromPrev.wait(NUM_SUBSTEPS*(step+1));
-    *ring.recvFlagFromPrev = 0;
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 512
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingAllReduce(const void* sendbuff, void* recvbuff,
-    const int count, ncclComm* comm, cudaStream_t stream) {
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
-  } else {
-    KernelArgs<T> args;
-    ArgsSetup(&args, sendbuff, recvbuff, 0, count, comm);
-    LAUNCH_KERNEL(AllReduceKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template <typename> class RedOp>
-class AllReduce {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int /*root*/, ncclComm* comm, cudaStream_t stream) {
-    return RingAllReduce<RedOp<T>, T>(sendbuff, recvbuff, count, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
-  NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, datatype, op, 0, comm, "AllReduce"));
-  return enqueue<AllReduce>(sendbuff, recvbuff, count, datatype, op, 0, comm, stream);
-}
-
diff --git a/src/bootstrap.cu b/src/bootstrap.cu
new file mode 100644
index 0000000..bfe525a
--- /dev/null
+++ b/src/bootstrap.cu
@@ -0,0 +1,292 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "core.h"
+#include "utils.h"
+#include "bootstrap.h"
+#include "net.h"
+#include <unistd.h>
+#include <sys/types.h>
+
+// Always use sockets for bootstrap
+ncclNet_t* ncclBootstrapNet = &ncclNetSocket;
+
+static ncclResult_t bootstrapListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t bootstrapConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; }
+static ncclResult_t bootstrapAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; }
+static ncclResult_t bootstrapTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t bootstrapCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; }
+static ncclResult_t bootstrapCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; }
+static ncclResult_t bootstrapCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; }
+
+// Additional sync functions based on async + test for bootstrap, using host ptrs.
+static ncclResult_t bootstrapSend(void* sendComm, void* data, int size) {
+  void* request;
+  NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, NCCL_PTR_HOST, &request));
+  int done = 0;
+  while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
+  return ncclSuccess;
+}
+static ncclResult_t bootstrapRecv(void* recvComm, void* data, int size) {
+  void* request;
+  NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, NCCL_PTR_HOST, &request));
+  int done = 0;
+  while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
+  return ncclSuccess;
+}
+
+struct extId {
+  ncclNetHandle_t extHandle;
+  void* extListenComm;
+  uint64_t hostHash;
+  pid_t pid;
+  int fd;
+  pthread_t boostrapThread;
+};
+
+struct bootstrapOp {
+  int op;
+  int size;
+};
+
+struct extInfo {
+  int rank;
+  int nranks;
+  ncclNetHandle_t extHandle;
+};
+
+enum {
+  BOOTSTRAP_ALLGATHER = 1,
+  BOOTSTRAP_RINGEXCHANGE,
+};
+
+#include <sys/resource.h>
+
+static ncclResult_t setFilesLimit() {
+  struct rlimit filesLimit;
+  SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit");
+  filesLimit.rlim_cur = filesLimit.rlim_max;
+  SYSCHECK(setrlimit(RLIMIT_NOFILE, &filesLimit), "setrlimit");
+  return ncclSuccess;
+}
+
+static void *bootstrapRoot(void* commId) {
+  struct extInfo info;
+  struct extId* id = (struct extId*)commId;
+  struct bootstrapOp bop;
+  void **extSendComm = NULL;
+  void **extRecvComm = NULL;
+  int size, alloc_size = 0;
+  char* data = NULL;
+  ncclResult_t res;
+  setFilesLimit();
+
+  /* Receive addresses from all ranks */
+  int nranks = 0, c = 0;
+  do {
+    void* tmpRecvComm;
+    NCCLCHECKGOTO(bootstrapAccept(id->extListenComm, &tmpRecvComm), res, out);
+    NCCLCHECKGOTO(bootstrapRecv(tmpRecvComm, &info, sizeof(info)), res, out);
+    if (!c) {
+      extSendComm = (void**)calloc(info.nranks, sizeof(void*));
+      extRecvComm = (void**)calloc(info.nranks, sizeof(void*));
+      if (extSendComm == NULL || extRecvComm == NULL) {
+        WARN("Bootstrap thread : failed to allocate memory");
+        goto out;
+      }
+      nranks = info.nranks;
+    }
+
+    if (nranks != info.nranks) {
+      WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nranks);
+      goto out;
+    }
+
+    extRecvComm[info.rank] = tmpRecvComm;
+    NCCLCHECKGOTO(bootstrapConnect(0, info.extHandle, extSendComm+info.rank), res, out);
+    c++;
+  } while (c < nranks);
+
+  do {
+    NCCLCHECKGOTO(bootstrapRecv(extRecvComm[0], &bop, sizeof(struct bootstrapOp)), res, out);
+    if (bop.size == -1) {
+      break;
+    } else {
+      size = bop.size;
+      if (size*nranks*2 > alloc_size) {
+        if (data) free(data); data = NULL;
+        NCCLCHECKGOTO(ncclCalloc(&data, size*nranks*2), res, out);
+        alloc_size = size*nranks*2;
+      }
+    }
+
+    if (bop.op == BOOTSTRAP_ALLGATHER) {
+      for (int r=0; r<nranks; r++) {
+        NCCLCHECKGOTO(bootstrapRecv(extRecvComm[r], data+size*r, size), res, out);
+      }
+
+      for (int r=0; r<nranks; r++) {
+        NCCLCHECKGOTO(bootstrapSend(extSendComm[r], data, size*nranks), res, out);
+      }
+    } else if (bop.op == BOOTSTRAP_RINGEXCHANGE) {
+      // Receive from all and build total table
+      for (int r=0; r<nranks; r++) {
+        NCCLCHECKGOTO(bootstrapRecv(extRecvComm[r], data+r*2*size, 2*size), res, out);
+      }
+
+      // Get prev/next request from everyone and answer.
+      for (int r=0; r<nranks; r++) {
+        int offset;
+        NCCLCHECKGOTO(bootstrapRecv(extRecvComm[r], &offset, sizeof(int)), res, out);
+        NCCLCHECKGOTO(bootstrapSend(extSendComm[r], data+offset, size), res, out);
+        NCCLCHECKGOTO(bootstrapRecv(extRecvComm[r], &offset, sizeof(int)), res, out);
+        NCCLCHECKGOTO(bootstrapSend(extSendComm[r], data+offset, size), res, out);
+      }
+    } else {
+      WARN("Bootstrap Root : invalid op type received %d", bop.op);
+      break;
+    }
+  } while (1);
+
+out:
+  bootstrapCloseListen(id->extListenComm);
+  for (int r=0; r<nranks; r++) {
+    if (extSendComm[r]) bootstrapCloseSend(extSendComm[r]);
+    if (extRecvComm[r]) bootstrapCloseRecv(extRecvComm[r]);
+  }
+  free(commId);
+  if (data) free(data);
+  if (extSendComm) free(extSendComm);
+  if (extRecvComm) free(extRecvComm);
+  return NULL;
+}
+
+ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
+  struct extId* id = (struct extId*)commId;
+  id->hostHash = getHostHash();
+  NCCLCHECK(bootstrapListen(idFromEnv ? dontCareIf : 0, &id->extHandle, &id->extListenComm));
+  ncclUniqueId* threadIdCopy;
+  NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
+  memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
+  pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy);
+  return ncclSuccess;
+}
+
+ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
+  static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
+  extId* id = (extId*)out;
+
+  char* env = getenv("NCCL_COMM_ID");
+  if (env) {
+    if (ncclSocketCreateHandle(&id->extHandle, env) != 0) {
+      WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
+      return ncclInvalidArgument;
+    }
+    id->pid = -1;
+  } else {
+    id->pid = getpid();
+    NCCLCHECK(bootstrapCreateRoot(out, false));
+  }
+
+  return ncclSuccess;
+}
+
+struct extState {
+  void* extRecvComm;
+  void* extSendComm;
+  int rank;
+  int nranks;
+};
+
+ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) {
+  struct extId* id = (struct extId*)commId;
+  bool idFromEnv = id->pid < 0;
+  struct extState* state;
+  NCCLCHECK(ncclCalloc(&state, 1));
+  state->rank = rank;
+  state->nranks = nranks;
+  *commState = state;
+
+  struct extInfo info;
+  info.rank = rank;
+  info.nranks = nranks;
+  void* tmpListenComm;
+  // Pass the remote address to listen via info
+  if (idFromEnv) {
+    memcpy(&info.extHandle, &id->extHandle, sizeof(ncclNetHandle_t));
+  }
+  // listen will return the local address via info ('findSubnetIf' indicates that the net device is unknown)
+  int dev = idFromEnv ? findSubnetIf : 0;
+  NCCLCHECK(bootstrapListen(dev, &info.extHandle, &tmpListenComm));
+  NCCLCHECK(bootstrapConnect(dev, id->extHandle, &state->extSendComm));
+  NCCLCHECK(bootstrapSend(state->extSendComm, &info, sizeof(info)));
+  NCCLCHECK(bootstrapAccept(tmpListenComm, &state->extRecvComm));
+  NCCLCHECK(bootstrapCloseListen(tmpListenComm));
+
+  return ncclSuccess;
+}
+
+ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
+  struct extState* state = (struct extState*)commState;
+  char* data = (char*)allData;
+  struct bootstrapOp bop;
+
+  bop.op = BOOTSTRAP_ALLGATHER;
+  bop.size = size;
+
+  if (!state->rank) {
+    NCCLCHECK(bootstrapSend(state->extSendComm, &bop, sizeof(struct bootstrapOp)));
+  }
+
+  NCCLCHECK(bootstrapSend(state->extSendComm, data+state->rank*size, size));
+  NCCLCHECK(bootstrapRecv(state->extRecvComm, data, size*state->nranks));
+
+  return ncclSuccess;
+}
+
+ncclResult_t bootstrapRingExchange(void* commState, void* prevNextData, int prev, int next, int size) {
+  struct extState* state = (struct extState*)commState;
+  char* mydata = (char*)prevNextData;
+  int prev_offset = prev*2*size+size, next_offset = next*2*size;
+
+  struct bootstrapOp bop;
+  bop.op = BOOTSTRAP_RINGEXCHANGE;
+  bop.size = size;
+
+  if (!state->rank) {
+    NCCLCHECK(bootstrapSend(state->extSendComm, &bop, sizeof(struct bootstrapOp)));
+  }
+
+  // Send data to root
+  NCCLCHECK(bootstrapSend(state->extSendComm, mydata, 2*size));
+
+  // Receive prev and next data
+  NCCLCHECK(bootstrapSend(state->extSendComm, &prev_offset, sizeof(int)));
+  NCCLCHECK(bootstrapRecv(state->extRecvComm, mydata, size));
+  NCCLCHECK(bootstrapSend(state->extSendComm, &next_offset, sizeof(int)));
+  NCCLCHECK(bootstrapRecv(state->extRecvComm, mydata+size, size));
+
+
+  return ncclSuccess;
+}
+
+ncclResult_t bootstrapClose(void* commState) {
+  struct extState* state = (struct extState*)commState;
+  struct bootstrapOp bop;
+  bop.size = -1;
+
+  if (!state->rank) {
+    NCCLCHECK(bootstrapSend(state->extSendComm, &bop, sizeof(struct bootstrapOp)));
+  }
+
+  NCCLCHECK(bootstrapCloseSend(state->extSendComm));
+  NCCLCHECK(bootstrapCloseRecv(state->extRecvComm));
+
+  free(state);
+
+  return ncclSuccess;
+}
diff --git a/src/broadcast.cu b/src/broadcast.cu
deleted file mode 100644
index 3a7cb11..0000000
--- a/src/broadcast.cu
+++ /dev/null
@@ -1,164 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 4
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and boffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  boffset += sliceSize; \
-  if (boffset == buffSize) boffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void BroadcastKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ T* sharedNextOutput;
-  __shared__ DevRing<T> ring;
-  bool pushrecv = args.pushrecv;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-    if (pushrecv) {
-      *ring.sendPtrToPrev = (T*)args.ThisOutput;
-      Wait([=] {
-        return *ring.recvPtrFromNext != nullptr;
-      });
-      sharedNextOutput = *ring.recvPtrFromNext;
-      *ring.recvPtrFromNext = nullptr;
-    }
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, (1-NUM_BUFCHUNKS)*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, 0);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, 0);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T> Prims;
-
-  const int size = args.N;
-  const int rank = ring.userRank[0];
-  const int nextRank = ring.userRank[1];
-  const int root = args.root;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int boffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int offset = 0; offset < size; offset += sliceSize) {
-    int maxOffset = size-offset;
-    if (rank == root) {
-      Prims::Copy(
-          thisInput + offset,
-          pushrecv ? sharedNextOutput + offset : nextOutput + boffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
-    } else if (nextRank == root) {
-      if (pushrecv) maxOffset = 0; // Only wait for signals
-      Prims::Copy(
-          prevInput  + boffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
-    } else {
-      if (pushrecv) {
-        Prims::Copy(
-            thisOutput + offset,
-            sharedNextOutput + offset,
-            sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-      } else {
-        Prims::DoubleCopy(
-            prevInput + boffset,
-            thisOutput + offset,
-            nextOutput + boffset,
-	    sliceSize, maxOffset,
-            step,
-            waitDoneFromNext, waitReadyFromPrev,
-            postReadyToNext, postDoneToPrev);
-      }
-    }
-    NEXT_STEP; // Increases step, boffset
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    if (nextRank != root) {
-      // Wait for last update from next then reset the flag
-      waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-      *ring.recvFlagFromNext = 0;
-    }
-
-    if (rank != root) {
-      // reset the flag
-      *ring.recvFlagFromPrev = 0;
-    }
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 256
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingBroadcast(void* buff, const int count, const int root,
-    ncclComm* comm, cudaStream_t stream) {
-  if (comm->nRanks != 1) {
-    KernelArgs<T> args;
-    ArgsSetup(&args, buff, buff, root, count, comm);
-    LAUNCH_KERNEL(BroadcastKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template<typename> class RedOp>
-class Broadcast {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int root, ncclComm* comm, cudaStream_t stream) {
-    return RingBroadcast<RedOp<T>, T>(recvbuff, count, root, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclBcast, void* buff, int count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream) {
-  NCCLCHECK(ArgsCheck(buff, buff, count, datatype, ncclSum, root, comm, "Bcast"));
-  return enqueue<Broadcast, FuncNull>(nullptr, buff, count, datatype, root, comm, stream);
-}
-
diff --git a/src/collectives/all_gather.cu b/src/collectives/all_gather.cu
new file mode 100644
index 0000000..fab262d
--- /dev/null
+++ b/src/collectives/all_gather.cu
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "collectives.h"
+
+ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  size_t nbytes = count*ncclTypeSize(datatype);
+  INFO(COLL,"opCount %lx sendbuff %p recvbuff %p count %zi size %zi datatype %d op %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, nbytes, datatype, op, comm, comm->nRanks, stream);
+  if (comm->nRanks == 1) {
+    if (sendbuff != recvbuff)
+      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+  } else {
+    NCCLCHECK(transportSaveProxies(ALLGATHER_SUBSTEPS, ALLGATHER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
+    NCCLCHECK(saveKernel(ncclCollAllGather, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes*comm->nRanks, 1));
+  }
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+  return ncclEnqueueCheck(ncclAllGatherFunc, "AllGather", sendbuff, recvbuff, sendcount, datatype,
+          ncclSum, 0, comm, stream);
+}
diff --git a/src/collectives/all_reduce.cu b/src/collectives/all_reduce.cu
new file mode 100644
index 0000000..cca9886
--- /dev/null
+++ b/src/collectives/all_reduce.cu
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "collectives.h"
+
+ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  size_t nbytes = count*ncclTypeSize(datatype);
+  INFO(COLL,"opCount %lx sendbuff %p recvbuff %p count %zi size %zi datatype %d op %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, nbytes, datatype, op, comm, comm->nRanks, stream);
+  if (comm->nRanks == 1) {
+    if (sendbuff != recvbuff)
+      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+  } else {
+    NCCLCHECK(transportSaveProxies(ALLREDUCE_SUBSTEPS, ALLREDUCE_BUFCHUNKS, (comm->nRanks)*2-2, comm->nRanks, nbytes, proxyPatternRing, comm));
+    NCCLCHECK(saveKernel(ncclCollAllReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, comm->nRanks));
+  }
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
+  return ncclEnqueueCheck(ncclAllReduceFunc, "AllReduce", sendbuff, recvbuff, count, datatype,
+          op, 0, comm, stream);
+}
diff --git a/src/collectives/broadcast.cu b/src/collectives/broadcast.cu
new file mode 100644
index 0000000..fe079b0
--- /dev/null
+++ b/src/collectives/broadcast.cu
@@ -0,0 +1,42 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "collectives.h"
+
+ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  size_t nbytes = count*ncclTypeSize(datatype);
+  INFO(COLL,"opCount %lx sendbuff %p recvbuff %p count %zi size %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, nbytes, datatype, op, root, comm, comm->nRanks, stream);
+  if (comm->nRanks == 1) {
+    if (sendbuff != recvbuff)
+      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+  } else {
+    NCCLCHECK(transportSaveProxies(BROADCAST_SUBSTEPS, BROADCAST_BUFCHUNKS, 1, 1, nbytes, proxyPatternFrom(root), comm));
+    NCCLCHECK(saveKernel(ncclCollBroadcast, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes, 1));
+  }
+
+  return ncclSuccess;
+}
+
+/* Deprecated original "in place" function, similar to MPI */
+NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  return ncclEnqueueCheck(ncclBroadcastFunc, "Bcast", buff, buff, count, datatype,
+          ncclSum, root, comm, stream);
+}
+
+NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  return ncclEnqueueCheck(ncclBroadcastFunc, "Broadcast", sendbuff, recvbuff, count, datatype,
+          ncclSum, root, comm, stream);
+}
diff --git a/src/collectives/collectives.h b/src/collectives/collectives.h
new file mode 100644
index 0000000..4a5cb7a
--- /dev/null
+++ b/src/collectives/collectives.h
@@ -0,0 +1,66 @@
+/*************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_COLLECTIVES_H_
+#define NCCL_COLLECTIVES_H_
+
+typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
+
+#define FUNC_INDEX(coll, redop, dtype, ll) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll)
+
+#define NCCL_COLL_NAME(coll, op, dtype) \
+  coll##_##op##_##dtype
+
+#define NCCL_KERN_NAME(coll, op, dtype) \
+  coll##Kernel_##op##_##dtype
+
+/* Declare all collective operations */
+#define DECL_COLL4(coll, op, dtype) \
+  extern __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
+  extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl coll); \
+
+#define DECL_COLL3(coll, op, dtype) \
+  DECL_COLL4(coll##LL, op, dtype) \
+  DECL_COLL4(coll, op, dtype)
+
+#define DECL_COLL2(coll, op) \
+  DECL_COLL3(coll, op, i8) \
+  DECL_COLL3(coll, op, u8) \
+  DECL_COLL3(coll, op, i32) \
+  DECL_COLL3(coll, op, u32) \
+  DECL_COLL3(coll, op, i64) \
+  DECL_COLL3(coll, op, u64) \
+  DECL_COLL3(coll, op, f16) \
+  DECL_COLL3(coll, op, f32) \
+  DECL_COLL3(coll, op, f64)
+
+#define DECL_COLL(coll) \
+  DECL_COLL2(coll, sum) \
+  DECL_COLL2(coll, prod) \
+  DECL_COLL2(coll, min) \
+  DECL_COLL2(coll, max)
+
+#define DECL_ALL_COLLS \
+  DECL_COLL2(ncclBroadcast, copy) \
+  DECL_COLL(ncclReduce) \
+  DECL_COLL2(ncclAllGather, copy) \
+  DECL_COLL(ncclReduceScatter) \
+  DECL_COLL(ncclAllReduce) \
+
+DECL_ALL_COLLS
+
+#define ALLREDUCE_SUBSTEPS 2
+#define ALLREDUCE_BUFCHUNKS 2
+#define ALLGATHER_SUBSTEPS 2
+#define ALLGATHER_BUFCHUNKS 2
+#define REDUCESCATTER_SUBSTEPS 2
+#define REDUCESCATTER_BUFCHUNKS 2
+#define BROADCAST_SUBSTEPS 8
+#define BROADCAST_BUFCHUNKS 2
+#define REDUCE_SUBSTEPS 8
+#define REDUCE_BUFCHUNKS 2
+
+#endif
diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile
new file mode 100644
index 0000000..ccea8f5
--- /dev/null
+++ b/src/collectives/device/Makefile
@@ -0,0 +1,86 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../../../makefiles/common.mk
+include ../../../makefiles/version.mk
+
+BUILDDIR ?= $(abspath ../../../build)
+OBJDIR := $(BUILDDIR)/obj/collectives/device
+
+LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu
+
+LIBOBJ     := $(patsubst %.cu,$(OBJDIR)/%_sum.o, $(LIBSRCFILES)) \
+              $(patsubst %.cu,$(OBJDIR)/%_prod.o, $(LIBSRCFILES)) \
+              $(patsubst %.cu,$(OBJDIR)/%_min.o, $(LIBSRCFILES)) \
+              $(patsubst %.cu,$(OBJDIR)/%_max.o, $(LIBSRCFILES)) \
+              $(OBJDIR)/functions.o
+
+LIBSRCFILES += functions.cu
+
+DEPFILES   := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES))
+DEPENDFILES := $(DEPFILES:%.d=%.dep)
+STATICLIB  := $(OBJDIR)/colldevice.a
+DEVOBJ     := $(OBJDIR)/devlink.o
+
+NVCUFLAGS  += -I. -I.. -I../.. -I../../include --compiler-options "-fPIC -fvisibility=hidden"
+
+
+all: $(STATICLIB)
+
+# Dummy rule so that the extra dependency (%.dep) files are preserved by make
+all_deps: $(DEPENDFILES)
+
+-include $(DEPFILES)
+
+$(STATICLIB): $(LIBOBJ) $(DEVOBJ)
+	@printf "Archiving  %-35s > %s\n" objects $@
+	ar cr $@ $^
+
+# We do not want make to build *.d when running make clean.
+# So we only provide targets for .dep which will produce .dep and .d,
+# with only .d being included, and .dep keeping track of what needs to
+# be regenerated.
+$(OBJDIR)/%.dep : %.cu
+	@mkdir -p $(OBJDIR)
+	@$(NVCC) $(NVCUFLAGS) -M $< -o $@.tmp
+	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $@.tmp > $@
+	@sed -e 's/.*://' -e 's/\\$$//' < $@.tmp | fmt -1 | \
+                sed -e 's/^ *//' -e 's/$$/:/' >> $@
+	@rm -f $@.tmp
+	@cp $@ $(@:.dep=.d)
+
+# Compiled kernels and collectives with relocatable device code ...
+$(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(NVCC) $(NVCUFLAGS) -dc $< -o $@
+
+$(OBJDIR)/%_sum.o : %.cu $(OBJDIR)/%.dep
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(NVCC) -DNCCL_OP=0 $(NVCUFLAGS) -dc $< -o $@
+
+$(OBJDIR)/%_prod.o : %.cu $(OBJDIR)/%.dep
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(NVCC) -DNCCL_OP=1 $(NVCUFLAGS) -dc $< -o $@
+
+$(OBJDIR)/%_min.o : %.cu $(OBJDIR)/%.dep
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(NVCC) -DNCCL_OP=2 $(NVCUFLAGS) -dc $< -o $@
+
+$(OBJDIR)/%_max.o : %.cu $(OBJDIR)/%.dep
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(NVCC) -DNCCL_OP=3 $(NVCUFLAGS) -dc $< -o $@
+
+# ... and create the device-side linked object with all those.
+$(DEVOBJ) : $(LIBOBJ)
+	$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
+
+clean:
+	rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(STATICLIB) test
diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu
new file mode 100644
index 0000000..0f572ce
--- /dev/null
+++ b/src/collectives/device/all_gather.cu
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common.h"
+#include "all_gather.h"
+#include "collectives.h"
+
+#define UNROLL 4
+
+#if NCCL_OP == 0
+IMPL_COLL3(ncclAllGather, copy, FuncSum, i8, int8_t, ncclCollAllGather, ncclSum, ncclInt8);
+#endif
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
new file mode 100644
index 0000000..a30e575
--- /dev/null
+++ b/src/collectives/device/all_gather.h
@@ -0,0 +1,269 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "primitives.h"
+#include "collectives.h"
+
+// Increase Step and poffset/noffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  poffset = noffset; \
+  noffset += sliceSize; \
+  if (noffset == buffSize) noffset = 0;
+
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = blockDim.x - 1;
+  const int bid = args->bid;
+  __shared__ T* sharedNextOutput;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  int prevdirect = ring->recv.conn.direct;
+  int nextdirect = ring->send.conn.direct;
+
+  WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS);
+  PostFlag postDoneToPrev(ring->recv.conn.head, ALLGATHER_SUBSTEPS, NULL, 0);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
+
+  typedef Primitives<UNROLL, ALLGATHER_SUBSTEPS, T> Prims;
+
+  const ssize_t size = args->N;
+  const int nranks = comm->nRanks;
+  const int buffSize = ring->buffSize / sizeof(T);
+  const int sliceSize = buffSize / ALLGATHER_BUFCHUNKS;
+  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+
+  if (tid == 0) {
+    // Update in case we skipped some collectives
+    *ring->recv.conn.opCount = args->opCount;
+    // Wait for next to be ready
+    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
+    waitOpCountNext.wait(args->opCount);
+    if (prevdirect) {
+      *ring->recv.conn.ptrExchange = args->ThisOutput;
+    }
+    if (nextdirect) {
+      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
+      while (*ptr == nullptr);
+      sharedNextOutput = (T*)*ptr;
+      *ptr = nullptr;
+    }
+  }
+  __syncthreads();
+
+  uint64_t step = 0ULL;
+  int poffset, noffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
+  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
+    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+    /////////////// begin AllGather steps ///////////////
+    ssize_t offset;
+    int maxOffset = min(chunkSize, size-chunkOffset);
+    int rankDest;
+
+    // step 0: push data to next GPU
+    rankDest = ring->devUserRanks[0];
+    offset = chunkOffset + rankDest * size;
+
+    if (thisInput + chunkOffset == thisOutput + offset) { // In place
+      Prims::Copy(tid, nthreads,
+          thisInput  + chunkOffset,
+          nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext,
+          postReadyToNext);
+    } else {
+      Prims::DoubleCopy(tid, nthreads,
+          thisInput  + chunkOffset,
+          thisOutput + offset,
+          nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext,
+          postReadyToNext);
+    }
+
+    NEXT_STEP; // Increases step, poffset, noffset
+
+    // k-2 steps: copy to next GPU
+    if (prevdirect) {
+      for (int j=1; j<nranks-1; ++j) {
+        rankDest = ring->devUserRanks[nranks-j];
+        offset = chunkOffset + rankDest * size;
+
+        Prims::Copy(tid, nthreads,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+
+        NEXT_STEP;
+      }
+      Prims::Copy(tid, nthreads,
+          NULL,
+          NULL,
+          0, 0,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    } else {
+      for (int j=1; j<nranks-1; ++j) {
+        rankDest = ring->devUserRanks[nranks-j];
+        offset = chunkOffset + rankDest * size;
+
+        Prims::DoubleCopy(tid, nthreads,
+            prevInput + poffset,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+
+        NEXT_STEP;
+      }
+
+      // Make final copy from buffer to dest.
+      rankDest = ring->devUserRanks[1];
+      offset = chunkOffset + rankDest * size;
+
+      // Here we need to copy from buffer to this output.
+      Prims::Copy(tid, nthreads,
+          prevInput + poffset,
+          thisOutput + offset,
+          sliceSize, maxOffset,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    }
+  }
+
+  if (tid == 0) {
+    waitDoneFromNext.wait(ALLGATHER_SUBSTEPS*(step + ALLGATHER_BUFCHUNKS));
+    *ring->send.conn.head = 0ULL;
+    *ring->recv.conn.tail = 0ULL;
+    __threadfence_system();
+    *ring->recv.conn.opCount = args->opCount+1;
+  }
+}
+
+#include "ll_kernel.h"
+
+#define NEXT_STEP_LL \
+  poffset = noffset; \
+  pflag = nflag; \
+  noffset += NCCL_LL_SLICE_LINES; \
+  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
+  nflag++; \
+  step++;
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int llNthreads = args->nThreads;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
+  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
+  volatile int * sizesFifo = ring->send.conn.llFifo;
+  uint64_t sendHead = sendHeadPtr[0];
+
+  typedef LLPrimitives<T, FUNC> LL;
+
+  const ssize_t size = args->N;
+  //const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nRings*chunkSize;
+
+  uint64_t step = ring->send.conn.llStep;
+  uint32_t pflag, nflag = step + 1;
+  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
+  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    if (size-gridOffset < loopSize) {
+      chunkSize = args->lastChunkSize;
+    }
+    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+    /////////////// begin AllGather steps ///////////////
+    ssize_t offset;
+    int maxOffset = min(chunkSize, size-chunkOffset);
+    int rankDest;
+
+    // step 0: push data to next GPU
+    rankDest = ring->devUserRanks[0];
+    offset = chunkOffset + rankDest * size;
+
+    WAIT_NEXT;
+    if (thisInput + chunkOffset == thisOutput + offset) { // In place
+      LL::ReduceCopy(
+          thisInput  + chunkOffset,
+          nextOutput + noffset,
+          maxOffset, nflag, llNthreads);
+    } else {
+      LL::ReduceCopy(
+          thisInput  + chunkOffset,
+          thisOutput + offset,
+          nextOutput + noffset,
+          maxOffset, nflag, llNthreads);
+    }
+    POST_SIZE;
+
+    NEXT_STEP_LL;
+
+    // k-2 steps: copy to next GPU
+    for (int j=1; j<nranks-1; ++j) {
+      rankDest = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + rankDest * size;
+
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          prevInput  + poffset,
+          thisOutput + offset,
+          nextOutput + noffset,
+          maxOffset, pflag, nflag, llNthreads);
+      POST_SIZE;
+      ACK_PREV;
+
+      NEXT_STEP_LL;
+    }
+
+    // step k-1: final store
+    rankDest = ring->devUserRanks[1];
+    offset = chunkOffset + rankDest * size;
+
+    LL::ReduceCopy(
+        prevInput  + poffset,
+        thisOutput + offset,
+        maxOffset, pflag, llNthreads);
+    ACK_PREV;
+  }
+
+  FIFO_CLEANING_AND_SAVE_STEP(nflag);
+}
diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu
new file mode 100644
index 0000000..caa1479
--- /dev/null
+++ b/src/collectives/device/all_reduce.cu
@@ -0,0 +1,21 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common.h"
+#include "all_reduce.h"
+#include "collectives.h"
+
+#define UNROLL 4
+
+#if NCCL_OP == 0
+IMPL_COLL2(ncclAllReduce, sum,  FuncSum,  ncclCollAllReduce, ncclSum);
+#elif NCCL_OP == 1
+IMPL_COLL2(ncclAllReduce, prod, FuncProd, ncclCollAllReduce, ncclProd);
+#elif NCCL_OP == 2
+IMPL_COLL2(ncclAllReduce, min,  FuncMin,  ncclCollAllReduce, ncclMin);
+#elif NCCL_OP == 3
+IMPL_COLL2(ncclAllReduce, max,  FuncMax,  ncclCollAllReduce, ncclMax);
+#endif
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
new file mode 100644
index 0000000..d7abc64
--- /dev/null
+++ b/src/collectives/device/all_reduce.h
@@ -0,0 +1,332 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "primitives.h"
+#include "collectives.h"
+
+// Increase Step and poffset/noffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  poffset = noffset; \
+  noffset += sliceSize; \
+  if (noffset == buffSize) noffset = 0;
+
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = blockDim.x - 1;
+  const int bid = args->bid;
+  __shared__ T* sharedNextOutput;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  int prevdirect = ring->recv.conn.direct;
+  int nextdirect = ring->send.conn.direct;
+
+  WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS);
+  PostFlag postDoneToPrev(ring->recv.conn.head, ALLREDUCE_SUBSTEPS, NULL, 0);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
+
+  typedef Primitives<UNROLL, ALLREDUCE_SUBSTEPS, T, FUNC> Prims;
+
+  const ssize_t size = args->N;
+  //const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  const int buffSize = ring->buffSize / sizeof(T);
+  const int sliceSize = buffSize / ALLREDUCE_BUFCHUNKS;
+  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+
+  if (tid == 0) {
+    // Update in case we skipped some collectives
+    *ring->recv.conn.opCount = args->opCount;
+    // Wait for next to be ready
+    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
+    waitOpCountNext.wait(args->opCount);
+    if (prevdirect) {
+      *ring->recv.conn.ptrExchange = args->ThisOutput;
+    }
+    if (nextdirect) {
+      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
+      while (*ptr == nullptr);
+      sharedNextOutput = (T*)*ptr;
+      *ptr = nullptr;
+    }
+  }
+  __syncthreads();
+
+  uint64_t step = 0ULL;
+  int poffset, noffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
+  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
+    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,nranks*args->nRings));
+    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
+
+    /////////////// begin AllReduce steps ///////////////
+    ssize_t offset;
+    int maxOffset;
+    int slice;
+
+    // step 0: push data to next GPU
+    slice = ring->devUserRanks[nranks-1];
+    offset = chunkOffset + slice * chunkSize;
+    maxOffset = min(chunkSize, size-offset);
+
+    Prims::Copy(tid, nthreads,
+        thisInput  + offset,
+        nextOutput + noffset,
+        sliceSize, maxOffset,
+        step,
+        waitDoneFromNext,
+        postReadyToNext);
+
+    NEXT_STEP; // Increases step, poffset, noffset
+
+    // k-2 steps: reduce and copy to next GPU
+    for (int j=2; j<nranks; ++j) {
+      slice = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + slice * chunkSize;
+      maxOffset = min(chunkSize, size-offset);
+
+      Prims::Reduce(tid, nthreads,
+          prevInput  + poffset,
+          thisInput  + offset,
+          nextOutput + noffset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext, waitReadyFromPrev,
+          postReadyToNext, postDoneToPrev);
+
+      NEXT_STEP;
+    }
+
+    // step k-1: reduce this buffer and data, which will produce the final
+    // result that we store in this data and push to the next GPU
+    slice = ring->devUserRanks[0];
+    offset = chunkOffset + slice * chunkSize;
+    maxOffset = min(chunkSize, size-offset);
+
+    Prims::ReduceCopy(tid, nthreads,
+        prevInput  + poffset,
+        thisInput  + offset,
+        nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+        thisOutput + offset,
+        sliceSize, maxOffset,
+        step,
+        waitDoneFromNext, waitReadyFromPrev,
+        postReadyToNext, postDoneToPrev);
+
+    NEXT_STEP;
+
+    // k-2 steps: copy to next GPU
+    if (prevdirect) {
+      for (int j=1; j<nranks-1; ++j) {
+        slice = ring->devUserRanks[nranks - j];
+        offset = chunkOffset + slice * chunkSize;
+        maxOffset = min(chunkSize, size-offset);
+
+        Prims::Copy(tid, nthreads,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+
+        NEXT_STEP;
+      }
+      Prims::Copy(tid, nthreads,
+          NULL,
+          NULL,
+          0, 0,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    } else {
+      for (int j=1; j<nranks-1; ++j) {
+        slice = ring->devUserRanks[nranks - j];
+        offset = chunkOffset + slice * chunkSize;
+        maxOffset = min(chunkSize, size-offset);
+
+        Prims::DoubleCopy(tid, nthreads,
+            prevInput + poffset,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+
+        NEXT_STEP;
+      }
+
+      // Make final copy from buffer to dest.
+      slice = ring->devUserRanks[1];
+      offset = chunkOffset + slice * chunkSize;
+      maxOffset = min(chunkSize, size-offset);
+
+      // Here we need to copy from buffer to this output.
+      Prims::Copy(tid, nthreads,
+          prevInput + poffset,
+          thisOutput + offset,
+          sliceSize, maxOffset,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    }
+  }
+
+  if (tid == 0) {
+    // Wait for next to have consumed all data before we reset the flag
+    waitDoneFromNext.wait(ALLREDUCE_SUBSTEPS*(step + ALLREDUCE_BUFCHUNKS));
+    *ring->send.conn.head = 0ULL;
+    *ring->recv.conn.tail = 0ULL;
+    __threadfence_system();
+    *ring->recv.conn.opCount = args->opCount+1;
+  }
+}
+
+#include "ll_kernel.h"
+
+#define NEXT_STEP_LL \
+  poffset = noffset; \
+  pflag = nflag; \
+  noffset += NCCL_LL_SLICE_LINES; \
+  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
+  nflag++; \
+  step++;
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int llNthreads = args->nThreads;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
+  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
+  volatile int * sizesFifo = ring->send.conn.llFifo;
+  uint64_t sendHead = sendHeadPtr[0];
+
+  typedef LLPrimitives<T, FUNC> LL;
+
+  const ssize_t size = args->N;
+  //const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nRings*nranks*chunkSize;
+
+  uint64_t step = ring->send.conn.llStep;
+  uint32_t pflag, nflag = step + 1;
+  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
+  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    if (size-gridOffset < loopSize) {
+      chunkSize = args->lastChunkSize;
+    }
+    ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
+
+    /////////////// begin AllReduce steps ///////////////
+    ssize_t offset;
+    int maxOffset;
+    int slice;
+
+    // step 0: push data to next GPU
+    slice = ring->devUserRanks[nranks-1];
+    offset = chunkOffset + slice * chunkSize;
+    maxOffset = min(chunkSize, size-offset);
+
+    WAIT_NEXT;
+    LL::ReduceCopy(
+        thisInput  + offset,
+        nextOutput + noffset,
+        maxOffset, nflag, llNthreads);
+    POST_SIZE;
+
+    NEXT_STEP_LL;
+
+    // k-2 steps: reduce and copy to next GPU
+    for (int j=2; j<nranks; ++j) {
+      slice = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + slice * chunkSize;
+      maxOffset = min(chunkSize, size-offset);
+
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          thisInput  + offset,
+          prevInput  + poffset,
+          nextOutput + noffset,
+          maxOffset, pflag, nflag, llNthreads);
+      POST_SIZE;
+      ACK_PREV;
+
+      NEXT_STEP_LL;
+    }
+
+    // step k-1: reduce this buffer and data, which will produce the final
+    // result that we store in this data and push to the next GPU
+    slice = ring->devUserRanks[0];
+    offset = chunkOffset + slice * chunkSize;
+    maxOffset = min(chunkSize, size-offset);
+
+    WAIT_NEXT;
+    LL::ReduceCopy(
+        thisInput  + offset,
+        prevInput  + poffset,
+        thisOutput + offset,
+        nextOutput + noffset,
+        maxOffset, pflag, nflag, llNthreads);
+    POST_SIZE;
+    ACK_PREV;
+
+    NEXT_STEP_LL;
+
+    // k-2 steps: copy to next GPU
+    for (int j=1; j<nranks-1; ++j) {
+      slice = ring->devUserRanks[nranks - j];
+      offset = chunkOffset + slice * chunkSize;
+      maxOffset = min(chunkSize, size-offset);
+
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          prevInput + poffset,
+          thisOutput + offset,
+          nextOutput + noffset,
+          maxOffset, pflag, nflag, llNthreads);
+      POST_SIZE;
+      ACK_PREV;
+
+      NEXT_STEP_LL;
+    }
+
+    // Make final copy from buffer to dest.
+    slice = ring->devUserRanks[1];
+    offset = chunkOffset + slice * chunkSize;
+    maxOffset = min(chunkSize, size-offset);
+
+    // Here we need to copy from buffer to this output.
+    LL::ReduceCopy(
+        prevInput + poffset,
+        thisOutput + offset,
+        maxOffset, pflag, llNthreads);
+    ACK_PREV;
+  }
+
+  FIFO_CLEANING_AND_SAVE_STEP(nflag);
+}
diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu
new file mode 100644
index 0000000..4125de4
--- /dev/null
+++ b/src/collectives/device/broadcast.cu
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common.h"
+#include "broadcast.h"
+#include "collectives.h"
+
+#define UNROLL 4
+
+#if NCCL_OP == 0
+IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8);
+#endif
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
new file mode 100644
index 0000000..c2f6d00
--- /dev/null
+++ b/src/collectives/device/broadcast.h
@@ -0,0 +1,228 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "primitives.h"
+#include "collectives.h"
+
+// Increase Step and boffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  boffset += sliceSize; \
+  if (boffset == buffSize) boffset = 0;
+
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = blockDim.x - 1;
+  const int bid = args->bid;
+  __shared__ T* sharedNextOutput;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  int prevdirect = ring->recv.conn.direct;
+  int nextdirect = ring->send.conn.direct;
+
+  WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
+  PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS);
+
+  typedef Primitives<UNROLL, BROADCAST_SUBSTEPS, T> Prims;
+
+  const ssize_t size = args->N;
+  const int buffSize = ring->buffSize / sizeof(T);
+  const int sliceSize = buffSize / BROADCAST_BUFCHUNKS;
+  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+  const int rank = ring->devUserRanks[0];
+  const int nextRank = ring->devUserRanks[1];
+  const int root = args->root;
+
+  if (tid == 0) {
+    // Update in case we skipped some collectives
+    *ring->recv.conn.opCount = args->opCount;
+    if (nextRank != root) {
+      // Wait for next to be ready
+      WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
+      waitOpCountNext.wait(args->opCount);
+    }
+    if (rank != root && prevdirect) {
+      *ring->recv.conn.ptrExchange = args->ThisOutput;
+    }
+    if (nextRank != root && nextdirect) {
+      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
+      while (*ptr == nullptr);
+      sharedNextOutput = (T*)*ptr;
+      *ptr = nullptr;
+    }
+  }
+  __syncthreads();
+
+  uint64_t step = 0ULL;
+  int boffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
+  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
+    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t offset = gridOffset + bid*chunkSize;
+    int maxOffset = min(chunkSize, size-offset);
+
+    if (rank == root) {
+      if (thisInput == thisOutput) {
+        Prims::Copy(tid, nthreads,
+            thisInput  + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext,
+            postReadyToNext);
+      } else {
+        Prims::DoubleCopy(tid, nthreads,
+            thisInput  + offset,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext,
+            postReadyToNext);
+      }
+    } else if (nextRank == root) {
+      if (prevdirect) maxOffset = 0; // Only wait for signals
+      Prims::Copy(tid, nthreads,
+          prevInput  + boffset,
+          thisOutput + offset,
+          sliceSize, maxOffset,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    } else {
+      if (prevdirect) {
+        Prims::Copy(tid, nthreads,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+      } else {
+        Prims::DoubleCopy(tid, nthreads,
+            prevInput + boffset,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+      }
+    }
+    NEXT_STEP; // Increases step, boffset
+  }
+
+  if (tid == 0) {
+    if (nextRank != root) {
+      // Wait for next to have consumed data before resetting the flag
+      waitDoneFromNext.wait(BROADCAST_SUBSTEPS*(step + BROADCAST_BUFCHUNKS - 1));
+      *ring->send.conn.head = 0ULL;
+    }
+    *ring->recv.conn.tail = 0ULL;
+    __threadfence_system();
+    *ring->recv.conn.opCount = args->opCount+1;
+  }
+}
+
+#include "ll_kernel.h"
+
+#define NEXT_STEP_LL \
+  boffset += NCCL_LL_SLICE_LINES; \
+  if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
+  flag++; \
+  step++;
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int llNthreads = args->nThreads;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
+  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
+  volatile int * sizesFifo = ring->send.conn.llFifo;
+  uint64_t sendHead = sendHeadPtr[0];
+  const int rank = comm->rank;
+  const int nextRank = ring->devUserRanks[1];
+  const int root = args->root;
+
+  typedef LLPrimitives<T, FUNC> LL;
+
+  const ssize_t size = args->N;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nRings*chunkSize;
+
+  uint64_t step = ring->send.conn.llStep;
+  uint32_t flag = step + 1;
+  int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
+  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    if (size-gridOffset < loopSize) {
+      chunkSize = args->lastChunkSize;
+    }
+    ssize_t offset = gridOffset + bid*chunkSize;
+
+    int maxOffset = min(chunkSize, size-offset);
+    if (rank == root) {
+      WAIT_NEXT;
+      if (thisInput == thisOutput) {
+        LL::ReduceCopy(
+            thisInput + offset,
+            nextOutput + boffset,
+            maxOffset, flag, llNthreads);
+      } else {
+        LL::ReduceCopy(
+            thisInput + offset,
+            thisOutput + offset,
+            nextOutput + boffset,
+            maxOffset, flag, llNthreads);
+      }
+      POST_SIZE;
+      NEXT_STEP_LL;
+    } else if (nextRank == root) {
+      LL::ReduceCopy(
+          prevInput + boffset,
+          thisOutput + offset,
+          maxOffset, flag, llNthreads);
+      NEXT_STEP_LL;
+      ACK_PREV;
+    } else {
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          prevInput + boffset,
+          thisOutput + offset,
+          nextOutput + boffset,
+          maxOffset, flag, flag, llNthreads);
+      POST_SIZE;
+      NEXT_STEP_LL;
+      ACK_PREV;
+    }
+  }
+
+  // We need everyone to acknowledge data even if they didn't receive anything
+  // so that the next collective can start right away.
+  ACK_PREV;
+
+  FIFO_CLEANING_AND_SAVE_STEP(flag);
+}
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
new file mode 100644
index 0000000..c988913
--- /dev/null
+++ b/src/collectives/device/common.h
@@ -0,0 +1,90 @@
+/*************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_DEVICE_COMMON_H_
+#define NCCL_DEVICE_COMMON_H_
+
+#include "../collectives.h"
+#include "core.h"
+#include "nccl.h"
+
+typedef void(*ncclKern_t)(struct CollectiveArgs* args);
+extern __device__ ncclKern_t ncclFuncs[];
+
+static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
+  int* d = (int*)dst;
+  int* s = (int*)src;
+  __syncthreads();
+  for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
+  __syncthreads();
+}
+static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid) {
+  load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid);
+  if (tid == 0) hostColl->active = 0;
+}
+
+/* Functions for aggregation case */
+#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
+__device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
+  coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(args); \
+}
+/* Kernels with the first operation inlined */
+#define IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, fIndex) \
+__launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \
+__global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
+  int tid = threadIdx.x; \
+  int bid = blockIdx.x; \
+  __shared__ struct ncclColl localColl; \
+ \
+  struct ncclComm* comm = firstColl.args.comm; \
+  struct ncclRing* ring = comm->rings+bid; \
+  struct ncclColl* c; \
+  if (bid == 0) { \
+    /* To optimize for latency, (only) the first operation is passed as argument.*/ \
+    c = &firstColl; \
+  } else { \
+    c = &localColl; \
+    load_coll(c, ring->devCollectives+ring->collFifoHead, tid); \
+  } \
+  while (1) { \
+    if (tid < c->nThreads) { \
+      if (c->funcIndex == fIndex) { \
+        coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
+      } else { \
+        ncclFuncs[c->funcIndex](&c->args); \
+      } \
+    } \
+    int nextIndex = c->nextIndex; \
+    if (tid == 0) ring->collFifoHead = nextIndex; \
+ \
+    if (c->active == 2) { \
+      return; \
+    } \
+ \
+    /* Load next collective operation*/ \
+    c = &localColl; /* for bid 0 */ \
+    load_coll(c, ring->devCollectives+nextIndex, tid); \
+  } \
+}
+
+#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
+  IMPL_COLL4(coll##LL, op, ncclFunc, dtype, ctype) \
+  IMPL_COLL4K(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1)) \
+  IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
+  IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 0)) \
+
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, i8,  int8_t,   ncclColl, ncclOp, ncclInt8) \
+  IMPL_COLL3(coll, op, ncclFunc, u8,  uint8_t,  ncclColl, ncclOp, ncclUint8) \
+  IMPL_COLL3(coll, op, ncclFunc, i32, int32_t,  ncclColl, ncclOp, ncclInt32) \
+  IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32) \
+  IMPL_COLL3(coll, op, ncclFunc, i64, int64_t,  ncclColl, ncclOp, ncclInt64) \
+  IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64) \
+  IMPL_COLL3(coll, op, ncclFunc, f16, half,     ncclColl, ncclOp, ncclFloat16) \
+  IMPL_COLL3(coll, op, ncclFunc, f32, float,    ncclColl, ncclOp, ncclFloat32) \
+  IMPL_COLL3(coll, op, ncclFunc, f64, double,   ncclColl, ncclOp, ncclFloat64)
+
+#endif
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
new file mode 100644
index 0000000..0eaa061
--- /dev/null
+++ b/src/collectives/device/common_kernel.h
@@ -0,0 +1,372 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_COMMON_KERNEL_H_
+#define NCCL_COMMON_KERNEL_H_
+
+#include "core.h"
+#include <cstdio>
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+// Define min for ssize_t
+static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
+
+typedef uint64_t PackType;
+
+// unpack x and y to elements of type T and apply FUNC to each element
+template<class FUNC, typename T>
+struct MULTI {
+  __device__ PackType operator()(const PackType x, const PackType y) const;
+};
+
+template<class FUNC>
+struct MULTI<FUNC, int8_t> {
+  static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
+      "PackType must be twice the size of uint32_t.");
+  union converter {
+    PackType storage;
+    struct {
+      uint32_t a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    // for char, we do these as vector ops
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, uint8_t> {
+  static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
+      "PackType must be twice the size of uint32_t.");
+  union converter {
+    PackType storage;
+    struct {
+      uint32_t a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    // for char, we do these as vector ops
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, int32_t> {
+  static_assert(sizeof(PackType) == 2 * sizeof(int32_t),
+      "PackType must be twice the size of int.");
+  union converter {
+    PackType storage;
+    struct {
+      int32_t a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, uint32_t> {
+  static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
+      "PackType must be twice the size of int.");
+  union converter {
+    PackType storage;
+    struct {
+      uint32_t a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, half> {
+  static_assert(sizeof(PackType) == 4 * sizeof(half),
+      "PackType must be four times the size of half.");
+
+  struct PackHalf2 {
+    half2 a, b;
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    struct PackHalf2 cx, cy, cr;
+    cx = *(reinterpret_cast<const struct PackHalf2*>(&x));
+    cy = *(reinterpret_cast<const struct PackHalf2*>(&y));
+
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return *(reinterpret_cast<PackType*>(&cr));
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, float> {
+  static_assert(sizeof(PackType) == 2 * sizeof(float),
+      "PackType must be twice the size of float.");
+  union converter {
+    PackType storage;
+    struct {
+      float a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, double> {
+  static_assert(sizeof(PackType) == sizeof(double),
+      "PackType must be the same size as double.");
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    double rv = FUNC()(__longlong_as_double(x), __longlong_as_double(y));
+    return __double_as_longlong(rv);
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, uint64_t> {
+  static_assert(sizeof(PackType) == sizeof(uint64_t),
+      "PackType must be the same size as uint64_t.");
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    uint64_t rv = FUNC()(x, y);
+    return rv;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, int64_t> {
+  static_assert(sizeof(PackType) == sizeof(int64_t),
+      "PackType must be the same size as int64_t.");
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    int64_t rv = FUNC()((int64_t)x, (int64_t)y);
+    return rv;
+  }
+};
+
+#define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))
+
+template<typename T>
+__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
+  size_t ptrval = reinterpret_cast<size_t>(ptr);
+  return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
+}
+
+template<typename T> inline __device__
+T vFetch(const volatile T* ptr) {
+  return *ptr;
+}
+
+template<typename T> inline __device__
+void vStore(volatile T* ptr, const T val) {
+  *ptr = val;
+}
+
+#if CUDART_VERSION < 9000
+template<> inline __device__
+half vFetch<half>(const volatile half* ptr) {
+  half r;
+  r.x = ptr->x;
+  return r;
+}
+
+template<> inline __device__
+void vStore<half>(volatile half* ptr, const half val) {
+  ptr->x = val.x;
+}
+#else
+template<> inline __device__
+half vFetch<half>(const volatile half* ptr) {
+  half r;
+  r = ((half*)ptr)[0];
+  return r;
+}
+
+template<> inline __device__
+void vStore<half>(volatile half* ptr, const half val) {
+  ((half*)ptr)[0] = val;
+}
+#endif
+
+template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
+__device__ inline void ReduceCopy(
+    const int tid, const int nthreads,
+    const volatile T * __restrict__ const src0,
+    const volatile T * __restrict__ const src1,
+    volatile T * __restrict__ const dest0,
+    volatile T * __restrict__ const dest1, const int N) {
+  for (int idx = tid; idx < N; idx += nthreads) {
+    T val = vFetch(src0+idx);
+    if (TWO_INPUTS) {
+      val = FUNC()(val, vFetch(src1+idx));
+    }
+    vStore(dest0+idx, val);
+    if (TWO_OUTPUTS) {
+      vStore(dest1+idx, val);
+    }
+  }
+}
+
+typedef ulong2 Pack128;
+
+template<class FUNC, typename T>
+struct MULTI128 {
+  __device__ void operator()(Pack128& x, Pack128& y) {
+    x.x = MULTI<FUNC, T>()(x.x, y.x);
+    x.y = MULTI<FUNC, T>()(x.y, y.y);
+  }
+};
+
+inline __device__ void Fetch128(Pack128& v, Pack128* p) {
+  asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory");
+}
+inline __device__ void Store128(Pack128* p, Pack128& v) {
+  asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" :: "l"(p), "l"(v.x), "l"(v.y) : "memory");
+}
+
+#define WARP_SIZE 32
+template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL>
+__device__ inline void ReduceCopy128b( const int w, const int nw, const int t,
+    Pack128 * src0, Pack128 * src1, Pack128 * dest0, Pack128 * dest1,
+    const int N) {
+  Pack128 t0[UNROLL];
+  Pack128 t1[UNROLL];
+  const Pack128* src0_end = src0 + N;
+  const int inc = nw * UNROLL * WARP_SIZE;
+  const int offset = w * UNROLL * WARP_SIZE + t;
+  src0 += offset;  if (TWO_INPUTS)  src1 += offset;
+  dest0 += offset; if (TWO_OUTPUTS) dest1 += offset;
+
+  while (src0 < src0_end) {
+#pragma unroll
+    for (int u = 0; u < UNROLL; ++u) {
+      Fetch128(t0[u], src0+u*WARP_SIZE);
+      if (TWO_INPUTS) Fetch128(t1[u], src1+u*WARP_SIZE);
+    }
+#pragma unroll
+    for (int u = 0; u < UNROLL; ++u) {
+      if (TWO_INPUTS) MULTI128<FUNC, T>()(t0[u], t1[u]);
+      Store128(dest0+u*WARP_SIZE, t0[u]);
+      if (TWO_OUTPUTS) Store128(dest1+u*WARP_SIZE, t0[u]);
+    }
+    src0 += inc;  if (TWO_INPUTS)  src1 += inc;
+    dest0 += inc; if (TWO_OUTPUTS) dest1 += inc;
+  }
+}
+
+template<int UNROLL, class FUNC, typename T, bool HAS_DEST1, bool HAS_SRC1>
+__device__ inline void ReduceOrCopy(const int tid, const int nthreads,
+    volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
+    const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
+    int N) {
+  int Nrem = N;
+  if (Nrem <= 0) return;
+
+  int Npreamble = (Nrem<alignof(Pack128)) ? Nrem : AlignUp(dest0, alignof(Pack128)) - dest0;
+
+  // stage 0: check if we'll be able to use the fast, 128-bit aligned path.
+  // If not, we'll just use the slow preamble path for the whole operation
+  bool alignable = (((AlignUp(src0,  alignof(Pack128)) == src0  + Npreamble)) &&
+          (!HAS_DEST1 || (AlignUp(dest1, alignof(Pack128)) == dest1 + Npreamble)) &&
+          (!HAS_SRC1  || (AlignUp(src1,  alignof(Pack128)) == src1  + Npreamble)));
+
+  if (!alignable) {
+    Npreamble = Nrem;
+  }
+
+  // stage 1: preamble: handle any elements up to the point of everything coming
+  // into alignment
+  ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Npreamble);
+
+  Nrem -= Npreamble;
+  if (Nrem == 0) return;
+
+  dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
+  src0  += Npreamble; if (HAS_SRC1)  { src1  += Npreamble; }
+
+  // stage 2: fast path: use 128b loads/stores to do the bulk of the work,
+  // assuming the pointers we have are all 128-bit alignable.
+  int w = tid / WARP_SIZE;       // Warp number
+  int nw = nthreads / WARP_SIZE; // Number of warps
+  int t = tid % WARP_SIZE;       // Thread (inside the warp)
+
+  const int PackFactor = sizeof(Pack128) / sizeof(T);
+
+  // stage 2a: main loop
+  int Nalign2a = (Nrem / (PackFactor * UNROLL * nthreads))
+      * (UNROLL * nthreads); // round down
+
+  ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2a);
+
+  int Ndone2a = Nalign2a * PackFactor;
+  Nrem -= Ndone2a;
+  if (Nrem == 0) return;
+  dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; }
+  src0  += Ndone2a; if (HAS_SRC1)  { src1  += Ndone2a; }
+
+  // stage 2b: slightly less optimized for section when we don't have full
+  // UNROLLs
+
+  int Nalign2b = Nrem / PackFactor;
+
+  ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, 1>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2b);
+
+  int Ndone2b = Nalign2b * PackFactor;
+  Nrem -= Ndone2b;
+  if (Nrem == 0) return;
+  dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; }
+  src0  += Ndone2b; if (HAS_SRC1)  { src1  += Ndone2b; }
+
+  // stage 2c: tail
+  ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Nrem);
+}
+
+#endif // COMMON_KERNEL_H_
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu
new file mode 100644
index 0000000..16f1865
--- /dev/null
+++ b/src/collectives/device/functions.cu
@@ -0,0 +1,64 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "collectives.h"
+#include "common.h"
+
+#define NCCL_FUNC4(coll, op, dtype) \
+  NCCL_COLL_NAME(coll, op, dtype), \
+  NCCL_COLL_NAME(coll##LL, op, dtype)  \
+
+// Must be consistent with ncclDataType_t
+#define NCCL_FUNCS3A(coll, op) \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  u8), \
+  NCCL_FUNC4(coll, op, i32), \
+  NCCL_FUNC4(coll, op, u32), \
+  NCCL_FUNC4(coll, op, i64), \
+  NCCL_FUNC4(coll, op, u64), \
+  NCCL_FUNC4(coll, op, f16), \
+  NCCL_FUNC4(coll, op, f32), \
+  NCCL_FUNC4(coll, op, f64)
+#define NCCL_FUNCS3B(coll, op) \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8)
+
+// Must be consistent with ncclRedOp_t
+#define NCCL_FUNCS2A(coll) \
+  NCCL_FUNCS3A(coll, sum ), \
+  NCCL_FUNCS3A(coll, prod), \
+  NCCL_FUNCS3A(coll, max ), \
+  NCCL_FUNCS3A(coll, min )
+#define NCCL_FUNCS2B(coll) \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy)
+
+// Must be consistent with ncclColl_t
+#define NCCL_FUNCS() { \
+  NCCL_FUNCS2B(ncclBroadcast), \
+  NCCL_FUNCS2A(ncclReduce), \
+  NCCL_FUNCS2B(ncclAllGather), \
+  NCCL_FUNCS2A(ncclReduceScatter), \
+  NCCL_FUNCS2A(ncclAllReduce) }
+
+// Must be consistent with the ncclFuncSet enum
+__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
+  NCCL_FUNCS2B(ncclBroadcast),
+  NCCL_FUNCS2A(ncclReduce),
+  NCCL_FUNCS2B(ncclAllGather),
+  NCCL_FUNCS2A(ncclReduceScatter),
+  NCCL_FUNCS2A(ncclAllReduce)
+};
diff --git a/src/collectives/device/ll_kernel.h b/src/collectives/device/ll_kernel.h
new file mode 100644
index 0000000..5ec3c9a
--- /dev/null
+++ b/src/collectives/device/ll_kernel.h
@@ -0,0 +1,154 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_LL_KERNEL_H_
+#define NCCL_LL_KERNEL_H_
+
+static __device__ uint64_t readLL(union ncclLLFifoLine* src, uint32_t flag) {
+  uint32_t data1, flag1, data2, flag2;
+  do {
+    asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
+  } while ((flag1 != flag) || (flag2 != flag));
+  uint64_t val64 = data1 + (((uint64_t)data2) << 32);
+  return val64;
+}
+
+static __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
+  asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
+}
+
+// Using memcpy handles misaligned pointers.
+static __device__ uint64_t readAL(uint64_t* src) {
+  uint64_t val;
+  memcpy((char*)&val, (char*)src, sizeof(uint64_t));
+  return val;
+}
+static __device__ void storeAL(uint64_t* dst, uint64_t val) {
+  memcpy((char*)dst, (char*)&val, sizeof(uint64_t));
+}
+
+template <typename T, class FUNC>
+class LLPrimitives {
+ private:
+  template <int HAS_SRC1, int HAS_SRC2, int HAS_DST1, int HAS_DST2>
+  static __device__ void ReduceCopyGeneric(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
+    if (size <= 0) return;
+    size_t size64 = size * sizeof(T) / sizeof(uint64_t);
+    uint64_t* src1A = (uint64_t*)src1;
+    uint64_t* dst1A = (uint64_t*)dst1;
+    int offset = threadIdx.x;
+    // Do multiples of 64 bits
+#pragma unroll 1
+    for (; offset < size64; offset += nthreads) {
+      uint64_t val;
+      if (HAS_SRC1) {
+        val = readAL(src1A+offset);
+        if (HAS_SRC2) val = MULTI<FUNC, T>()(readLL(src2+offset, iflag), val);
+      } else if (HAS_SRC2) {
+        val = readLL(src2+offset, iflag);
+      }
+      if (HAS_DST1) storeAL(dst1A+offset, val);
+      if (HAS_DST2) storeLL(dst2+offset, val, oflag);
+    }
+    // Finish last word
+    int sizeDone = size64*(sizeof(uint64_t)/sizeof(T));
+    int sizeRem = size - sizeDone;
+    if (threadIdx.x == 0 && sizeRem) {
+      const T* src1B = src1 + sizeDone;
+      T* dst1B = dst1 + sizeDone;
+
+      uint64_t lastVal;
+      T* vals = (T*)&lastVal;
+
+      if (HAS_SRC2) {
+        uint64_t lastVal2 = readLL(src2+size64, iflag);
+        T* src2B = (T*)&lastVal2;
+        for (int offset = 0; offset < sizeRem; offset++) {
+          vals[offset] = HAS_SRC1 ? FUNC()(src2B[offset], src1B[offset]) : src2B[offset];
+        }
+      } else if (HAS_SRC1) {
+        for (int offset = 0; offset < sizeRem; offset++) {
+          vals[offset] = src1B[offset];
+        }
+      }
+      if (HAS_DST2) storeLL(dst2+size64, lastVal, oflag);
+      if (HAS_DST1) {
+        for (int offset = 0; offset < sizeRem; offset++) {
+          dst1B[offset] = vals[offset];
+        }
+      }
+    }
+  }
+ public:
+  static __device__ void ReduceCopy(const T* src, union ncclLLFifoLine* dst, int size, uint32_t oflag, int nthreads) {
+    return ReduceCopyGeneric<1, 0, 0, 1>(src, NULL, NULL, dst, size, 0, oflag, nthreads);
+  }
+
+  static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst, int size, uint32_t iflag, int nthreads) {
+    return ReduceCopyGeneric<0, 1, 1, 0>(NULL, src, dst, NULL, size, iflag, 0, nthreads);
+  }
+
+  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, union ncclLLFifoLine* dst, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
+    return ReduceCopyGeneric<1, 1, 0, 1>(src1, src2, NULL, dst, size, iflag, oflag, nthreads);
+  }
+
+  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst, int size, uint32_t iflag, int nthreads) {
+    return ReduceCopyGeneric<1, 1, 1, 0>(src1, src2, dst, NULL, size, iflag, 0, nthreads);
+  }
+
+  static __device__ void ReduceCopy(const T* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t oflag, int nthreads) {
+    return ReduceCopyGeneric<1, 0, 1, 1>(src, NULL, dst1, dst2, size, 0, oflag, nthreads);
+  }
+
+  static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
+    return ReduceCopyGeneric<0, 1, 1, 1>(NULL, src, dst1, dst2, size, iflag, oflag, nthreads);
+  }
+
+  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
+    return ReduceCopyGeneric<1, 1, 1, 1>(src1, src2, dst1, dst2, size, iflag, oflag, nthreads);
+  }
+};
+
+// Common macros
+
+#define STEP_TO_SLOT(step) \
+  (step % NCCL_LL_CHUNKS)
+
+#define WAIT_NEXT \
+  if (tid == 0) { \
+    while (sendHead + NCCL_LL_CHUNKS <= step) { \
+      sendHead = sendHeadPtr[0]; \
+    } \
+  } \
+  asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads));
+
+#define POST_SIZE \
+  if (tid == 0 && sizesFifo) sizesFifo[step % NCCL_LL_CHUNKS] = (maxOffset <= 0) ? -1 : (maxOffset*2*(int)sizeof(T));
+
+#define ACK_PREV \
+  asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads)); \
+  if (tid == 0) recvHeadPtr[0] = step;
+
+#define FIFO_CLEANING_AND_SAVE_STEP(flag) do { \
+  if (step > ring->send.conn.llLastCleaning + NCCL_LL_CLEAN_FREQ) { \
+    /* Reset all flags */ \
+    static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); \
+    static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); \
+    const union ncclLLFifoLine resetLine = { 0, flag, 0, flag }; \
+    for (int i=0; i<NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*llNthreads); i++) { \
+      prevInput[tid+i*llNthreads].i4 = resetLine.i4; \
+    } \
+    __threadfence_system(); \
+    /* Restart from the same slot, only make sure sender waits for data to be reset */ \
+    step += NCCL_LL_CHUNKS; \
+    ACK_PREV; \
+    while (sendHeadPtr[0] < step); \
+    if (tid == 0) ring->send.conn.llLastCleaning = step; \
+  } \
+  ring->send.conn.llStep = step; \
+} while (0);
+
+#endif
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
new file mode 100644
index 0000000..8df152e
--- /dev/null
+++ b/src/collectives/device/primitives.h
@@ -0,0 +1,226 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PRIMITIVES_H_
+#define NCCL_PRIMITIVES_H_
+
+#include <type_traits>
+#include "reduce_kernel.h" // for reduction funcs
+
+
+/* Defines primitive operations: Copy, Reduce, DoubleCopy, and ReduceCopy.
+ *
+ * In order to reduce the reptetion of template arguments, the operations
+ * are bundled as static methods of the Primitives class.
+ *
+ * Each primitive operation copies/reduces a contiguous buffer and syncs
+ * an optional set of flags against a sub-step counter. The sync value is
+ * based on the step parameter. Sync flags must be of type WaitFlag or
+ * PostFlag. The primitive routines wait for all WaitFlag args to attain
+ * at least a value of SUBSTEPS*(step-1)+substep+1 (i.e. completion of
+ * corresponding substep by previous step) before executing the transfer.
+ * After each substep is transfered, all PostFlag arguments get updated to
+ * the value SUBSTEPS*step+substep+1.
+ */
+
+
+class WaitFlag {
+  volatile uint64_t * const flag;
+  const int shift;
+ public:
+  __device__ __forceinline__
+  WaitFlag(volatile uint64_t * const flag, const int shift) : flag(flag), shift(shift) { }
+  __device__ __forceinline__
+  void wait(uint64_t val) { while ((*flag + shift) < val) /*SPIN*/; }
+};
+
+
+class PostFlag {
+  volatile uint64_t * const flag;
+  const int shift;
+  volatile int * const fifo;
+  const int fifo_size;
+ public:
+  __device__ __forceinline__
+  PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size) : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size) { }
+  __device__ __forceinline__
+  void post(uint64_t val) { *flag = (val - shift); }
+  __device__ __forceinline__
+  void postSize(uint64_t step, int size) { if (fifo != NULL) fifo[step%fifo_size] = size; };
+};
+
+
+// Helper to check if any argument is of type T.
+// e.g. AnyAre<WaitFlag>(Flag1, Flag2, ...)
+template<typename T> __device__ __forceinline__
+bool AnyAre() { return false; }
+
+template<typename T, typename FIRST_T, typename... TAIL_Ts>
+__device__ __forceinline__
+bool AnyAre(FIRST_T first, TAIL_Ts... tail) {
+  return std::is_same<T, FIRST_T>::value || AnyAre<T>(tail...);
+}
+
+
+// Wait on all WaitFlags, ignore PostFlags
+__device__ __forceinline__
+void WaitOnFlags(uint64_t val) { }
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void WaitOnFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
+  flag.wait(val);
+  WaitOnFlags(val, tail...);
+}
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void WaitOnFlags(uint64_t val, PostFlag, TAIL_Ts... tail) {
+  WaitOnFlags(val, tail...);
+}
+
+
+// Post all PostFlags, ignore WaitFlags
+__device__ __forceinline__
+void PostToFlags(uint64_t val) { }
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void PostToFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
+  PostToFlags(val, tail...);
+}
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void PostToFlags(uint64_t val, PostFlag flag, TAIL_Ts... tail) {
+  flag.post(val);
+  PostToFlags(val, tail...);
+}
+
+
+// Post sizes for PostFlags, ignore WaitFlags
+__device__ __forceinline__
+void PostSizeToFlags(uint64_t step, int size) { }
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void PostSizeToFlags(uint64_t step, int size, WaitFlag flag, TAIL_Ts... tail) {
+  PostSizeToFlags(step, size, tail...);
+}
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void PostSizeToFlags(uint64_t step, int size, PostFlag flag, TAIL_Ts... tail) {
+  flag.postSize(step, size);
+  PostSizeToFlags(step, size, tail...);
+}
+
+
+// Create pointer arithmetic syntax that doesn't break for nullptr_t
+template <typename Tptr> __device__ __forceinline__
+Tptr ptradd(Tptr ptr, int i) {
+  return ptr + i;
+}
+
+__device__ __forceinline__
+nullptr_t ptradd(nullptr_t ptr, int i) {
+  return nullptr;
+}
+
+
+// Implementation of primitive types
+template <int UNROLL, int SUBSTEPS, typename T, typename REDOP=FuncSum<T> >
+class Primitives {
+ private:
+  template <typename SRC2_T, // either T* or nullptr_t
+      typename DST2_T, // either T* or nullptr_t
+      typename... SYNC_Ts> // either WaitFunc or PostFunc
+  static __device__ __forceinline__ void
+  GenericOp(const int tid, const int nthreads,
+      const T*     src1,
+      const SRC2_T src2,
+      T*     dst1,
+      DST2_T dst2,
+      int len, int maxoffset, uint64_t step, SYNC_Ts... flags) {
+
+    enum { noSrc2 = std::is_same<SRC2_T, nullptr_t>::value };
+    enum { noDst2 = std::is_same<DST2_T, nullptr_t>::value };
+    static_assert(noSrc2 || std::is_same<SRC2_T, const T*>::value,
+        "src2 must be of type T* or nullptr_t");
+    static_assert(noDst2 || std::is_same<DST2_T, T*>::value,
+        "dst2 must be of type T* or nullptr_t");
+
+    using OpType = typename std::conditional<noSrc2, FuncSum<T>, REDOP>::type;
+
+    int sliceSize = len / SUBSTEPS;
+    int sliceOffset = 0;
+
+#pragma unroll 1
+    for (int sub=0; sub<SUBSTEPS; ++sub) {
+      int realSize = max(0, min(sliceSize, maxoffset-sliceOffset));
+      if (tid < nthreads) {
+        if (AnyAre<WaitFlag>(flags...)) {
+          if (tid == 0) {
+            WaitOnFlags(SUBSTEPS*step + sub + 1, flags...);
+          }
+          asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+        }
+        ReduceOrCopy
+        <
+        UNROLL,
+        OpType,
+        T,
+        !std::is_same<DST2_T, nullptr_t>::value, // HAS_DEST1
+        !std::is_same<SRC2_T, nullptr_t>::value  // HAS_SRC1
+        >
+        (
+            tid, nthreads,
+            ptradd(dst1, sliceOffset),
+            ptradd(dst2, sliceOffset),
+            ptradd(src1, sliceOffset),
+            ptradd(src2, sliceOffset),
+            realSize
+        );
+        if (AnyAre<PostFlag>(flags...)) {
+          __syncthreads();
+        }
+      } else {
+        if (AnyAre<PostFlag>(flags...)) {
+          __syncthreads();
+          PostSizeToFlags(SUBSTEPS*step+sub, realSize*sizeof(T), flags...);
+          __threadfence_system();
+          PostToFlags(SUBSTEPS*step + sub + 1, flags...);
+        }
+      }
+      sliceOffset += sliceSize;
+    }
+  }
+
+ public:
+  template <typename... SYNC_Ts>
+  static __device__ __forceinline__ void
+  Copy(const int tid, const int nthreads, const T* src, T* dst,
+      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
+    GenericOp(tid, nthreads, src, nullptr, dst, nullptr, len, maxOffset, step, flags...);
+  }
+
+  template <typename... SYNC_Ts>
+  static __device__ __forceinline__ void
+  DoubleCopy(const int tid, const int nthreads, const T* src, T* dst1, T* dst2,
+      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
+    GenericOp(tid, nthreads, src, nullptr, dst1, dst2, len, maxOffset, step, flags...);
+  }
+
+  template <typename... SYNC_Ts>
+  static __device__ __forceinline__ void
+  Reduce(const int tid, const int nthreads, const T* src1, const T* src2, T* dst,
+      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
+    GenericOp(tid, nthreads, src1, src2, dst, nullptr, len, maxOffset, step, flags...);
+  }
+
+  template <typename... SYNC_Ts>
+  static __device__ __forceinline__ void
+  ReduceCopy(const int tid, const int nthreads, const T* src1, const T* src2, T* dst1, T* dst2,
+      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
+    GenericOp(tid, nthreads, src1, src2, dst1, dst2, len, maxOffset, step, flags...);
+  }
+};
+
+#endif // end include guard
diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu
new file mode 100644
index 0000000..bd1d23c
--- /dev/null
+++ b/src/collectives/device/reduce.cu
@@ -0,0 +1,21 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common.h"
+#include "reduce.h"
+#include "collectives.h"
+
+#define UNROLL 4
+
+#if NCCL_OP == 0
+IMPL_COLL2(ncclReduce, sum,  FuncSum,  ncclCollReduce, ncclSum);
+#elif NCCL_OP == 1
+IMPL_COLL2(ncclReduce, prod, FuncProd, ncclCollReduce, ncclProd);
+#elif NCCL_OP == 2
+IMPL_COLL2(ncclReduce, min,  FuncMin,  ncclCollReduce, ncclMin);
+#elif NCCL_OP == 3
+IMPL_COLL2(ncclReduce, max,  FuncMax,  ncclCollReduce, ncclMax);
+#endif
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
new file mode 100644
index 0000000..f5694b1
--- /dev/null
+++ b/src/collectives/device/reduce.h
@@ -0,0 +1,190 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "primitives.h"
+#include "collectives.h"
+
+// Increase Step and boffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  boffset += sliceSize; \
+  if (boffset == buffSize) boffset = 0;
+
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclReduceKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = blockDim.x - 1;
+  const int bid = args->bid;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+
+  WaitFlag waitDoneFromNext(ring->send.conn.head, (REDUCE_BUFCHUNKS-1)*REDUCE_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
+  PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS);
+
+  typedef Primitives<UNROLL, REDUCE_SUBSTEPS, T, FUNC> Prims;
+
+  const ssize_t size = args->N;
+  const int nranks = comm->nRanks;
+  const int buffSize = ring->buffSize / sizeof(T);
+  const int sliceSize = buffSize / REDUCE_BUFCHUNKS;
+  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+  const int rank = ring->devUserRanks[0];
+  const int prevRank = ring->devUserRanks[nranks-1];
+  const int root = args->root;
+
+  if (tid == 0) {
+    // Update in case we skipped some collectives
+    *ring->recv.conn.opCount = args->opCount;
+
+    if (rank != root) {
+      // Wait for next to be ready
+      WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
+      waitOpCountNext.wait(args->opCount);
+    }
+  }
+  __syncthreads();
+
+  uint64_t step = 0ULL;
+  int boffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
+  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
+    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t offset = gridOffset + bid*chunkSize;
+    int maxOffset = min(chunkSize, size-offset);
+    if (prevRank == root) {
+      Prims::Copy(tid, nthreads,
+          thisInput + offset,
+          nextOutput + boffset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext,
+          postReadyToNext);
+    } else if (rank == root) {
+      Prims::Reduce(tid, nthreads,
+          prevInput  + boffset,
+          thisInput + offset,
+          thisOutput + offset,
+          sliceSize, maxOffset,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    } else {
+      Prims::Reduce(tid, nthreads,
+          prevInput + boffset,
+          thisInput + offset,
+          nextOutput + boffset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext, waitReadyFromPrev,
+          postReadyToNext, postDoneToPrev);
+    }
+    NEXT_STEP; // Increases step, boffset
+  }
+
+  if (tid == 0) {
+    if (rank != root) {
+      // Wait for next to have consumed data before resetting the flag
+      waitDoneFromNext.wait(REDUCE_SUBSTEPS*(step + REDUCE_BUFCHUNKS - 1));
+      *ring->send.conn.head = 0ULL;
+    }
+    *ring->recv.conn.tail = 0ULL;
+    __threadfence_system();
+    *ring->recv.conn.opCount = args->opCount+1;
+  }
+}
+
+#include "ll_kernel.h"
+
+#define NEXT_STEP_LL \
+  boffset += NCCL_LL_SLICE_LINES; \
+  if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
+  flag++; \
+  step++;
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int llNthreads = args->nThreads;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
+  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
+  volatile int * sizesFifo = ring->send.conn.llFifo;
+  uint64_t sendHead = sendHeadPtr[0];
+  const int nranks = comm->nRanks;
+  const int rank = comm->rank;
+  const int prevRank = ring->devUserRanks[nranks-1];
+  const int root = args->root;
+
+  typedef LLPrimitives<T, FUNC> LL;
+
+  const ssize_t size = args->N;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nRings*chunkSize;
+
+  uint64_t step = ring->send.conn.llStep;
+  uint32_t flag = step + 1;
+  int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
+  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    if (size-gridOffset < loopSize) {
+      chunkSize = args->lastChunkSize;
+    }
+    ssize_t offset = gridOffset + bid*chunkSize;
+
+    int maxOffset = min(chunkSize, size-offset);
+    if (prevRank == root) {
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          thisInput + offset,
+          nextOutput + boffset,
+          maxOffset, flag, llNthreads);
+      POST_SIZE;
+      NEXT_STEP_LL;
+    } else if (rank == root) {
+      LL::ReduceCopy(
+          thisInput + offset,
+          prevInput  + boffset,
+          thisOutput + offset,
+          maxOffset, flag, llNthreads);
+      NEXT_STEP_LL;
+      ACK_PREV;
+    } else {
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          thisInput + offset,
+          prevInput + boffset,
+          nextOutput + boffset,
+          maxOffset, flag, flag, llNthreads);
+      POST_SIZE;
+      NEXT_STEP_LL;
+      ACK_PREV;
+    }
+  }
+
+  // We need everyone to acknowledge data even if they didn't receive anything
+  // so that the next collective can start right away.
+  ACK_PREV;
+
+  FIFO_CLEANING_AND_SAVE_STEP(flag);
+}
diff --git a/src/collectives/device/reduce_kernel.h b/src/collectives/device/reduce_kernel.h
new file mode 100644
index 0000000..0cb8f13
--- /dev/null
+++ b/src/collectives/device/reduce_kernel.h
@@ -0,0 +1,364 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+
+#ifndef NCCL_REDUCE_KERNEL_H_
+#define NCCL_REDUCE_KERNEL_H_
+
+#include "common_kernel.h"
+#include <limits>
+
+template<typename T>
+struct FuncNull {
+  __device__ T operator()(const T x, const T y) const {
+    return 0;
+  }
+};
+
+template<typename T>
+struct FuncSum {
+  __device__ T operator()(const T x, const T y) const {
+    return x + y;
+  }
+};
+
+template<typename T>
+struct FuncProd {
+  __device__ T operator()(const T x, const T y) const {
+    return x * y;
+  }
+};
+
+template<typename T>
+struct FuncMax {
+  __device__ T operator()(const T x, const T y) const {
+    return (x < y) ? y : x;
+  }
+};
+
+template<typename T>
+struct FuncMin {
+  __device__ T operator()(const T x, const T y) const {
+    return (x < y) ? x : y;
+  }
+};
+
+template<>
+struct FuncSum<int8_t> {
+  union converter { uint32_t storage; char4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
+    int32_t rv;
+    asm("vadd.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
+        "vadd.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+        "vadd.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+        "vadd.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = cx.a.x + cy.a.x;
+    cr.a.y = cx.a.y + cy.a.y;
+    cr.a.z = cx.a.z + cy.a.z;
+    cr.a.w = cx.a.w + cy.a.w;
+    return cr.storage;
+#endif
+  }
+  __device__ int8_t operator()(const int8_t x, const int8_t y) const {
+    return x+y;
+  }
+};
+template<>
+struct FuncSum<uint8_t> {
+  union converter { uint32_t storage; uchar4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    asm("vadd4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
+    int32_t rv;
+    asm("vadd.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
+        "vadd.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+        "vadd.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+        "vadd.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = cx.a.x + cy.a.x;
+    cr.a.y = cx.a.y + cy.a.y;
+    cr.a.z = cx.a.z + cy.a.z;
+    cr.a.w = cx.a.w + cy.a.w;
+    return cr.storage;
+#endif
+  }
+  __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
+    return x+y;
+  }
+};
+
+static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) {
+  /* This can be used both for signed and unsigned 8-bit multiplication */
+#if (__CUDA_ARCH__ >= 300)
+  uint32_t rv;
+  asm("{ .reg .u32 t0, t1, t2, t3;\n\t"
+      " vmad.u32.u32.u32 t3, %1.b3, %2.b3, 0;\n\t"
+      " vmad.u32.u32.u32 t2, %1.b2, %2.b2, 0;\n\t"
+      " shl.b32          t3, t3, 16;\n\t"
+      " shl.b32          t2, t2, 16;\n\t"
+      " vmad.u32.u32.u32 t1, %1.b1, %2.b1, t3;\n\t"
+      " shl.b32          t1, t1, 8;\n\t"
+      " vmad.u32.u32.u32 t0, %1.b0, %2.b0, t2;\n\t"
+      " and.b32          t1, t1, 0xff00ff00;\n\t"
+      " and.b32          t0, t0, 0x00ff00ff;\n\t"
+      " or.b32           %0,  t0, t1;\n\t"
+      "}" : "=r"(rv) : "r"(x), "r"(y));
+  return rv;
+#else
+  union converter { uint32_t storage; char4 a; };
+  converter cx, cy, cr;
+  cx.storage = x;
+  cy.storage = y;
+  cr.a.x = cx.a.x * cy.a.x;
+  cr.a.y = cx.a.y * cy.a.y;
+  cr.a.z = cx.a.z * cy.a.z;
+  cr.a.w = cx.a.w * cy.a.w;
+  return cr.storage;
+#endif
+}
+
+template<>
+struct FuncProd<int8_t> {
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+    return mulChar4(x, y);
+  }
+  __device__ int8_t operator()(const int8_t x, const int8_t y) const {
+    return x*y;
+  }
+};
+template<>
+struct FuncProd<uint8_t> {
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+    return mulChar4(x, y);
+  }
+  __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
+    return x*y;
+  }
+};
+
+template<>
+struct FuncMax<int8_t> {
+  union converter { uint32_t storage; char4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
+    int32_t rv;
+    asm("vmax.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
+        "vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+        "vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+        "vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = max(cx.a.x, cy.a.x);
+    cr.a.y = max(cx.a.y, cy.a.y);
+    cr.a.z = max(cx.a.z, cy.a.z);
+    cr.a.w = max(cx.a.w, cy.a.w);
+    return cr.storage;
+#endif
+  }
+  __device__ int8_t operator()(const int8_t x, const int8_t y) const {
+    return (x>y) ? x : y;
+  }
+};
+template<>
+struct FuncMax<uint8_t> {
+  union converter { uint32_t storage; uchar4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
+    int32_t rv;
+    asm("vmax.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
+        "vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+        "vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+        "vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = max(cx.a.x, cy.a.x);
+    cr.a.y = max(cx.a.y, cy.a.y);
+    cr.a.z = max(cx.a.z, cy.a.z);
+    cr.a.w = max(cx.a.w, cy.a.w);
+    return cr.storage;
+#endif
+  }
+  __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
+    return (x>y) ? x : y;
+  }
+};
+
+template<>
+struct FuncMin<int8_t> {
+  union converter { uint32_t storage; char4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
+    int32_t rv;
+    asm("vmin.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
+        "vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+        "vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+        "vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = min(cx.a.x, cy.a.x);
+    cr.a.y = min(cx.a.y, cy.a.y);
+    cr.a.z = min(cx.a.z, cy.a.z);
+    cr.a.w = min(cx.a.w, cy.a.w);
+    return cr.storage;
+#endif
+  }
+  __device__ int8_t operator()(const int8_t x, const int8_t y) const {
+    return (x<y) ? x : y;
+  }
+};
+template<>
+struct FuncMin<uint8_t> {
+  union converter { uint32_t storage; uchar4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
+    int32_t rv;
+    asm("vmin.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
+        "vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+        "vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+        "vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = min(cx.a.x, cy.a.x);
+    cr.a.y = min(cx.a.y, cy.a.y);
+    cr.a.z = min(cx.a.z, cy.a.z);
+    cr.a.w = min(cx.a.w, cy.a.w);
+    return cr.storage;
+#endif
+  }
+  __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
+    return (x<y) ? x : y;
+  }
+};
+
+template<>
+struct FuncSum<half> {
+  __device__ half2 operator()(const half2 x, const half2 y) const {
+#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
+    return __hadd2(x, y);
+#else
+    float2 fx, fy, fr;
+    fx = __half22float2(x);
+    fy = __half22float2(y);
+    fr.x = fx.x + fy.x;
+    fr.y = fx.y + fy.y;
+    return __float22half2_rn(fr);
+#endif
+  }
+  __device__ half operator()(const half x, const half y) const {
+#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
+    return __hadd(x, y);
+#else
+    return __float2half( __half2float(x) + __half2float(y) );
+#endif
+  }
+};
+
+template<>
+struct FuncProd<half> {
+  __device__ half2 operator()(const half2 x, const half2 y) const {
+#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
+    return __hmul2(x, y);
+#else
+    float2 fx, fy, fr;
+    fx = __half22float2(x);
+    fy = __half22float2(y);
+    fr.x = fx.x * fy.x;
+    fr.y = fx.y * fy.y;
+    return __float22half2_rn(fr);
+#endif
+  }
+  __device__ half operator()(const half x, const half y) const {
+#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
+    return __hmul(x, y);
+#else
+    return __float2half( __half2float(x) * __half2float(y) );
+#endif
+  }
+};
+
+template<>
+struct FuncMax<half> {
+  __device__ half2 operator()(const half2 x, const half2 y) const {
+    float2 fx, fy, fr;
+    fx = __half22float2(x);
+    fy = __half22float2(y);
+    fr.x = fmaxf(fx.x, fy.x);
+    fr.y = fmaxf(fx.y, fy.y);
+    return __float22half2_rn(fr);
+  }
+  __device__ half operator()(const half x, const half y) const {
+    float fx, fy, fm;
+    fx = __half2float(x);
+    fy = __half2float(y);
+    fm = fmaxf(fx, fy);
+    return __float2half(fm);
+  }
+};
+
+template<>
+struct FuncMin<half> {
+  __device__ half2 operator()(const half2 x, const half2 y) const {
+    float2 fx, fy, fr;
+    fx = __half22float2(x);
+    fy = __half22float2(y);
+    fr.x = fminf(fx.x, fy.x);
+    fr.y = fminf(fx.y, fy.y);
+    return __float22half2_rn(fr);
+  }
+  __device__ half operator()(const half x, const half y) const {
+    float fx, fy, fm;
+    fx = __half2float(x);
+    fy = __half2float(y);
+    fm = fminf(fx, fy);
+    return __float2half(fm);
+  }
+};
+#endif // REDUCE_KERNEL_H_
diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu
new file mode 100644
index 0000000..b16053c
--- /dev/null
+++ b/src/collectives/device/reduce_scatter.cu
@@ -0,0 +1,21 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common.h"
+#include "reduce_scatter.h"
+#include "collectives.h"
+
+#define UNROLL 4
+
+#if NCCL_OP == 0
+IMPL_COLL2(ncclReduceScatter, sum,  FuncSum,  ncclCollReduceScatter, ncclSum);
+#elif NCCL_OP == 1
+IMPL_COLL2(ncclReduceScatter, prod, FuncProd, ncclCollReduceScatter, ncclProd);
+#elif NCCL_OP == 2
+IMPL_COLL2(ncclReduceScatter, min,  FuncMin,  ncclCollReduceScatter, ncclMin);
+#elif NCCL_OP == 3
+IMPL_COLL2(ncclReduceScatter, max,  FuncMax,  ncclCollReduceScatter, ncclMax);
+#endif
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
new file mode 100644
index 0000000..cad011b
--- /dev/null
+++ b/src/collectives/device/reduce_scatter.h
@@ -0,0 +1,217 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "primitives.h"
+#include "collectives.h"
+
+// Increase Step and poffset/noffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  poffset = noffset; \
+  noffset += sliceSize; \
+  if (noffset == buffSize) noffset = 0;
+
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = blockDim.x - 1;
+  const int bid = args->bid;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+
+  WaitFlag waitDoneFromNext(ring->send.conn.head, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, REDUCESCATTER_SUBSTEPS);
+  PostFlag postDoneToPrev(ring->recv.conn.head, REDUCESCATTER_SUBSTEPS, NULL, 0);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
+
+  typedef Primitives<UNROLL, REDUCESCATTER_SUBSTEPS, T, FUNC> Prims;
+
+  const ssize_t size = args->N;
+  const int nranks = comm->nRanks;
+  const int buffSize = ring->buffSize / sizeof(T);
+  const int sliceSize = buffSize / REDUCESCATTER_BUFCHUNKS;
+  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+
+  if (tid == 0) {
+    // Update in case we skipped some collectives
+    *ring->recv.conn.opCount = args->opCount;
+    // Wait for next to be ready
+    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
+    waitOpCountNext.wait(args->opCount);
+  }
+  __syncthreads();
+
+  uint64_t step = 0ULL;
+  int poffset, noffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
+  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
+    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+    /////////////// begin ReduceScatter steps ///////////////
+    ssize_t offset;
+    int maxOffset = min(chunkSize, size-chunkOffset);
+    int rankDest;
+
+    // step 0: push data to next GPU
+    rankDest = ring->devUserRanks[nranks-1];
+    offset = chunkOffset + rankDest * size;
+
+    Prims::Copy(tid, nthreads,
+        thisInput  + offset,
+        nextOutput + noffset,
+        sliceSize, maxOffset,
+        step,
+        waitDoneFromNext,
+        postReadyToNext);
+
+    NEXT_STEP; // Increases step, poffset, noffset
+
+    // k-2 steps: reduce and copy to next GPU
+    for (int j=2; j<nranks; ++j) {
+      rankDest = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + rankDest * size;
+
+      Prims::Reduce(tid, nthreads,
+          prevInput  + poffset,
+          thisInput  + offset,
+          nextOutput + noffset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext, waitReadyFromPrev,
+          postReadyToNext, postDoneToPrev);
+
+      NEXT_STEP;
+    }
+
+    // step k-1: reduce this buffer and data, which will produce the final
+    // result that we store in this data and push to the next GPU
+    rankDest = ring->devUserRanks[0];
+    offset = chunkOffset + rankDest * size;
+
+    Prims::Reduce(tid, nthreads,
+        prevInput  + poffset,
+        thisInput  + offset,
+        thisOutput + chunkOffset,
+        sliceSize, maxOffset,
+        step,
+        waitReadyFromPrev,
+        postDoneToPrev);
+  }
+
+  if (tid == 0) {
+    waitDoneFromNext.wait(REDUCESCATTER_SUBSTEPS*(step + REDUCESCATTER_BUFCHUNKS));
+    *ring->send.conn.head = 0ULL;
+    *ring->recv.conn.tail = 0ULL;
+    __threadfence_system();
+    *ring->recv.conn.opCount = args->opCount+1;
+  }
+}
+
+#include "ll_kernel.h"
+
+#define NEXT_STEP_LL \
+  poffset = noffset; \
+  pflag = nflag; \
+  noffset += NCCL_LL_SLICE_LINES; \
+  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
+  nflag++; \
+  step++;
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int llNthreads = args->nThreads;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
+  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
+  volatile int * sizesFifo = ring->send.conn.llFifo;
+  uint64_t sendHead = sendHeadPtr[0];
+
+  typedef LLPrimitives<T, FUNC> LL;
+
+  const ssize_t size = args->N;
+  //const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nRings*chunkSize;
+
+  uint64_t step = ring->send.conn.llStep;
+  uint32_t pflag, nflag = step + 1;
+  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
+  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    if (size-gridOffset < loopSize) {
+      chunkSize = args->lastChunkSize;
+    }
+    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+    /////////////// begin ReduceScatter steps ///////////////
+    ssize_t offset;
+    int maxOffset = min(chunkSize, size-chunkOffset);
+    int rankDest;
+
+    // step 0: push data to next GPU
+    rankDest = ring->devUserRanks[nranks-1];
+    offset = chunkOffset + rankDest * size;
+
+    WAIT_NEXT;
+    LL::ReduceCopy(
+        thisInput  + offset,
+        nextOutput + noffset,
+        maxOffset, nflag, llNthreads);
+    POST_SIZE;
+
+    NEXT_STEP_LL;
+
+    // k-2 steps: reduce and copy to next GPU
+    for (int j=2; j<nranks; ++j) {
+      rankDest = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + rankDest * size;
+
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          thisInput  + offset,
+          prevInput  + poffset,
+          nextOutput + noffset,
+          maxOffset, pflag, nflag, llNthreads);
+      POST_SIZE;
+      ACK_PREV;
+
+      NEXT_STEP_LL;
+    }
+
+    // step k-1: reduce this buffer and data, which will produce the final
+    // result that we store in this data
+    rankDest = ring->devUserRanks[0];
+    offset = chunkOffset + rankDest * size;
+
+    LL::ReduceCopy(
+        thisInput  + offset,
+        prevInput  + poffset,
+        thisOutput + chunkOffset,
+        maxOffset, pflag, llNthreads);
+    ACK_PREV;
+  }
+
+  FIFO_CLEANING_AND_SAVE_STEP(nflag);
+}
diff --git a/src/collectives/reduce.cu b/src/collectives/reduce.cu
new file mode 100644
index 0000000..b7c91e6
--- /dev/null
+++ b/src/collectives/reduce.cu
@@ -0,0 +1,33 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "collectives.h"
+
+ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  size_t nbytes = count*ncclTypeSize(datatype);
+  INFO(COLL,"opCount %lx sendbuff %p recvbuff %p count %zi size %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, nbytes, datatype, op, root, comm, comm->nRanks, stream);
+  if (comm->nRanks == 1) {
+    if (sendbuff != recvbuff)
+      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+  } else {
+    NCCLCHECK(transportSaveProxies(REDUCE_SUBSTEPS, REDUCE_BUFCHUNKS, 1, 1, nbytes, proxyPatternTo(root), comm));
+    NCCLCHECK(saveKernel(ncclCollReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, 1));
+  }
+
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  return ncclEnqueueCheck(ncclReduceFunc, "Reduce", sendbuff, recvbuff, count, datatype,
+          op, root, comm, stream);
+}
diff --git a/src/collectives/reduce_scatter.cu b/src/collectives/reduce_scatter.cu
new file mode 100644
index 0000000..9e052ff
--- /dev/null
+++ b/src/collectives/reduce_scatter.cu
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "collectives.h"
+
+ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  size_t nbytes = count*ncclTypeSize(datatype);
+  INFO(COLL,"opCount %lx sendbuff %p recvbuff %p count %zi size %zi datatype %d op %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, nbytes, datatype, op, comm, comm->nRanks, stream);
+  if (comm->nRanks == 1) {
+    if (sendbuff != recvbuff)
+      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+  } else {
+    NCCLCHECK(transportSaveProxies(REDUCESCATTER_SUBSTEPS, REDUCESCATTER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
+    NCCLCHECK(saveKernel(ncclCollReduceScatter, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes*comm->nRanks, 1));
+  }
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
+  return ncclEnqueueCheck(ncclReduceScatterFunc, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype,
+          op, 0, comm, stream);
+}
diff --git a/src/common_coll.h b/src/common_coll.h
deleted file mode 100644
index 54050f8..0000000
--- a/src/common_coll.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef COMMON_COLL_H_
-#define COMMON_COLL_H_
-
-#include "core.h"
-
-static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
-  cudaPointerAttributes attr;
-  cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
-  if (err != cudaSuccess || attr.devicePointer == NULL) {
-    WARN("%s : %s is not a valid pointer\n", opname, ptrname);
-    return ncclInvalidDevicePointer;
-  }
-  if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
-    WARN("%s : %s allocated on device %d mismatchs with NCCL device %d \n", opname, ptrname, attr.device, comm->cudaDev);
-    return ncclInvalidDevicePointer;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
-  if (ptr == NULL) {
-    WARN("%s : %s argument is NULL", opname, ptrname);
-    return ncclInvalidArgument;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, int count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) {
-  NCCLCHECK(PtrCheck(comm, opname, "comm"));
-  // First, the easy ones
-  if (root < 0 || root >= comm->nRanks) {
-    WARN("%s : invalid root %d (root should be in the 0..%d range)\n", opname, root, comm->nRanks);
-    return ncclInvalidRank;
-  }
-  if (type < 0 || type >= nccl_NUM_TYPES) {
-    WARN("%s : invalid type %d\n", opname, type);
-    return ncclInvalidType;
-  }
-  if (op < 0 || op >= nccl_NUM_OPS) {
-    WARN("%s : invalid reduction operation %d\n", opname, op);
-    return ncclInvalidOperation;
-  }
-  if (count < 0) {
-    WARN("%s : invalid count %d\n", opname, count);
-    return ncclInvalidArgument;
-  }
-
-  // Check pointers
-  NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname))
-  if (strcmp(opname, "Reduce") == 0 && comm->rank != root) {
-    // No need to check recvbuff pointer for non-root reduce
-    return ncclSuccess;
-  }
-  NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname))
-  return ncclSuccess;
-}
-
-// Kernel launch
-template<typename T>
-struct KernelArgs {
-  // general parameters
-  int nRanks;
-  int root;
-  int buffSize;
-  int N;
-  int opIndex;
-  volatile int * __restrict__ opCounter;
-  int * __restrict__ doneCount;
-  bool pushrecv;
-
-  // some pre-computed sizes
-  int SliceSize;
-  int SliceOffset;
-  int ChunkSize;
-  int NumChunks;
-
-  // local and remote input, output, and buffer
-  const T * __restrict__ ThisInput;
-  T * __restrict__ ThisOutput;
-
-  DevRing<char>* ring;
-};
-
-template<typename T>
-void ArgsSetup(KernelArgs<T> *args, const void* sendbuff, void* recvbuff,
-		const int root, const int count, ncclComm *comm) {
-  args->nRanks = comm->nRanks;
-  args->root = root;
-  args->buffSize = comm->buffSize;
-  args->N = count;
-  args->opIndex = comm->opSched;
-  args->opCounter = comm->opCounter;
-  args->ThisInput = (const T*)sendbuff;
-  args->ThisOutput = (T*)recvbuff;
-  args->ring = comm->devRing;
-  args->pushrecv = comm->globalMemSpace;
-}
-
-#define LAUNCH_KERNEL(K, THREADS, UNROLL, FUNC, T, \
-		args, stream) do { \
-  dim3 grid(1, 1, 1); \
-  dim3 block(THREADS+1, 1, 1); \
-  void* argptrs[] = {&args}; \
-  CUDACHECK(cudaLaunchKernel( \
-            (void*)K<THREADS, UNROLL, FUNC, T>, \
-            grid, block, argptrs, 0, stream), ncclUnhandledCudaError); \
-} while (0)
-
-#endif
diff --git a/src/common_kernel.h b/src/common_kernel.h
deleted file mode 100644
index b96519f..0000000
--- a/src/common_kernel.h
+++ /dev/null
@@ -1,362 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-
-#ifndef COMMON_KERNEL_H_
-#define COMMON_KERNEL_H_
-
-#include <cstdio>
-#include <cstdint>
-
-#include <cuda_runtime.h>
-
-// BAR macro and helpers
-#define WARP_SIZE 32
-#define ROUNDUP(x, y)                                                           \
-    (((((x) + (y) - 1) / (y))) * (y))
-#define DIVUP(x, y) \
-    (((x)+(y)-1)/(y))
-#define BAR_EXEC(type, barid, nthreads) \
-    asm("bar." #type " " #barid ", " #nthreads ";\n\t")
-#define BAR_EXPAND(type, barid, nthreads) \
-    BAR_EXEC(type, barid, (nthreads))
-
-// Named barrier macro.
-// Expands to asm("bar.type barid, nthreads") where
-// nthreads has been rounded up to WARP_SIZE.
-#define BAR(type, barid, nthreads) \
-    BAR_EXPAND(type, barid, ROUNDUP(nthreads, WARP_SIZE))
-
-template<typename T> inline __device__
-T vFetch(const volatile T* ptr) {
-  return *ptr;
-}
-
-template<typename T> inline __device__
-void vStore(volatile T* ptr, const T val) {
-  *ptr = val;
-}
-
-#ifdef CUDA_HAS_HALF
-#if CUDART_VERSION < 9000
-template<> inline __device__
-half vFetch<half>(const volatile half* ptr) {
-  half r;
-  r.x = ptr->x;
-  return r;
-}
-template<> inline __device__
-void vStore<half>(volatile half* ptr, const half val) {
-  ptr->x = val.x;
-}
-#else
-template<> inline __device__
-half vFetch<half>(const volatile half* ptr) {
-  return *((half*)ptr);
-}
-template<> inline __device__
-void vStore<half>(volatile half* ptr, const half val) {
-  *((half*)ptr) = val;
-}
-#endif
-#endif
-
-__device__ unsigned int spinct;
-
-// Spin wait until func evaluates to true
-template<typename FUNC>
-__device__ inline void Wait(const FUNC& func) {
-  while (!func()) {
-    // waste time
-    atomicInc(&spinct, 10);
-  }
-}
-
-typedef uint64_t PackType;
-
-// unpack x and y to elements of type T and apply FUNC to each element
-template<class FUNC, typename T>
-struct MULTI {
-  __device__ PackType operator()(const PackType x, const PackType y) const;
-};
-
-template<class FUNC>
-struct MULTI<FUNC, char> {
-  static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
-      "PackType must be twice the size of uint32_t.");
-  union converter {
-    PackType storage;
-    struct {
-      uint32_t a, b;
-    };
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-
-    // for char, we do these as vector ops
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return cr.storage;
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, int> {
-  static_assert(sizeof(PackType) == 2 * sizeof(int),
-      "PackType must be twice the size of int.");
-  union converter {
-    PackType storage;
-    struct {
-      int a, b;
-    };
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return cr.storage;
-  }
-};
-
-#ifdef CUDA_HAS_HALF
-template<class FUNC>
-struct MULTI<FUNC, half> {
-  static_assert(sizeof(PackType) == 4 * sizeof(half),
-      "PackType must be four times the size of half.");
-
-  struct PackHalf2 {
-    half2 a, b;
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    struct PackHalf2 cx, cy, cr;
-    cx = *(reinterpret_cast<const struct PackHalf2*>(&x));
-    cy = *(reinterpret_cast<const struct PackHalf2*>(&y));
-
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return *(reinterpret_cast<PackType*>(&cr));
-  }
-};
-#endif
-
-template<class FUNC>
-struct MULTI<FUNC, float> {
-  static_assert(sizeof(PackType) == 2 * sizeof(float),
-      "PackType must be twice the size of float.");
-  union converter {
-    PackType storage;
-    struct {
-      float a, b;
-    };
-  };
-
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-
-    cr.a = FUNC()(cx.a, cy.a);
-    cr.b = FUNC()(cx.b, cy.b);
-
-    return cr.storage;
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, double> {
-  static_assert(sizeof(PackType) == sizeof(double),
-      "PackType must be the same size as double.");
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    double rv = FUNC()(__longlong_as_double(x), __longlong_as_double(y));
-    return __double_as_longlong(rv);
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, unsigned long long> {
-  static_assert(sizeof(PackType) == sizeof(unsigned long long),
-      "PackType must be the same size as unsigned long long.");
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    unsigned long long rv = FUNC()(x, y);
-    return rv;
-  }
-};
-
-template<class FUNC>
-struct MULTI<FUNC, long long> {
-  static_assert(sizeof(PackType) == sizeof(long long),
-      "PackType must be the same size as long long.");
-  __device__ PackType operator()(const PackType x, const PackType y) const {
-    long long rv = FUNC()((long long)x, (long long)y);
-    return rv;
-  }
-};
-
-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
-__device__ inline void ReduceCopy(
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1,
-    volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1, const int idx) {
-  T val = vFetch(src0+idx);
-  if (TWO_INPUTS) {
-    val = FUNC()(val, vFetch(src1+idx));
-  }
-  vStore(dest0+idx, val);
-  if (TWO_OUTPUTS) {
-    vStore(dest1+idx, val);
-  }
-}
-
-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL, int THREADS>
-__device__ inline void ReduceCopy64b(
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1,
-    volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1, const int offset) {
-  PackType t0[UNROLL];
-  PackType t1[UNROLL];
-  #pragma unroll
-  for (int u = 0; u < UNROLL; ++u) {
-    int idx = offset + u*THREADS;
-    t0[u] = (reinterpret_cast<const volatile PackType *>(src0))[idx];
-    if (TWO_INPUTS) {
-      t1[u] = (reinterpret_cast<const volatile PackType *>(src1))[idx];
-    }
-  }
-  #pragma unroll
-  for (int u = 0; u < UNROLL; ++u) {
-    int idx = offset + u*THREADS;
-    PackType val = TWO_INPUTS ? MULTI<FUNC, T>()(t0[u], t1[u]) : t0[u];
-    (reinterpret_cast<volatile PackType *>(dest0))[idx] = val;
-    if (TWO_OUTPUTS) {
-      (reinterpret_cast<volatile PackType *>(dest1))[idx] = val;
-    }
-  }
-}
-
-#define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))
-
-template<typename T>
-__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
-  size_t ptrval = reinterpret_cast<size_t>(ptr);
-  return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
-}
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of producer threads
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, class FUNC, typename T, bool HAS_DEST1,
-    bool HAS_SRC1>
-__device__ inline void ReduceOrCopy(const int tid,
-    volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
-    const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
-    int N) {
-  if (N<=0) {
-    return;
-  }
-
-  int Npreamble = (N<alignof(PackType)) ? N : AlignUp(dest0, alignof(PackType)) - dest0;
-
-  // stage 0: check if we'll be able to use the fast, 64-bit aligned path.
-  // If not, we'll just use the slow preamble path for the whole operation
-  bool alignable = (((AlignUp(src0,  alignof(PackType)) == src0  + Npreamble)) &&
-      (!HAS_DEST1 || (AlignUp(dest1, alignof(PackType)) == dest1 + Npreamble)) &&
-      (!HAS_SRC1  || (AlignUp(src1,  alignof(PackType)) == src1  + Npreamble)));
-
-  if (!alignable) {
-    Npreamble = N;
-  }
-
-  // stage 1: preamble: handle any elements up to the point of everything coming
-  // into alignment
-  for (int idx = tid; idx < Npreamble; idx += THREADS) {
-    // ought to be no way this is ever more than one iteration, except when
-    // alignable is false
-    ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(src0, src1, dest0, dest1, idx);
-  }
-
-  // stage 2: fast path: use 64b loads/stores to do the bulk of the work,
-  // assuming the pointers we have are all 64-bit alignable.
-  if (alignable) {
-    const int PackFactor = sizeof(PackType) / sizeof(T);
-    int Nrem = N - Npreamble;
-    dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
-    src0  += Npreamble; if (HAS_SRC1)  { src1  += Npreamble; }
-
-    // stage 2a: main loop
-    int Nalign2a = (Nrem / (PackFactor * UNROLL * THREADS))
-        * (UNROLL * THREADS); // round down
-
-    #pragma unroll 1 // don't unroll this loop
-    for (int idx = tid; idx < Nalign2a; idx += UNROLL * THREADS) {
-      ReduceCopy64b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL, THREADS>(src0, src1, dest0, dest1, idx);
-    }
-
-    int Ndone2a = Nalign2a * PackFactor;
-    Nrem -= Ndone2a;
-
-    // stage 2b: slightly less optimized for section when we don't have full
-    // UNROLLs
-
-    int Nalign2b = Nrem / PackFactor;
-
-    #pragma unroll 4
-    for (int idx = Nalign2a + tid; idx < Nalign2a + Nalign2b; idx += THREADS) {
-      ReduceCopy64b<FUNC, T, HAS_SRC1, HAS_DEST1, 1, 0>(src0, src1, dest0, dest1, idx);
-    }
-
-    int Ndone2b = Nalign2b * PackFactor;
-    Nrem -= Ndone2b;
-    int Ndone2 = Ndone2a + Ndone2b;
-    dest0 += Ndone2; if (HAS_DEST1) { dest1 += Ndone2; }
-    src0  += Ndone2; if (HAS_SRC1)  { src1  += Ndone2; }
-
-    // stage 2c: tail
-
-    for (int idx = tid; idx < Nrem; idx += THREADS) {
-      // never ought to make it more than one time through this loop.  only a
-      // few threads should even participate
-      ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(src0, src1, dest0, dest1, idx);
-    }
-  } // done fast path
-}
-
-template <typename T>
-__device__ inline void incrementOpCounter(const KernelArgs<T> *args) {
-  // increment comm's operation counts
-  __threadfence_system(); // Technically need to ensure that cleared flags
-  // are visible before incrementing op counter.
-  *args->opCounter = args->opIndex+1;
-}
-
-template <int THREADS, typename T> __device__ __forceinline__
-void LoadRing(const DevRing<char>* src, DevRing<T>* dst) {
-  enum { NUM_WORDS = sizeof(DevRing<char>) / sizeof(long long) };
-  static_assert(sizeof(DevRing<char>) % sizeof(long long) == 0, "Bad alignment");
-  static_assert(THREADS >= NUM_WORDS, "Not enough threads to load DevRing");
-  static_assert(sizeof(DevRing<char>) == sizeof(DevRing<T>), "DevRing size mismatch");
-  long long* lldst = reinterpret_cast<long long*>(dst);
-  const long long* llsrc = reinterpret_cast<const long long*>(src);
-  if (threadIdx.x < NUM_WORDS) {
-    lldst[threadIdx.x] = llsrc[threadIdx.x];
-  }
-}
-
-
-#endif // COMMON_KERNEL_H_
diff --git a/src/copy_kernel.h b/src/copy_kernel.h
deleted file mode 100644
index 0f69748..0000000
--- a/src/copy_kernel.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-
-#ifndef COPY_KERNEL_H_
-#define COPY_KERNEL_H_
-
-#include "common_kernel.h"
-
-template<typename T>
-struct FuncPassA {
-  __device__ T operator()(const T x, const T y) const {
-    return x;
-  }
-};
-
-#ifdef CUDA_HAS_HALF
-template <>
-struct FuncPassA<half> {
-  __device__ half2 operator()(const half2 x, const half2 y) const {
-    return x;
-  }
-  __device__ half operator()(const half x, const half y) const {
-    return x;
-  }
-};
-#endif
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of producer threads
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, typename T>
-__device__ void Copy(volatile T * __restrict__ const dest,
-    const volatile T * __restrict__ const src, const int N) {
-  ReduceOrCopy<UNROLL, THREADS, FuncPassA<T>, T, false, false>(threadIdx.x,
-      dest, nullptr, src, nullptr, N);
-}
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of producer threads
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, typename T>
-__device__ void DoubleCopy(volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1,
-    const volatile T * __restrict__ const src, const int N) {
-  ReduceOrCopy<UNROLL, THREADS, FuncPassA<T>, T, true, false>(threadIdx.x,
-      dest0, dest1, src, nullptr, N);
-}
-
-#endif // COPY_KERNEL_H_
diff --git a/src/core.cu b/src/core.cu
deleted file mode 100644
index 1420d21..0000000
--- a/src/core.cu
+++ /dev/null
@@ -1,1019 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include "core.h"
-#include "libwrap.h"
-#include "common_coll.h"
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sched.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <cuda_runtime.h>
-#include <string.h>
-#include <errno.h>
-
-DebugLevel ncclDebugLevel;
-
-NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
-ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
-  NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
-  pid_t pid = getpid();
-  static int count = 0;
-  int commId = __sync_fetch_and_add(&count, 1);
-  int len = snprintf(out->internal, NCCL_UNIQUE_ID_BYTES, "nccl-%d-%d", pid, commId);
-  if(strlen(out->internal) < len) {
-    WARN("ncclUniqueId truncated");
-    return ncclInternalError;
-  }
-  return ncclSuccess;
-}
-
-
-static ncclResult_t shmOpen(const char* shmname, size_t bytes, void** ptr) {
-  int fd = shm_open(shmname, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
-  if (fd == -1) {
-    WARN("shm_open failed to open %s", shmname);
-    return ncclSystemError;
-  }
-
-  if (ftruncate(fd, bytes) == -1) {
-    WARN("ftruncate failed to allocate %ld bytes", bytes);
-    shm_unlink(shmname);
-    close(fd);
-    return ncclSystemError;
-  }
-
-  *ptr = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-  if (*ptr == MAP_FAILED) {
-    WARN("failure in mmap");
-    shm_unlink(shmname);
-    close(fd);
-    return ncclSystemError;
-  }
-
-  close(fd);
-  return ncclSuccess;
-}
-
-static ncclResult_t shmUnlink(const char* shmname) {
-  if(shm_unlink(shmname) == -1) {
-    WARN("smh_unlink failed");
-    return ncclSystemError;
-  } else {
-    return ncclSuccess;
-  }
-}
-
-static ncclResult_t shmUnmap(void* ptr, size_t bytes) {
-  if(munmap(ptr, bytes) == -1) {
-    WARN("munmap failed");
-    return ncclSystemError;
-  } else {
-    return ncclSuccess;
-  }
-}
-
-
-typedef struct {
-  int rank;
-  int ndev;
-  int cudaDev;
-  int sortId;
-  pid_t pid;
-  ncclMem* hostptr;
-  ncclMem* devptr;
-  cudaIpcMemHandle_t devipc;
-  size_t buffSize;
-} RankEntry;
-
-static int compRanks(const void* a, const void* b) {
-  const RankEntry* A = (const RankEntry*)a;
-  const RankEntry* B = (const RankEntry*)b;
-  if (A->sortId < B->sortId) return -1;
-  if (A->sortId > B->sortId) return  1;
-  return 0;
-}
-
-static void orderRanks(RankEntry* ranks, int count) {
-  qsort(ranks, count, sizeof(RankEntry), compRanks);
-}
-
-
-typedef struct {
-  union {
-    struct {
-      volatile int bar;
-      int globalMemSpaceBroke;
-    };
-    char pad[16];
-   };
-   RankEntry ranks[1];
-} RankGather;
-
-static ncclResult_t initGather(RankGather** gather, ncclUniqueId commId,
-    int ndev, int rank, RankEntry myInfo) {
-  size_t bytes = offsetof(RankGather, ranks) + ndev*sizeof(RankEntry);
-  RankGather* tmp = NULL;
-  int bar_tmp;
-
-  ncclResult_t res = shmOpen(commId.internal, bytes, (void**)&tmp);
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to open shm segment for gather", rank);
-    return res;
-  }
-
-  tmp->ranks[rank] = myInfo;
-
-  bar_tmp = tmp->bar - 1;
-  bool swapped;
-  do {
-    bar_tmp += 1;
-    if (bar_tmp == ndev-1) { // everyone is done
-      ncclResult_t res = shmUnlink(commId.internal);
-      if (res != ncclSuccess) {
-        WARN("rank %d failed to unlink shm segment for gather", rank);
-        shmUnmap(tmp, bytes);
-        return res;
-      }
-
-      orderRanks(tmp->ranks, ndev);
-    }
-    swapped = __sync_bool_compare_and_swap(&tmp->bar, bar_tmp, bar_tmp+1);
-  } while(!swapped);
-
-  while (tmp->bar < ndev)
-    sched_yield();
-  __sync_synchronize();
-
-  *gather = tmp;
-  return ncclSuccess;
-}
-
-static void syncRingDirect(RankGather* gather, int* globalMemSpaceOk) {
-  int bar_tmp = gather->bar - 1;
-  int ndev = gather->ranks[0].ndev;
-  bool swapped;
-  do {
-    bar_tmp += 1;
-    swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1);
-  } while(!swapped);
-
-  while (gather->bar < 2*ndev) // Wait for all ranks to arrive at this second barrier
-    sched_yield();
-  __sync_synchronize();
-
-  *globalMemSpaceOk = gather->globalMemSpaceBroke ? 0 : 1;
-}
-
-static ncclResult_t closeGather(RankGather* gather, int ndev) {
-  int bar_tmp = gather->bar - 1;
-  bool swapped;
-  do {
-    bar_tmp += 1;
-    swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1);
-  } while(!swapped);
-
-  while (gather->bar < 3*ndev) // Wait for all ranks to arrive at this third barrier
-    sched_yield();
-  __sync_synchronize();
-
-  size_t bytes = offsetof(RankGather, ranks) + ndev*sizeof(RankEntry);
-  ncclResult_t res = shmUnmap(gather, bytes);
-  if (res != ncclSuccess) {
-    WARN("failed to unmap %ld bytes of gather", bytes);
-    return res;
-  }
-
-  return ncclSuccess;
-}
-
-
-static ncclResult_t allocDevMem(ncclMem** ptr, size_t buffSize) {
-  size_t size = offsetof(struct ncclMem, buff) + buffSize;
-  cudaError_t res = cudaMalloc((void**)ptr, size);
-  if (res != cudaSuccess) {
-    *ptr = NULL;
-    WARN("failed to allocate %lu byte device buffer", size);
-    return ncclCudaMallocFailed;
-  }
-  if (cudaMemset(*ptr, 0, size) != cudaSuccess) {
-    WARN("failed to memset device buffer.");
-    cudaFree(*ptr);
-    *ptr = NULL;
-    return ncclUnhandledCudaError;
-  }
-  return ncclSuccess;
-}
-
-static const int ShmMapped = 1;
-static const int ShmLinked = 2;
-
-static ncclResult_t allocHostMem(ncclMem** ptr, size_t buffSize) {
-  size_t size = offsetof(struct ncclMem, buff) + buffSize;
-  cudaError_t res = cudaMallocHost((void**)ptr, size);
-  if (res != cudaSuccess) {
-    *ptr = NULL;
-    WARN("failed to allocate %lu byte host buffer", size);
-    return ncclSystemError;
-  }
-  memset(*ptr, 0, size);
-  return ncclSuccess;
-}
-
-static ncclResult_t openHostMemShm(const char* shmname, ncclMem** ptr, size_t buffSize) {
-  size_t size = offsetof(struct ncclMem, buff) + buffSize;
-  ncclResult_t res = shmOpen(shmname, size, (void**)ptr);
-  if (res != ncclSuccess) {
-    WARN("failed to allocate %lu byte shm buffer", size);
-    *ptr = NULL;
-    return res;
-  }
-
-  if(cudaHostRegister(*ptr, size, cudaHostRegisterMapped) != cudaSuccess) {
-    WARN("failed to register host buffer");
-    shmUnlink(shmname);
-    shmUnmap(*ptr, size);
-    *ptr = NULL;
-    return ncclUnhandledCudaError;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t populateRankInfo(RankEntry* info, int rank, ncclComm_t comm) {
-  char busId[13];
-  nvmlDevice_t nvmlHandle;
-  cudaError_t res = cudaDeviceGetPCIBusId(busId, 13, comm->cudaDev);
-  if (res == cudaErrorInvalidDevice) {
-    WARN("rank %d attempted to access an invalid cuda device %d", rank, comm->cudaDev);
-    return ncclInvalidDeviceIndex;
-  } else if (res != cudaSuccess) {
-    WARN("rank %d failed to get PCI Bus Id for device %d", rank, comm->cudaDev);
-    return ncclUnhandledCudaError;
-  }
-  INFO("rank %d using device %d (%s)", rank, comm->cudaDev, busId);
-
-  if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlHandle) != ncclSuccess) {
-    WARN("rank %d failed to get nvml handle for device %s", rank, busId);
-    return ncclUnhandledCudaError;
-  }
-  // Order by nvml index
-  if (wrapNvmlDeviceGetIndex(nvmlHandle, (unsigned*)&info->sortId) != ncclSuccess) {
-    WARN("rank %d failed to get nvml device index for device %d", rank, comm->cudaDev);
-    return ncclUnhandledCudaError;
-  }
-
-  info->rank = rank;
-  info->ndev = comm->nRanks;
-  info->cudaDev = comm->cudaDev;
-  info->pid = getpid();
-  info->buffSize = comm->buffSize;
-  info->hostptr = comm->hostMem;
-  info->devptr = comm->devMem;
-  if (cudaIpcGetMemHandle(&info->devipc, (void*)comm->devMem) != cudaSuccess) {
-    WARN("rank %d failed to open CUDA IPC handle", rank);
-    return ncclUnhandledCudaError;
-  }
-
-  return ncclSuccess;
-}
-
-
-static ncclResult_t commClearMaps(ncclComm_t comm) {
-  ncclResult_t res, retval = ncclSuccess;
-  cudaError_t cures;
-
-  for(int d=0; d<comm->nRanks; ++d) {
-    if (comm->ptrs[d].hostCleanup != NULL) {
-      cures = cudaHostUnregister(comm->ptrs[d].hostCleanup);
-      if (cures != cudaSuccess) {
-        WARN("rank %d failed to unregister handle to device %d",
-          comm->rank, d);
-          retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
-      }
-      res = shmUnmap(comm->ptrs[d].hostCleanup, offsetof(ncclMem, buff) + comm->buffSize);
-      if (res != ncclSuccess) {
-        WARN("rank %d failed to unmap handle to device %d",
-          comm->rank, d);
-          retval = (retval == ncclSuccess) ? res : retval;
-      }
-      comm->ptrs[d].hostCleanup = NULL;
-    }
-
-    if (comm->ptrs[d].devCleanup != NULL) {
-      cures = cudaIpcCloseMemHandle((void*)comm->ptrs[d].devCleanup);
-      if (cures != cudaSuccess) {
-        WARN("rank %d failed to close IPC handle to device %d: %s",
-          comm->rank, d, cudaGetErrorString(cures));
-        retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
-      }
-    }
-  }
-
-  if (comm->userFromRing != NULL)
-    memset(comm->userFromRing, 0, sizeof(int)*comm->nRanks);
-  if (comm->ncclFromRing != NULL)
-    memset(comm->ncclFromRing, 0, sizeof(int)*comm->nRanks);
-
-  if (comm->devUserFromRing != NULL) {
-    cures = cudaMemset(comm->devUserFromRing, 0, sizeof(int)*comm->nRanks);
-    if (cures != cudaSuccess) {
-      WARN("Faild to clear dev map: %s", cudaGetErrorString(cures));
-      retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
-    }
-  }
-
-  if (comm->devRing != NULL) {
-    cures = cudaMemset(comm->devRing, 0, sizeof(DevRing<char>));
-    if (cures != cudaSuccess) {
-      WARN("Failed to clear devRing: %s", cudaGetErrorString(cures));
-      retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
-    }
-  }
-  return retval;
-}
-
-static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int rank, RankEntry* ranks, int* globalMemSpaceBroke) {
-  int ndev = comm->nRanks;
-  comm->rank = rank;
-
-  if (ndev > MAXRANKS) {
-    WARN("%d ranks exceeds MAXRANKS of %d", ndev, MAXRANKS);
-    return ncclUnsupportedDeviceCount;
-  }
-
-  // Check for inconsistencies between ranks
-  // If two ranks use the same rank, then one slot of
-  // ranks[] will be left unset with zero ndev/buffSize.
-  for(int i=0; i<ndev; ++i) {
-    if (ranks[i].buffSize != comm->buffSize
-        || ranks[i].ndev != comm->nRanks) {
-      commClearMaps(comm);
-      return ncclRankMismatch;
-    }
-  }
-
-  // Find self among ranks of gather
-  int myNcclId = -1;
-  for (int i=0; i<ndev; ++i) {
-    if(ranks[i].rank == rank) {
-      myNcclId = i;
-      break;
-    }
-  }
-  if (myNcclId == -1) {
-    WARN("rank %d not found in communicator", rank);
-    return ncclInvalidRank;
-  }
-
-  for(int ringPos=0; ringPos<ndev; ++ringPos) {
-    int ncclPos = (ringPos+myNcclId) % ndev; // ring order relative to self
-    int userRank = ranks[ncclPos].rank;
-    comm->userFromRing[ringPos] = userRank;
-    comm->ncclFromRing[ringPos] = ncclPos;
-  }
-
-  int myDev = ranks[myNcclId].cudaDev;
-  pid_t myPid = ranks[myNcclId].pid;
-
-  for (int i=0; i<ndev; ++i) {
-    int iRank = ranks[i].rank;
-    int iDev = ranks[i].cudaDev;
-    pid_t iPid = ranks[i].pid;
-    int canpeer = 0;
-
-    int iIsNeighbor = (i == (myNcclId+1)%ndev) || (i == (myNcclId+ndev-1)%ndev);
-
-    if (iIsNeighbor && cudaDeviceCanAccessPeer(&canpeer, myDev, iDev) != cudaSuccess) {
-      INFO("peer query failed between rank %d (dev %d) and rank %d (dev %d)",
-        rank, myDev, iRank, iDev);
-      canpeer = 0;
-    }
-
-    cudaError_t err;
-    ncclMem* remoteHostBuff;
-
-    comm->ptrs[i].type = NodeRef::HOST; // Assume host buffer
-    comm->ptrs[i].devCleanup = NULL;
-    comm->ptrs[i].hostCleanup = NULL;
-
-    if (iPid == myPid) {
-      remoteHostBuff = ranks[i].hostptr;
-
-      if (myDev == iDev) { // shared device
-        INFO("rank access %d -> %d via common device", rank, iRank);
-        comm->ptrs[i].type = NodeRef::DEVICE;
-        comm->ptrs[i].local = ranks[myNcclId].devptr;
-        comm->ptrs[i].remote = ranks[i].devptr;
-      } else if (canpeer) {
-        INFO("rank access %d -> %d via P2P device mem", rank, iRank);
-        err = cudaDeviceEnablePeerAccess(iDev, 0);
-        if (err == cudaErrorPeerAccessAlreadyEnabled) {
-          cudaGetLastError();
-        } else if (err != cudaSuccess) {
-          WARN("rank %d failed to peer with device %d: %s",
-              rank, iDev, cudaGetErrorString(err));
-          commClearMaps(comm);
-          return ncclUnhandledCudaError;
-        }
-        comm->ptrs[i].type = NodeRef::DEVICE;
-        comm->ptrs[i].local = ranks[myNcclId].devptr;
-        comm->ptrs[i].remote = ranks[i].devptr;
-      }
-    } else { // Separate processes
-      *globalMemSpaceBroke = 1;
-      char rankname[1024];
-      sprintf(rankname, "%s-%d", commId->internal, ranks[i].rank);
-      if (openHostMemShm(rankname, &remoteHostBuff, ranks[i].buffSize)
-          != ncclSuccess) {
-        WARN("rank %d failed to open sysmem buffer of rank %d", rank, iRank);
-        commClearMaps(comm);
-        return ncclUnhandledCudaError;
-      }
-      comm->ptrs[i].hostCleanup = remoteHostBuff;
-
-      // TODO: Extend to same device (MPS) case.
-      // At present that would go through host mem.
-      if (canpeer) {
-        INFO("rank access %d -> %d via IPC device mem", rank, iRank);
-        comm->ptrs[i].type = NodeRef::DEVICE;
-        comm->ptrs[i].local  = ranks[myNcclId].devptr;
-        err = cudaIpcOpenMemHandle((void**)(&comm->ptrs[i].remote),
-            ranks[i].devipc, cudaIpcMemLazyEnablePeerAccess);
-        if (err != cudaSuccess) {
-          WARN("rank %d failed to open Ipc handle to rank %d: %s",
-              rank, iRank, cudaGetErrorString(err));
-          commClearMaps(comm);
-          return ncclUnhandledCudaError;
-        }
-        comm->ptrs[i].devCleanup = comm->ptrs[i].remote;
-      }
-    }
-
-    err = cudaHostGetDevicePointer(&comm->ptrs[i].opCounter,
-          &(remoteHostBuff->opCounter), 0);
-    if (err != cudaSuccess) {
-      WARN("rank %d failed to obtain %d's zero copy pointer: %s",
-          rank, iRank, cudaGetErrorString(err));
-      commClearMaps(comm);
-      return ncclUnhandledCudaError;
-    }
-
-    if (comm->ptrs[i].type == NodeRef::HOST) {
-      *globalMemSpaceBroke = 1;
-      INFO("rank access %d -> %d via zero-copy host mem", rank, iRank);
-      if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myNcclId].hostptr, 0) != cudaSuccess) {
-        WARN("rank %d failed to map zero copy buffer to device", rank);
-        commClearMaps(comm);
-        return ncclUnhandledCudaError;
-      }
-      if (cudaHostGetDevicePointer(&comm->ptrs[i].remote, remoteHostBuff, 0) != cudaSuccess) {
-        WARN("rank %d failed to map %d's zero copy buffer to device", rank, iRank);
-        commClearMaps(comm);
-        return ncclUnhandledCudaError;
-      }
-    }
-  }
-
-  // Setup device-side ring view
-  if (cudaMemcpy(comm->devUserFromRing, comm->userFromRing, ndev*sizeof(int),
-      cudaMemcpyHostToDevice) != cudaSuccess) {
-    WARN("rank %d failed to copy maps to device", rank);
-    commClearMaps(comm);
-    return ncclUnhandledCudaError;
-  }
-
-  DevRing<char> ringTemp;
-  memcpy(ringTemp.userRank, comm->userFromRing, ndev*sizeof(int));
-
-  int prevIdx = comm->ncclFromRing[comm->nRanks-1];
-  int nextIdx = comm->ncclFromRing[1 % comm->nRanks];
-  NodeRef* prevPtrs = comm->ptrs+prevIdx;
-  NodeRef* nextPtrs = comm->ptrs+nextIdx;
-
-  ringTemp.prevOpCounter    = prevPtrs->opCounter;
-  ringTemp.nextOpCounter    = nextPtrs->opCounter;
-  ringTemp.sendFlagToNext   = nextPtrs->remote->flags;
-  ringTemp.recvFlagFromPrev = prevPtrs->local->flags;
-  ringTemp.sendFlagToPrev   = prevPtrs->remote->flags+1;
-  ringTemp.recvFlagFromNext = nextPtrs->local->flags+1;
-
-  ringTemp.recvPtrFromNext = (char**)&nextPtrs->local->recvPtrs;
-  ringTemp.sendPtrToPrev   = (char**)&prevPtrs->remote->recvPtrs;
-
-  ringTemp.recvBuffer = prevPtrs->local->buff;
-  ringTemp.sendBuffer = nextPtrs->remote->buff;
-
-  if (cudaMemcpy(comm->devRing, &ringTemp, sizeof(ringTemp),
-      cudaMemcpyHostToDevice) != cudaSuccess) {
-    WARN("rank %d failed to copy ring maps to device", rank);
-    commClearMaps(comm);
-    return ncclUnhandledCudaError;
-  }
-
-  return ncclSuccess;
-}
-
-static void initDebug() {
-  const char* nccl_debug = getenv("NCCL_DEBUG");
-  if (nccl_debug == NULL) {
-    ncclDebugLevel = NONE;
-  } else if (strcmp(nccl_debug, "VERSION") == 0) {
-    ncclDebugLevel = VERSION;
-  } else if (strcmp(nccl_debug, "WARN") == 0) {
-    ncclDebugLevel = WARN;
-  } else if (strcmp(nccl_debug, "INFO") == 0) {
-    ncclDebugLevel = INFO;
-    INFO("NCCL debug level set to INFO");
-  } else if (strcmp(nccl_debug, "ABORT") == 0) {
-    ncclDebugLevel = ABORT;
-    INFO("NCCL debug level set to ABORT");
-  }
-}
-
-static void commFree(ncclComm_t comm) {
-  if (comm == NULL)
-    return;
-
-  if (comm->doneEvent != NULL)
-    if (cudaEventDestroy(comm->doneEvent) != cudaSuccess)
-      INFO("ncclComm failed to destroy doneEvent");
-
-  ncclResult_t res = commClearMaps(comm);
-  if (res != ncclSuccess)
-    INFO("failed to cleanup comm maps");
-
-  if (comm->devRing != NULL)
-    if (cudaFree(comm->devRing) != cudaSuccess)
-      INFO("commFree failed to free devRing");
-
-  if (comm->userFromRing != NULL)
-    free(comm->userFromRing);
-
-  if (comm->devUserFromRing != NULL)
-    if (cudaFree(comm->devUserFromRing) != cudaSuccess)
-      INFO("commFree failed to free dev maps");
-
-  if (comm->ncclFromRing != NULL)
-    free(comm->ncclFromRing);
-
-  if (comm->devMem != NULL && cudaFree(comm->devMem) != cudaSuccess)
-    INFO("Failed to free devMap");
-
-  if (comm->hostMem != NULL) {
-    if (comm->hostMemState & ShmMapped) {
-      if (cudaHostUnregister(comm->hostMem) != cudaSuccess)
-        INFO("Failed to unregister hostMem");
-      size_t size = offsetof(ncclMem, buff) + comm->buffSize;
-      if (shmUnmap(comm->hostMem, size) != ncclSuccess)
-        INFO("Failed to unmap hostMem");
-      comm->hostMemState ^= ShmMapped;
-    } else {
-      cudaFreeHost(comm->hostMem);
-    }
-  }
-  free(comm);
-}
-
-static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, const ncclUniqueId* commId, int rank) {
-  size_t commBytes = offsetof(ncclComm, ptrs) + ndev*sizeof(NodeRef);
-  struct ncclComm* comm = (struct ncclComm*)malloc(commBytes);
-  if (comm == NULL) {
-    WARN("comm allocation failed");
-    return ncclSystemError;
-  }
-  memset(comm, 0, commBytes);
-
-  comm->nRanks = ndev;
-  cudaGetDevice(&comm->cudaDev);
-
-  const char* str = getenv("NCCL_BUFFSIZE");
-  int buffsize;
-  if (str != NULL) {
-    errno = 0;
-    buffsize = strtol(str, NULL, 10);
-    if (errno == ERANGE || buffsize == 0) {
-      INFO("rank %d invalid NCCL_BUFFSIZE: %s, using default %lu",
-          rank, str, DEFAULT_BUFFER_SIZE_BYTES);
-      buffsize = DEFAULT_BUFFER_SIZE_BYTES;
-    }
-  } else {
-    buffsize = DEFAULT_BUFFER_SIZE_BYTES;
-  }
-  comm->buffSize = buffsize;
-  INFO("rank %d using buffSize = %lu", rank, comm->buffSize);
-
-
-  ncclResult_t res;
-  res = allocDevMem(&comm->devMem, comm->buffSize);
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to allocate device buffer", rank);
-    commFree(comm);
-    return res;
-  }
-
-  if (cudaMalloc(&comm->devRing, sizeof(DevRing<char>)) != cudaSuccess) {
-    WARN("rank %d failed to allocate device-side ring views", rank);
-    commFree(comm);
-    return ncclCudaMallocFailed;
-  }
-
-  if (cudaMalloc(&comm->devUserFromRing, ndev*sizeof(int)) != cudaSuccess ) {
-    WARN("rank %d failed to allocated device maps", rank);
-    commFree(comm);
-    return ncclCudaMallocFailed;
-  }
-
-  comm->userFromRing = (int*)malloc(ndev*sizeof(int));
-  if (comm->userFromRing == NULL) {
-    WARN("rank %d failed to allocate host maps", rank);
-    commFree(comm);
-    return ncclSystemError;
-  }
-
-  comm->ncclFromRing = (int*)malloc(ndev*sizeof(int));
-  if (comm->ncclFromRing == NULL) {
-    WARN("rank %d failed to allocate host maps", rank);
-    commFree(comm);
-    return ncclSystemError;
-  }
-
-  if (cudaEventCreateWithFlags(&comm->doneEvent, cudaEventDisableTiming) != cudaSuccess) {
-    WARN("ncclComm on rank %d failed to create doneEvent", rank);
-    commFree(comm);
-    return ncclUnhandledCudaError;
-  }
-
-  if(commId == NULL) {
-    comm->hostMemState = 0;
-    res = allocHostMem(&comm->hostMem, comm->buffSize);
-  } else {
-    char rankname[1024];
-    sprintf(rankname, "%s-%d", commId->internal, rank);
-    res = openHostMemShm(rankname, &comm->hostMem, comm->buffSize);
-    if (res != ncclSuccess) {
-      WARN("rank %d failed to allocate host buffer", rank);
-      commFree(comm);
-      return res;
-    }
-    comm->hostMemState = ShmMapped | ShmLinked;
-  }
-
-  if (cudaHostGetDevicePointer(&comm->opCounter, &comm->hostMem->opCounter, 0) != cudaSuccess) {
-    WARN("ncclComm on rank %d failed to map opCounter to device", rank);
-    commFree(comm);
-    return ncclUnhandledCudaError;
-  }
-
-  *comret = comm;
-  return ncclSuccess;
-}
-
-static ncclResult_t devCommUpdate(ncclComm_t comm) {
-  // Copy the comm on the device
-  size_t commBytes = offsetof(ncclComm, ptrs) + comm->nRanks*sizeof(NodeRef);
-  if (cudaMemcpy(comm->devComm, comm, commBytes, cudaMemcpyHostToDevice) != cudaSuccess) {
-    WARN("failed to copy device comm");
-    return ncclUnhandledCudaError;
-  }
-  // Fix the host pointer to be accessible from the device
-  void* dptr;
-  if (cudaHostGetDevicePointer(&dptr, comm->hostMem, 0) != cudaSuccess) {
-    WARN("failed to get device pointer for host mem");
-    return ncclUnhandledCudaError;
-  }
-  if (cudaMemcpy(&comm->devComm->hostMem, &dptr, sizeof(dptr), cudaMemcpyHostToDevice) != cudaSuccess) {
-    WARN("failed to update host pointer");
-    return ncclUnhandledCudaError;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t devCommSetup(ncclComm_t comm) {
-  // Fully duplicate the comm on the device
-  size_t commBytes = offsetof(ncclComm, ptrs) + comm->nRanks*sizeof(NodeRef);
-  if (cudaMalloc(&comm->devComm, commBytes) != cudaSuccess) {
-    WARN("failed to allocated device comm");
-    return ncclCudaMallocFailed;
-  }
-  return devCommUpdate(comm);
-}
-
-static ncclResult_t commUnlinkHostMem(ncclComm_t comm, ncclUniqueId commId, int rank) {
-  char rankname[1024];
-  sprintf(rankname, "%s-%d", commId.internal, rank);
-  if (comm->hostMemState & ShmLinked)
-    comm->hostMemState ^= ShmLinked;
-  return shmUnlink(rankname);
-}
-
-static void showVersion() {
-  static int shown = 0;
-  if (shown == 0 && ncclDebugLevel >= VERSION) {
-    printf("NCCL version %d.%d.%d compiled with CUDA %d.%d\n", NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH, CUDA_MAJOR, CUDA_MINOR);
-    fflush(stdout);
-    shown = 1;
-  }
-}
-
-NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
-ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) {
-  if (myrank == 0) showVersion();
-
-  NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm"));
-
-  if (ndev < 1) {
-    WARN("Invalid device count requested : %d", ndev);
-    return ncclUnsupportedDeviceCount;
-  }
-  if (myrank >= ndev || myrank < 0) {
-    WARN("Invalid rank %d, should be in the range 0..%d", myrank, ndev-1);
-    return ncclInvalidRank;
-  }
-
-  if (strlen(commId.internal) < 1 ||
-      strlen(commId.internal) >= NCCL_UNIQUE_ID_BYTES) {
-    WARN("rank %d invalid commId", myrank);
-    return ncclInvalidArgument;
-  }
-
-  initDebug();
-  ncclResult_t res;
-  RankEntry myStuff;
-  RankGather* gath = NULL;
-
-  res = wrapSymbols();
-  if (res != ncclSuccess) {
-    WARN("NCCL failed to initialize client libs");
-    return res;
-  }
-
-  res = wrapNvmlInit();
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to initialize nvml", myrank);
-    return res;
-  }
-
-  res = commAlloc(newcomm, ndev, &commId, myrank);
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to allocate communicator", myrank);
-    return res;
-  }
-
-  res = populateRankInfo(&myStuff, myrank, *newcomm);
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to obtain rank info", myrank);
-    goto cleanup;
-  }
-
-  res = initGather(&gath, commId, ndev, myrank, myStuff);
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to gather rank info", myrank);
-    goto cleanup;
-  }
-
-  res = commBuildMaps(*newcomm, &commId, myrank, gath->ranks, &gath->globalMemSpaceBroke);
-  syncRingDirect(gath, &((*newcomm)->globalMemSpace));
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to build comm maps", myrank);
-    goto cleanup;
-  }
-
-  INFO("Global device memory space is %s", (*newcomm)->globalMemSpace ? "enabled" : "disabled");
-
-  res = closeGather(gath, ndev); // includes a barrier
-  gath = NULL;
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to close gather", myrank);
-    goto cleanup;
-  }
-
-  res = devCommSetup(*newcomm);
-  if (res != ncclSuccess) {
-    WARN("rank %d failed to copy dcomm", myrank);
-    goto cleanup;
-  }
-
-  res = ncclSuccess;
-  goto final;
-
-  cleanup:
-  if (gath != NULL)
-    closeGather(gath, ndev);
-  commFree(*newcomm);
-
-  final:
-  if ((*newcomm)->hostMemState & ShmLinked) {
-    if (commUnlinkHostMem(*newcomm, commId, myrank) != ncclSuccess)
-      INFO("rank %d failed to unlink host mem shm segment", myrank);
-  }
-
-  if (wrapNvmlShutdown() != ncclSuccess)
-    INFO("rank %d did not shutdown nvml properly", myrank);
-  return res;
-}
-
-NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
-ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
-  initDebug();
-
-  showVersion();
-
-  NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
-
-  if (ndev < 1) {
-    WARN("Invalid device count requested : %d", ndev);
-    return ncclUnsupportedDeviceCount;
-  }
-
-  ncclResult_t res;
-  int savedDevice;
-  RankEntry* ranks = NULL;
-  int rank, cudaDev;
-  ncclComm_t comm = NULL;
-  char busId[13];
-  nvmlDevice_t nvmlHandle;
-  int affinity_set = 0;
-  int globalMemSpaceBroke = 0; // Assume direct access to recv ptr OK
-
-  res = wrapSymbols();
-  if (res != ncclSuccess) {
-    WARN("NCCL failed to initialize client libs");
-    return res;
-  }
-
-  cudaGetDevice(&savedDevice);
-  ranks = (RankEntry*)malloc(ndev*sizeof(RankEntry));
-  if (ranks == NULL) {
-    WARN("NCCL allocation failed");
-    return ncclSystemError;
-  }
-  memset(ranks, 0, ndev*sizeof(RankEntry));
-
-  res = wrapNvmlInit();
-  if (res != ncclSuccess) {
-    WARN("nccl failed to initialize nvml");
-    return res;
-  }
-
-  for(rank=0; rank<ndev; ++rank)
-    comms[rank] = NULL;
-
-  for (rank=0; rank<ndev; ++rank) {
-    cudaDev = (devlist == NULL) ? rank : devlist[rank];
-    if (cudaSetDevice(cudaDev) != cudaSuccess) {
-      WARN("rank %d failed to set cuda device %d", rank, cudaDev);
-      res = ncclInvalidDeviceIndex;
-      goto cleanup;
-    }
-
-    // Set CPU affinity
-    affinity_set = 0;
-    if (cudaDeviceGetPCIBusId(busId, 13, cudaDev) != cudaSuccess) {
-      INFO("rank %d failed to get PCI Bus Id for device %d", rank, cudaDev);
-      goto skipaffinity;
-    }
-    if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlHandle) != ncclSuccess) {
-      INFO("rank %d failed to get nvml handle for device %s", rank, busId);
-      goto skipaffinity;
-    }
-    if (wrapNvmlDeviceSetCpuAffinity(nvmlHandle) != ncclSuccess) {
-      INFO("rank %d failed to set affinity", rank);
-      goto skipaffinity;
-    }
-    affinity_set = 1;
-    skipaffinity:
-
-    res = commAlloc(&comm, ndev, NULL, rank);
-    if (res != ncclSuccess) {
-      WARN("rank %d failed to allocate communicator", rank);
-      goto cleanup;
-    }
-    comms[rank] = comm;
-
-    if (affinity_set && wrapNvmlDeviceClearCpuAffinity(nvmlHandle) != ncclSuccess) {
-      INFO("rank %d set but failed to clear cpu affinity", rank);
-    }
-    res = populateRankInfo(ranks+rank, rank, comm);
-    if (res != ncclSuccess) {
-      WARN("rank %d failed to obtain rank info", rank);
-      goto cleanup;
-    }
-  }
-
-  orderRanks(ranks, ndev);
-  for(rank=0; rank<ndev; ++rank) {
-    comm = comms[rank];
-    cudaSetDevice(comm->cudaDev);
-    res = commBuildMaps(comm, NULL, rank, ranks, &globalMemSpaceBroke);
-    if (res != ncclSuccess) {
-      WARN("rank %d failed to build comm maps", rank);
-      goto cleanup;
-    }
-  }
-
-  INFO("Global device memory space is %s", (globalMemSpaceBroke) ? "disabled" : "enabled");
-  for(rank=0; rank<ndev; ++rank) {
-    comms[rank]->globalMemSpace = globalMemSpaceBroke ? 0 : 1;
-  }
- 
-  for(rank=0; rank<ndev; ++rank) {
-    res = devCommSetup(comms[rank]);
-    if (res != ncclSuccess) {
-      WARN("rank %d failed to copy dcomm", rank);
-      goto cleanup;
-    }
-  }
-
-  free(ranks);
-  ranks = NULL;
-  res = ncclSuccess;
-  goto final;
-
-  cleanup:
-  if (ranks != NULL)
-    free(ranks);
-  for(rank=0; rank<ndev; ++rank) {
-    if(comms[rank] != NULL) {
-      commFree(comms[rank]);
-    }
-  }
-
-  final:
-  if(wrapNvmlShutdown() != ncclSuccess)
-    INFO("NCCL did not shutdown nvml properly");
-  cudaSetDevice(savedDevice);
-  return res;
-}
-
-NCCL_API(void, ncclCommDestroy, ncclComm_t comm);
-void ncclCommDestroy(ncclComm_t comm) {
-  if (comm == NULL)
-    return;
-
-  int savedDevice;
-  cudaGetDevice(&savedDevice);
-  int commDevice = comm->cudaDev;
-
-  if (savedDevice != commDevice) {
-    CUDACHECK(cudaSetDevice(commDevice), void());
-  }
-
-  commFree(comm);
-
-  if (savedDevice != commDevice)
-    cudaSetDevice(savedDevice);
-}
-
-NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
-const char* ncclGetErrorString(ncclResult_t code) {
-  switch (code) {
-  case ncclSuccess                : return "no error";
-  case ncclUnhandledCudaError     : return "unhandled cuda error";
-  case ncclSystemError            : return "system error";
-  case ncclInternalError          : return "internal error";
-  case ncclInvalidDevicePointer   : return "invalid device pointer";
-  case ncclInvalidRank            : return "invalid rank";
-  case ncclUnsupportedDeviceCount : return "unsupported device count";
-  case ncclDeviceNotFound         : return "device not found";
-  case ncclInvalidDeviceIndex     : return "invalid device index";
-  case ncclLibWrapperNotSet       : return "lib wrapper not initialized";
-  case ncclCudaMallocFailed       : return "cuda malloc failed";
-  case ncclRankMismatch           : return "parameter mismatch between ranks";
-  case ncclInvalidArgument        : return "invalid argument";
-  case ncclInvalidType            : return "invalid data type";
-  case ncclInvalidOperation       : return "invalid reduction operations";
-  }
-  return "unknown result code";
-}
-
-NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
-ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
-  NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
-  NCCLCHECK(PtrCheck(count, "CommCount", "count"));
-  *count = comm->nRanks;
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
-ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
-  NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
-  NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
-  *devid = comm->cudaDev;
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
-ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
-  NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
-  NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
-  *rank = comm->rank;
-  return ncclSuccess;
-}
-
diff --git a/src/core.h b/src/core.h
deleted file mode 100644
index 17794d7..0000000
--- a/src/core.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef CORE_H_
-#define CORE_H_
-
-
-#include "nccl.h"
-#include <cstdio>
-#include <cuda_runtime.h>
-
-#define MAXRANKS 32
-#define DEFAULT_BUFFER_SIZE_BYTES (1UL << 21)
-#define NCCL_MEM_PAD_ALIGN 65536
-
-
-struct ncclMem {
-  union { // Pad this block so that devBuff is correctly aligned
-    struct {
-      int   flags[2];
-      void* recvPtrs;
-      int   opCounter; // Used to determine when remote Communicators are ready.
-                       // Only used in host memory.
-    };
-    char pad[NCCL_MEM_PAD_ALIGN];
-  };
-  // devBuff will be bigger ; we only use its offset/address.
-  char buff[1];
-};
-
-template <typename T>
-struct alignas(long long) DevRing {
-  volatile int* __restrict__ prevOpCounter;
-  volatile int* __restrict__ nextOpCounter;
-  volatile int* __restrict__ sendFlagToNext;
-  volatile int* __restrict__ sendFlagToPrev;
-  volatile int* __restrict__ recvFlagFromNext;
-  volatile int* __restrict__ recvFlagFromPrev;
-
-  T* volatile * __restrict__ recvPtrFromNext;
-  T* volatile * __restrict__ sendPtrToPrev;
-  T*   __restrict__ recvBuffer;
-  T*   __restrict__ sendBuffer;
-
-  int userRank[MAXRANKS];
-};
-
-struct NodeRef {
-  ncclMem* remote; // TODO: Verify if these
-  ncclMem* local;  //       are still needed.
-  enum {DEVICE, HOST} type;
-  ncclMem* devCleanup;  // Used only when remote comm uses same process & GPU
-  ncclMem* hostCleanup; // Used whenever target is in different process
-  int* opCounter; // TODO: see if this can be removed too.
-};
-
-
-struct ncclComm {
-  int rank;    // my rank in the communicator
-  int nRanks;  // number of GPUs in communicator
-  int cudaDev; // my cuda device index
-
-  // Device and Host allocated chunks. Stored here to correctly free() memory.
-  ncclMem* devMem;
-  ncclMem* hostMem;
-  int hostMemState;
-  int opSched; // Scheduling operation index
-  int* opCounter; // Counter of completed operations
-
-  cudaStream_t prevStream; // cache last used stream
-  cudaEvent_t doneEvent; // orders operations in different streams
-
-  // Maps an internal nccl index to user-specified rank order. This is necessary
-  // since we need to know how the user expects data to be ordered across
-  // devices. Ordered from current device.
-  int* userFromRing;
-
-  // copy of the above stored on each device
-  int* devUserFromRing;
-
-  // Ring order
-  int* ncclFromRing; // TODO: REMOVE IF NOT NEEDED BEYOND CORE.CU
-
-  // Size of temp buffer in bytes.
-  size_t buffSize;
-
-  // Whether we have remote access to the recvbuff pointers passed from remote
-  // GPUs. In single process mode this can be used as long as QPI links are
-  // not present. In multi-process, we never push to a remote recvbuff.
-  int globalMemSpace;
-
-  // Device copy of the communicator
-  struct ncclComm *devComm;  // TODO: Remove this if not useful
-
-  // Device-side ring view
-  DevRing<char>* devRing;
-
-  // Device-to-device communication structures to access remote or local device
-  // memory. Actual allocation larger than 1.
-  NodeRef ptrs[1];
-};
-
-
-typedef enum {NONE=0, VERSION=1, WARN=2, INFO=3, ABORT=4} DebugLevel;
-extern DebugLevel ncclDebugLevel;
-
-#define WARN(...) do {                                           \
-  if (ncclDebugLevel >= WARN) {                                  \
-    printf("WARN %s:%d ", __FILE__, __LINE__);                   \
-    printf(__VA_ARGS__);                                         \
-    printf("\n");                                                \
-    fflush(stdout);                                              \
-    if (ncclDebugLevel >= ABORT) abort();                        \
-  }                                                              \
-} while(0)
-
-#define INFO(...) do {                                           \
-  if (ncclDebugLevel >= INFO) {                                  \
-    printf("INFO "); printf(__VA_ARGS__); printf("\n");          \
-    fflush(stdout);                                              \
-  }                                                              \
-} while(0)
-
-// Check CUDA calls
-#define CUDACHECK(cmd, retcode) do {                        \
-    cudaError_t e = cmd;                                    \
-    if( e != cudaSuccess ) {                                \
-        WARN("Cuda failure '%s'\n", cudaGetErrorString(e)); \
-        return retcode;                                     \
-    }                                                       \
-} while(false)
-
-// Propagate errors up
-#define NCCLCHECK(call) do { \
-  ncclResult_t res = call; \
-  if (res != ncclSuccess) { \
-    return res; \
-  } \
-} while (0);
-
-#ifdef PROFAPI
-#define NCCL_API(ret, func, args...)        \
-    __attribute__ ((visibility("default"))) \
-    __attribute__ ((alias(#func)))          \
-    ret p##func (args);                     \
-    extern "C"                              \
-    __attribute__ ((visibility("default"))) \
-    __attribute__ ((weak))                  \
-    ret func(args)
-#else
-#define NCCL_API(ret, func, args...)        \
-    extern "C"                              \
-    __attribute__ ((visibility("default"))) \
-    ret func(args)
-#endif // end PROFAPI
-
-
-#endif // end include guard
-
diff --git a/src/enqueue.h b/src/enqueue.h
deleted file mode 100644
index 43d570e..0000000
--- a/src/enqueue.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef enqueue_h_
-#define enqueue_h_
-
-#include "core.h"
-#include "reduce_kernel.h"
-
-/* Syncronize previous collective (if in different stream) and enqueue
- * collective. Work is performed asynchronously with the host thread.
- * The ColFunc class should be templated on the datatype and reduction
- * operator (if applicable) and define a static entry() method as
- * follows.
- *   template <typename T, template <typename> class RedOp>
- *   class CollectiveFunctor {
- *     public:
- *     static ncclResult_t entry(const void* sendbuff, void* recvbuff, int count,
- *         int root, ncclComm* comm, cudaStream_t stream);
- *   };
- * The entry() method can assume that the appropriate cuda device has been set. */
-template< template<typename, template<typename> class> class ColFunc,
-          typename T,
-          template<typename> class Op >
-ncclResult_t enqueue(const void* sendbuff,
-                     void* recvbuff,
-                     int count,
-                     int root,
-                     ncclComm_t comm,
-                     cudaStream_t stream)
-{
-  if (stream != comm->prevStream) { // sync required for calls in different streams
-    comm->prevStream = stream;
-    CUDACHECK(cudaStreamWaitEvent(stream, comm->doneEvent, 0), ncclUnhandledCudaError);
-  }
-
-  ncclResult_t ret;
-  ret = ColFunc<T, Op>::entry(sendbuff, recvbuff, count, root, comm, stream);
-
-  // Always have to record done event because we don't know what stream next
-  // collective will be in.
-  CUDACHECK(cudaEventRecord(comm->doneEvent, stream), ncclUnhandledCudaError);
-  comm->opSched += 1;
-  return ret;
-}
-
-
-// This version decodes type
-template< template<typename, template<typename> class> class ColFunc,
-          template<typename> class Op >
-ncclResult_t enqueue(const void* sendbuff,
-                     void* recvbuff,
-                     int count,
-                     ncclDataType_t type,
-                     int root,
-                     ncclComm_t comm,
-                     cudaStream_t stream)
-{
-  switch(type) {
-  case ncclChar:
-    return enqueue<ColFunc, char, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  case ncclInt:
-    return enqueue<ColFunc, int, Op>(sendbuff, recvbuff, count, root, comm, stream);
-#ifdef CUDA_HAS_HALF
-  case ncclHalf:
-    return enqueue<ColFunc, half, Op>(sendbuff, recvbuff, count, root, comm, stream);
-#endif
-  case ncclFloat:
-    return enqueue<ColFunc, float, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  case ncclDouble:
-    return enqueue<ColFunc, double, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  case ncclInt64:
-    return enqueue<ColFunc, long long, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  case ncclUint64:
-    return enqueue<ColFunc, unsigned long long, Op>(sendbuff, recvbuff, count, root, comm, stream);
-  default:
-    WARN("Invalid ncclType %d", type);
-    return ncclInvalidType;
-  }
-}
-
-// This version decodes both type and reduction op
-template< template<typename, template<typename> class> class ColFunc>
-ncclResult_t enqueue(const void* sendbuff,
-                     void* recvbuff,
-                     int count,
-                     ncclDataType_t type,
-                     ncclRedOp_t op,
-                     int root,
-                     ncclComm_t comm,
-                     cudaStream_t stream)
-{
-  switch(op) {
-  case ncclSum:
-    return enqueue<ColFunc, FuncSum>(sendbuff, recvbuff, count, type, root, comm, stream);
-  case ncclProd:
-    return enqueue<ColFunc, FuncProd>(sendbuff, recvbuff, count, type, root, comm, stream);
-  case ncclMax:
-    return enqueue<ColFunc, FuncMax>(sendbuff, recvbuff, count, type, root, comm, stream);
-  case ncclMin:
-    return enqueue<ColFunc, FuncMin>(sendbuff, recvbuff, count, type, root, comm, stream);
-  default:
-    WARN("Invalid ncclRedOp: %d", op);
-    return ncclInvalidOperation;
-  }
-}
-
-#endif // End include guard
-
diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h
new file mode 100644
index 0000000..81af3a4
--- /dev/null
+++ b/src/include/bootstrap.h
@@ -0,0 +1,18 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_BOOTSTRAP_H_
+#define NCCL_BOOTSTRAP_H_
+
+#include "nccl.h"
+
+ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
+ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
+ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState);
+ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
+ncclResult_t bootstrapRingExchange(void* commState, void* prevNextData, int prev, int next, int size);
+ncclResult_t bootstrapClose(void* commState);
+#endif
diff --git a/src/include/common_coll.h b/src/include/common_coll.h
new file mode 100644
index 0000000..cd97bbd
--- /dev/null
+++ b/src/include/common_coll.h
@@ -0,0 +1,195 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_COLL_H_
+#define COMMON_COLL_H_
+
+#include "core.h"
+#include "enqueue.h"
+#include "collectives/collectives.h"
+
+static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
+  cudaPointerAttributes attr;
+  cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
+  if (err != cudaSuccess || attr.devicePointer == NULL) {
+    WARN("%s : %s is not a valid pointer", opname, ptrname);
+    return ncclInvalidArgument;
+  }
+#if __CUDACC_VER_MAJOR__ >= 10
+  if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
+#else
+  if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
+#endif
+    WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
+    return ncclInvalidArgument;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
+  if (ptr == NULL) {
+    WARN("%s : %s argument is NULL", opname, ptrname);
+    return ncclInvalidArgument;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) {
+  NCCLCHECK(PtrCheck(comm, opname, "comm"));
+  // First, the easy ones
+  if (root < 0 || root >= comm->nRanks) {
+    WARN("%s : invalid root %d (root should be in the 0..%d range)", opname, root, comm->nRanks);
+    return ncclInvalidArgument;
+  }
+  if (type < 0 || type >= ncclNumTypes) {
+    WARN("%s : invalid type %d", opname, type);
+    return ncclInvalidArgument;
+  }
+  if (op < 0 || op >= ncclNumOps) {
+    WARN("%s : invalid reduction operation %d", opname, op);
+    return ncclInvalidArgument;
+  }
+
+  if (comm->checkPointers) {
+    // Check CUDA device pointers
+    if (strcmp(opname, "Broadcast") != 0 || comm->rank == root) {
+      NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname));
+    }
+    if (strcmp(opname, "Reduce") != 0 || comm->rank == root) {
+      NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname));
+    }
+  }
+  return ncclSuccess;
+}
+
+static __inline__ int ncclTypeSize(ncclDataType_t type) {
+  switch (type) {
+    case ncclInt8:
+    case ncclUint8:
+      return 1;
+    case ncclFloat16:
+      return 2;
+    case ncclInt32:
+    case ncclUint32:
+    case ncclFloat32:
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclFloat64:
+      return 8;
+    default:
+      return -1;
+  }
+}
+
+// In : comm, nbytes ; Out : nrings, nthreads, ll
+// - We start with the minimum number of threads possible (64) and see if the size fits in LL;
+//   If not, we increase the number of threads by 2x, until we reach the max number of LL threads (256, or set by user via NCCL_NTHREADS, or platform non-LL default)
+// - We use "maxRings" to limit the max number of rings we can use before reaching the max number of LL threads
+//   This ensures we don't use a large number of rings with a small number of threads
+// - We use the NCCL_LL_RING_THRESHOLD as the per-thread threshold before we reach the max number of threads
+//   we use NCCL_THREAD_THRESHOLD when we reach the max
+// - If by the max number of LL threads, the size still cannot fit in LL, then we use non-LL setting
+// - We honor the NCCL_LL_THRESHOLD (total threshold) set by user too
+static inline void ncclGetCollResource(ncclComm_t comm, size_t nbytes, int* nrings, int* nthreads, int* ll) {
+  *ll = 0;
+  int llEnforced = 0; /* see if the size falls in the NCCL_LL_THRESHOLD range set by user */
+  if (comm->llThreshold >= 0) { /* user sets total LL threshold */
+    if (nbytes > comm->llThreshold) { /* non-LL */
+      *nthreads = comm->nThreads+1;
+      *nrings = comm->nRings;
+      return;
+    } else {
+      llEnforced = 1; /* user wants to use LL */
+    }
+  }
+  int nt = NCCL_LL_MIN_NTHREADS; /* start with min number of LL threads */
+  size_t nr;
+  int ll_max_nthreads = std::min(NCCL_LL_MAX_NTHREADS, comm->nThreads); /* respect user's setting or platform's default setting */
+  int maxRings = (comm->nRanks <= 4) ? 1 : ll_max_nthreads / NCCL_LL_MIN_NTHREADS;
+  ssize_t threshold = std::min(comm->threadThreshold, (ssize_t)NCCL_LL_RING_THRESHOLD);
+  while (nt < ll_max_nthreads && *ll == 0) {
+    nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*nt*comm->nRanks));
+    if (nr <= maxRings) { /* avoid using few threads but many rings */
+      nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
+      *ll = nbytes > comm->nRanks*nr*nt*threshold ? 0 : 1;
+    }
+    if (*ll == 0) {
+      nt = nt << 1;
+    }
+  }
+  if (*ll == 1) {
+    *nthreads = nt;
+    *nrings = (int)nr;
+    return; /* we can use smaller number of threads to make LL work, stop here */
+  }
+  nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*ll_max_nthreads*comm->nRanks)); /* else we try the max number of LL threads */
+  nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
+  *ll = nbytes > comm->nRanks*nr*ll_max_nthreads*comm->threadThreshold ? llEnforced : 1;
+  *nthreads = *ll ? ll_max_nthreads : comm->nThreads+1;
+  *nrings = *ll ? (int)nr : comm->nRings;
+}
+
+static ncclResult_t saveKernel(int coll, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t dtype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, size_t nbytes, int loopFactor) {
+  int llMode, nBlocks, nThreads;
+  ncclGetCollResource(comm, nbytes, &nBlocks, &nThreads, &llMode);
+  comm->myParams->blockDim.x = std::max((int)comm->myParams->blockDim.x, nThreads);
+  if (comm->userStreamSet == false) {
+    comm->userStream = stream;
+    comm->userStreamSet = true;
+  } else if (stream != comm->userStream) {
+    WARN("Error : mixing different streams within a group call is not supported.");
+    return ncclInvalidUsage;
+  }
+  int lastChunkSize = 0;
+  if (llMode == 1) {
+    int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / ncclTypeSize(dtype);
+    const ssize_t loopSize = nBlocks*loopFactor*(ssize_t)sliceSize;
+    lastChunkSize = DIVUP((count-count/loopSize*loopSize), nBlocks*loopFactor);
+    ALIGN_SIZE(lastChunkSize, nThreads*sizeof(uint64_t)/ncclTypeSize(dtype));
+  }
+  for (int bid=0; bid<nBlocks; bid++) {
+    struct ncclRing* ring = comm->rings+(comm->myParams->gridDim.x % comm->nRings);
+    if (ring->collCount == NCCL_MAX_OPS) {
+      WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
+      return ncclInvalidUsage;
+    }
+
+    comm->myParams->gridDim.x++;
+
+    int opIndex = ring->collFifoTail;
+    struct ncclColl* c = ring->collectives+opIndex;
+    volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
+    while (activePtr[0] != 0) sched_yield();
+
+    struct CollectiveArgs* args = &c->args;
+    args->root = root;
+    args->N = count;
+    args->ThisInput = sendbuff;
+    args->ThisOutput = recvbuff;
+    args->comm = comm->devComm;
+    args->opCount = comm->opCount;
+    args->bid = bid;
+    args->nRings = nBlocks;
+    args->nThreads = nThreads;
+    args->lastChunkSize = lastChunkSize;
+
+    c->nThreads = nThreads;
+    c->funcIndex = FUNC_INDEX(coll, op, dtype, llMode);
+    c->active = 1;
+    opIndex = (opIndex+1)%NCCL_MAX_OPS;
+    c->nextIndex = opIndex;
+    ring->collFifoTail = opIndex;
+    ring->collCount++;
+  }
+  /*if (llMode == 0)*/ comm->opCount++;
+  return ncclSuccess;
+}
+
+extern __global__ void ncclMultiOpKernel (struct ncclColl firstColl);
+
+#endif
diff --git a/src/include/core.h b/src/include/core.h
new file mode 100644
index 0000000..66b353c
--- /dev/null
+++ b/src/include/core.h
@@ -0,0 +1,385 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CORE_H_
+#define NCCL_CORE_H_
+
+#define NCCL_MAX_OPS 2048
+
+#include "nccl.h"
+#include "transport.h"
+#include "debug.h"
+#include <cstdio>
+#include <algorithm> // std::min/std::max
+#include <unistd.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+
+#if __CUDACC_VER_MAJOR__ < 9
+struct cudaLaunchParams {
+  void *func;
+  dim3 gridDim;
+  dim3 blockDim;
+  void **args;
+  size_t sharedMem;
+  cudaStream_t stream;
+};
+#endif
+
+#define MAXRINGS 16
+#define MAXTHREADS 256
+#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
+
+// Rings / LL tuning
+#define NCCL_LL_RING_THRESHOLD 8 // Per thread size before we start increasing nrings
+#define NCCL_THREAD_THRESHOLD 32  // Per thread size before we switch to non-LL
+#define NCCL_LL_MAX_NTHREADS 256
+#define NCCL_LL_MIN_NTHREADS 64
+
+#define DIVUP(x, y) \
+    (((x)+(y)-1)/(y))
+#define ROUNDUP(x, y) \
+    (DIVUP((x), (y))*(y))
+
+#define ALIGN_SIZE(size, align) \
+  size = ((size + (align) - 1) / (align)) * (align);
+
+union ncclLLFifoLine {
+  /* Flags have to be *after* data, because otherwise, an incomplete receive
+     from the network may receive the flag but not the data.
+     Note this is assuming that either we receive contiguous chunks of data
+     (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
+  struct {
+    uint32_t data1;
+    uint32_t flag1;
+    uint32_t data2;
+    uint32_t flag2;
+  };
+  uint64_t v[2];
+  int4 i4;
+};
+
+struct ncclConnInfo {
+  // Regular comm mechanism
+  char *buff;         // Local for recv, remote for send
+  uint64_t *tail;     // Local for recv, remote for send
+  uint64_t *head;     // Local for send, remote for recv
+  uint64_t *opCount;  // Local for recv, remote for send
+
+  int direct;         // Direct communication
+  void **ptrExchange; // Pointer exchange for direct communication
+
+  int *fifo;          // Size fifo for proxy
+
+  // Low latency mechanism
+  char *llBuff;       // Local for recv, remote for send
+  uint64_t *llHead;   // Local for send, remote for recv
+  int *llFifo;        // LL Size fifo for proxy
+  uint64_t llStep;    // Keep where we are
+  uint64_t llLastCleaning;
+};
+
+struct ncclConnector {
+  struct transportProxyInfo* proxyInfo;
+  struct ncclTransport* transport;
+  void* transportResources; // Host-side resources
+  struct ncclConnInfo conn;
+};
+
+#define CACHE_LINE_SIZE 128
+#define MEM_ALIGN 4096
+#define SIZES_FIFO_SIZE 32
+#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
+
+#define NCCL_LL_CHUNKS 8
+#define NUM_LINES_PER_THREAD 2
+#define NCCL_LL_BUFF_SIZE (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_LL_CHUNKS*sizeof(union ncclLLFifoLine)) // 64K
+#define NCCL_LL_BUFF_LINES (NCCL_LL_BUFF_SIZE / (2*sizeof(uint64_t)))
+#define NCCL_LL_SLICE_LINES (NCCL_LL_BUFF_LINES / NCCL_LL_CHUNKS)
+#define NCCL_LL_CLEAN_FREQ 0x10000000
+
+struct ncclSendMem {
+  union {
+    struct {
+      uint64_t head;
+      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      void* ptrExchange;
+      char pad2[CACHE_LINE_SIZE-sizeof(void*)];
+      uint64_t llHead;
+    };
+    char pad3[MEM_ALIGN];
+  };
+};
+
+struct ncclRecvMem {
+  union {
+    struct {
+      uint64_t tail;
+      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      uint64_t opCount;
+      char pad4[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      int sizesFifo[SIZES_FIFO_SIZE];
+      int llSizesFifo[SIZES_FIFO_SIZE];
+    };
+    char pad5[MEM_ALIGN];
+  };
+  char llBuff[NCCL_LL_BUFF_SIZE];
+  char buff[1]; // Actually larger than that
+};
+
+struct ncclRing {
+  union {
+    struct {
+      int id;
+      int nthreads;
+      // Per ring resources
+      struct ncclSendMem* devMemSend;   // CUDA-size resources
+      struct ncclRecvMem* devMemRecv;   // CUDA-size resources
+      int buffSize;
+      int devMemSendSize;    // Keep the size for IPCs
+      int devMemRecvSize;    // Keep the size for IPCs
+      struct ncclConnector send;
+      struct ncclConnector recv;
+
+      // Maps an internal nccl index to user-specified rank order. This is necessary
+      // since we need to know how the user expects data to be ordered across
+      // devices. Ordered from current device.
+      int* userRanks;
+      int* devUserRanks;
+
+      // Operation list for aggregation
+      struct ncclColl* collectives;
+      struct ncclColl* devCollectives;
+      int collStart;
+      int collCount;
+      int collFifoHead; // Only used by GPU
+      int collFifoTail; // Only used by CPU
+    };
+    int data[0x80];
+  };
+};
+static_assert(sizeof(struct ncclRing) == 0x80*sizeof(int), "ncclRing must have a pow2 size");
+
+/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
+/* to make sure reads to host from the CUDA kernel are aligned. */
+/* Make sure to adjust padding at the end of ncclColl. */
+struct CollectiveArgs {
+  struct ncclComm* comm;
+  uint64_t opCount;
+
+  // local and remote input, output, and buffer
+  const void * ThisInput;
+  void * ThisOutput;
+
+  // general parameters
+  size_t N;
+  uint32_t root;
+  uint8_t bid;
+  uint8_t nRings;
+  uint16_t nThreads;
+
+  int lastChunkSize;
+};
+struct ncclColl {
+  union {
+    struct {
+      struct CollectiveArgs args;
+      uint16_t nThreads;
+      uint16_t funcIndex;
+      uint16_t nextIndex;
+      uint8_t  active;
+    };
+    int data[0x10];
+  };
+};
+static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
+
+struct ncclComm {
+  struct ncclRing rings[MAXRINGS];
+
+  int rank;    // my rank in the communicator
+  int nRanks;  // number of GPUs in communicator
+  int cudaDev; // my cuda device index
+
+  enum { GROUP, PARALLEL } launchMode;
+  cudaStream_t userStream;
+  bool userStreamSet;
+  cudaEvent_t doneEvent;
+  bool checkPointers;
+
+  // Counter to make sure collectives match (needed for bcast/reduce
+  // where syncs are not symmetric).
+  uint64_t opCount;
+
+  // Rings for collectives
+  int nRings;
+  int nThreads;
+
+  // Low-latency algorithm threshold
+  ssize_t llThreshold;
+  ssize_t threadThreshold;
+
+  // An internal CUDA stream for NCCL kernel CGMD launches
+  int groupCudaStream;
+  cudaStream_t groupStream;
+
+  // Device copy of the communicator
+  struct ncclComm *devComm;
+
+  // Intra-process sync
+  int intraRank;
+  int intraRanks;
+  int* intraBarrier;
+  int intraPhase;
+
+  // Storage for deferred intra-process launch
+  struct cudaLaunchParams * intraParams;
+  struct cudaLaunchParams *myParams;
+  int* intraCudaDevs;
+  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
+  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
+  struct ncclColl args;
+  void* argsptr;
+};
+
+// Check CUDA calls
+#define CUDACHECK(cmd) do {                                 \
+    cudaError_t e = cmd;                                    \
+    if( e != cudaSuccess ) {                                \
+        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
+        return ncclUnhandledCudaError;                      \
+    }                                                       \
+} while(false)
+
+#define CUDACHECKGOTO(cmd, res, label) do {                 \
+    cudaError_t e = cmd;                                    \
+    if( e != cudaSuccess ) {                                \
+        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
+        res = ncclUnhandledCudaError;                       \
+        goto label;                                         \
+    }                                                       \
+} while(false)
+
+#include <errno.h>
+// Check system calls
+#define SYSCHECK(call, name) do { \
+  int ret = -1; \
+  while (ret == -1) { \
+    SYSCHECKVAL(call, name, ret); \
+    if (ret == -1) { \
+      INFO(ALL,"Got %s, retrying", strerror(errno));   \
+    }\
+  } \
+} while (0);
+
+#define SYSCHECKVAL(call, name, retval) do { \
+  retval = call; \
+  if (retval == -1 && errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { \
+    WARN("Call to " name " failed : %s", strerror(errno)); \
+    return ncclSystemError; \
+  } \
+} while (0);
+
+#define SYSCHECKNTIMES(call, name, times, usec, exptype) do { \
+  int ret = -1; \
+  int count = 0; \
+  while (ret == -1 && count < times) { \
+    SYSCHECKVALEXP(call, name, ret, exptype); \
+    count++; \
+    if (ret == -1) { \
+      usleep(usec); \
+    }\
+  } \
+  if (ret == -1) { \
+    WARN("Call to " name " timeout : %s", strerror(errno)); \
+    return ncclSystemError; \
+  } \
+} while (0);
+
+#define SYSCHECKVALEXP(call, name, retval, exptype) do { \
+  retval = call; \
+  if (retval == -1 && errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN && errno != exptype) { \
+    WARN("Call to " name " failed : %s", strerror(errno)); \
+    return ncclSystemError; \
+  } \
+} while (0);
+
+// Propagate errors up
+#define NCCLCHECK(call) do { \
+  ncclResult_t res = call; \
+  if (res != ncclSuccess) { \
+    /* Print the back trace*/ \
+    INFO(ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    return res; \
+  } \
+} while (0);
+
+#define NCCLCHECKGOTO(call, res, label) do { \
+  res = call; \
+  if (res != ncclSuccess) { \
+    /* Print the back trace*/ \
+    INFO(ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    goto label; \
+  } \
+} while (0);
+
+#ifdef PROFAPI
+#define NCCL_API(ret, func, args...)        \
+    __attribute__ ((visibility("default"))) \
+    __attribute__ ((alias(#func)))          \
+    ret p##func (args);                     \
+    extern "C"                              \
+    __attribute__ ((visibility("default"))) \
+    __attribute__ ((weak))                  \
+    ret func(args)
+#else
+#define NCCL_API(ret, func, args...)        \
+    extern "C"                              \
+    __attribute__ ((visibility("default"))) \
+    ret func(args)
+#endif // end PROFAPI
+
+int ncclCudaCompCap();
+
+#include <sys/mman.h>
+static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
+  CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
+  memset(*ptr, 0, size);
+  *devPtr = *ptr;
+  return ncclSuccess;
+}
+
+static inline ncclResult_t ncclCudaHostFree(void* ptr) {
+  CUDACHECK(cudaFreeHost(ptr));
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
+  void* p = malloc(nelem*sizeof(T));
+  if (p == NULL) {
+    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
+    return ncclSystemError;
+  }
+  memset(p, 0, nelem*sizeof(T));
+  *ptr = (T*)p;
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
+  CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
+  CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
+  CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
+  return ncclSuccess;
+}
+
+#endif // end include guard
diff --git a/src/include/debug.h b/src/include/debug.h
new file mode 100644
index 0000000..83ae98b
--- /dev/null
+++ b/src/include/debug.h
@@ -0,0 +1,179 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_DEBUG_H_
+#define NCCL_DEBUG_H_
+
+#include <pthread.h>
+#include <stdio.h>
+#include <chrono>
+
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <limits.h>
+#include <string.h>
+#include "nccl.h"
+#define gettid() (pid_t) syscall(SYS_gettid)
+
+typedef enum {NONE=0, VERSION=1, WARN=2, INFO=3, ABORT=4, TRACE=5} DebugLevel;
+typedef enum {INIT=1, COLL=2, P2P=4, SHM=8, NET=16, ALL=~0} SubSys;
+extern DebugLevel ncclDebugLevel;
+extern uint64_t ncclDebugMask;
+extern pthread_mutex_t ncclDebugOutputLock;
+extern FILE *ncclDebugFile;
+extern ncclResult_t getHostName(char* hostname, int maxlen);
+
+#define WARN(...) do {                                           \
+  if (ncclDebugLevel >= WARN) {                                  \
+    char hostname[1024];                                         \
+    getHostName(hostname, 1024);                                 \
+    int cudaDev;                                                 \
+    cudaGetDevice(&cudaDev);                                     \
+    pthread_mutex_lock(&ncclDebugOutputLock);                    \
+    fprintf(ncclDebugFile,"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, __FILE__, __LINE__); \
+    fprintf(ncclDebugFile,__VA_ARGS__);                          \
+    fprintf(ncclDebugFile,"\n");                                 \
+    fflush(ncclDebugFile);                                       \
+    pthread_mutex_unlock(&ncclDebugOutputLock);                  \
+    if (ncclDebugLevel == ABORT) { fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n", hostname, getpid(), gettid(), cudaDev, __FILE__, __LINE__); abort(); } \
+  }                                                              \
+} while(0)
+
+#define INFO(FLAGS, ...) do {                                    \
+  if (ncclDebugLevel >= INFO && ((FLAGS) & ncclDebugMask)) {     \
+    char hostname[1024];                                         \
+    getHostName(hostname, 1024);                                 \
+    int cudaDev;                                                 \
+    cudaGetDevice(&cudaDev);                                     \
+    pthread_mutex_lock(&ncclDebugOutputLock);                    \
+    fprintf(ncclDebugFile,"%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev); \
+    fprintf(ncclDebugFile,__VA_ARGS__);fprintf(ncclDebugFile,"\n"); \
+    fflush(ncclDebugFile);                                       \
+    pthread_mutex_unlock(&ncclDebugOutputLock);                  \
+  }                                                              \
+} while(0)
+
+#ifdef ENABLE_TRACE
+#define TRACE(FLAGS, ...) do {                                   \
+  if (ncclDebugLevel == TRACE && ((FLAGS) & ncclDebugMask)) {    \
+    char hostname[1024];                                         \
+    getHostName(hostname, 1024);                                 \
+    int cudaDev;                                                 \
+    cudaGetDevice(&cudaDev);                                     \
+    pthread_mutex_lock(&ncclDebugOutputLock);                    \
+    auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch; \
+    double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000; \
+    fprintf(ncclDebugFile,"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, __func__, __LINE__); \
+    fprintf(ncclDebugFile,__VA_ARGS__);fprintf(ncclDebugFile,"\n"); \
+    fflush(ncclDebugFile);                                       \
+    pthread_mutex_unlock(&ncclDebugOutputLock);                  \
+  }                                                              \
+} while(0)
+
+extern std::chrono::high_resolution_clock::time_point ncclEpoch;
+
+#else
+#define TRACE(...)
+#endif
+
+#include <stdlib.h>
+
+static inline void initDebug() {
+  const char* nccl_debug = getenv("NCCL_DEBUG");
+  if (nccl_debug == NULL) {
+    ncclDebugLevel = NONE;
+  } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
+    ncclDebugLevel = VERSION;
+  } else if (strcasecmp(nccl_debug, "WARN") == 0) {
+    ncclDebugLevel = WARN;
+  } else if (strcasecmp(nccl_debug, "INFO") == 0) {
+    ncclDebugLevel = INFO;
+  } else if (strcasecmp(nccl_debug, "ABORT") == 0) {
+    ncclDebugLevel = ABORT;
+  } else if (strcasecmp(nccl_debug, "TRACE") == 0) {
+    ncclDebugLevel = TRACE;
+  }
+
+  /* Parse the NCCL_DEBUG_SUBSYS env var
+   * This can be a comma separated list such as INIT,COLL
+   * or ^INIT,COLL etc
+   */
+  char* nccl_debug_subsys = getenv("NCCL_DEBUG_SUBSYS");
+  if (nccl_debug_subsys != NULL) {
+    char *subsys = strtok(nccl_debug_subsys, ",");
+    while (subsys != NULL) {
+      int invert = 0;
+      uint64_t mask = 0;
+      if (subsys[0] == '^') { invert = 1; subsys++; }
+      if (strcasecmp(subsys, "INIT") == 0) {
+        mask = INIT;
+      } else if (strcasecmp(subsys, "COLL") == 0) {
+        mask = COLL;
+      } else if (strcasecmp(subsys, "P2P") == 0) {
+        mask = P2P;
+      } else if (strcasecmp(subsys, "SHM") == 0) {
+        mask = SHM;
+      } else if (strcasecmp(subsys, "NET") == 0) {
+        mask = NET;
+      } else if (strcasecmp(subsys, "ALL") == 0) {
+        mask = ALL;
+      }
+      if (mask) {
+        if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
+      }
+      subsys = strtok(NULL, ",");
+    }
+  }
+
+  /* Parse and expand the NCCL_DEBUG_FILE path and
+   * then create the debug file. But don't bother unless the
+   * NCCL_DEBUG level is > VERSION
+   */
+  const char* nccl_debug_file = getenv("NCCL_DEBUG_FILE");
+  if (ncclDebugLevel > VERSION && nccl_debug_file != NULL) {
+    int c = 0;
+    char debug_fn[PATH_MAX+1] = "";
+    char *dfn = debug_fn;
+    while (nccl_debug_file[c] != '\0' && c < PATH_MAX) {
+      if (nccl_debug_file[c++] != '%') {
+        *dfn++ = nccl_debug_file[c-1];
+        continue;
+      }
+      switch (nccl_debug_file[c++]) {
+        case '%': // Double %
+          *dfn++ = '%';
+          break;
+        case 'h': // %h = hostname
+          char hostname[1024];
+          getHostName(hostname, 1024);
+          dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
+          break;
+        case 'p': // %p = pid
+          dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
+          break;
+        default: // Echo everything we don't understand
+          *dfn++ = '%';
+          *dfn++ = nccl_debug_file[c-1];
+          break;
+      }
+    }
+    *dfn = '\0';
+    if (debug_fn[0] != '\0') {
+      FILE *file = fopen(debug_fn, "w");
+      if (file != NULL) {
+        INFO(ALL,"DEBUG file is '%s'", debug_fn);
+        ncclDebugFile = file;
+      }
+    }
+  }
+  pthread_mutex_init(&ncclDebugOutputLock, NULL);
+
+#ifdef ENABLE_TRACE
+  ncclEpoch = std::chrono::high_resolution_clock::now();
+#endif
+}
+
+#endif
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
new file mode 100644
index 0000000..69d0463
--- /dev/null
+++ b/src/include/enqueue.h
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ENQUEUE_H_
+#define NCCL_ENQUEUE_H_
+
+#include "core.h"
+#include "group.h"
+
+typedef ncclResult_t(*ncclFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+
+ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff,
+    void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
+ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
+ncclResult_t ncclCpuBarrierOut(ncclComm_t comm);
+ncclResult_t ncclBarrierEnqueue(ncclComm_t comm);
+ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm);
+ncclResult_t ncclEnqueueEvents(ncclComm_t comm);
+
+#endif // End include guard
diff --git a/src/include/group.h b/src/include/group.h
new file mode 100644
index 0000000..76da30f
--- /dev/null
+++ b/src/include/group.h
@@ -0,0 +1,24 @@
+/*************************************************************************
+ * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_GROUP_H_
+#define NCCL_GROUP_H_
+
+#include "nccl.h"
+#include "core.h"
+
+bool ncclAsyncMode();
+ncclResult_t ncclAsyncErrCheck(ncclResult_t ret);
+
+typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
+
+ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
+
+typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+
+ncclResult_t ncclAsyncColl(ncclComm_t comm);
+#endif
diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h
new file mode 100644
index 0000000..4f3e831
--- /dev/null
+++ b/src/include/ibvwrap.h
@@ -0,0 +1,1109 @@
+/*************************************************************************
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2004, 2011-2012 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ *
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_IBVWRAP_H_
+#define NCCL_IBVWRAP_H_
+
+#include "core.h"
+#include <sys/types.h>
+#include <unistd.h>
+
+// Dynamically handle dependencies on IB verbs
+
+#if __GNUC__ >= 3
+#  define __attribute_const __attribute__((const))
+#else
+#  define __attribute_const
+#endif
+
+union ibv_gid {
+	uint8_t			raw[16];
+	struct {
+		uint64_t	subnet_prefix;
+		uint64_t	interface_id;
+	} global;
+};
+
+#ifndef container_of
+/**
+  * container_of - cast a member of a structure out to the containing structure
+  * @ptr:        the pointer to the member.
+  * @type:       the type of the container struct this is embedded in.
+  * @member:     the name of the member within the struct.
+  *
+ */
+#define container_of(ptr, type, member) \
+	((type *) ((uint8_t *)(ptr) - offsetof(type, member)))
+#endif
+
+#define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz))
+
+/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/
+//static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *)NULL) - 1;
+
+enum ibv_node_type {
+	IBV_NODE_UNKNOWN	= -1,
+	IBV_NODE_CA 		= 1,
+	IBV_NODE_SWITCH,
+	IBV_NODE_ROUTER,
+	IBV_NODE_RNIC,
+
+	/* Leave a gap for future node types before starting with
+	 * experimental node types.
+	 */
+	IBV_EXP_NODE_TYPE_START	= 32,
+	IBV_EXP_NODE_MIC	= IBV_EXP_NODE_TYPE_START
+};
+
+enum ibv_transport_type {
+	IBV_TRANSPORT_UNKNOWN	= -1,
+	IBV_TRANSPORT_IB	= 0,
+	IBV_TRANSPORT_IWARP,
+
+	/* Leave a gap for future transport types before starting with
+	 * experimental transport types.
+	 */
+	IBV_EXP_TRANSPORT_TYPE_START	= 32,
+	IBV_EXP_TRANSPORT_SCIF		= IBV_EXP_TRANSPORT_TYPE_START
+};
+
+enum ibv_device_cap_flags {
+	IBV_DEVICE_RESIZE_MAX_WR	= 1,
+	IBV_DEVICE_BAD_PKEY_CNTR	= 1 <<  1,
+	IBV_DEVICE_BAD_QKEY_CNTR	= 1 <<  2,
+	IBV_DEVICE_RAW_MULTI		= 1 <<  3,
+	IBV_DEVICE_AUTO_PATH_MIG	= 1 <<  4,
+	IBV_DEVICE_CHANGE_PHY_PORT	= 1 <<  5,
+	IBV_DEVICE_UD_AV_PORT_ENFORCE	= 1 <<  6,
+	IBV_DEVICE_CURR_QP_STATE_MOD	= 1 <<  7,
+	IBV_DEVICE_SHUTDOWN_PORT	= 1 <<  8,
+	IBV_DEVICE_INIT_TYPE		= 1 <<  9,
+	IBV_DEVICE_PORT_ACTIVE_EVENT	= 1 << 10,
+	IBV_DEVICE_SYS_IMAGE_GUID	= 1 << 11,
+	IBV_DEVICE_RC_RNR_NAK_GEN	= 1 << 12,
+	IBV_DEVICE_SRQ_RESIZE		= 1 << 13,
+	IBV_DEVICE_N_NOTIFY_CQ		= 1 << 14,
+	IBV_DEVICE_XRC			= 1 << 20,
+	IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29
+};
+
+enum ibv_atomic_cap {
+	IBV_ATOMIC_NONE,
+	IBV_ATOMIC_HCA,
+	IBV_ATOMIC_GLOB
+};
+
+struct ibv_device_attr {
+	char			fw_ver[64];
+	uint64_t		node_guid;
+	uint64_t		sys_image_guid;
+	uint64_t		max_mr_size;
+	uint64_t		page_size_cap;
+	uint32_t		vendor_id;
+	uint32_t		vendor_part_id;
+	uint32_t		hw_ver;
+	int			max_qp;
+	int			max_qp_wr;
+	int			device_cap_flags;
+	int			max_sge;
+	int			max_sge_rd;
+	int			max_cq;
+	int			max_cqe;
+	int			max_mr;
+	int			max_pd;
+	int			max_qp_rd_atom;
+	int			max_ee_rd_atom;
+	int			max_res_rd_atom;
+	int			max_qp_init_rd_atom;
+	int			max_ee_init_rd_atom;
+	enum ibv_atomic_cap	atomic_cap;
+	int			max_ee;
+	int			max_rdd;
+	int			max_mw;
+	int			max_raw_ipv6_qp;
+	int			max_raw_ethy_qp;
+	int			max_mcast_grp;
+	int			max_mcast_qp_attach;
+	int			max_total_mcast_qp_attach;
+	int			max_ah;
+	int			max_fmr;
+	int			max_map_per_fmr;
+	int			max_srq;
+	int			max_srq_wr;
+	int			max_srq_sge;
+	uint16_t		max_pkeys;
+	uint8_t			local_ca_ack_delay;
+	uint8_t			phys_port_cnt;
+};
+
+enum ibv_mtu {
+	IBV_MTU_256  = 1,
+	IBV_MTU_512  = 2,
+	IBV_MTU_1024 = 3,
+	IBV_MTU_2048 = 4,
+	IBV_MTU_4096 = 5
+};
+
+enum ibv_port_state {
+	IBV_PORT_NOP		= 0,
+	IBV_PORT_DOWN		= 1,
+	IBV_PORT_INIT		= 2,
+	IBV_PORT_ARMED		= 3,
+	IBV_PORT_ACTIVE		= 4,
+	IBV_PORT_ACTIVE_DEFER	= 5
+};
+
+enum {
+	IBV_LINK_LAYER_UNSPECIFIED,
+	IBV_LINK_LAYER_INFINIBAND,
+	IBV_LINK_LAYER_ETHERNET,
+
+	/* Leave a gap for future link layer types before starting with
+	 * experimental link layer.
+	 */
+	IBV_EXP_LINK_LAYER_START	= 32,
+	IBV_EXP_LINK_LAYER_SCIF		= IBV_EXP_LINK_LAYER_START
+};
+
+enum ibv_port_cap_flags {
+	IBV_PORT_SM				= 1 <<  1,
+	IBV_PORT_NOTICE_SUP			= 1 <<  2,
+	IBV_PORT_TRAP_SUP			= 1 <<  3,
+	IBV_PORT_OPT_IPD_SUP			= 1 <<  4,
+	IBV_PORT_AUTO_MIGR_SUP			= 1 <<  5,
+	IBV_PORT_SL_MAP_SUP			= 1 <<  6,
+	IBV_PORT_MKEY_NVRAM			= 1 <<  7,
+	IBV_PORT_PKEY_NVRAM			= 1 <<  8,
+	IBV_PORT_LED_INFO_SUP			= 1 <<  9,
+	IBV_PORT_SYS_IMAGE_GUID_SUP		= 1 << 11,
+	IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP	= 1 << 12,
+	IBV_PORT_EXTENDED_SPEEDS_SUP		= 1 << 14,
+	IBV_PORT_CM_SUP				= 1 << 16,
+	IBV_PORT_SNMP_TUNNEL_SUP		= 1 << 17,
+	IBV_PORT_REINIT_SUP			= 1 << 18,
+	IBV_PORT_DEVICE_MGMT_SUP		= 1 << 19,
+	IBV_PORT_VENDOR_CLASS			= 1 << 24,
+	IBV_PORT_CLIENT_REG_SUP			= 1 << 25,
+	IBV_PORT_IP_BASED_GIDS			= 1 << 26,
+};
+
+struct ibv_port_attr {
+	enum ibv_port_state	state;
+	enum ibv_mtu		max_mtu;
+	enum ibv_mtu		active_mtu;
+	int			gid_tbl_len;
+	uint32_t		port_cap_flags;
+	uint32_t		max_msg_sz;
+	uint32_t		bad_pkey_cntr;
+	uint32_t		qkey_viol_cntr;
+	uint16_t		pkey_tbl_len;
+	uint16_t		lid;
+	uint16_t		sm_lid;
+	uint8_t			lmc;
+	uint8_t			max_vl_num;
+	uint8_t			sm_sl;
+	uint8_t			subnet_timeout;
+	uint8_t			init_type_reply;
+	uint8_t			active_width;
+	uint8_t			active_speed;
+	uint8_t			phys_state;
+	uint8_t			link_layer;
+	uint8_t			reserved;
+};
+
+enum ibv_event_type {
+	IBV_EVENT_CQ_ERR,
+	IBV_EVENT_QP_FATAL,
+	IBV_EVENT_QP_REQ_ERR,
+	IBV_EVENT_QP_ACCESS_ERR,
+	IBV_EVENT_COMM_EST,
+	IBV_EVENT_SQ_DRAINED,
+	IBV_EVENT_PATH_MIG,
+	IBV_EVENT_PATH_MIG_ERR,
+	IBV_EVENT_DEVICE_FATAL,
+	IBV_EVENT_PORT_ACTIVE,
+	IBV_EVENT_PORT_ERR,
+	IBV_EVENT_LID_CHANGE,
+	IBV_EVENT_PKEY_CHANGE,
+	IBV_EVENT_SM_CHANGE,
+	IBV_EVENT_SRQ_ERR,
+	IBV_EVENT_SRQ_LIMIT_REACHED,
+	IBV_EVENT_QP_LAST_WQE_REACHED,
+	IBV_EVENT_CLIENT_REREGISTER,
+	IBV_EVENT_GID_CHANGE,
+
+	/* new experimental events start here leaving enough
+	 * room for 14 events which should be enough
+	 */
+	IBV_EXP_EVENT_DCT_KEY_VIOLATION = 32,
+	IBV_EXP_EVENT_DCT_ACCESS_ERR,
+	IBV_EXP_EVENT_DCT_REQ_ERR,
+};
+
+struct ibv_async_event {
+	union {
+		struct ibv_cq  *cq;
+		struct ibv_qp  *qp;
+		struct ibv_srq *srq;
+		struct ibv_exp_dct *dct;
+		int		port_num;
+		/* For source compatible with Legacy API */
+		uint32_t	xrc_qp_num;
+	} element;
+	enum ibv_event_type	event_type;
+};
+
+enum ibv_wc_status {
+	IBV_WC_SUCCESS,
+	IBV_WC_LOC_LEN_ERR,
+	IBV_WC_LOC_QP_OP_ERR,
+	IBV_WC_LOC_EEC_OP_ERR,
+	IBV_WC_LOC_PROT_ERR,
+	IBV_WC_WR_FLUSH_ERR,
+	IBV_WC_MW_BIND_ERR,
+	IBV_WC_BAD_RESP_ERR,
+	IBV_WC_LOC_ACCESS_ERR,
+	IBV_WC_REM_INV_REQ_ERR,
+	IBV_WC_REM_ACCESS_ERR,
+	IBV_WC_REM_OP_ERR,
+	IBV_WC_RETRY_EXC_ERR,
+	IBV_WC_RNR_RETRY_EXC_ERR,
+	IBV_WC_LOC_RDD_VIOL_ERR,
+	IBV_WC_REM_INV_RD_REQ_ERR,
+	IBV_WC_REM_ABORT_ERR,
+	IBV_WC_INV_EECN_ERR,
+	IBV_WC_INV_EEC_STATE_ERR,
+	IBV_WC_FATAL_ERR,
+	IBV_WC_RESP_TIMEOUT_ERR,
+	IBV_WC_GENERAL_ERR
+};
+const char *ibv_wc_status_str(enum ibv_wc_status status);
+
+enum ibv_wc_opcode {
+	IBV_WC_SEND,
+	IBV_WC_RDMA_WRITE,
+	IBV_WC_RDMA_READ,
+	IBV_WC_COMP_SWAP,
+	IBV_WC_FETCH_ADD,
+	IBV_WC_BIND_MW,
+/*
+ * Set value of IBV_WC_RECV so consumers can test if a completion is a
+ * receive by testing (opcode & IBV_WC_RECV).
+ */
+	IBV_WC_RECV			= 1 << 7,
+	IBV_WC_RECV_RDMA_WITH_IMM
+};
+
+enum ibv_wc_flags {
+	IBV_WC_GRH		= 1 << 0,
+	IBV_WC_WITH_IMM		= 1 << 1
+};
+
+struct ibv_wc {
+	uint64_t		wr_id;
+	enum ibv_wc_status	status;
+	enum ibv_wc_opcode	opcode;
+	uint32_t		vendor_err;
+	uint32_t		byte_len;
+	uint32_t		imm_data;	/* in network byte order */
+	uint32_t		qp_num;
+	uint32_t		src_qp;
+	int			wc_flags;
+	uint16_t		pkey_index;
+	uint16_t		slid;
+	uint8_t			sl;
+	uint8_t			dlid_path_bits;
+};
+
+enum ibv_access_flags {
+	IBV_ACCESS_LOCAL_WRITE		= 1,
+	IBV_ACCESS_REMOTE_WRITE		= (1<<1),
+	IBV_ACCESS_REMOTE_READ		= (1<<2),
+	IBV_ACCESS_REMOTE_ATOMIC	= (1<<3),
+	IBV_ACCESS_MW_BIND		= (1<<4)
+};
+
+struct ibv_pd {
+	struct ibv_context     *context;
+	uint32_t		handle;
+};
+
+enum ibv_xrcd_init_attr_mask {
+	IBV_XRCD_INIT_ATTR_FD	    = 1 << 0,
+	IBV_XRCD_INIT_ATTR_OFLAGS   = 1 << 1,
+	IBV_XRCD_INIT_ATTR_RESERVED = 1 << 2
+};
+
+struct ibv_xrcd_init_attr {
+	uint32_t comp_mask;
+	int	 fd;
+	int	 oflags;
+};
+
+struct ibv_xrcd {
+	struct ibv_context     *context;
+};
+
+enum ibv_rereg_mr_flags {
+	IBV_REREG_MR_CHANGE_TRANSLATION	= (1 << 0),
+	IBV_REREG_MR_CHANGE_PD		= (1 << 1),
+	IBV_REREG_MR_CHANGE_ACCESS	= (1 << 2),
+	IBV_REREG_MR_KEEP_VALID		= (1 << 3)
+};
+
+struct ibv_mr {
+	struct ibv_context     *context;
+	struct ibv_pd	       *pd;
+	void		       *addr;
+	size_t			length;
+	uint32_t		handle;
+	uint32_t		lkey;
+	uint32_t		rkey;
+};
+
+enum ibv_mw_type {
+	IBV_MW_TYPE_1			= 1,
+	IBV_MW_TYPE_2			= 2
+};
+
+struct ibv_mw {
+	struct ibv_context     *context;
+	struct ibv_pd	       *pd;
+	uint32_t		rkey;
+};
+
+struct ibv_global_route {
+	union ibv_gid		dgid;
+	uint32_t		flow_label;
+	uint8_t			sgid_index;
+	uint8_t			hop_limit;
+	uint8_t			traffic_class;
+};
+
+struct ibv_grh {
+	uint32_t		version_tclass_flow;
+	uint16_t		paylen;
+	uint8_t			next_hdr;
+	uint8_t			hop_limit;
+	union ibv_gid		sgid;
+	union ibv_gid		dgid;
+};
+
+enum ibv_rate {
+	IBV_RATE_MAX      = 0,
+	IBV_RATE_2_5_GBPS = 2,
+	IBV_RATE_5_GBPS   = 5,
+	IBV_RATE_10_GBPS  = 3,
+	IBV_RATE_20_GBPS  = 6,
+	IBV_RATE_30_GBPS  = 4,
+	IBV_RATE_40_GBPS  = 7,
+	IBV_RATE_60_GBPS  = 8,
+	IBV_RATE_80_GBPS  = 9,
+	IBV_RATE_120_GBPS = 10,
+	IBV_RATE_14_GBPS  = 11,
+	IBV_RATE_56_GBPS  = 12,
+	IBV_RATE_112_GBPS = 13,
+	IBV_RATE_168_GBPS = 14,
+	IBV_RATE_25_GBPS  = 15,
+	IBV_RATE_100_GBPS = 16,
+	IBV_RATE_200_GBPS = 17,
+	IBV_RATE_300_GBPS = 18
+};
+
+/**
+ * ibv_rate_to_mult - Convert the IB rate enum to a multiple of the
+ * base rate of 2.5 Gbit/sec.  For example, IBV_RATE_5_GBPS will be
+ * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec.
+ * @rate: rate to convert.
+ */
+int ibv_rate_to_mult(enum ibv_rate rate) __attribute_const;
+
+/**
+ * mult_to_ibv_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate enum.
+ * @mult: multiple to convert.
+ */
+enum ibv_rate mult_to_ibv_rate(int mult) __attribute_const;
+
+/**
+ * ibv_rate_to_mbps - Convert the IB rate enum to Mbit/sec.
+ * For example, IBV_RATE_5_GBPS will return the value 5000.
+ * @rate: rate to convert.
+ */
+int ibv_rate_to_mbps(enum ibv_rate rate) __attribute_const;
+
+/**
+ * mbps_to_ibv_rate - Convert a Mbit/sec value to an IB rate enum.
+ * @mbps: value to convert.
+ */
+enum ibv_rate mbps_to_ibv_rate(int mbps) __attribute_const;
+
+struct ibv_ah_attr {
+	struct ibv_global_route	grh;
+	uint16_t		dlid;
+	uint8_t			sl;
+	uint8_t			src_path_bits;
+	uint8_t			static_rate;
+	uint8_t			is_global;
+	uint8_t			port_num;
+};
+
+enum ibv_srq_attr_mask {
+	IBV_SRQ_MAX_WR	= 1 << 0,
+	IBV_SRQ_LIMIT	= 1 << 1
+};
+
+struct ibv_srq_attr {
+	uint32_t		max_wr;
+	uint32_t		max_sge;
+	uint32_t		srq_limit;
+};
+
+struct ibv_srq_init_attr {
+	void		       *srq_context;
+	struct ibv_srq_attr	attr;
+};
+
+enum ibv_srq_type {
+	IBV_SRQT_BASIC,
+	IBV_SRQT_XRC
+};
+
+enum ibv_srq_init_attr_mask {
+	IBV_SRQ_INIT_ATTR_TYPE		= 1 << 0,
+	IBV_SRQ_INIT_ATTR_PD		= 1 << 1,
+	IBV_SRQ_INIT_ATTR_XRCD		= 1 << 2,
+	IBV_SRQ_INIT_ATTR_CQ		= 1 << 3,
+	IBV_SRQ_INIT_ATTR_RESERVED	= 1 << 4
+};
+
+struct ibv_srq_init_attr_ex {
+	void		       *srq_context;
+	struct ibv_srq_attr	attr;
+
+	uint32_t		comp_mask;
+	enum ibv_srq_type	srq_type;
+	struct ibv_pd	       *pd;
+	struct ibv_xrcd	       *xrcd;
+	struct ibv_cq	       *cq;
+};
+
+enum ibv_qp_type {
+	IBV_QPT_RC = 2,
+	IBV_QPT_UC,
+	IBV_QPT_UD,
+	/* XRC compatible code */
+	IBV_QPT_XRC,
+	IBV_QPT_RAW_PACKET = 8,
+	IBV_QPT_RAW_ETH = 8,
+	IBV_QPT_XRC_SEND = 9,
+	IBV_QPT_XRC_RECV,
+
+	/* Leave a gap for future qp types before starting with
+	 * experimental qp types.
+	 */
+	IBV_EXP_QP_TYPE_START	= 32,
+	IBV_EXP_QPT_DC_INI	= IBV_EXP_QP_TYPE_START
+};
+
+struct ibv_qp_cap {
+	uint32_t		max_send_wr;
+	uint32_t		max_recv_wr;
+	uint32_t		max_send_sge;
+	uint32_t		max_recv_sge;
+	uint32_t		max_inline_data;
+};
+
+struct ibv_qp_init_attr {
+	void		       *qp_context;
+	struct ibv_cq	       *send_cq;
+	struct ibv_cq	       *recv_cq;
+	struct ibv_srq	       *srq;
+	struct ibv_qp_cap	cap;
+	enum ibv_qp_type	qp_type;
+	int			sq_sig_all;
+	/* Below is needed for backwards compatabile */
+	struct ibv_xrc_domain  *xrc_domain;
+};
+
+enum ibv_qp_init_attr_mask {
+	IBV_QP_INIT_ATTR_PD		= 1 << 0,
+	IBV_QP_INIT_ATTR_XRCD		= 1 << 1,
+	IBV_QP_INIT_ATTR_RESERVED	= 1 << 2
+};
+
+struct ibv_qp_init_attr_ex {
+	void		       *qp_context;
+	struct ibv_cq	       *send_cq;
+	struct ibv_cq	       *recv_cq;
+	struct ibv_srq	       *srq;
+	struct ibv_qp_cap	cap;
+	enum ibv_qp_type	qp_type;
+	int			sq_sig_all;
+
+	uint32_t		comp_mask;
+	struct ibv_pd	       *pd;
+	struct ibv_xrcd	       *xrcd;
+};
+
+enum ibv_qp_open_attr_mask {
+	IBV_QP_OPEN_ATTR_NUM		= 1 << 0,
+	IBV_QP_OPEN_ATTR_XRCD		= 1 << 1,
+	IBV_QP_OPEN_ATTR_CONTEXT	= 1 << 2,
+	IBV_QP_OPEN_ATTR_TYPE		= 1 << 3,
+	IBV_QP_OPEN_ATTR_RESERVED	= 1 << 4
+};
+
+struct ibv_qp_open_attr {
+	uint32_t		comp_mask;
+	uint32_t		qp_num;
+	struct ibv_xrcd        *xrcd;
+	void		       *qp_context;
+	enum ibv_qp_type	qp_type;
+};
+
+enum ibv_qp_attr_mask {
+	IBV_QP_STATE			= 1 << 	0,
+	IBV_QP_CUR_STATE		= 1 << 	1,
+	IBV_QP_EN_SQD_ASYNC_NOTIFY	= 1 << 	2,
+	IBV_QP_ACCESS_FLAGS		= 1 << 	3,
+	IBV_QP_PKEY_INDEX		= 1 << 	4,
+	IBV_QP_PORT			= 1 << 	5,
+	IBV_QP_QKEY			= 1 << 	6,
+	IBV_QP_AV			= 1 << 	7,
+	IBV_QP_PATH_MTU			= 1 << 	8,
+	IBV_QP_TIMEOUT			= 1 << 	9,
+	IBV_QP_RETRY_CNT		= 1 << 10,
+	IBV_QP_RNR_RETRY		= 1 << 11,
+	IBV_QP_RQ_PSN			= 1 << 12,
+	IBV_QP_MAX_QP_RD_ATOMIC		= 1 << 13,
+	IBV_QP_ALT_PATH			= 1 << 14,
+	IBV_QP_MIN_RNR_TIMER		= 1 << 15,
+	IBV_QP_SQ_PSN			= 1 << 16,
+	IBV_QP_MAX_DEST_RD_ATOMIC	= 1 << 17,
+	IBV_QP_PATH_MIG_STATE		= 1 << 18,
+	IBV_QP_CAP			= 1 << 19,
+	IBV_QP_DEST_QPN			= 1 << 20
+};
+
+enum ibv_qp_state {
+	IBV_QPS_RESET,
+	IBV_QPS_INIT,
+	IBV_QPS_RTR,
+	IBV_QPS_RTS,
+	IBV_QPS_SQD,
+	IBV_QPS_SQE,
+	IBV_QPS_ERR,
+	IBV_QPS_UNKNOWN
+};
+
+enum ibv_mig_state {
+	IBV_MIG_MIGRATED,
+	IBV_MIG_REARM,
+	IBV_MIG_ARMED
+};
+
+struct ibv_qp_attr {
+	enum ibv_qp_state	qp_state;
+	enum ibv_qp_state	cur_qp_state;
+	enum ibv_mtu		path_mtu;
+	enum ibv_mig_state	path_mig_state;
+	uint32_t		qkey;
+	uint32_t		rq_psn;
+	uint32_t		sq_psn;
+	uint32_t		dest_qp_num;
+	int			qp_access_flags;
+	struct ibv_qp_cap	cap;
+	struct ibv_ah_attr	ah_attr;
+	struct ibv_ah_attr	alt_ah_attr;
+	uint16_t		pkey_index;
+	uint16_t		alt_pkey_index;
+	uint8_t			en_sqd_async_notify;
+	uint8_t			sq_draining;
+	uint8_t			max_rd_atomic;
+	uint8_t			max_dest_rd_atomic;
+	uint8_t			min_rnr_timer;
+	uint8_t			port_num;
+	uint8_t			timeout;
+	uint8_t			retry_cnt;
+	uint8_t			rnr_retry;
+	uint8_t			alt_port_num;
+	uint8_t			alt_timeout;
+};
+
+enum ibv_wr_opcode {
+	IBV_WR_RDMA_WRITE,
+	IBV_WR_RDMA_WRITE_WITH_IMM,
+	IBV_WR_SEND,
+	IBV_WR_SEND_WITH_IMM,
+	IBV_WR_RDMA_READ,
+	IBV_WR_ATOMIC_CMP_AND_SWP,
+	IBV_WR_ATOMIC_FETCH_AND_ADD
+};
+
+enum ibv_send_flags {
+	IBV_SEND_FENCE		= 1 << 0,
+	IBV_SEND_SIGNALED	= 1 << 1,
+	IBV_SEND_SOLICITED	= 1 << 2,
+	IBV_SEND_INLINE		= 1 << 3
+};
+
+struct ibv_sge {
+	uint64_t		addr;
+	uint32_t		length;
+	uint32_t		lkey;
+};
+
+struct ibv_send_wr {
+	uint64_t		wr_id;
+	struct ibv_send_wr     *next;
+	struct ibv_sge	       *sg_list;
+	int			num_sge;
+	enum ibv_wr_opcode	opcode;
+	int			send_flags;
+	uint32_t		imm_data;	/* in network byte order */
+	union {
+		struct {
+			uint64_t	remote_addr;
+			uint32_t	rkey;
+		} rdma;
+		struct {
+			uint64_t	remote_addr;
+			uint64_t	compare_add;
+			uint64_t	swap;
+			uint32_t	rkey;
+		} atomic;
+		struct {
+			struct ibv_ah  *ah;
+			uint32_t	remote_qpn;
+			uint32_t	remote_qkey;
+		} ud;
+	} wr;
+	union {
+		union {
+			struct {
+				uint32_t    remote_srqn;
+			} xrc;
+		} qp_type;
+
+		uint32_t		xrc_remote_srq_num;
+	};
+};
+
+struct ibv_recv_wr {
+	uint64_t		wr_id;
+	struct ibv_recv_wr     *next;
+	struct ibv_sge	       *sg_list;
+	int			num_sge;
+};
+
+struct ibv_mw_bind {
+	uint64_t		wr_id;
+	struct ibv_mr	       *mr;
+	void		       *addr;
+	size_t			length;
+	int			send_flags;
+	int			mw_access_flags;
+};
+
+struct ibv_srq {
+	struct ibv_context     *context;
+	void		       *srq_context;
+	struct ibv_pd	       *pd;
+	uint32_t		handle;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+	uint32_t		events_completed;
+
+	/* below are for source compatabilty with legacy XRC,
+	*   padding based on ibv_srq_legacy.
+	*/
+	uint32_t		xrc_srq_num_bin_compat_padding;
+	struct ibv_xrc_domain	*xrc_domain_bin_compat_padding;
+	struct ibv_cq	*xrc_cq_bin_compat_padding;
+	void		*ibv_srq_padding;
+
+	/* legacy fields */
+	uint32_t		xrc_srq_num;
+	struct ibv_xrc_domain	*xrc_domain;
+	struct ibv_cq		*xrc_cq;
+};
+
+/* Not in use in new API, needed for compilation as part of source compat layer */
+enum ibv_event_flags {
+	IBV_XRC_QP_EVENT_FLAG = 0x80000000,
+};
+
+
+
+struct ibv_qp {
+	struct ibv_context     *context;
+	void		       *qp_context;
+	struct ibv_pd	       *pd;
+	struct ibv_cq	       *send_cq;
+	struct ibv_cq	       *recv_cq;
+	struct ibv_srq	       *srq;
+	uint32_t		handle;
+	uint32_t		qp_num;
+	enum ibv_qp_state       state;
+	enum ibv_qp_type	qp_type;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+	uint32_t		events_completed;
+};
+
+struct ibv_comp_channel {
+	struct ibv_context     *context;
+	int			fd;
+	int			refcnt;
+};
+
+struct ibv_cq {
+	struct ibv_context     *context;
+	struct ibv_comp_channel *channel;
+	void		       *cq_context;
+	uint32_t		handle;
+	int			cqe;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+	uint32_t		comp_events_completed;
+	uint32_t		async_events_completed;
+};
+
+struct ibv_ah {
+	struct ibv_context     *context;
+	struct ibv_pd	       *pd;
+	uint32_t		handle;
+};
+
+enum ibv_flow_flags {
+	IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1,
+	IBV_FLOW_ATTR_FLAGS_DONT_TRAP = 1 << 1,
+};
+
+enum ibv_flow_attr_type {
+	/* steering according to rule specifications */
+	IBV_FLOW_ATTR_NORMAL		= 0x0,
+	/* default unicast and multicast rule -
+	 * receive all Eth traffic which isn't steered to any QP
+	 */
+	IBV_FLOW_ATTR_ALL_DEFAULT	= 0x1,
+	/* default multicast rule -
+	 * receive all Eth multicast traffic which isn't steered to any QP
+	 */
+	IBV_FLOW_ATTR_MC_DEFAULT	= 0x2,
+};
+
+enum ibv_flow_spec_type {
+	IBV_FLOW_SPEC_ETH	= 0x20,
+	IBV_FLOW_SPEC_IPV4	= 0x30,
+	IBV_FLOW_SPEC_TCP	= 0x40,
+	IBV_FLOW_SPEC_UDP	= 0x41,
+};
+
+struct ibv_flow_eth_filter {
+	uint8_t		dst_mac[6];
+	uint8_t		src_mac[6];
+	uint16_t	ether_type;
+	/*
+	 * same layout as 802.1q: prio 3, cfi 1, vlan id 12
+	 */
+	uint16_t	vlan_tag;
+};
+
+struct ibv_flow_spec_eth {
+	enum ibv_flow_spec_type  type;
+	uint16_t  size;
+	struct ibv_flow_eth_filter val;
+	struct ibv_flow_eth_filter mask;
+};
+
+struct ibv_flow_ipv4_filter {
+	uint32_t src_ip;
+	uint32_t dst_ip;
+};
+
+struct ibv_flow_spec_ipv4 {
+	enum ibv_flow_spec_type  type;
+	uint16_t  size;
+	struct ibv_flow_ipv4_filter val;
+	struct ibv_flow_ipv4_filter mask;
+};
+
+struct ibv_flow_tcp_udp_filter {
+	uint16_t dst_port;
+	uint16_t src_port;
+};
+
+struct ibv_flow_spec_tcp_udp {
+	enum ibv_flow_spec_type  type;
+	uint16_t  size;
+	struct ibv_flow_tcp_udp_filter val;
+	struct ibv_flow_tcp_udp_filter mask;
+};
+
+struct ibv_flow_spec {
+	union {
+		struct {
+			enum ibv_flow_spec_type	type;
+			uint16_t		size;
+		} hdr;
+		struct ibv_flow_spec_eth eth;
+		struct ibv_flow_spec_ipv4 ipv4;
+		struct ibv_flow_spec_tcp_udp tcp_udp;
+	};
+};
+
+struct ibv_flow_attr {
+	uint32_t comp_mask;
+	enum ibv_flow_attr_type type;
+	uint16_t size;
+	uint16_t priority;
+	uint8_t num_of_specs;
+	uint8_t port;
+	uint32_t flags;
+	/* Following are the optional layers according to user request
+	 * struct ibv_flow_spec_xxx [L2]
+	 * struct ibv_flow_spec_yyy [L3/L4]
+	 */
+};
+
+struct ibv_flow {
+	uint32_t	   comp_mask;
+	struct ibv_context *context;
+	uint32_t	   handle;
+};
+
+struct ibv_device;
+struct ibv_context;
+
+struct ibv_device_ops {
+	struct ibv_context *	(*alloc_context)(struct ibv_device *device, int cmd_fd);
+	void			(*free_context)(struct ibv_context *context);
+};
+
+enum {
+	IBV_SYSFS_NAME_MAX	= 64,
+	IBV_SYSFS_PATH_MAX	= 256
+};
+
+struct ibv_device {
+	struct ibv_device_ops	ops;
+	enum ibv_node_type	node_type;
+	enum ibv_transport_type	transport_type;
+	/* Name of underlying kernel IB device, eg "mthca0" */
+	char			name[IBV_SYSFS_NAME_MAX];
+	/* Name of uverbs device, eg "uverbs0" */
+	char			dev_name[IBV_SYSFS_NAME_MAX];
+	/* Path to infiniband_verbs class device in sysfs */
+	char			dev_path[IBV_SYSFS_PATH_MAX];
+	/* Path to infiniband class device in sysfs */
+	char			ibdev_path[IBV_SYSFS_PATH_MAX];
+};
+
+struct verbs_device {
+	struct ibv_device device; /* Must be first */
+	size_t	sz;
+	size_t	size_of_context;
+	int	(*init_context)(struct verbs_device *device,
+				struct ibv_context *ctx, int cmd_fd);
+	void	(*uninit_context)(struct verbs_device *device,
+				struct ibv_context *ctx);
+	/* future fields added here */
+};
+
+struct ibv_context_ops {
+	int			(*query_device)(struct ibv_context *context,
+					      struct ibv_device_attr *device_attr);
+	int			(*query_port)(struct ibv_context *context, uint8_t port_num,
+					      struct ibv_port_attr *port_attr);
+	struct ibv_pd *		(*alloc_pd)(struct ibv_context *context);
+	int			(*dealloc_pd)(struct ibv_pd *pd);
+	struct ibv_mr *		(*reg_mr)(struct ibv_pd *pd, void *addr, size_t length,
+					  int access);
+	struct ibv_mr *		(*rereg_mr)(struct ibv_mr *mr,
+					    int flags,
+					    struct ibv_pd *pd, void *addr,
+					    size_t length,
+					    int access);
+	int			(*dereg_mr)(struct ibv_mr *mr);
+	struct ibv_mw *		(*alloc_mw)(struct ibv_pd *pd, enum ibv_mw_type type);
+	int			(*bind_mw)(struct ibv_qp *qp, struct ibv_mw *mw,
+					   struct ibv_mw_bind *mw_bind);
+	int			(*dealloc_mw)(struct ibv_mw *mw);
+	struct ibv_cq *		(*create_cq)(struct ibv_context *context, int cqe,
+					     struct ibv_comp_channel *channel,
+					     int comp_vector);
+	int			(*poll_cq)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc);
+	int			(*req_notify_cq)(struct ibv_cq *cq, int solicited_only);
+	void			(*cq_event)(struct ibv_cq *cq);
+	int			(*resize_cq)(struct ibv_cq *cq, int cqe);
+	int			(*destroy_cq)(struct ibv_cq *cq);
+	struct ibv_srq *	(*create_srq)(struct ibv_pd *pd,
+					      struct ibv_srq_init_attr *srq_init_attr);
+	int			(*modify_srq)(struct ibv_srq *srq,
+					      struct ibv_srq_attr *srq_attr,
+					      int srq_attr_mask);
+	int			(*query_srq)(struct ibv_srq *srq,
+					     struct ibv_srq_attr *srq_attr);
+	int			(*destroy_srq)(struct ibv_srq *srq);
+	int			(*post_srq_recv)(struct ibv_srq *srq,
+						 struct ibv_recv_wr *recv_wr,
+						 struct ibv_recv_wr **bad_recv_wr);
+	struct ibv_qp *		(*create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *attr);
+	int			(*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+					    int attr_mask,
+					    struct ibv_qp_init_attr *init_attr);
+	int			(*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+					     int attr_mask);
+	int			(*destroy_qp)(struct ibv_qp *qp);
+	int			(*post_send)(struct ibv_qp *qp, struct ibv_send_wr *wr,
+					     struct ibv_send_wr **bad_wr);
+	int			(*post_recv)(struct ibv_qp *qp, struct ibv_recv_wr *wr,
+					     struct ibv_recv_wr **bad_wr);
+	struct ibv_ah *		(*create_ah)(struct ibv_pd *pd, struct ibv_ah_attr *attr);
+	int			(*destroy_ah)(struct ibv_ah *ah);
+	int			(*attach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid,
+						uint16_t lid);
+	int			(*detach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid,
+						uint16_t lid);
+	void			(*async_event)(struct ibv_async_event *event);
+};
+
+struct ibv_context {
+	struct ibv_device      *device;
+	struct ibv_context_ops	ops;
+	int			cmd_fd;
+	int			async_fd;
+	int			num_comp_vectors;
+	pthread_mutex_t		mutex;
+	void		       *abi_compat;
+};
+
+enum verbs_context_mask {
+	VERBS_CONTEXT_XRCD         = (uint64_t)1 << 0,
+	VERBS_CONTEXT_SRQ          = (uint64_t)1 << 1,
+	VERBS_CONTEXT_QP           = (uint64_t)1 << 2,
+	VERBS_CONTEXT_RESERVED     = (uint64_t)1 << 3,
+	VERBS_CONTEXT_EXP	   = (uint64_t)1 << 62
+};
+
+struct verbs_context {
+	/*  "grows up" - new fields go here */
+	int (*_reserved_2) (void);
+	int (*destroy_flow) (struct ibv_flow *flow);
+	int (*_reserved_1) (void);
+	struct ibv_flow * (*create_flow) (struct ibv_qp *qp,
+					  struct ibv_flow_attr *flow_attr);
+	struct ibv_qp * (*open_qp)(struct ibv_context *context,
+			struct ibv_qp_open_attr *attr);
+	struct ibv_qp * (*create_qp_ex)(struct ibv_context *context,
+			struct ibv_qp_init_attr_ex *qp_init_attr_ex);
+	int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num);
+	struct ibv_srq * (*create_srq_ex)(struct ibv_context *context,
+			struct ibv_srq_init_attr_ex *srq_init_attr_ex);
+	struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context,
+			struct ibv_xrcd_init_attr *xrcd_init_attr);
+	int  (*close_xrcd)(struct ibv_xrcd *xrcd);
+	uint64_t has_comp_mask;
+	size_t   sz;	/* Must be immediately before struct ibv_context */
+	struct ibv_context context;/* Must be last field in the struct */
+};
+
+/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/
+/*static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx)
+{
+	return (!ctx || (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED)) ?
+		NULL : container_of(ctx, struct verbs_context, context);
+}
+
+#define verbs_get_ctx_op(ctx, op) ({ \
+	struct verbs_context *_vctx = verbs_get_ctx(ctx); \
+	(!_vctx || (_vctx->sz < sizeof(*_vctx) - offsetof(struct verbs_context, op)) || \
+	!_vctx->op) ? NULL : _vctx; })*/
+
+#define verbs_set_ctx_op(_vctx, op, ptr) ({ \
+	struct verbs_context *vctx = _vctx; \
+	if (vctx && (vctx->sz >= sizeof(*vctx) - offsetof(struct verbs_context, op))) \
+		vctx->op = ptr; })
+
+static inline struct verbs_device *verbs_get_device(struct ibv_device *dev)
+{
+	return (dev->ops.alloc_context) ?
+		NULL : container_of(dev, struct verbs_device, device);
+}
+
+typedef enum ibv_return_enum
+{
+    IBV_SUCCESS = 0,                   //!< The operation was successful
+} ibv_return_t;
+
+ncclResult_t wrap_ibv_symbols(void);
+ncclResult_t wrap_ibv_fork_init(void);
+ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices);
+ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list);
+const char *wrap_ibv_get_device_name(struct ibv_device *device);
+ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device);
+ncclResult_t wrap_ibv_close_device(struct ibv_context *context);
+ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event);
+ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event);
+ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr);
+ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
+ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
+ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
+ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context);
+ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd);
+ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access);
+struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access);
+ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr);
+ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context);
+ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
+ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
+ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq);
+static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc, int* num_done) {
+  int done = cq->context->ops.poll_cq(cq, num_entries, wc); /*returns the number of wcs or 0 on success, a negative number otherwise*/
+  if (done < 0) {
+    WARN("Call to ibv_poll_cq() returned %d", done);
+    return ncclSystemError;
+  }
+  *num_done = done;
+  return ncclSuccess;
+}
+ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
+ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
+ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp);
+static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
+  return qp->context->ops.post_send(qp, wr, bad_wr);
+}
+
+static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
+  int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  if (ret != IBV_SUCCESS) {
+    WARN("ibv_post_send() failed with error %s", strerror(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+static inline ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) {
+  int ret = qp->context->ops.post_recv(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  if (ret != IBV_SUCCESS) {
+    WARN("ibv_post_recv() failed with error %s", strerror(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event);
+
+#endif //End include guard
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
new file mode 100644
index 0000000..58c1335
--- /dev/null
+++ b/src/include/nccl_net.h
@@ -0,0 +1,64 @@
+/*************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NET_H_
+#define NCCL_NET_H_
+
+#include "nccl.h"
+
+#define NCCL_NET_MAJOR 1
+#define NCCL_NET_MINOR 0
+
+#define NCCL_NET_HANDLE_MAXSIZE 64
+
+#define NCCL_PTR_HOST 0x1
+#define NCCL_PTR_CUDA 0x2
+
+#define NCCL_MAX_SCORE 0x7
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Return the number of network devices along with their scores relative to the
+  // current CUDA device. The per device score should be a value from 1-7 with a
+  // higher score representing a better choice for performance.
+  // This call should allocate the 'scores' array using malloc(3), and it
+  // will then be freed automatically by NCCL.
+  ncclResult_t (*devices)(int* ndev, int** scores);
+  // Return whether this device supports host pointers and/or CUDA pointers
+  // as data from the current GPU. Supported types should be composed with
+  // NCCL_PTR_HOST and NCCL_PTR_CUDA.
+  ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connectHandle
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int type, void** request);
+  // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*irecv)(void* recvComm, void* data, int size, int type, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*flush)(void* recvComm, void* data, int size);
+  // Test whether a request is complete and return the size received (can be less than requested).
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_t;
+
+extern
+#ifdef __cplusplus
+"C"
+#endif
+ncclNet_t* ncclNet;
+
+#endif // end include guard
diff --git a/src/include/net.h b/src/include/net.h
new file mode 100644
index 0000000..d841419
--- /dev/null
+++ b/src/include/net.h
@@ -0,0 +1,40 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INT_NET_H_
+#define NCCL_INT_NET_H_
+
+#include "nccl.h"
+#include "nccl_net.h"
+
+typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
+
+/* Socket Interface Selection type */
+typedef enum { findSubnetIf   = -1,
+    dontCareIf     = -2
+} ncclSocketIfSl_t;
+
+// Translation to external API
+static const char* ncclNetName() { return ncclNet->name; }
+static ncclResult_t ncclNetDevices(int* ndev, int** scores) { NCCLCHECK(ncclNet->devices(ndev, scores)); return ncclSuccess; }
+static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(ncclNet->ptrSupport(dev, supportedTypes)); return ncclSuccess; }
+static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
+static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
+static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, type, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, type, request)); return ncclSuccess; }
+static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size) { NCCLCHECK(ncclNet->flush(recvComm, data, size)); return ncclSuccess; }
+static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
+static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
+static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
+
+extern bool ncclIbSupport();
+extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str);
+extern ncclNet_t ncclNetIb;
+extern ncclNet_t ncclNetSocket;
+
+#endif
diff --git a/src/include/nvlink.h b/src/include/nvlink.h
new file mode 100644
index 0000000..7eb74c9
--- /dev/null
+++ b/src/include/nvlink.h
@@ -0,0 +1,155 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NVLINK_H_
+#define NCCL_NVLINK_H_
+
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "nvmlwrap.h"
+#include "topo.h"
+
+#define CONNECT_NVLINK 0x10
+#define CONNECT_NVSWITCH 0x100
+
+enum ncclNvLinkDeviceType {
+  ncclNvLinkDeviceGpu,
+  ncclNvLinkDeviceSwitch,
+};
+
+static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
+  char classPath[] =  "/sys/bus/pci/devices/0000:00:00.0/class";
+  memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
+  char* rPath = realpath(classPath, NULL);
+  int fd;
+  SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd);
+  free(rPath);
+  char pciClass[9];
+  strncpy(pciClass, "0x000000", 9);
+  int len;
+  SYSCHECKVAL(read(fd, pciClass, 8), "read", len);
+  SYSCHECK(close(fd), "close");
+  if (strcmp(pciClass, "0x068000") == 0) {
+    // PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
+    *type = ncclNvLinkDeviceSwitch;
+  } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
+      || strcmp(pciClass, "0x030000") == 0) {  // "VGA Controller" (GeForce)
+    *type = ncclNvLinkDeviceGpu;
+  } else {
+    // Ignore if we don't know what's on the other side.
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+/* Get the maximum number of NVLinks based on the GPU generation */
+static ncclResult_t getMaxNvlinks(int* maxLinks) {
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  int ccMajor;
+  CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
+  // 6 for Volta, 4 for Pascal
+  *maxLinks = (ccMajor > 6) ? 6 : 4;
+  // INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks);
+  return ncclSuccess;
+}
+
+static int getNvlinkGpu(const char* busId1, const char* busId2) {
+  // Determine if that connection is through NVLink
+  int links = 0;
+  int nvswitch_links = 0;
+  int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
+  nvmlDevice_t nvmlDev;
+  ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId1, &nvmlDev);
+  if (res != ncclSuccess) return 0;
+
+  for(int l=0; l<maxNvLinks; ++l) {
+    // nvmlDeviceGetNvLinkCapability(NVML_NVLINK_CAP_P2P_SUPPORTED) would seem to
+    // report whether the NVLink connects to a peer GPU (versus a POWER CPU?). I
+    // don't know whether nvmlDeviceGetNvLinkRemotePciInfo() would succeed in
+    // the POWER CPU case, so it seems best to check this as well.
+    unsigned canP2P;
+    if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
+
+    // nvmlDeviceGetNvLinkRemotePciInfo() will return NVML_ERROR_NOT_SUPPORTED
+    // if the links don't exist, or are disabled. So checking for that return
+    // here would probably make the nvmlDeviceGetNvLinkCapability check above
+    // redundant. Presumably, we still need to check the P2P capability above,
+    // since even non-GPUs would possess PCI info.
+    nvmlPciInfo_t remoteProc;
+    if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
+
+    // Old versions of NVML return a lowercase PCI ID
+    char* p = remoteProc.busId;
+    for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
+      if (p[c] == 0) break;
+      p[c] = toupper(p[c]);
+    }
+
+    if (strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
+      links++;
+    } else {
+      // Make a lower case copy of the bus ID for calling ncclDeviceType
+      // PCI system path is in lower case
+      char* p = remoteProc.busId;
+      char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+      for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
+        if (p[c] == 0) break;
+        lowerId[c] = tolower(p[c]);
+      }
+
+      // Determine if the remote side is NVswitch
+      enum ncclNvLinkDeviceType type;
+      if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
+        //TODO: we are making an assumption that all GPUs are connected to this switch
+        //This assumption may change for future architectures
+        nvswitch_links++;
+      }
+    }
+  }
+  return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*links;
+}
+
+static int getNumNvlinks(const char* busId) {
+  nvmlDevice_t nvmlDev;
+  ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev);
+  if (res != ncclSuccess) return 0;
+
+  int nvlinks = 0, nvswitch_links = 0;
+  int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
+  for(int l=0; l<maxNvLinks; ++l) {
+    unsigned canP2P;
+    nvmlEnableState_t isActive;
+    if (wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) == ncclSuccess && canP2P &&
+        wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) == ncclSuccess && isActive == NVML_FEATURE_ENABLED) {
+      nvlinks++;
+    } else {
+      continue;
+    }
+
+    nvmlPciInfo_t remoteProc;
+    if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
+
+    // Make a lower case copy of the bus ID for calling ncclDeviceType
+    // PCI system path is in lower case
+    char* p = remoteProc.busId;
+    char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+    for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
+      if (p[c] == 0) break;
+      lowerId[c] = tolower(p[c]);
+    }
+
+    // Determine if the remote side is NVswitch
+    enum ncclNvLinkDeviceType type;
+    if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
+      //TODO: we are making an assumption that all GPUs are connected to this switch
+      //This assumption may change for future architectures
+      nvswitch_links++;
+    }
+  }
+  return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*nvlinks;
+}
+#endif
diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h
new file mode 100644
index 0000000..ddfd233
--- /dev/null
+++ b/src/include/nvmlwrap.h
@@ -0,0 +1,149 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NVMLWRAP_H_
+#define NCCL_NVMLWRAP_H_
+
+#include "core.h"
+
+//#define NVML_DIRECT 1
+#ifdef NVML_DIRECT
+#include "nvml.h"
+
+#define NVMLCHECK(cmd) do {                              \
+    nvmlReturn_t e = cmd;                                \
+    if( e != NVML_SUCCESS ) {                            \
+      WARN("NVML failure '%s'", nvmlErrorString(e));     \
+      return ncclSystemError;                            \
+    }                                                    \
+} while(false)
+
+static ncclResult_t wrapNvmlSymbols(void) { return ncclSuccess; }
+static ncclResult_t wrapNvmlInit(void) { NVMLCHECK(nvmlInit()); return ncclSuccess; }
+static ncclResult_t wrapNvmlShutdown(void) { NVMLCHECK(nvmlShutdown()); return ncclSuccess; }
+static ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
+  NVMLCHECK(nvmlDeviceGetHandleByPciBusId(pciBusId, device));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
+  NVMLCHECK(nvmlDeviceGetIndex(device, index));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
+  NVMLCHECK(nvmlDeviceSetCpuAffinity(device));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
+  NVMLCHECK(nvmlDeviceClearCpuAffinity(device));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
+  NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetHandleByPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
+  NVMLCHECK(nvmlDeviceGetPciInfo(device, pci));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
+  NVMLCHECK(nvmlDeviceGetNvLinkState(device, link, isActive));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
+  NVMLCHECK(nvmlDeviceGetNvLinkRemotePciInfo(device, link, pci));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
+                                                   nvmlNvLinkCapability_t capability, unsigned int *capResult) {
+  NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
+  return ncclSuccess;
+}
+#else
+// Dynamically handle dependencies on NVML
+
+/* Extracted from nvml.h */
+typedef struct nvmlDevice_st* nvmlDevice_t;
+#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE   16
+
+typedef enum nvmlEnableState_enum
+{
+    NVML_FEATURE_DISABLED    = 0,     //!< Feature disabled
+    NVML_FEATURE_ENABLED     = 1      //!< Feature enabled
+} nvmlEnableState_t;
+
+typedef enum nvmlNvLinkCapability_enum
+{
+    NVML_NVLINK_CAP_P2P_SUPPORTED = 0,     // P2P over NVLink is supported
+    NVML_NVLINK_CAP_SYSMEM_ACCESS = 1,     // Access to system memory is supported
+    NVML_NVLINK_CAP_P2P_ATOMICS   = 2,     // P2P atomics are supported
+    NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3,     // System memory atomics are supported
+    NVML_NVLINK_CAP_SLI_BRIDGE    = 4,     // SLI is supported over this link
+    NVML_NVLINK_CAP_VALID         = 5,     // Link is supported on this device
+    // should be last
+    NVML_NVLINK_CAP_COUNT
+} nvmlNvLinkCapability_t;
+
+typedef enum nvmlReturn_enum
+{
+    NVML_SUCCESS = 0,                   //!< The operation was successful
+    NVML_ERROR_UNINITIALIZED = 1,       //!< NVML was not first initialized with nvmlInit()
+    NVML_ERROR_INVALID_ARGUMENT = 2,    //!< A supplied argument is invalid
+    NVML_ERROR_NOT_SUPPORTED = 3,       //!< The requested operation is not available on target device
+    NVML_ERROR_NO_PERMISSION = 4,       //!< The current user does not have permission for operation
+    NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
+    NVML_ERROR_NOT_FOUND = 6,           //!< A query to find an object was unsuccessful
+    NVML_ERROR_INSUFFICIENT_SIZE = 7,   //!< An input argument is not large enough
+    NVML_ERROR_INSUFFICIENT_POWER = 8,  //!< A device's external power cables are not properly attached
+    NVML_ERROR_DRIVER_NOT_LOADED = 9,   //!< NVIDIA driver is not loaded
+    NVML_ERROR_TIMEOUT = 10,            //!< User provided timeout passed
+    NVML_ERROR_IRQ_ISSUE = 11,          //!< NVIDIA Kernel detected an interrupt issue with a GPU
+    NVML_ERROR_LIBRARY_NOT_FOUND = 12,  //!< NVML Shared Library couldn't be found or loaded
+    NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
+    NVML_ERROR_CORRUPTED_INFOROM = 14,  //!< infoROM is corrupted
+    NVML_ERROR_GPU_IS_LOST = 15,        //!< The GPU has fallen off the bus or has otherwise become inaccessible
+    NVML_ERROR_RESET_REQUIRED = 16,     //!< The GPU requires a reset before it can be used again
+    NVML_ERROR_OPERATING_SYSTEM = 17,   //!< The GPU control device has been blocked by the operating system/cgroups
+    NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18,   //!< RM detects a driver/library version mismatch
+    NVML_ERROR_IN_USE = 19,             //!< An operation cannot be performed because the GPU is currently in use
+    NVML_ERROR_UNKNOWN = 999            //!< An internal driver error occurred
+} nvmlReturn_t;
+
+typedef struct nvmlPciInfo_st
+{
+    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
+    unsigned int domain;             //!< The PCI domain on which the device's bus resides, 0 to 0xffff
+    unsigned int bus;                //!< The bus on which the device resides, 0 to 0xff
+    unsigned int device;             //!< The device's id on the bus, 0 to 31
+    unsigned int pciDeviceId;        //!< The combined 16-bit device id and 16-bit vendor id
+
+    // Added in NVML 2.285 API
+    unsigned int pciSubSystemId;     //!< The 32-bit Sub System Device ID
+
+    // NVIDIA reserved for internal use only
+    unsigned int reserved0;
+    unsigned int reserved1;
+    unsigned int reserved2;
+    unsigned int reserved3;
+} nvmlPciInfo_t;
+/* End of nvml.h */
+
+ncclResult_t wrapNvmlSymbols(void);
+
+ncclResult_t wrapNvmlInit(void);
+ncclResult_t wrapNvmlShutdown(void);
+ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
+ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
+ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
+ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
+ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
+ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci);
+ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
+ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
+ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
+                                                   nvmlNvLinkCapability_t capability, unsigned int *capResult);
+#endif // NVML_DIRECT
+
+#endif // End include guard
diff --git a/src/include/param.h b/src/include/param.h
new file mode 100644
index 0000000..44c1b9a
--- /dev/null
+++ b/src/include/param.h
@@ -0,0 +1,81 @@
+/*************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PARAM_H_
+#define NCCL_PARAM_H_
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <pwd.h>
+
+static const char* userHomeDir() {
+  struct passwd *pwUser = getpwuid(getuid());
+  return pwUser == NULL ? NULL : pwUser->pw_dir;
+}
+
+static void setEnvFile(const char* fileName) {
+  FILE * file = fopen(fileName, "r");
+  if (file == NULL) return;
+
+  char *line = NULL;
+  char envVar[1024];
+  char envValue[1024];
+  size_t n = 0;
+  ssize_t read;
+  while ((read = getline(&line, &n, file)) != -1) {
+    if (line[read-1] == '\n') line[read-1] = '\0';
+    int s=0; // Env Var Size
+    while (line[s] != '\0' && line[s] != '=') s++;
+    if (line[s] == '\0') continue;
+    strncpy(envVar, line, std::min(1024,s));
+    envVar[s] = '\0';
+    s++;
+    strncpy(envValue, line+s, 1024);
+    setenv(envVar, envValue, 0);
+    char *str = getenv(envVar);
+  }
+  if (line) free(line);
+  fclose(file);
+}
+
+static void initEnv() {
+  char confFilePath[1024];
+  const char * userDir = userHomeDir();
+  if (userDir) {
+    sprintf(confFilePath, "%s/.nccl.conf", userDir);
+    setEnvFile(confFilePath);
+  }
+  sprintf(confFilePath, "/etc/nccl.conf");
+  setEnvFile(confFilePath);
+}
+
+
+#define NCCL_PARAM(name, env, default_value) \
+pthread_mutex_t ncclParamMutex##name = PTHREAD_MUTEX_INITIALIZER; \
+int64_t ncclParam##name() { \
+  static_assert(default_value != -1LL, "default value cannot be -1"); \
+  static int64_t value = -1LL; \
+  pthread_mutex_lock(&ncclParamMutex##name); \
+  if (value == -1LL) { \
+    value = default_value; \
+    char* str = getenv("NCCL_" env); \
+    if (str && strlen(str) > 0) { \
+      errno = 0; \
+      int64_t v = strtoll(str, NULL, 0); \
+      if (errno) { \
+        INFO(ALL,"Invalid value %s for %s, using default %lu.", str, "NCCL_" env, value); \
+      } else { \
+        value = v; \
+        INFO(ALL,"%s set by environment to %lu.", "NCCL_" env, value);  \
+      } \
+    } \
+  } \
+  pthread_mutex_unlock(&ncclParamMutex##name); \
+  return value; \
+}
+
+#endif
diff --git a/src/include/ring.h b/src/include/ring.h
new file mode 100644
index 0000000..fa5e099
--- /dev/null
+++ b/src/include/ring.h
@@ -0,0 +1,14 @@
+/*************************************************************************
+ * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_RING_H_
+#define NCCL_RING_H_
+#include "core.h"
+
+ncclResult_t initRing(struct ncclComm* comm, int ringid);
+ncclResult_t freeRing(struct ncclRing* ring);
+
+#endif
diff --git a/src/include/rings.h b/src/include/rings.h
new file mode 100644
index 0000000..751846c
--- /dev/null
+++ b/src/include/rings.h
@@ -0,0 +1,17 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_RINGS_H_
+#define NCCL_RINGS_H_
+
+static int getDefaultThreads() {
+  // On Kepler, rings are doubled later.
+  return ncclCudaCompCap() == 3 ? 128 : 256;
+}
+
+ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next);
+
+#endif
diff --git a/src/include/shm.h b/src/include/shm.h
new file mode 100644
index 0000000..ce46a16
--- /dev/null
+++ b/src/include/shm.h
@@ -0,0 +1,76 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_SHM_H_
+#define NCCL_SHM_H_
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPtr, void** devShmPtr, int create) {
+  *shmPtr = NULL;
+  int fd = shm_open(shmname, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+  if (fd == -1) {
+    WARN("shm_open failed to open %s : %s", shmname, strerror(errno));
+    return ncclSystemError;
+  }
+
+  if (create) {
+    int res = posix_fallocate(fd, 0, shmsize);
+    if (res != 0) {
+      WARN("Unable to allocate shared memory (%d bytes) : %s", shmsize, strerror(res));
+      shm_unlink(shmname);
+      close(fd);
+      return ncclSystemError;
+    }
+  }
+
+  void *ptr = mmap(NULL, shmsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+  close(fd);
+  if (ptr == MAP_FAILED) {
+    WARN("failure in mmap of %s (size %d) : %s", shmname, shmsize, strerror(errno));
+    shm_unlink(shmname);
+    return ncclSystemError;
+  }
+  if (create) {
+    memset(ptr, 0, shmsize);
+  }
+
+  cudaError_t e;
+  if ((e=cudaHostRegister(ptr, shmsize, cudaHostRegisterMapped)) != cudaSuccess) {
+    WARN("failed to register host buffer %p : %s", ptr, cudaGetErrorString(e));
+    if (create) shm_unlink(shmname);
+    munmap(ptr, shmsize);
+    return ncclUnhandledCudaError;
+  }
+
+  if ((e=cudaHostGetDevicePointer(devShmPtr, ptr, 0)) != cudaSuccess) {
+    WARN("failed to get device pointer for local shmem %p : %s", ptr, cudaGetErrorString(e));
+    if (create) shm_unlink(shmname);
+    munmap(ptr, shmsize);
+    return ncclUnhandledCudaError;
+  }
+  *shmPtr = ptr;
+  return ncclSuccess;
+}
+
+static ncclResult_t shmUnlink(const char* shmname) {
+  if (shmname != NULL) SYSCHECK(shm_unlink(shmname), "shm_unlink");
+  return ncclSuccess;
+}
+
+static ncclResult_t shmClose(void* shmPtr, void* devShmPtr, const int shmsize) {
+  CUDACHECK(cudaHostUnregister(shmPtr));
+  if (munmap(shmPtr, shmsize) != 0) {
+    WARN("munmap of shared memory failed");
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+#endif
diff --git a/src/include/socket.h b/src/include/socket.h
new file mode 100644
index 0000000..3321e4d
--- /dev/null
+++ b/src/include/socket.h
@@ -0,0 +1,401 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_SOCKET_H_
+#define NCCL_SOCKET_H_
+
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <netinet/tcp.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <ifaddrs.h>
+#include <net/if.h>
+#include "utils.h"
+
+#define MAX_IF_NAME_SIZE 16
+#define SLEEP_INT     1000  // sleep interval in usec
+#define RETRY_TIMES   2e4   // retry times before reporting a timeout (20 sec)
+
+/* Common socket address storage structure for IPv4/IPv6 */
+union socketAddress {
+  struct sockaddr sa;
+  struct sockaddr_in sin;
+  struct sockaddr_in6 sin6;
+};
+
+/* Format a string representation of a (struct sockaddr *) socket address using getnameinfo()
+ *
+ * Output: "IPv4/IPv6 address<port>"
+ */
+static inline const char *socketToString(struct sockaddr *saddr, char *buf) {
+  if (buf == NULL || saddr == NULL) return NULL;
+  if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
+  char host[NI_MAXHOST], service[NI_MAXSERV];
+  (void) getnameinfo(saddr, sizeof(union socketAddress), host, NI_MAXHOST, service, NI_MAXSERV, NI_NUMERICHOST|NI_NUMERICSERV);
+  sprintf(buf, "%s<%s>", host, service);
+  return buf;
+}
+
+/* Allow the user to force the IPv4/IPv6 interface selection */
+static inline int envSocketFamily(void) {
+  int family = -1; // Family selection is not forced, will use first one found
+  char* env = getenv("NCCL_SOCKET_FAMILY");
+  if (env == NULL)
+    return family;
+
+  if (strcmp(env, "AF_INET") == 0)
+    family = AF_INET;  // IPv4
+  else if (strcmp(env, "AF_INET6") == 0)
+    family = AF_INET6; // IPv6
+  return family;
+}
+
+static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
+  char line[1024];
+  struct netIf userIfs[maxIfs];
+  bool searchNot = prefixList && prefixList[0] == '^';
+  int nUserIfs = parseStringList(prefixList, userIfs, maxIfs);
+
+  int found = 0;
+  struct ifaddrs *interfaces, *interface;
+  getifaddrs(&interfaces);
+  for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) {
+    if (interface->ifa_addr == NULL) continue;
+
+    /* We only support IPv4 & IPv6 */
+    int family = interface->ifa_addr->sa_family;
+    if (family != AF_INET && family != AF_INET6)
+      continue;
+
+    TRACE(INIT|NET,"Found interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
+
+    /* Allow the caller to force the socket family type */
+    if (sock_family != -1 && family != sock_family)
+      continue;
+
+    /* We also need to skip IPv6 loopback interfaces */
+    if (family == AF_INET6) {
+      struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr);
+      if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
+    }
+
+    // check against user specified interfaces
+    if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs) ^ searchNot)) {
+      continue;
+    }
+
+    // Check that this interface has not already been saved
+    // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link
+    bool duplicate = false;
+    for (int i = 0; i < found; i++) {
+      if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; }
+    }
+
+    if (!duplicate) {
+      // Store the interface name
+      strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize);
+      // Store the IP address
+      int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
+      memcpy(addrs+found, interface->ifa_addr, salen);
+      INFO(INIT|NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
+      found++;
+    }
+  }
+
+  freeifaddrs(interfaces);
+  return found;
+}
+
+static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
+  /* Check family first */
+  int family = local_if.ifa_addr->sa_family;
+  if (family != remote.sa.sa_family) {
+    return false;
+  }
+
+  if (family == AF_INET) {
+    struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
+    struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
+    struct sockaddr_in& remote_addr = remote.sin;
+    struct in_addr local_subnet, remote_subnet;
+    local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
+    remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
+    return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true;
+  } else if (family == AF_INET6) {
+    struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
+    struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
+    struct sockaddr_in6& remote_addr = remote.sin6;
+    struct in6_addr& local_in6 = local_addr->sin6_addr;
+    struct in6_addr& mask_in6 = mask->sin6_addr;
+    struct in6_addr& remote_in6 = remote_addr.sin6_addr;
+    bool same = true;
+    int len = 16;  //IPv6 address is 16 unsigned char
+    for (int c = 0; c < len; c++) {  //Network byte order is big-endian
+      char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c];
+      char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c];
+      if (c1 ^ c2) {
+        same = false;
+        break;
+      }
+    }
+    // At last, we need to compare scope id
+    // Two Link-type addresses can have the same subnet address even though they are not in the same scope
+    // For Global type, this field is 0, so a comparison wouldn't matter
+    same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id);
+    return same;
+  } else {
+    WARN("Net : Unsupported address family type");
+    return false;
+  }
+}
+
+static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress remoteAddr, int ifNameMaxSize, int maxIfs) {
+  char line[1024], line_a[1024];
+  int found = 0;
+  struct ifaddrs *interfaces, *interface;
+  getifaddrs(&interfaces);
+  for (interface = interfaces; interface && !found; interface = interface->ifa_next) {
+    if (interface->ifa_addr == NULL) continue;
+
+    /* We only support IPv4 & IPv6 */
+    int family = interface->ifa_addr->sa_family;
+    if (family != AF_INET && family != AF_INET6)
+      continue;
+
+    // check against user specified interfaces
+    if (!matchSubnet(*interface, remoteAddr)) {
+      continue;
+    }
+
+    // Store the local IP address
+    int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
+    memcpy(localAddrs+found, interface->ifa_addr, salen);
+
+    // Store the interface name
+    strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
+
+    INFO(INIT|NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
+    found++;
+    if (found == maxIfs) break;
+  }
+
+  if (found == 0) {
+    WARN("Net : No interface found in the same subnet as remote address %s", socketToString(&(remoteAddr.sa), line_a));
+  }
+  freeifaddrs(interfaces);
+  return found;
+}
+
+static ncclResult_t GetSocketAddrFromString(union socketAddress* ua, const char* ip_port_pair) {
+  if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
+    WARN("Net : string is null");
+    return ncclInvalidArgument;
+  }
+
+  bool ipv6 = ip_port_pair[0] == '[';
+  /* Construct the sockaddress structure */
+  if (!ipv6) {
+    struct netIf ni;
+    // parse <ip_or_hostname>:<port> string, expect one pair
+    if (parseStringList(ip_port_pair, &ni, 1) != 1) {
+      WARN("Net : No valid <IPv4_or_hostname>:<port> pair found");
+      return ncclInvalidArgument;
+    }
+
+    struct addrinfo hints, *p;
+    int rv;
+    memset(&hints, 0, sizeof(hints));
+    hints.ai_family = AF_UNSPEC;
+    hints.ai_socktype = SOCK_STREAM;
+
+    if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) {
+      WARN("Net : error encountered when getting address info : %s", gai_strerror(rv));
+      return ncclInvalidArgument;
+    }
+
+    // use the first
+    if (p->ai_family == AF_INET) {
+      struct sockaddr_in& sin = ua->sin;
+      memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in));
+      sin.sin_family = AF_INET;                        // IPv4
+      //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr));  // IP address
+      sin.sin_port = htons(ni.port);                   // port
+    } else if (p->ai_family == AF_INET6) {
+      struct sockaddr_in6& sin6 = ua->sin6;
+      memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6));
+      sin6.sin6_family = AF_INET6;                     // IPv6
+      sin6.sin6_port = htons(ni.port);                 // port
+      sin6.sin6_flowinfo = 0;                          // needed by IPv6, but possibly obsolete
+      sin6.sin6_scope_id = 0;                          // should be global scope, set to 0
+    } else {
+      WARN("Net : unsupported IP family");
+      return ncclInvalidArgument;
+    }
+
+    freeaddrinfo(p); // all done with this structure
+
+  } else {
+    int i, j = -1, len = strlen(ip_port_pair);
+    for (i = 1; i < len; i++) {
+      if (ip_port_pair[i] == '%') j = i;
+      if (ip_port_pair[i] == ']') break;
+    }
+    if (i == len) {
+      WARN("Net : No valid [IPv6]:port pair found");
+      return ncclInvalidArgument;
+    }
+    bool global_scope = (j == -1 ? true : false);     // If no % found, global scope; otherwise, link scope
+
+    char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ];
+    memset(ip_str, '\0', sizeof(ip_str));
+    memset(port_str, '\0', sizeof(port_str));
+    memset(if_name, '\0', sizeof(if_name));
+    strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1);
+    strncpy(port_str, ip_port_pair+i+2, len-i-1);
+    int port = atoi(port_str);
+    if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name
+
+    struct sockaddr_in6& sin6 = ua->sin6;
+    sin6.sin6_family = AF_INET6;                       // IPv6
+    inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr));    // IP address
+    sin6.sin6_port = htons(port);                      // port
+    sin6.sin6_flowinfo = 0;                            // needed by IPv6, but possibly obsolete
+    sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name);  // 0 if global scope; intf index if link scope
+  }
+  return ncclSuccess;
+}
+
+static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) {
+  int nIfs = 0;
+  // Allow user to force the INET socket family selection
+  int sock_family = envSocketFamily();
+  // User specified interface
+  char* env = getenv("NCCL_SOCKET_IFNAME");
+  if (env && strlen(env) > 1) {
+    // Specified by user : find or fail
+    nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+  } else {
+    // Try to automatically pick the right one
+    // Start with IB
+    nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    // else see if we can get some hint from COMM ID
+    if (nIfs == 0) {
+      char* commId = getenv("NCCL_COMM_ID");
+      if (commId && strlen(commId) > 1) {
+        // Try to find interface that is in the same subnet as the IP in comm id
+        union socketAddress idAddr;
+        GetSocketAddrFromString(&idAddr, commId);
+        nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, idAddr, ifNameMaxSize, maxIfs);
+      }
+    }
+    // Then look for anything else (but not docker or lo)
+    if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    // Finally look for docker, then lo.
+    if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+  }
+  return nIfs;
+}
+
+static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr) {
+  /* IPv4/IPv6 support */
+  int family = localAddr->sa.sa_family;
+  int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
+
+  /* Create socket and bind it to a port */
+  int sockfd = socket(family, SOCK_STREAM, 0);
+  if (sockfd == -1) {
+    WARN("Net : Socket creation failed : %s", strerror(errno));
+    return ncclSystemError;
+  }
+
+  int opt = 1;
+  SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
+
+  // localAddr port should be 0 (Any port)
+  SYSCHECK(bind(sockfd, &localAddr->sa, salen), "bind");
+
+  /* Get the assigned Port */
+  socklen_t size = salen;
+  SYSCHECK(getsockname(sockfd, &localAddr->sa, &size), "getsockname");
+
+#ifdef ENABLE_TRACE
+  char line[1024];
+  TRACE(INIT|NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
+#endif
+
+  /* Put the socket in listen mode */
+  SYSCHECK(listen(sockfd, 128), "listen");
+  *fd = sockfd;
+  return ncclSuccess;
+}
+
+static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
+  /* IPv4/IPv6 support */
+  int family = remoteAddr->sa.sa_family;
+  int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
+
+  /* Connect to a hostname / port */
+  *fd = socket(family, SOCK_STREAM, 0);
+  if (*fd == -1) {
+    WARN("Net : Socket creation failed : %s", strerror(errno));
+    return ncclSystemError;
+  }
+
+  const int one = 1;
+  SYSCHECK(setsockopt(*fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
+
+  /*  const int bufsize = 128*1024;
+    SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
+    SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/
+
+#ifdef ENABLE_TRACE
+  char line[1024];
+  TRACE(INIT|NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line));
+#endif
+
+  SYSCHECKNTIMES(connect(*fd, &remoteAddr->sa, salen), "connect", RETRY_TIMES, SLEEP_INT, ECONNREFUSED);
+  return ncclSuccess;
+}
+
+static ncclResult_t socketReceive(int fd, void* ptr, int size) {
+  char* data = (char*)ptr;
+  int offset = 0;
+  while (offset < size) {
+    int recvsize;
+    SYSCHECKVAL(recv(fd, data, size-offset, 0), "recv", recvsize);
+    if (recvsize == 0) {
+      WARN("Net : Connection closed by remote peer");
+      return ncclSystemError;
+    }
+    if (recvsize == -1) {
+      INFO(NET,"Recv : got retcode %d, retrying", errno);
+      continue;
+    }
+    data += recvsize;
+    offset += recvsize;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t socketSend(int fd, void* ptr, int size) {
+  char* data = (char*)ptr;
+  int offset = 0;
+  while (offset < size) {
+    int sendsize;
+    SYSCHECKVAL(write(fd, data, size-offset), "write", sendsize);
+    if (sendsize == -1) {
+      INFO(NET,"Send : got retcode %d, retrying", errno);
+      continue;
+    }
+    data += sendsize;
+    offset += sendsize;
+  }
+  return ncclSuccess;
+}
+
+#endif
diff --git a/src/include/topo.h b/src/include/topo.h
new file mode 100644
index 0000000..24d7e9d
--- /dev/null
+++ b/src/include/topo.h
@@ -0,0 +1,83 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TOPO_H_
+#define NCCL_TOPO_H_
+
+#include "nccl.h"
+#include <ctype.h>
+
+#define MAXPATHSIZE 1024
+
+static ncclResult_t getCudaPath(int cudaDev, char** path) {
+  char busId[16];
+  CUDACHECK(cudaDeviceGetPCIBusId(busId, 16, cudaDev));
+  for (int i=0; i<16; i++) busId[i] = tolower(busId[i]);
+  char busPath[] = "/sys/class/pci_bus/0000:00/device";
+  memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, sizeof("0000:00")-1);
+  char* cudaRpath = realpath(busPath, NULL);
+  char pathname[MAXPATHSIZE];
+  strncpy(pathname, cudaRpath, MAXPATHSIZE);
+  strncpy(pathname+strlen(pathname), "/", MAXPATHSIZE-strlen(pathname));
+  strncpy(pathname+strlen(pathname), busId, MAXPATHSIZE-strlen(pathname));
+  free(cudaRpath);
+  *path = realpath(pathname, NULL);
+  if (*path == NULL) {
+    WARN("Could not find real path of %s", pathname);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t getMlxPath(char* ibName, char** path) {
+  char devicepath[MAXPATHSIZE];
+  snprintf(devicepath, MAXPATHSIZE, "/sys/class/infiniband/%s/device", ibName);
+  *path = realpath(devicepath, NULL);
+  if (*path == NULL) {
+    WARN("Could not find real path of %s", devicepath);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t getSockPath(char* ifName, char** path) {
+  char devicepath[MAXPATHSIZE];
+  snprintf(devicepath, MAXPATHSIZE, "/sys/class/net/%s/device", ifName);
+  *path = realpath(devicepath, NULL);
+  if (*path == NULL) {
+    INFO(NET|INIT, "Could not find real path of %s", devicepath);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+enum ncclIbPathDist {
+  PATH_PIX = 0,
+  PATH_PXB = 1,
+  PATH_PHB = 2,
+  PATH_SOC = 3
+};
+
+static const char* pathDists[] = { "PIX", "PXB", "PHB", "SOC" };
+
+static int pciDistance(char* path1, char* path2) {
+  int score = 0;
+  int depth = 0;
+  int same = 1;
+  for (int i=0; i<strlen(path1); i++) {
+    if (path1[i] != path2[i]) same = 0;
+    if (path1[i] == '/') {
+      depth++;
+      if (same == 1) score++;
+    }
+  }
+  if (score == 3) return PATH_SOC;
+  if (score == 4) return PATH_PHB;
+  if (score == depth-1) return PATH_PIX;
+  return PATH_PXB;
+}
+
+#endif
diff --git a/src/include/transport.h b/src/include/transport.h
new file mode 100644
index 0000000..59f83c9
--- /dev/null
+++ b/src/include/transport.h
@@ -0,0 +1,113 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TRANSPORT_H_
+#define NCCL_TRANSPORT_H_
+
+#include "nccl.h"
+#include <stdint.h>
+
+#define NTRANSPORTS 3
+
+extern struct ncclTransport ncclTransports[];
+
+// Forward declarations
+struct ncclRing;
+struct ncclConnector;
+struct ncclComm;
+
+#define RANK_INFO_SIZE 64
+typedef char ncclTinfo_t[RANK_INFO_SIZE];
+
+struct ncclInfo {
+  ncclTinfo_t tinfo[NTRANSPORTS];
+};
+
+// Used to hold the transport connection values
+typedef int64_t ncclTvalue_t;
+
+#define CONNECT_SIZE 128
+struct ncclConnect {
+  char data[CONNECT_SIZE];
+};
+
+struct ncclProxyArgs {
+  struct ncclRing* ring;
+  int substeps;
+  int nsteps;
+  uint64_t opCount;
+  int llMode;
+  bool needProxy;
+  int active;   // add component before this line -- it is left out during initialization
+};
+
+struct ncclTransportComm {
+  ncclResult_t (*setup)(ncclTinfo_t*, ncclTinfo_t*, struct ncclConnect*, struct ncclRing*);
+  ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*);
+  ncclResult_t (*free)(void*);
+  ncclResult_t (*proxy)(struct ncclProxyArgs*);
+};
+
+struct ncclTransport {
+  const char name[4];
+  ncclResult_t (*fillInfo)(ncclTinfo_t*, int);
+  ncclResult_t (*canConnect)(ncclTvalue_t*, ncclTinfo_t*, ncclTinfo_t*);
+  ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*);
+  struct ncclTransportComm send;
+  struct ncclTransportComm recv;
+};
+
+#include <pthread.h>
+
+typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
+
+#define TRANSPORT_PROXY_FIFO_SIZE NCCL_MAX_OPS
+
+struct transportProxyInfo {
+  struct ncclComm* comm;
+  pthread_t thread;
+  threadFunc_t func;
+  volatile int proxyReady;
+  struct ncclProxyArgs argsFifo[TRANSPORT_PROXY_FIFO_SIZE];
+  volatile uint64_t argsFifoHead;
+  volatile uint64_t argsFifoTail;
+  pthread_cond_t cond;
+  pthread_mutex_t mutex;
+};
+
+ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm);
+ncclResult_t transportDestroyProxy(struct ncclConnector* connector);
+
+enum proxyMode {
+  proxyRing = 0,
+  proxyFrom = 1,
+  proxyTo = 2
+};
+
+static int proxyPatternRing = proxyRing;
+static inline int proxyPatternFrom(int root) { return 1+root; }
+static inline int proxyPatternTo(int root) { return -1-root; }
+static inline enum proxyMode proxyPatternMode(int pattern) { return (pattern == 0) ? proxyRing : ((pattern > 0) ? proxyFrom : proxyTo); }
+static inline int proxyPatternRoot(int pattern) { return (pattern > 0) ? pattern-1 : -pattern-1; }
+
+ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t size, int pattern, struct ncclComm* comm);
+ncclResult_t transportStartProxies(struct ncclComm* comm);
+
+#include <unistd.h>
+
+// Spin wait until func evaluates to true
+template<typename FUNC>
+inline void transportProxyWait(const FUNC& func) {
+  while (!func()) {
+    sched_yield();
+  }
+}
+
+inline void transportProxyIdle(int idle) {
+  sched_yield();
+}
+
+#endif
diff --git a/src/include/utils.h b/src/include/utils.h
new file mode 100644
index 0000000..5a6a588
--- /dev/null
+++ b/src/include/utils.h
@@ -0,0 +1,25 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_UTILS_H_
+#define NCCL_UTILS_H_
+
+#include "nccl.h"
+#include <stdint.h>
+
+ncclResult_t getHostName(char* hostname, int maxlen);
+uint64_t getHostHash();
+uint64_t getPidHash();
+
+struct netIf {
+  char prefix[64];
+  int port;
+};
+
+int parseStringList(const char* string, struct netIf* ifList, int maxList);
+bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize);
+
+#endif
diff --git a/src/init.cu b/src/init.cu
new file mode 100644
index 0000000..6669251
--- /dev/null
+++ b/src/init.cu
@@ -0,0 +1,815 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "core.h"
+#include "ring.h"
+#include "param.h"
+#include "nvmlwrap.h"
+#include "rings.h"
+#include "bootstrap.h"
+#include "transport.h"
+#include "common_coll.h"
+#include "group.h"
+#include "utils.h"
+#include "net.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sched.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <cuda_runtime.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+
+DebugLevel ncclDebugLevel;
+uint64_t ncclDebugMask = INIT; // Default debug sub-system mask is INIT
+pthread_mutex_t ncclDebugOutputLock;
+FILE *ncclDebugFile = stdout;
+
+#ifdef ENABLE_TRACE
+std::chrono::high_resolution_clock::time_point ncclEpoch;
+#endif
+
+#if __CUDACC_VER_MAJOR__ >= 10 || (__CUDACC_VER_MAJOR__ >= 9 && __CUDACC_VER_MINOR__ >= 2)
+#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
+#else
+#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
+#endif
+
+NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
+
+NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
+
+extern "C" __attribute__ ((visibility("default")))
+ncclNet_t* ncclNet = NULL;
+
+// We define this as weak to let tests redefine their own
+#pragma weak ncclCudaCompCap
+int ncclCudaCompCap() {
+  int cudaDev;
+  if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
+  int ccMajor;
+  if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
+  return ccMajor;
+}
+int ncclCudaFullCompCap() {
+  int cudaDev;
+  if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
+  int ccMajor, ccMinor;
+  if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
+  if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0;
+  return ccMajor*10+ccMinor;
+}
+
+void initNet() {
+  if (ncclNet != NULL) {
+    INFO(INIT,"Using external Network %s", ncclNetName());
+  } else {
+    ncclNet = ncclIbSupport() ? &ncclNetIb : &ncclNetSocket;
+    INFO(INIT,"Using internal Network %s", ncclNetName());
+  }
+}
+
+NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
+NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", NCCL_THREAD_THRESHOLD);
+
+pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
+static bool initialized = false;
+static ncclResult_t ncclInit() {
+  if (initialized) return ncclSuccess;
+  pthread_mutex_lock(&initLock);
+  if (!initialized) {
+    initEnv();
+    initDebug();
+    initNet();
+    initialized = true;
+  }
+  pthread_mutex_unlock(&initLock);
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclGetVersion, int* version);
+ncclResult_t ncclGetVersion(int* version) {
+  if (version == NULL) return ncclInvalidArgument;
+  *version = NCCL_VERSION_CODE;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
+ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
+  NCCLCHECK(ncclInit());
+  NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
+  return bootstrapGetUniqueId(out);
+}
+
+static ncclResult_t commFree(ncclComm_t comm) {
+  if (comm == NULL)
+    return ncclSuccess;
+
+  CUDACHECK(cudaFree(comm->devComm));
+
+  for (int ring=0; ring<comm->nRings; ring++)
+    NCCLCHECK(freeRing(comm->rings+ring));
+
+  if (comm->doneEvent != NULL)
+    CUDACHECK(cudaEventDestroy(comm->doneEvent));
+
+  if (comm->launchMode == ncclComm::GROUP) {
+    CUDACHECK(cudaStreamDestroy(comm->groupStream));
+  }
+
+  // Last rank frees shared resources between threads
+  int isLast;
+  NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
+  if (isLast) {
+    free(comm->intraBarrier);
+    free(comm->intraParams);
+    free(comm->intraCudaDevs);
+    free(comm->intraCGMode);
+    free(comm->intraCC);
+  }
+
+  free(comm);
+  return ncclSuccess;
+}
+
+static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
+  if (ndev < 1) {
+    WARN("invalid device count (%d) requested", ndev);
+    return ncclInvalidArgument;
+  }
+  if (rank >= ndev || rank < 0) {
+    WARN("rank %d exceeds ndev=%d", rank, ndev);
+    return ncclInvalidArgument;
+  }
+
+  // Try to create a CUDA object right away. If there is something wrong with
+  // the device we're on (failure cause #1) , better know it early.
+  cudaEvent_t doneEvent;
+  CUDACHECK(cudaEventCreateWithFlags(&doneEvent, cudaEventDisableTiming));
+
+  struct ncclComm* comm;
+  NCCLCHECK(ncclCalloc(&comm, 1));
+
+  INFO(INIT,"comm %p rank %d nranks %d", comm, rank, ndev);
+  comm->rank = rank;
+  comm->nRanks = ndev;
+  cudaGetDevice(&comm->cudaDev);
+  comm->doneEvent = doneEvent;
+  comm->llThreshold = ncclParamLlThreshold();
+  comm->threadThreshold = ncclParamThreadThreshold();
+  comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
+#if __CUDACC_VER_MAJOR__ >= 10 || (__CUDACC_VER_MAJOR__ >= 9 && __CUDACC_VER_MINOR__ >= 2)
+  comm->groupCudaStream = ncclParamGroupCudaStream();
+#else
+  // Don't allow the user to overload the default setting in older CUDA builds
+  comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM;
+#endif
+
+  comm->argsptr = &comm->args;
+
+  *comret = comm;
+  return ncclSuccess;
+}
+
+static ncclResult_t devCommSetup(ncclComm_t comm) {
+  // Fully duplicate the comm on the device
+  NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
+  // Copy the comm on the device
+  NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1));
+  // Copy userRanks
+  for (int r=0; r<comm->nRings; r++) {
+    NCCLCHECK(ncclCudaMemcpy(comm->rings[r].devUserRanks, comm->rings[r].userRanks, comm->nRanks));
+  }
+  return ncclSuccess;
+}
+
+// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
+#define STR2(v) #v
+#define STR(v) STR2(v)
+#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
+static void showVersion() {
+  static int shown = 0;
+  if (shown == 0 && ncclDebugLevel >= VERSION) {
+    printf("%s\n", VERSION_STRING);
+    fflush(stdout);
+    if (ncclDebugFile != stdout)
+      INFO(ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
+    shown = 1;
+  }
+}
+
+static ncclResult_t fillInfo(struct ncclInfo* info, int rank) {
+  for (int t=0; t<NTRANSPORTS; t++) {
+    NCCLCHECK(ncclTransports[t].fillInfo(info->tinfo+t, rank));
+  }
+  return ncclSuccess;
+}
+
+template <int type>
+static ncclResult_t selectTransport(struct ncclInfo* myInfo, struct ncclInfo* peerInfo, struct ncclConnect* connect, struct ncclTransport** transportRet, struct ncclRing* ring) {
+  for (int t=0; t<NTRANSPORTS; t++) {
+    struct ncclTransport *transport = ncclTransports+t;
+    struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
+    ncclTvalue_t ret = 0;
+    NCCLCHECK(transport->canConnect(&ret, myInfo->tinfo+t, peerInfo->tinfo+t));
+    if (ret > 0) {
+      NCCLCHECK(transportComm->setup(myInfo->tinfo+t, peerInfo->tinfo+t, connect, ring));
+      *transportRet = transport;
+      return ncclSuccess;
+    }
+  }
+  WARN("No transport found !");
+  *transportRet = NULL;
+  return ncclInternalError;
+}
+
+static ncclResult_t setupRing(struct ncclComm* comm, int ringid, int rank, int nranks, int* ringRanks, struct ncclInfo* allInfo, struct ncclConnect* connect) {
+  NCCLCHECK(initRing(comm, ringid));
+
+  struct ncclRing* ring = comm->rings+ringid;
+  // Reorganize ranks to start with rank.
+  int shift;
+  for (shift = 0; shift<nranks; shift++) {
+    if (ringRanks[shift] == rank) {
+      break;
+    }
+  }
+  for (int i=0; i<nranks; i++) {
+    ring->userRanks[i] = ringRanks[(i+shift)%nranks];
+  }
+  int prev = ring->userRanks[nranks-1];
+  int next = ring->userRanks[1];
+
+  NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+0, &ring->recv.transport, ring));
+  NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+1, &ring->send.transport, ring));
+  NCCLCHECK(transportCreateProxy(0, ring, comm));
+  NCCLCHECK(transportCreateProxy(1, ring, comm));
+  return ncclSuccess;
+}
+
+static ncclResult_t fillConnect(struct ncclInfo* allInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) {
+  for (int r=0; r<nranks; r++) {
+    connectTransport[r] = -1;
+    for (int t=0; t<NTRANSPORTS; t++) {
+      NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, allInfo[rank].tinfo+t, allInfo[r].tinfo+t));
+      if (connectValue[r] > 0) {
+        connectTransport[r] = t;
+        break;
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+static void swap(void* mem1, void* mem2, int size) {
+  char tmp[size];
+  memcpy(tmp, mem1, size); memcpy(mem1, mem2, size); memcpy(mem2, tmp, size);
+}
+
+#define MAXWIDTH 20
+#define PREFIXLEN 15
+#define STRLENGTH (PREFIXLEN+4*MAXWIDTH)
+void dumpMatrix(int* connectMatrix, int nranks) {
+  char line[STRLENGTH+1];
+  line[STRLENGTH] = '\0';
+  memset(line, ' ', STRLENGTH);
+  for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j);
+  INFO(INIT,"%s", line);
+  for (int i=0; i<nranks; i++) {
+    memset(line, ' ', STRLENGTH);
+    sprintf(line, "%3d ", i);
+    for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]);
+    INFO(INIT,"%s", line);
+  }
+}
+
+void dumpLine(int* values, int nranks, const char* prefix) {
+  int prefixlen = strlen(prefix);
+  char line[STRLENGTH+1];
+  line[STRLENGTH] = '\0';
+  memset(line, ' ', STRLENGTH);
+  strncpy(line, prefix, PREFIXLEN);
+  for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
+  INFO(INIT,"%s", line);
+}
+
+static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
+  for (int r=0; r<nrings; r++) {
+    char prefix[30];
+    /*sprintf(prefix, "[%d] Ring %d Prev : ", rank, r);
+    dumpLine(prev+r*nranks, nranks, prefix);
+    sprintf(prefix, "[%d] Ring %d Next : ", rank, r);
+    dumpLine(next+r*nranks, nranks, prefix);*/
+
+    int current = rank;
+    for (int i=0; i<nranks; i++) {
+      rings[r*nranks+i] = current;
+      current = next[r*nranks+current];
+    }
+    sprintf(prefix, "Ring %02d : ", r);
+    if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
+    if (current != rank) {
+      WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
+      return ncclInternalError;
+    }
+    // Check that all ranks are there
+    for (int i=0; i<nranks; i++) {
+      int found = 0;
+      for (int j=0; j<nranks; j++) {
+        if (rings[r*nranks+j] == i) {
+          found = 1;
+          break;
+        }
+      }
+      if (found == 0) {
+        WARN("Error : ring %d does not contain rank %d", r, i);
+        return ncclInternalError;
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+void* waitForNonNullPtr(void* p) {
+  volatile void** ptr = (volatile void**) p;
+  while (*ptr == NULL) sched_yield();
+  return (void*)*ptr;
+}
+
+ncclResult_t initParams(struct ncclComm* comm) {
+  struct cudaLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
+  params->args = &comm->argsptr;
+  params->stream = NULL;
+  params->sharedMem = 0;
+  params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
+  params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1;
+  return ncclSuccess;
+}
+
+// Allocate/Set Intra Process Structures and set CG options
+ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) {
+  comm->intraRank = rank;
+  comm->intraRanks = ranks;
+  comm->intraPhase = 0;
+
+  // Alloc shared structures
+  if (rank == 0) {
+    assert(comm == comm0);
+    int* bar;
+    NCCLCHECK(ncclCalloc(&bar, 2));
+    bar[0] = bar[1] = 0;
+    comm->intraBarrier = bar;
+    NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks));
+    NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks));
+    int* CGMode;
+    NCCLCHECK(ncclCalloc(&CGMode, 1));
+    *CGMode = 0x11;
+    comm->intraCGMode = CGMode;
+    int* CC;
+    NCCLCHECK(ncclCalloc(&CC, 1));
+    *CC = ncclCudaFullCompCap();
+    comm->intraCC = CC;
+  } else {
+    comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
+    comm->intraParams = (struct cudaLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
+    comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
+    comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
+    comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
+  }
+  comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
+  NCCLCHECK(initParams(comm));
+
+  int cgMdLaunch = 0;
+
+  // Set CG Mode
+  comm->launchMode = ncclComm::GROUP;
+  char* str = getenv("NCCL_LAUNCH_MODE");
+  if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) {
+    comm->launchMode = ncclComm::PARALLEL;
+  }
+  if (comm->launchMode == ncclComm::GROUP) {
+    CUDACHECK(cudaStreamCreateWithFlags(&comm->groupStream, cudaStreamNonBlocking));
+#if __CUDACC_VER_MAJOR__ >= 9
+    if (*comm->intraCC && (ncclCudaFullCompCap() == *comm->intraCC)) {
+      // Check whether the GPU supports Cooperative Group Multi Device Launch
+      (void) cudaDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
+    }
+#endif
+  }
+
+  // Disable cgMdLaunch if any rank does not support it
+  if (cgMdLaunch == 0) {
+    *comm->intraCGMode = 0x10;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
+  int rank = comm->rank;
+  int nranks = comm->nRanks;
+  void* commState;
+  NCCLCHECK(bootstrapInit(commId, rank, nranks, &commState));
+
+  struct ncclInfo* allInfo;
+  NCCLCHECK(ncclCalloc(&allInfo, nranks));
+  NCCLCHECK(fillInfo(allInfo+rank, rank));
+  NCCLCHECK(bootstrapAllGather(commState, allInfo, sizeof(struct ncclInfo)));
+
+  int* connectTransport;
+  ncclTvalue_t* connectValue;
+  NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
+  NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
+
+  NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
+  NCCLCHECK(bootstrapAllGather(commState, connectTransport, nranks*(sizeof(int))));
+  NCCLCHECK(bootstrapAllGather(commState, connectValue, nranks*(sizeof(ncclTvalue_t))));
+  //if (rank == 0) dumpMatrix(connectTransport, nranks);
+  //if (rank == 0) dumpMatrix(connectValue, nranks);
+
+  // Get my rings
+  int nrings;
+  int* prev, *next;
+  NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
+  comm->nThreads = getDefaultThreads();
+  NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next));
+  free(connectTransport);
+  free(connectValue);
+
+  // Find max nThreads
+  int allData[nranks];
+  allData[rank] = comm->nThreads;
+  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
+  for (int i=0; i<nranks; i++)
+    comm->nThreads = std::max(allData[i], comm->nThreads);
+  if (rank == 0) INFO(INIT,"Using %d threads", comm->nThreads);
+
+  // Determine the minimum CUDA Compute capability of all GPUs
+  int myCompCap = ncclCudaCompCap();
+  int minCompCap = myCompCap;
+  allData[rank] = myCompCap;
+  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
+  for (int i=0; i<nranks; i++)
+    minCompCap = std::min(allData[i], minCompCap);
+  if (rank == 0) INFO(INIT,"Min Comp Cap %d", minCompCap);
+
+  // Find min nrings across ranks
+  allData[rank] = nrings;
+  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
+  for (int i=0; i<nranks; i++)
+    nrings = std::min(allData[i], nrings);
+
+  // Exchange data with others to build complete rings
+  comm->nRings = nrings;
+  for (int r=0; r<nrings; r++) {
+    NCCLCHECK(bootstrapAllGather(commState, prev+r*nranks, sizeof(int)));
+    NCCLCHECK(bootstrapAllGather(commState, next+r*nranks, sizeof(int)));
+  }
+  int *rings;
+  NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
+  NCCLCHECK(buildRings(nrings, rings, rank, nranks, prev, next));
+  free(prev);
+  free(next);
+
+  // Connect with prev/next for each ring
+  for (int r=0; r<nrings; r++) {
+    int* ringRanks = rings+r*nranks;
+    struct ncclRing *ring = comm->rings+r;
+    struct ncclConnect connect[2];
+    NCCLCHECK(setupRing(comm, r, rank, nranks, ringRanks, allInfo, connect));
+    NCCLCHECK(bootstrapRingExchange(commState, connect, ring->userRanks[nranks-1], ring->userRanks[1], sizeof(struct ncclConnect)));
+    NCCLCHECK(ring->send.transport->send.connect(connect+1, &ring->send));
+    NCCLCHECK(ring->recv.transport->recv.connect(connect+0, &ring->recv));
+  }
+  free(rings);
+  free(allInfo);
+
+  // Intra-process barrier setup
+  struct rankInfo {
+    uint64_t hostHash;
+    uint64_t pidHash;
+    struct ncclComm* comm;
+  } rankInfos[nranks];
+  rankInfos[rank].hostHash = getHostHash();
+  rankInfos[rank].pidHash = getPidHash();
+  rankInfos[rank].comm = comm;
+  NCCLCHECK(bootstrapAllGather(commState, rankInfos, sizeof(struct rankInfo)));
+
+  // Compute intra ranks
+  int intraRank0 = -1, intraRank = -1, intraRanks = 0;
+  for (int r=0; r<nranks; r++) {
+    if ((rankInfos[r].hostHash == rankInfos[rank].hostHash) &&
+        (rankInfos[r].pidHash == rankInfos[rank].pidHash)) {
+      if (intraRanks == 0) intraRank0 = r;
+      if (r == rank) intraRank = intraRanks;
+      intraRanks++;
+    }
+  }
+  TRACE(INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+      rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
+  if (intraRank == -1 || intraRank0 == -1 || rankInfos[intraRank0].comm == NULL) {
+    WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+        rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
+    return ncclInternalError;
+  }
+  NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, rankInfos[intraRank0].comm));
+
+  // Barrier
+  bootstrapClose(commState);
+  return ncclSuccess;
+}
+
+bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice) {
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+  if (cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev) != cudaSuccess) return false;
+  if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevice) != ncclSuccess) return false;
+  if (wrapNvmlDeviceSetCpuAffinity(*nvmlDevice) != ncclSuccess) {
+    WARN("Failed to set CPU affinity");
+    return false;
+  }
+  return true;
+}
+
+ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) {
+  cpu_set_t affinitySave;
+  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+
+  NCCLCHECK(wrapNvmlSymbols());
+  NCCLCHECK(wrapNvmlInit());
+
+  // Make sure all host memory allocation are close to the GPU
+  int cudaDev;
+  nvmlDevice_t nvmlDevice;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  SetCpuAffinity(cudaDev, &nvmlDevice);
+  ncclResult_t res;
+
+  NCCLCHECKGOTO(commAlloc(newcomm, ndev, myrank), res, cleanup);
+  NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
+  NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
+
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
+  return ncclSuccess;
+cleanup:
+  *newcomm = NULL;
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  return res;
+}
+
+NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
+ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
+  char* env = getenv("NCCL_COMM_ID");
+  if (env && myrank == 0) {
+    NCCLCHECK(bootstrapCreateRoot(&commId, true));
+  }
+
+  NCCLCHECK(ncclInit());
+  if (myrank == 0) showVersion();
+
+  INFO(INIT,"rank %d nranks %d", myrank, nranks);
+
+  // Make sure the CUDA runtime is initialized.
+  CUDACHECK(cudaFree(NULL));
+
+  NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm"));
+  if (nranks < 1 || myrank < 0 || myrank >= nranks) {
+    WARN("Invalid rank requested : %d/%d", myrank, nranks);
+    return ncclInvalidArgument;
+  }
+
+  if (ncclAsyncMode()) {
+    int cudaDev;
+    CUDACHECK(cudaGetDevice(&cudaDev));
+    return ncclAsyncInit(ncclCommInitRankSync, cudaDev, newcomm, nranks, commId, myrank);
+  } else {
+    return ncclCommInitRankSync(newcomm, nranks, commId, myrank);
+  }
+}
+
+static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) {
+  struct ncclInfo* allInfo;
+  NCCLCHECK(ncclCalloc(&allInfo, nranks));
+  for (int rank=0; rank<nranks; rank++) {
+    CUDACHECK(cudaSetDevice(devs[rank]));
+    NCCLCHECK(fillInfo(allInfo+rank, rank));
+  }
+
+  int* connectTransport;
+  ncclTvalue_t* connectValue;
+  NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
+  NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
+  for (int rank=0; rank<nranks; rank++)
+    NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
+
+  int* prev, *prevFinal, *next, *nextFinal;
+  NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXRINGS));
+  int nrings = MAXRINGS;
+  int nthreads=0;
+  int myCompCap = ncclCudaCompCap();
+  int minCompCap = myCompCap;
+  for (int rank=0; rank<nranks; rank++) {
+    CUDACHECK(cudaSetDevice(devs[rank]));
+    int nringsRank;
+    int nthreadsRank = getDefaultThreads();
+    myCompCap = ncclCudaCompCap();
+    NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next));
+    nrings = std::min(nrings, nringsRank);
+    nthreads = std::max(nthreads, nthreadsRank);
+    minCompCap = std::min(minCompCap, myCompCap);
+    for (int ring=0; ring<nrings; ring++) {
+      int index = ring*nranks+rank;
+      prevFinal[index] = prev[index];
+      nextFinal[index] = next[index];
+    }
+  }
+  free(connectTransport);
+  free(connectValue);
+  free(prev);
+  free(next);
+
+  INFO(INIT,"Using %d threads", nthreads);
+  INFO(INIT,"Min Comp Cap %d", minCompCap);
+
+  int* rings;
+  NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
+  NCCLCHECK(buildRings(nrings, rings, 0, nranks, prevFinal, nextFinal));
+  free(prevFinal);
+  free(nextFinal);
+
+  for (int rank=0; rank<nranks; rank++) {
+    comms[rank]->nRings = nrings;
+    comms[rank]->nThreads = nthreads;
+  }
+
+  for (int r=0; r<nrings; r++) {
+    struct ncclConnect connect[2*nranks];
+    int* ringRanks = rings+r*nranks;
+    for (int rank=0; rank<nranks; rank++) {
+      CUDACHECK(cudaSetDevice(devs[rank]));
+      NCCLCHECK(setupRing(comms[rank], r, rank, nranks, ringRanks, allInfo, connect+2*rank));
+    }
+    // RingExchange connect information
+    for (int rank=0; rank<nranks; rank++) {
+      // Swap rank->prev and prevRank->next
+      struct ncclRing *ring = comms[rank]->rings+r;
+      int prevRank = ring->userRanks[nranks-1];
+      struct ncclConnect* prevRankNextConnect = connect+2*prevRank+1;
+      struct ncclConnect* rankPrevConnect = connect+2*rank;
+      swap(prevRankNextConnect, rankPrevConnect, sizeof(struct ncclConnect));
+    }
+    for (int rank=0; rank<nranks; rank++) {
+      CUDACHECK(cudaSetDevice(devs[rank]));
+      struct ncclRing *ring = comms[rank]->rings+r;
+      NCCLCHECK(ring->send.transport->send.connect(connect+2*rank+1, &ring->send));
+      NCCLCHECK(ring->recv.transport->recv.connect(connect+2*rank+0, &ring->recv));
+    }
+  }
+  free(rings);
+  free(allInfo);
+  return ncclSuccess;
+}
+
+
+NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
+ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
+  NCCLCHECK(ncclInit());
+  NCCLCHECK(wrapNvmlSymbols());
+  NCCLCHECK(wrapNvmlInit());
+  showVersion();
+
+  INFO(INIT,"nranks %d", ndev);
+
+  NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
+  if (ndev < 1) {
+    WARN("Invalid device count requested : %d", ndev);
+    return ncclInvalidArgument;
+  }
+
+  ncclResult_t res;
+  int savedDevice;
+  int rank, cudaDev;
+  ncclComm_t comm = NULL;
+  nvmlDevice_t nvmlDevice;
+  int ncclDevList[ndev];
+  for (int i=0; i<ndev; i++) {
+    ncclDevList[i] = devlist ? devlist[i] : i;
+  }
+
+  cudaGetDevice(&savedDevice);
+
+  for(rank=0; rank<ndev; ++rank)
+    comms[rank] = NULL;
+
+  cpu_set_t affinitySave;
+  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+
+  for (rank=0; rank<ndev; ++rank) {
+    cudaDev = ncclDevList[rank];
+    CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
+
+    SetCpuAffinity(cudaDev, &nvmlDevice);
+
+    NCCLCHECKGOTO(commAlloc(&comm, ndev, rank), res, cleanup);
+    comms[rank] = comm;
+
+    NCCLCHECKGOTO(ncclCommSetIntra(comm, rank, ndev, comms[0]), res, cleanup);
+  }
+
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+
+  NCCLCHECKGOTO(initTransportsAll(comms, ncclDevList, ndev), res, cleanup);
+
+  for(rank=0; rank<ndev; ++rank) {
+    cudaDev = ncclDevList[rank];
+    CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
+    NCCLCHECKGOTO(devCommSetup(comms[rank]), res, cleanup);
+  }
+
+  res = ncclSuccess;
+  goto final;
+
+cleanup:
+  for(rank=0; rank<ndev; ++rank) {
+    if(comms[rank] != NULL) {
+      commFree(comms[rank]);
+    }
+  }
+
+final:
+  if(wrapNvmlShutdown() != ncclSuccess)
+    INFO(INIT,"NCCL did not shutdown nvml properly");
+  cudaSetDevice(savedDevice);
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  return res;
+}
+
+NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
+ncclResult_t ncclCommDestroy(ncclComm_t comm) {
+
+  if (comm == NULL)
+    return ncclSuccess;
+  int savedDevice;
+  CUDACHECK(cudaGetDevice(&savedDevice));
+  int commDevice = comm->cudaDev;
+
+  if (savedDevice != commDevice) {
+    CUDACHECK(cudaSetDevice(commDevice));
+  }
+
+  NCCLCHECK(commFree(comm));
+
+  if (savedDevice != commDevice)
+    CUDACHECK(cudaSetDevice(savedDevice));
+
+  return ncclSuccess;
+}
+
+NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
+const char* ncclGetErrorString(ncclResult_t code) {
+  switch (code) {
+    case ncclSuccess                : return "no error";
+    case ncclUnhandledCudaError     : return "unhandled cuda error";
+    case ncclSystemError            : return "unhandled system error";
+    case ncclInternalError          : return "internal error";
+    case ncclInvalidArgument        : return "invalid argument";
+    case ncclInvalidUsage           : return "invalid usage";
+    default                         : return "unknown result code";
+  }
+}
+
+NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
+ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
+  NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
+  NCCLCHECK(PtrCheck(count, "CommCount", "count"));
+  *count = comm->nRanks;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
+ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
+  NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
+  NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
+  *devid = comm->cudaDev;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
+ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
+  NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
+  NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
+  *rank = comm->rank;
+  return ncclSuccess;
+}
diff --git a/src/libwrap.cu b/src/libwrap.cu
deleted file mode 100644
index 1ac19a6..0000000
--- a/src/libwrap.cu
+++ /dev/null
@@ -1,155 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "libwrap.h"
-#include <dlfcn.h>
-#include "core.h"
-
-int symbolsLoaded = 0;
-
-static nvmlReturn_t (*nvmlInternalInit)(void);
-static nvmlReturn_t (*nvmlInternalShutdown)(void);
-static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
-static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
-static nvmlReturn_t (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device);
-static nvmlReturn_t (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device);
-static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
-
-ncclResult_t wrapSymbols(void) {
-
-  if (symbolsLoaded)
-    return ncclSuccess;
-
-  static void* nvmlhandle = NULL;
-  void* tmp;
-  void** cast;
-
-  nvmlhandle=dlopen("libnvidia-ml.so", RTLD_NOW);
-  if (!nvmlhandle) {
-    nvmlhandle=dlopen("libnvidia-ml.so.1", RTLD_NOW);
-    if (!nvmlhandle) {
-      WARN("Failed to open libnvidia-ml.so[.1]");
-      goto teardown;
-    }
-  }
-
-  #define LOAD_SYM(handle, symbol, funcptr) do {         \
-    cast = (void**)&funcptr;                             \
-    tmp = dlsym(handle, symbol);                         \
-    if (tmp == NULL) {                                   \
-      WARN("dlsym failed on %s - %s", symbol, dlerror());\
-      goto teardown;                                     \
-    }                                                    \
-    *cast = tmp;                                         \
-  } while (0)
-
-  LOAD_SYM(nvmlhandle, "nvmlInit", nvmlInternalInit);
-  LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceSetCpuAffinity", nvmlInternalDeviceSetCpuAffinity);
-  LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity);
-  LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
-
-  symbolsLoaded = 1;
-  return ncclSuccess;
-
-  teardown:
-  nvmlInternalInit = NULL;
-  nvmlInternalShutdown = NULL;
-  nvmlInternalDeviceGetHandleByPciBusId = NULL;
-  nvmlInternalDeviceGetIndex = NULL;
-  nvmlInternalDeviceSetCpuAffinity = NULL;
-  nvmlInternalDeviceClearCpuAffinity = NULL;
-
-  if (nvmlhandle != NULL) dlclose(nvmlhandle);
-  return ncclSystemError;
-}
-
-
-ncclResult_t wrapNvmlInit(void) {
-  if (nvmlInternalInit == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalInit();
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlInit() failed: %s",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlShutdown(void) {
-  if (nvmlInternalShutdown == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalShutdown();
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlShutdown() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
-  if (nvmlInternalDeviceGetHandleByPciBusId == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
-  if (nvmlInternalDeviceGetIndex == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalDeviceGetIndex(device, index);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceGetIndex() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
-  if (nvmlInternalDeviceSetCpuAffinity == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalDeviceSetCpuAffinity(device);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceSetCpuAffinity() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
-  if (nvmlInternalInit == NULL) {
-    WARN("lib wrapper not initialized.");
-    return ncclLibWrapperNotSet;
-  }
-  nvmlReturn_t ret = nvmlInternalDeviceClearCpuAffinity(device);
-  if (ret != NVML_SUCCESS) {
-    WARN("nvmlDeviceClearCpuAffinity() failed: %s ",
-      nvmlInternalErrorString(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
diff --git a/src/libwrap.h b/src/libwrap.h
deleted file mode 100644
index cdce480..0000000
--- a/src/libwrap.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-
-// Dynamically handle dependencies on external libraries (other than cudart).
-
-#ifndef SRC_LIBWRAP_H_
-#define SRC_LIBWRAP_H_
-
-#include "core.h"
-
-/* Extracted from nvml.h */
-typedef struct nvmlDevice_st* nvmlDevice_t;
-
-typedef enum nvmlReturn_enum
-{
-    NVML_SUCCESS = 0,                   //!< The operation was successful
-    NVML_ERROR_UNINITIALIZED = 1,       //!< NVML was not first initialized with nvmlInit()
-    NVML_ERROR_INVALID_ARGUMENT = 2,    //!< A supplied argument is invalid
-    NVML_ERROR_NOT_SUPPORTED = 3,       //!< The requested operation is not available on target device
-    NVML_ERROR_NO_PERMISSION = 4,       //!< The current user does not have permission for operation
-    NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
-    NVML_ERROR_NOT_FOUND = 6,           //!< A query to find an object was unsuccessful
-    NVML_ERROR_INSUFFICIENT_SIZE = 7,   //!< An input argument is not large enough
-    NVML_ERROR_INSUFFICIENT_POWER = 8,  //!< A device's external power cables are not properly attached
-    NVML_ERROR_DRIVER_NOT_LOADED = 9,   //!< NVIDIA driver is not loaded
-    NVML_ERROR_TIMEOUT = 10,            //!< User provided timeout passed
-    NVML_ERROR_IRQ_ISSUE = 11,          //!< NVIDIA Kernel detected an interrupt issue with a GPU
-    NVML_ERROR_LIBRARY_NOT_FOUND = 12,  //!< NVML Shared Library couldn't be found or loaded
-    NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
-    NVML_ERROR_CORRUPTED_INFOROM = 14,  //!< infoROM is corrupted
-    NVML_ERROR_GPU_IS_LOST = 15,        //!< The GPU has fallen off the bus or has otherwise become inaccessible
-    NVML_ERROR_RESET_REQUIRED = 16,     //!< The GPU requires a reset before it can be used again
-    NVML_ERROR_OPERATING_SYSTEM = 17,   //!< The GPU control device has been blocked by the operating system/cgroups
-    NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18,   //!< RM detects a driver/library version mismatch
-    NVML_ERROR_IN_USE = 19,             //!< An operation cannot be performed because the GPU is currently in use
-    NVML_ERROR_UNKNOWN = 999            //!< An internal driver error occurred
-} nvmlReturn_t;
-/* End of nvml.h */
-
-ncclResult_t wrapSymbols(void);
-
-ncclResult_t wrapNvmlInit(void);
-ncclResult_t wrapNvmlShutdown(void);
-ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
-ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
-ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
-ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
-
-#endif // End include guard
-
diff --git a/src/misc/enqueue.cu b/src/misc/enqueue.cu
new file mode 100644
index 0000000..dc5d9cc
--- /dev/null
+++ b/src/misc/enqueue.cu
@@ -0,0 +1,248 @@
+/*************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "common_coll.h"
+#include "param.h"
+
+#include "collectives/collectives.h"
+
+#define NCCL_FUNC4(coll, op, dtype) \
+  (void*)NCCL_KERN_NAME(coll, op, dtype), \
+  (void*)NCCL_KERN_NAME(coll##LL, op, dtype)
+
+// Must be consistent with ncclDataType_t
+#define NCCL_FUNCS3A(coll, op) \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  u8), \
+  (void*)NCCL_FUNC4(coll, op, i32), \
+  (void*)NCCL_FUNC4(coll, op, u32), \
+  (void*)NCCL_FUNC4(coll, op, i64), \
+  (void*)NCCL_FUNC4(coll, op, u64), \
+  (void*)NCCL_FUNC4(coll, op, f16), \
+  (void*)NCCL_FUNC4(coll, op, f32), \
+  (void*)NCCL_FUNC4(coll, op, f64)
+#define NCCL_FUNCS3B(coll, op) \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8)
+
+// Must be consistent with ncclRedOp_t
+#define NCCL_FUNCS2A(coll) \
+  NCCL_FUNCS3A(coll, sum ), \
+  NCCL_FUNCS3A(coll, prod), \
+  NCCL_FUNCS3A(coll, max ), \
+  NCCL_FUNCS3A(coll, min )
+#define NCCL_FUNCS2B(coll) \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy)
+
+// Must be consistent with the ncclFuncSet enum
+static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
+  NCCL_FUNCS2B(ncclBroadcast),
+  NCCL_FUNCS2A(ncclReduce),
+  NCCL_FUNCS2B(ncclAllGather),
+  NCCL_FUNCS2A(ncclReduceScatter),
+  NCCL_FUNCS2A(ncclAllReduce)
+};
+
+ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
+#if __CUDACC_VER_MAJOR__ >= 9
+  if (cgMode & 0x01) {
+    CUDACHECK(cudaLaunchCooperativeKernelMultiDevice(paramsList, numDevices,
+            // These flags are to reduce the latency of using this API
+            cudaCooperativeLaunchMultiDeviceNoPreSync|cudaCooperativeLaunchMultiDeviceNoPostSync));
+    return ncclSuccess;
+  }
+#endif
+  int savedDev;
+  CUDACHECK(cudaGetDevice(&savedDev));
+  for (int i = 0; i < numDevices; i++) {
+    struct cudaLaunchParams* params = paramsList+i;
+    CUDACHECK(cudaSetDevice(cudaDevs[i]));
+    CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
+  }
+  CUDACHECK(cudaSetDevice(savedDev));
+  return ncclSuccess;
+}
+
+ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
+  params->gridDim.x = std::min((int) params->gridDim.x, comm->nRings);
+
+  // Set active = 2 for the last operation
+  for (int r=0; r<params->gridDim.x; r++) {
+    struct ncclRing* ring = comm->rings+r;
+    ring->collectives[(ring->collStart+ring->collCount-1)%NCCL_MAX_OPS].active = 2;
+  }
+
+  // Find the first operation, choose the kernel accordingly and pass it
+  // as the first argument.
+  struct ncclColl* coll = comm->rings[0].collectives+comm->rings[0].collStart;
+  memcpy(&comm->args, coll, sizeof(struct ncclColl));
+  // As we pass that coll directly, we can free it immediately.
+  coll->active = 0;
+
+  params->func = ncclKerns[coll->funcIndex];
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
+  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+  int val = *ptr;
+  bool done = false;
+  while (done == false) {
+    if (val >= comm->intraRanks) {
+      WARN("Trying to launch too many collectives");
+      return ncclInvalidUsage;
+    }
+    if (val+1 == comm->intraRanks) {
+      // Reset the barrier.
+      comm->intraBarrier[comm->intraPhase^1] = 0;
+      *isLast = 1;
+      return ncclSuccess;
+    }
+    done = __sync_bool_compare_and_swap(ptr, val, val+1);
+    val++;
+  }
+  *isLast = 0;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
+  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+  int val = *ptr;
+  if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
+    WARN("Trying to launch too many collectives");
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
+  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+  while (*ptr < comm->intraRanks) pthread_yield();
+  comm->intraPhase ^= 1;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
+  if (comm->nRanks == 1) return ncclSuccess;
+  struct cudaLaunchParams* params = comm->myParams;
+
+  NCCLCHECK(setupLaunch(comm, params));
+
+  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
+  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
+    // Enqueue event in user stream
+    CUDACHECK(cudaEventRecord(comm->doneEvent, comm->userStream));
+    // Create dependency between user stream and internal NCCL stream
+    CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
+    params->stream = comm->groupStream;
+  } else {
+    if (comm->userStream != params->stream) {
+      // Stream changed from last call, create dependency against last NCCL kernel launch
+      CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
+    }
+    params->stream = comm->userStream;
+  }
+
+  int isLast = 0;
+  NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
+
+  if (isLast) {
+    if (comm->launchMode == ncclComm::GROUP) {
+      // I'm the last. Launch all operations.
+      NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode));
+    }
+    NCCLCHECK(ncclCpuBarrierLast(comm));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
+  if (comm->nRanks == 1) return ncclSuccess;
+  // We can't print the CG mode before the first barrier happened.
+  if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
+    *comm->intraCGMode ^= 0x10;
+    INFO(INIT,"Launch mode %s%s%s",
+        comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
+        *comm->intraCGMode ? "/CGMD" : "",
+        (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
+  }
+
+  NCCLCHECK(ncclCpuBarrierOut(comm));
+
+  struct cudaLaunchParams *params = comm->myParams;
+  if (comm->launchMode == ncclComm::PARALLEL) {
+    CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
+  }
+  // Start the network proxies as soon as the kernel has been launched. We can't
+  // perform any CUDA call between the two or having a cudaFree between the CUDA
+  // launch and the transportStartProxies call could cause a deadlock.
+  // Also, starting the proxies after the CUDA launch seems to be better for
+  // performance (latency).
+  for (int r=0; r<params->gridDim.x; r++) {
+    struct ncclRing* ring = comm->rings+r;
+    ring->collStart = ring->collFifoTail;
+    ring->collCount = 0;
+  }
+  params->gridDim.x = params->blockDim.x = 0;
+  NCCLCHECK(transportStartProxies(comm));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
+  struct cudaLaunchParams *params = comm->myParams;
+  // Enqueue event after NCCL kernel
+  CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream));
+  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
+  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
+    // Create dependency between NCCL internal stream and user stream
+    CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
+  }
+  comm->userStreamSet = false;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff,
+    void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  if (comm == NULL) return ncclInvalidArgument;
+  // Launch asynchronously if needed
+  if (ncclAsyncMode()) {
+    ncclResult_t ret = ncclSuccess;
+    int savedDev = -1;
+    if (comm->checkPointers) {
+      CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end);
+      CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, end);
+    }
+    // Check arguments
+    NCCLCHECKGOTO(ArgsCheck(sendbuff, recvbuff, count, type, op, root, comm, primName), ret, end);
+    // Always register comm even in case of error to make sure ncclGroupEnd
+    // cleans it up.
+    NCCLCHECK(ncclAsyncColl(comm));
+    NCCLCHECKGOTO(func(sendbuff, recvbuff, count, type, op, root, comm, stream), ret, end);
+end:
+    if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev));
+    ncclAsyncErrCheck(ret);
+    return ret;
+  } else {
+    NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, type, op, root, comm, primName));
+    NCCLCHECK(func(sendbuff, recvbuff, count, type, op, root, comm, stream));
+    NCCLCHECK(ncclBarrierEnqueue(comm));
+    NCCLCHECK(ncclBarrierEnqueueWait(comm));
+    NCCLCHECK(ncclEnqueueEvents(comm));
+    return ncclSuccess;
+  }
+}
diff --git a/src/misc/group.cu b/src/misc/group.cu
new file mode 100644
index 0000000..c7b31cf
--- /dev/null
+++ b/src/misc/group.cu
@@ -0,0 +1,198 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "group.h"
+#include "debug.h"
+#include "enqueue.h"
+
+#define MAX_ASYNC_OPS 128
+thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS];
+thread_local int ncclGroupIndex = 0;
+thread_local int ncclGroupMode = 0;
+thread_local ncclResult_t ncclGroupError = ncclSuccess;
+
+bool ncclAsyncMode() {
+  return ncclGroupMode > 0;
+}
+
+ncclResult_t ncclAsyncErrCheck(ncclResult_t ret) {
+  if (ncclGroupError == ncclSuccess || ret != ncclSuccess) ncclGroupError = ret;
+  return ret;
+}
+
+struct ncclInitArgs {
+  ncclInitFunc_t func;
+  int cudaDev;
+  ncclComm_t* newcomm;
+  int ndev;
+  ncclUniqueId commId;
+  int myrank;
+};
+struct ncclCollArgs {
+  ncclComm_t comm;
+};
+
+enum ncclAsyncFuncType {
+  ASYNC_FUNC_INVALID = 0,
+  ASYNC_FUNC_INIT = 1,
+  ASYNC_FUNC_COLL = 2,
+};
+struct ncclAsyncArgs {
+  ncclResult_t ret;
+  enum ncclAsyncFuncType funcType;
+  union {
+    ncclCollArgs coll;
+    ncclInitArgs init;
+  };
+};
+
+thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];
+
+ncclResult_t ncclSetDevice(int cudaDev) {
+  CUDACHECK(cudaSetDevice(cudaDev));
+  return ncclSuccess;
+}
+
+#define CHECK(a) do { \
+  if ((args->ret = (a)) != ncclSuccess) { \
+    INFO(INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
+    return args; \
+  } \
+} while(0)
+
+void* ncclAsyncThreadMain(void* args_) {
+  struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
+  CHECK(ncclSetDevice(args->init.cudaDev));
+  CHECK(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank));
+  return args;
+}
+
+ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) {
+  if (ncclGroupIndex >= MAX_ASYNC_OPS) {
+    WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
+    return ncclAsyncErrCheck(ncclInternalError);
+  }
+  int index = ncclGroupIndex++;
+  struct ncclAsyncArgs* args = ncclGroupArgs+index;
+  args->funcType = ASYNC_FUNC_INIT;
+  args->init.func = func;
+  args->init.cudaDev = cudaDev;
+  args->init.newcomm = newcomm;
+  args->init.ndev = ndev;
+  memcpy(&args->init.commId, &commId, sizeof(commId));
+  args->init.myrank = myrank;
+  // We need to use threads for Init
+  pthread_create(ncclGroupThreads+index, NULL, ncclAsyncThreadMain, args);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclAsyncColl(ncclComm_t comm) {
+  struct ncclAsyncArgs* args = ncclGroupArgs;
+  for (int i=0; i<ncclGroupIndex; i++) {
+    if (args->coll.comm == comm) return ncclSuccess;
+    args++;
+  }
+  if (ncclGroupIndex >= MAX_ASYNC_OPS) {
+    WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
+    return ncclAsyncErrCheck(ncclInternalError);
+  }
+  ncclGroupIndex++;
+  args->funcType = ASYNC_FUNC_COLL;
+  args->coll.comm = comm;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclGroupStart);
+ncclResult_t ncclGroupStart() {
+  ncclGroupMode++;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclGroupEnd);
+ncclResult_t ncclGroupEnd() {
+  ncclGroupMode--;
+  if (ncclGroupMode > 0) return ncclSuccess;
+  int savedDev;
+  CUDACHECK(cudaGetDevice(&savedDev));
+  int done = ncclGroupIndex;
+  int doneArray[ncclGroupIndex];
+  for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 0;
+
+  ncclResult_t ret = ncclGroupError;
+  if (ret != ncclSuccess) goto group_cleanup;
+
+  /* Collectives are done in three steps :
+   * 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative]
+   * 2. Barrier Wait. No CUDA call is permitted
+   * 3. Enqueue Events. CUDA event wait/enqueue.
+   * This is needed because step 2 cannot call any CUDA primitive, otherwise if
+   * cudaFree happens between 1 and 3, it could block that CUDA call and
+   * prevent some ranks from launching their network threads, which would
+   * prevent the NCCL call from completing, blocking the cudaFree call.
+   */
+  for (int i=0; i<ncclGroupIndex; i++) {
+    struct ncclAsyncArgs* args = ncclGroupArgs+i;
+    if (args->funcType == ASYNC_FUNC_COLL) {
+      if (args->coll.comm->userStream == NULL)
+        CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
+      NCCLCHECKGOTO(ncclBarrierEnqueue(args->coll.comm), ret, end);
+    }
+  }
+  for (int i=0; i<ncclGroupIndex; i++) {
+    struct ncclAsyncArgs* args = ncclGroupArgs+i;
+    if (args->funcType == ASYNC_FUNC_COLL) {
+      CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
+      NCCLCHECKGOTO(ncclBarrierEnqueueWait(args->coll.comm), ret, end);
+    }
+  }
+  for (int i=0; i<ncclGroupIndex; i++) {
+    struct ncclAsyncArgs* args = ncclGroupArgs+i;
+    if (args->funcType == ASYNC_FUNC_COLL) {
+      if (args->coll.comm->userStream == NULL)
+        CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
+      NCCLCHECKGOTO(ncclEnqueueEvents(args->coll.comm), ret, end);
+      doneArray[i] = 1;
+      done--;
+    }
+  }
+
+  /* For init, since we use threads, we just wait for threads to complete */
+  while (done) {
+    for (int i=0; i<ncclGroupIndex; i++) {
+      struct ncclAsyncArgs* args = ncclGroupArgs+i;
+      if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
+        int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL);
+        if (err == EBUSY) continue;
+        if (err != 0) { ret = ncclSystemError; goto end; }
+        if (args->ret != ncclSuccess) { ret = args->ret; goto end; }
+        doneArray[i] = 1;
+        done--;
+      }
+    }
+  }
+  goto end;
+group_cleanup:
+  // At least one call in the group failed. Since we want to make that group
+  // an atomic operation, we need to cancel all operations.
+  for (int i=0; i<ncclGroupIndex; i++) {
+    struct ncclComm* comm = ncclGroupArgs[i].coll.comm;
+    for (int r=0; r<comm->nRings; r++) {
+      struct ncclRing* ring = comm->rings+r;
+      for (int i=0; i<ring->collCount; i++) {
+        ring->collectives[(ring->collStart + i)%NCCL_MAX_OPS].active = 0;
+      }
+      ring->collFifoTail = ring->collStart;
+      ring->collCount = 0;
+    }
+    comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
+    comm->userStreamSet = false;
+  }
+end:
+  ncclGroupError = ncclSuccess;
+  ncclGroupIndex = 0;
+  CUDACHECK(cudaSetDevice(savedDev)); // do other clean-ups first before calling cudaSetDevice, because this call can fail too
+  return ret;
+}
diff --git a/src/misc/ibvwrap.cu b/src/misc/ibvwrap.cu
new file mode 100644
index 0000000..7ac3431
--- /dev/null
+++ b/src/misc/ibvwrap.cu
@@ -0,0 +1,290 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "ibvwrap.h"
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <dlfcn.h>
+#include "core.h"
+
+static enum { ibvUninitialized, ibvInitializing, ibvInitialized, ibvError } ibvState = ibvUninitialized;
+
+/*Function Pointers*/
+int (*ibv_internal_fork_init)(void);
+struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
+void (*ibv_internal_free_device_list)(struct ibv_device **list);
+const char * (*ibv_internal_get_device_name)(struct ibv_device *device);
+struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device);
+int (*ibv_internal_close_device)(struct ibv_context *context);
+int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event);
+void (*ibv_internal_ack_async_event)(struct ibv_async_event *event);
+int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr);
+int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
+int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
+int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
+struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
+int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
+struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
+int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
+struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
+int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
+struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
+int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
+int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
+const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
+
+// IBVERBS Library versioning
+#define IBVERBS_VERSION "IBVERBS_1.1"
+
+ncclResult_t wrap_ibv_symbols(void) {
+  if (ibvState == ibvInitialized)
+    return ncclSuccess;
+  if (ibvState == ibvError)
+    return ncclSystemError;
+
+  if (__sync_bool_compare_and_swap(&ibvState, ibvUninitialized, ibvInitializing) == false) {
+    // Another thread raced in front of us. Wait for it to be done.
+    while (ibvState == ibvInitializing) pthread_yield();
+    return (ibvState == ibvInitialized) ? ncclSuccess : ncclSystemError;
+  }
+
+  static void* ibvhandle = NULL;
+  void* tmp;
+  void** cast;
+
+  ibvhandle=dlopen("libibverbs.so", RTLD_NOW);
+  if (!ibvhandle) {
+    ibvhandle=dlopen("libibverbs.so.1", RTLD_NOW);
+    if (!ibvhandle) {
+      WARN("Failed to open libibverbs.so[.1]");
+      goto teardown;
+    }
+  }
+
+#define LOAD_SYM(handle, symbol, funcptr) do {         \
+    cast = (void**)&funcptr;                             \
+    tmp = dlvsym(handle, symbol, IBVERBS_VERSION);       \
+    if (tmp == NULL) {                                   \
+      WARN("dlvsym failed on %s - %s version %s", symbol, dlerror(), IBVERBS_VERSION);  \
+      goto teardown;                                     \
+    }                                                    \
+    *cast = tmp;                                         \
+  } while (0)
+
+  LOAD_SYM(ibvhandle, "ibv_get_device_list", ibv_internal_get_device_list);
+  LOAD_SYM(ibvhandle, "ibv_free_device_list", ibv_internal_free_device_list);
+  LOAD_SYM(ibvhandle, "ibv_get_device_name", ibv_internal_get_device_name);
+  LOAD_SYM(ibvhandle, "ibv_open_device", ibv_internal_open_device);
+  LOAD_SYM(ibvhandle, "ibv_close_device", ibv_internal_close_device);
+  LOAD_SYM(ibvhandle, "ibv_get_async_event", ibv_internal_get_async_event);
+  LOAD_SYM(ibvhandle, "ibv_ack_async_event", ibv_internal_ack_async_event);
+  LOAD_SYM(ibvhandle, "ibv_query_device", ibv_internal_query_device);
+  LOAD_SYM(ibvhandle, "ibv_query_port", ibv_internal_query_port);
+  LOAD_SYM(ibvhandle, "ibv_query_gid", ibv_internal_query_gid);
+  LOAD_SYM(ibvhandle, "ibv_query_qp", ibv_internal_query_qp);
+  LOAD_SYM(ibvhandle, "ibv_alloc_pd", ibv_internal_alloc_pd);
+  LOAD_SYM(ibvhandle, "ibv_dealloc_pd", ibv_internal_dealloc_pd);
+  LOAD_SYM(ibvhandle, "ibv_reg_mr", ibv_internal_reg_mr);
+  LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibv_internal_dereg_mr);
+  LOAD_SYM(ibvhandle, "ibv_create_cq", ibv_internal_create_cq);
+  LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibv_internal_destroy_cq);
+  LOAD_SYM(ibvhandle, "ibv_create_qp", ibv_internal_create_qp);
+  LOAD_SYM(ibvhandle, "ibv_modify_qp", ibv_internal_modify_qp);
+  LOAD_SYM(ibvhandle, "ibv_destroy_qp", ibv_internal_destroy_qp);
+  LOAD_SYM(ibvhandle, "ibv_fork_init", ibv_internal_fork_init);
+  LOAD_SYM(ibvhandle, "ibv_event_type_str", ibv_internal_event_type_str);
+
+  ibvState = ibvInitialized;
+  return ncclSuccess;
+
+teardown:
+  ibv_internal_get_device_list = NULL;
+  ibv_internal_free_device_list = NULL;
+  ibv_internal_get_device_name = NULL;
+  ibv_internal_open_device = NULL;
+  ibv_internal_close_device = NULL;
+  ibv_internal_get_async_event = NULL;
+  ibv_internal_ack_async_event = NULL;
+  ibv_internal_query_device = NULL;
+  ibv_internal_query_port = NULL;
+  ibv_internal_query_gid = NULL;
+  ibv_internal_query_qp = NULL;
+  ibv_internal_alloc_pd = NULL;
+  ibv_internal_dealloc_pd = NULL;
+  ibv_internal_reg_mr = NULL;
+  ibv_internal_dereg_mr = NULL;
+  ibv_internal_create_cq = NULL;
+  ibv_internal_destroy_cq = NULL;
+  ibv_internal_create_qp = NULL;
+  ibv_internal_modify_qp = NULL;
+  ibv_internal_destroy_qp = NULL;
+  ibv_internal_fork_init = NULL;
+  ibv_internal_event_type_str = NULL;
+
+  if (ibvhandle != NULL) dlclose(ibvhandle);
+  ibvState = ibvError;
+  return ncclSystemError;
+}
+
+#define IBV_PTR_CHECK_ERRNO(name_internal, call, retval, error_retval, name) \
+  if (name_internal == NULL) { \
+     WARN("lib wrapper not initialized."); \
+     return ncclInternalError; \
+  } \
+  retval = call; \
+  if (retval == error_retval) { \
+    WARN("Call to " name " failed with error %s", strerror(errno)); \
+    return ncclSystemError; \
+  } \
+  return ncclSuccess;
+
+#define IBV_PTR_CHECK(name_internal, call, retval, error_retval, name) \
+  if (name_internal == NULL) { \
+     WARN("lib wrapper not initialized."); \
+     return ncclInternalError; \
+  } \
+  retval = call; \
+  if (retval == error_retval) { \
+    WARN("Call to " name " failed"); \
+    return ncclSystemError; \
+  } \
+  return ncclSuccess;
+
+#define IBV_INT_CHECK_RET_ERRNO(name_internal, call, success_retval, name) \
+  if (name_internal == NULL) { \
+     WARN("lib wrapper not initialized."); \
+     return ncclInternalError; \
+  } \
+  int ret = call; \
+  if (ret != success_retval) { \
+    WARN("Call to " name " failed with error %s", strerror(ret)); \
+    return ncclSystemError; \
+  } \
+  return ncclSuccess;
+
+#define IBV_INT_CHECK(name_internal, call, error_retval, name) \
+  if (name_internal == NULL) { \
+     WARN("lib wrapper not initialized."); \
+     return ncclInternalError; \
+  } \
+  int ret = call; \
+  if (ret == error_retval) { \
+    WARN("Call to " name " failed"); \
+    return ncclSystemError; \
+  } \
+  return ncclSuccess;
+
+#define IBV_PASSTHRU(name_internal, call) \
+  if (name_internal == NULL) { \
+     WARN("lib wrapper not initialized."); \
+     return ncclInternalError; \
+  } \
+  call; \
+  return ncclSuccess;
+
+ncclResult_t wrap_ibv_fork_init() {
+  IBV_INT_CHECK(ibv_internal_fork_init, ibv_internal_fork_init(), -1, "ibv_fork_init");
+}
+
+ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices) {
+  *ret = ibv_internal_get_device_list(num_devices);
+  if (*ret == NULL) *num_devices = 0;
+  return ncclSuccess;
+}
+
+ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list) {
+  IBV_PASSTHRU(ibv_internal_free_device_list, ibv_internal_free_device_list(list));
+}
+
+const char *wrap_ibv_get_device_name(struct ibv_device *device) {
+  if (ibv_internal_get_device_name == NULL) {
+    WARN("lib wrapper not initialized.");
+    exit(-1);
+  }
+  return ibv_internal_get_device_name(device);
+}
+
+ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device) { /*returns 0 on success, -1 on failure*/
+  IBV_PTR_CHECK(ibv_internal_open_device, ibv_internal_open_device(device), *ret, NULL, "ibv_open_device");
+}
+
+ncclResult_t wrap_ibv_close_device(struct ibv_context *context) { /*returns 0 on success, -1 on failure*/
+  IBV_INT_CHECK(ibv_internal_close_device, ibv_internal_close_device(context), -1, "ibv_close_device");
+}
+
+ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event) { /*returns 0 on success, and -1 on error*/
+  IBV_INT_CHECK(ibv_internal_get_async_event, ibv_internal_get_async_event(context, event), -1, "ibv_get_async_event");
+}
+
+ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event) {
+  IBV_PASSTHRU(ibv_internal_ack_async_event, ibv_internal_ack_async_event(event));
+}
+
+ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_device, ibv_internal_query_device(context, device_attr), 0, "ibv_query_device");
+}
+
+ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port");
+}
+
+ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid) {
+  IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_gid, ibv_internal_query_gid(context, port_num, index, gid), 0, "ibv_query_gid");
+}
+
+ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) {
+  IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_qp, ibv_internal_query_qp(qp, attr, attr_mask, init_attr), 0, "ibv_query_qp");
+}
+
+ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context) {
+  IBV_PTR_CHECK(ibv_internal_alloc_pd, ibv_internal_alloc_pd(context), *ret, NULL, "ibv_alloc_pd");
+}
+
+ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  IBV_INT_CHECK_RET_ERRNO(ibv_internal_dealloc_pd, ibv_internal_dealloc_pd(pd), 0, "ibv_dealloc_pd");
+}
+
+ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access) {
+  IBV_PTR_CHECK(ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
+}
+
+struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) {
+  if (ibv_internal_reg_mr == NULL) {
+    WARN("lib wrapper not initialized.");
+    return NULL;
+  }
+  return ibv_internal_reg_mr(pd, addr, length, access);
+}
+
+ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  IBV_INT_CHECK_RET_ERRNO(ibv_internal_dereg_mr, ibv_internal_dereg_mr(mr), 0, "ibv_dereg_mr");
+}
+
+ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector) {
+  IBV_PTR_CHECK(ibv_internal_create_cq, ibv_internal_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq");
+}
+
+ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq) {
+  IBV_INT_CHECK_RET_ERRNO(ibv_internal_destroy_cq, ibv_internal_destroy_cq(cq), 0, "ibv_destroy_cq");
+}
+
+ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp) {
+  IBV_INT_CHECK_RET_ERRNO(ibv_internal_destroy_qp, ibv_internal_destroy_qp(qp), 0, "ibv_destroy_qp");
+}
+
+ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) {
+  IBV_PTR_CHECK(ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp");
+}
+
+ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  IBV_INT_CHECK_RET_ERRNO(ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp");
+}
+
+ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event) {
+  *ret = (char *) ibv_internal_event_type_str(event);
+  return ncclSuccess;
+}
diff --git a/src/misc/nvmlwrap.cu b/src/misc/nvmlwrap.cu
new file mode 100644
index 0000000..52d4d41
--- /dev/null
+++ b/src/misc/nvmlwrap.cu
@@ -0,0 +1,248 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nvmlwrap.h"
+
+#ifndef NVML_DIRECT
+#include <dlfcn.h>
+#include "core.h"
+
+static enum { nvmlUninitialized, nvmlInitializing, nvmlInitialized, nvmlError } nvmlState = nvmlUninitialized;
+
+static nvmlReturn_t (*nvmlInternalInit)(void);
+static nvmlReturn_t (*nvmlInternalShutdown)(void);
+static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
+static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
+static nvmlReturn_t (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device);
+static nvmlReturn_t (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device);
+static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
+static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
+static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci);
+static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
+static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link,
+    nvmlNvLinkCapability_t capability, unsigned int *capResult);
+
+ncclResult_t wrapNvmlSymbols(void) {
+  if (nvmlState == nvmlInitialized)
+    return ncclSuccess;
+  if (nvmlState == nvmlError)
+    return ncclSystemError;
+
+  if (__sync_bool_compare_and_swap(&nvmlState, nvmlUninitialized, nvmlInitializing) == false) {
+    // Another thread raced in front of us. Wait for it to be done.
+    while (nvmlState == nvmlInitializing) pthread_yield();
+    return (nvmlState == nvmlInitialized) ? ncclSuccess : ncclSystemError;
+  }
+
+  static void* nvmlhandle = NULL;
+  void* tmp;
+  void** cast;
+
+  nvmlhandle=dlopen("libnvidia-ml.so.1", RTLD_NOW);
+  if (!nvmlhandle) {
+    WARN("Failed to open libnvidia-ml.so.1");
+    goto teardown;
+  }
+
+#define LOAD_SYM(handle, symbol, funcptr) do {         \
+    cast = (void**)&funcptr;                             \
+    tmp = dlsym(handle, symbol);                         \
+    if (tmp == NULL) {                                   \
+      WARN("dlsym failed on %s - %s", symbol, dlerror());\
+      goto teardown;                                     \
+    }                                                    \
+    *cast = tmp;                                         \
+  } while (0)
+
+#define LOAD_SYM_OPTIONAL(handle, symbol, funcptr) do {\
+    cast = (void**)&funcptr;                             \
+    tmp = dlsym(handle, symbol);                         \
+    if (tmp == NULL) {                                   \
+      INFO(INIT,"dlsym failed on %s, ignoring", symbol); \
+    }                                                    \
+    *cast = tmp;                                         \
+  } while (0)
+
+  LOAD_SYM(nvmlhandle, "nvmlInit", nvmlInternalInit);
+  LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceSetCpuAffinity", nvmlInternalDeviceSetCpuAffinity);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity);
+  LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo);
+  LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState);
+  LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo);
+  LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability);
+
+  nvmlState = nvmlInitialized;
+  return ncclSuccess;
+
+teardown:
+  nvmlInternalInit = NULL;
+  nvmlInternalShutdown = NULL;
+  nvmlInternalDeviceGetHandleByPciBusId = NULL;
+  nvmlInternalDeviceGetIndex = NULL;
+  nvmlInternalDeviceSetCpuAffinity = NULL;
+  nvmlInternalDeviceClearCpuAffinity = NULL;
+  nvmlInternalDeviceGetPciInfo = NULL;
+  nvmlInternalDeviceGetNvLinkState = NULL;
+  nvmlInternalDeviceGetNvLinkRemotePciInfo = NULL;
+  nvmlInternalDeviceGetNvLinkCapability = NULL;
+
+  if (nvmlhandle != NULL) dlclose(nvmlhandle);
+  nvmlState = nvmlError;
+  return ncclSystemError;
+}
+
+
+ncclResult_t wrapNvmlInit(void) {
+  if (nvmlInternalInit == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret = nvmlInternalInit();
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlInit() failed: %s",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlShutdown(void) {
+  if (nvmlInternalShutdown == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret = nvmlInternalShutdown();
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlShutdown() failed: %s ",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
+  if (nvmlInternalDeviceGetHandleByPciBusId == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret = nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device);
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
+  if (nvmlInternalDeviceGetIndex == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret = nvmlInternalDeviceGetIndex(device, index);
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlDeviceGetIndex() failed: %s ",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
+  if (nvmlInternalDeviceSetCpuAffinity == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclInternalError;
+  }
+  // Workaround : it seems SetCpuAffinity is not thread safe.
+  static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+  pthread_mutex_lock(&lock);
+  nvmlReturn_t ret = nvmlInternalDeviceSetCpuAffinity(device);
+  pthread_mutex_unlock(&lock);
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlDeviceSetCpuAffinity() failed: %s ",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
+  if (nvmlInternalInit == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret = nvmlInternalDeviceClearCpuAffinity(device);
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlDeviceClearCpuAffinity() failed: %s ",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
+  if (nvmlInternalDeviceGetPciInfo == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret = nvmlInternalDeviceGetPciInfo(device, pci);
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlDeviceGetPciInfo() failed: %s ",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
+  if (nvmlInternalDeviceGetNvLinkState == NULL) {
+    /* Do not warn, this symbol is optional. */
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive);
+  if (ret != NVML_SUCCESS) {
+    INFO(INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
+  if (nvmlInternalDeviceGetNvLinkRemotePciInfo == NULL) {
+    /* Do not warn, this symbol is optional. */
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci);
+  if (ret != NVML_SUCCESS) {
+    if (ret != NVML_ERROR_NOT_SUPPORTED)
+      INFO(INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ",
+          nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
+    nvmlNvLinkCapability_t capability, unsigned int *capResult) {
+  if (nvmlInternalDeviceGetNvLinkCapability == NULL) {
+    /* Do not warn, this symbol is optional. */
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult);
+  if (ret != NVML_SUCCESS) {
+    if (ret != NVML_ERROR_NOT_SUPPORTED)
+      INFO(INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ",
+          nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+#endif
diff --git a/src/misc/rings.cu b/src/misc/rings.cu
new file mode 100644
index 0000000..9ecda49
--- /dev/null
+++ b/src/misc/rings.cu
@@ -0,0 +1,355 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "net.h"
+#include "param.h"
+
+/* Parse user defined rings. Format is like :
+ * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
+ * Rings with a non-matching number of ranks are ignored so we can provide
+ * rings for multiple cases.
+ */
+#define MAX_ENV_RANKS 512
+static ncclResult_t parseRings(const char* str, int* nringsRet, int nranks, int* prev, int* next) {
+  int ranks[MAX_ENV_RANKS];
+  int nrings = 0;
+  int rank = 0;
+  int offset = 0;
+  int status = 0; // 0 : between numbers, 1 : inside number
+  do {
+    int digit = str[offset] - '0';
+    if (digit >= 0 && digit <= 9) {
+      if (status == 0) {
+        ranks[rank] = digit;
+        status = 1;
+      } else {
+        ranks[rank] = ranks[rank]*10+digit;
+      }
+    } else {
+      if (status == 1) {
+        rank++;
+        if (rank == MAX_ENV_RANKS) goto end;
+      }
+      status = 0;
+      if (str[offset] == '|' || str[offset] == '\0') {
+        int prevRank = ranks[rank-1];
+        // Ignore rings if nranks doesn't match
+        if (rank != nranks) goto newring;
+
+        for (int r=0; r<nranks; r++) {
+          int rank = ranks[r];
+          // Ignore rings with ranks out of bounds
+          if (rank < 0 || rank >= nranks) goto newring;
+          // Ignore rings with duplicate ranks
+          for (int i=0; i<r; i++)
+            if (ranks[i] == rank) goto newring;
+
+          next[nrings*nranks+prevRank] = rank;
+          prev[nrings*nranks+rank] = prevRank;
+          prevRank = rank;
+        }
+        nrings++;
+newring:
+        rank = 0;
+      }
+    }
+  } while (str[offset++] != 0);
+end:
+  *nringsRet = nrings;
+  return ncclSuccess;
+}
+
+/*
+ * Ring creation algorithm
+ *
+ * First, we establish hierarchical coordinates depending on the way ranks can
+ * communicate. After fillCoords, we have for each rank a unique 3-int array
+ * {   node, pci_domain,   rank } corresponding to the three transports :
+ * { 2[NET],     1[SHM], 0[P2P] }.
+ * Also, we renumber ranks (to indexes) based on their growing coordinates.
+ *
+ * Then, we ask transports to connect groups together. We start with net, then
+ * shm, then p2p. We maintain two arrays, prev and next, where values are equal
+ * to -1 when ranks are not yet connected, and a rank otherwise. We never
+ * connect ranks outside our group, meaning that on 4 nodes of 2 sockets of 4
+ * ranks, if we are rank 13, we should see something like (provided we have a
+ * single net interface, hence a single ring) :
+ *
+ * Connecting all nodes                                <13>
+ * 2[NET] : prev 31 -1 -1 -1 -1 -1 -1 -1  7 -1 -1 -1 -1 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
+ *          next -1 -1 -1 -1 -1 -1 -1  8 -1 -1 -1 -1 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1  0
+ *
+ * Connecting P2P domains with shared memory           <13>
+ * 1[SHM] : prev 31 -1 -1 -1 -1 -1 -1 -1  7 -1 -1 -1 11 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
+ *          next -1 -1 -1 -1 -1 -1 -1  8 -1 -1 -1 12 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1  0
+ *
+ * Connecting ranks (only inside the P2P domain)       <13>
+ * 0[P2P] : prev 31 -1 -1 -1 -1 -1 -1 -1  7 -1 -1 -1 11 12 13 14 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
+ *          next -1 -1 -1 -1 -1 -1 -1  8 -1 -1 -1 12 13 14 15 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1  0
+ *
+ * Hence, when we ask a transport to connect groups, we provide it with a subview of the ranks (except for net
+ * which always sees the full world). That way, P2P can bruteforce all combinations inside the node without
+ * risking to explode in terms of combinations, and we scale better.
+ *
+ * Finally, we loop over Network scores to try to create rings with high scores (=locality) and decrease until
+ * we get at least one ring.
+ */
+
+static void recIsConnected(int rank, int* connected, int nranks, int* matrix, int transport) {
+  connected[rank] = 1;
+  for (int r=0; r<nranks; r++) {
+    if (connected[r] == 0 && matrix[rank*nranks+r] == transport) {
+      recIsConnected(r, connected, nranks, matrix, transport);
+    }
+  }
+}
+
+static void isConnected(int rank, int* connected, int nranks, int* matrix, int transport) {
+  for (int r=0; r<nranks; r++) connected[r] = 0;
+  recIsConnected(rank, connected, nranks, matrix, transport);
+}
+
+#define NEW_IDX(rank) do { \
+  rankToIdx[rank] = idx; \
+  idxToRank[idx] = rank; \
+  for (int t=0; t<NTRANSPORTS; t++) coords[rank*NTRANSPORTS+t] = current[t]; \
+  idx++; \
+} while (0)
+
+int findConnected(int rank, int* matrix, int nranks, int transport, int* coords) {
+  for (int r=0; r<nranks; r++) {
+    if (coords[r*NTRANSPORTS] == -1 && matrix[rank*nranks+r] == transport) return r;
+  }
+  return -1;
+}
+
+static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankToIdx, int* idxToRank) {
+  int current[NTRANSPORTS];
+  int* p2pConnected;
+  NCCLCHECK(ncclCalloc(&p2pConnected, nranks));
+  for (int i=0; i<NTRANSPORTS; i++) current[i] = 0;
+  int curRank = 0, idx = 0;
+  while (1) {
+    // P2P is handled separately as there is no level below it and we need to
+    // cover the case of being connected to another GPU indirectly.
+    // So we detect all GPUs in the same P2P domain once and add them all at
+    // once.
+    isConnected(curRank, p2pConnected, nranks, matrix, 0);
+    for (int r=0; r<nranks; r++) {
+      if (p2pConnected[r]) {
+        NEW_IDX(r);
+        curRank = r;
+        current[0]++;
+      }
+    }
+    current[0] = 0;
+
+    if (idx == nranks) {
+      free(p2pConnected);
+      return ncclSuccess;
+    }
+
+    // Find next group, either connected through SHM or NET.
+    int rank;
+    int transport = 1;
+    while ((rank = findConnected(curRank, matrix, nranks, transport, coords)) == -1) {
+      current[transport] = 0;
+      transport++;
+      if (transport == NTRANSPORTS) { free(p2pConnected); return ncclInternalError; }
+    }
+    curRank = rank;
+    current[transport]++;
+  }
+}
+
+NCCL_PARAM(MinNrings, "MIN_NRINGS", 0);
+NCCL_PARAM(MaxNrings, "MAX_NRINGS", 0);
+
+/* Users can force the number of threads with an environment variable */
+NCCL_PARAM(Nthreads, "NTHREADS", -2);
+ncclResult_t getEnvThreads(int* nthreads) {
+  int64_t nt = ncclParamNthreads();
+  if (nt != -2)
+    *nthreads = nt;
+  return ncclSuccess;
+}
+
+/* Main ring creation function */
+ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next) {
+  *nrings = 0;
+
+  if (nranks == 1) return ncclSuccess;
+
+  char* str = getenv("NCCL_RINGS");
+  if (str && strlen(str)>0) {
+    int ret = parseRings(str, nrings, nranks, prev, next);
+    if (ret == ncclSuccess && *nrings > 0) {
+      if (rank == 0) INFO(INIT,"%d ring(s) set by environment", *nrings);
+      NCCLCHECK(getEnvThreads(nthreads));
+      return ncclSuccess;
+    }
+    if (rank == 0) INFO(INIT,"No valid ring found in environment, ignoring");
+    *nrings = 0;
+  }
+
+  // Compute hierarchical topology groups, indexes, and rank<->index tables
+  int* coords, *globalIdxToRank, *globalRankToIdx;
+  NCCLCHECK(ncclCalloc(&coords, nranks*NTRANSPORTS));
+  for (int i=0; i<nranks*NTRANSPORTS; i++) coords[i] = -1;
+  NCCLCHECK(ncclCalloc(&globalIdxToRank, nranks));
+  NCCLCHECK(ncclCalloc(&globalRankToIdx, nranks));
+
+  NCCLCHECK(fillCoords(nranks, transports, coords, globalRankToIdx, globalIdxToRank));
+
+  // Start with a high score, then decrease until we find rings
+  int minScore = NCCL_MAX_SCORE;
+  int nringsTmp;
+  int *prevTmp, *nextTmp, *idxToRank, *rankToIdx, *groups, *subgroups;
+  NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&idxToRank, nranks));
+  NCCLCHECK(ncclCalloc(&rankToIdx, nranks));
+  NCCLCHECK(ncclCalloc(&groups, nranks));
+  NCCLCHECK(ncclCalloc(&subgroups, nranks));
+
+  int nThreads;
+  do {
+    nThreads = *nthreads;
+    for (int i=0; i<nranks*MAXRINGS; i++) prevTmp[i] = nextTmp[i] = -1;
+    nringsTmp = MAXRINGS;
+    // Loop over transports to connect groups
+    for (int t=NTRANSPORTS-1; t>=0; t--) {
+      for (int i=0; i<nranks; i++) idxToRank[i] = rankToIdx[i] = -1;
+
+      int nidx = 0;
+      for (int i=0; i<nranks; i++) {
+        // Extract only ranks in the same local area as rank
+        // We need to extract them in the topological order, hence we iterate over indexes, not ranks
+        int r = globalIdxToRank[i];
+        int sameLocal = 1;
+        for (int tr = NTRANSPORTS-1; tr > t; tr--) if (coords[r*NTRANSPORTS+tr] != coords[rank*NTRANSPORTS+tr]) sameLocal = 0;
+        if (!sameLocal) continue;
+
+        groups[nidx] = coords[r*NTRANSPORTS+t];
+        subgroups[nidx] = t ? coords[r*NTRANSPORTS+t-1] : nidx;
+        rankToIdx[r] = nidx;
+        idxToRank[nidx] = r;
+        nidx++;
+      }
+
+      int ngroups = groups[nidx-1] + 1; // Coords should be ordered
+
+      ncclTvalue_t* subvalues;
+      int *subprev, *subnext;
+      NCCLCHECK(ncclCalloc(&subvalues, nidx*nidx));
+      NCCLCHECK(ncclCalloc(&subprev, nidx*nringsTmp));
+      NCCLCHECK(ncclCalloc(&subnext, nidx*nringsTmp));
+      if (ngroups > 1) {
+        /* Extract subvalues */
+        for (int i=0; i<nidx; i++) {
+          for (int j=0; j<nidx; j++) {
+            if (transports[idxToRank[i]*nranks+idxToRank[j]] == t)
+              subvalues[i*nidx+j] = values[idxToRank[i]*nranks+idxToRank[j]];
+            else
+              subvalues[i*nidx+j] = 0;
+          }
+        }
+        /* Extract subprev/subnext */
+        for (int i=0; i<nidx*nringsTmp; i++) {
+          subprev[i] = subnext[i] = -1;
+        }
+        for (int r=0; r<nringsTmp; r++) {
+          int start = -1, end = -1;
+          for (int i=0; i<nranks; i++) {
+            if (rankToIdx[i] == -1) continue;
+            if (prevTmp[r*nranks+i] != -1) start = i;
+            if (nextTmp[r*nranks+i] != -1) end = i;
+          }
+          if (start != -1 && end != -1) {
+            subprev[r*nidx+rankToIdx[start]] = rankToIdx[end];
+            subnext[r*nidx+rankToIdx[end]] = rankToIdx[start];
+          }
+        }
+        /* Get rings */
+        NCCLCHECK(ncclTransports[t].getRings(nidx, groups, subgroups, subvalues, &nringsTmp, subprev, subnext, minScore, &nThreads));
+        /* Merge subprev/subnext into prev/next */
+        for (int r=0; r<nringsTmp; r++) {
+          for (int i=0; i<nidx; i++) {
+            if ((prevTmp[r*nranks+idxToRank[i]] == -1) && (subprev[r*nidx+i] != -1)) prevTmp[r*nranks+idxToRank[i]] = idxToRank[subprev[r*nidx+i]];
+            if ((nextTmp[r*nranks+idxToRank[i]] == -1) && (subnext[r*nidx+i] != -1)) nextTmp[r*nranks+idxToRank[i]] = idxToRank[subnext[r*nidx+i]];
+          }
+        }
+        //for (int r=0; r<nringsTmp; r++) {
+        //printf("[%d] [%d] [%d] [%d] Prev ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", prevTmp[r*nranks+i]); printf("\n");
+        //printf("[%d] [%d] [%d] [%d] Next ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", nextTmp[r*nranks+i]); printf("\n");
+        //}
+      }
+      free(subvalues);
+      free(subprev);
+      free(subnext);
+      if (nringsTmp == 0) break;
+    }
+    minScore--;
+    if (nringsTmp > *nrings) {
+      *nrings = nringsTmp;
+      for (int i=0; i<nranks*(*nrings); i++) {
+        prev[i] = prevTmp[i];
+        next[i] = nextTmp[i];
+      }
+    }
+  } while (nringsTmp == 0 && minScore);
+
+  free(coords);
+  free(globalRankToIdx);
+  free(globalIdxToRank);
+  free(prevTmp);
+  free(nextTmp);
+  free(idxToRank);
+  free(rankToIdx);
+  free(groups);
+  free(subgroups);
+
+  *nthreads = nThreads;
+
+  if (*nrings == 0) {
+    WARN("Could not create rings, falling back on simple ring");
+    *nrings = 1;
+    prev[rank] = (rank-1+nranks) % nranks;
+    next[rank] = (rank+1)%nranks;
+  }
+
+  int maxNrings = ncclParamMaxNrings();
+  int minNrings = ncclParamMinNrings();
+  if (maxNrings > 0 && minNrings > maxNrings) {
+    if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than NCCL_MAX_NRINGS, ignoring NCCL_MIN_NRINGS");
+    minNrings = 0;
+  }
+  if (minNrings > MAXRINGS) {
+    if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXRINGS, MAXRINGS);
+    minNrings = MAXRINGS;
+  }
+  if (maxNrings > 0 && maxNrings <= *nrings) {
+    if (rank == 0) INFO(INIT,"Limiting to %d rings per user request.", maxNrings);
+    *nrings = maxNrings;
+  } else {
+    int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
+    if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
+    if (minNrings > 0 && minNrings > *nrings) {
+      if (rank == 0 && minNrings > defaultMinNrings) INFO(INIT,"Duplicating rings to %d per user request.", minNrings);
+      for (int r=*nrings; r<MAXRINGS && r <minNrings; r++) {
+        for (int i=0; i<nranks; i++) {
+          prev[r*nranks+i] = prev[(r-*nrings)*nranks+i];
+          next[r*nranks+i] = next[(r-*nrings)*nranks+i];
+        }
+      }
+      *nrings = minNrings;
+    }
+  }
+
+  NCCLCHECK(getEnvThreads(nthreads));
+  return ncclSuccess;
+}
diff --git a/src/misc/utils.cu b/src/misc/utils.cu
new file mode 100644
index 0000000..e22a2c3
--- /dev/null
+++ b/src/misc/utils.cu
@@ -0,0 +1,129 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "utils.h"
+#include "debug.h"
+#include <unistd.h>
+#include <string.h>
+
+ncclResult_t getHostName(char* hostname, int maxlen) {
+  if (gethostname(hostname, maxlen) != 0) {
+    strncpy(hostname, "unknown", maxlen);
+    return ncclSystemError;
+  }
+  int i = 0;
+  while ((hostname[i] != '.') && (hostname[i] != '\0') && (i < maxlen-1)) i++;
+  hostname[i] = '\0';
+  return ncclSuccess;
+}
+
+uint64_t getHash(const char* string) {
+  // Based on DJB2, result = result * 33 + char
+  uint64_t result = 5381;
+  for (int c = 0; string[c] != '\0'; c++) {
+    result = ((result << 5) + result) + string[c];
+  }
+  return result;
+}
+
+/* Generate a hash of the unique identifying string for this host
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $(hostname) $(readlink /proc/self/ns/uts)
+ */
+uint64_t getHostHash(void) {
+  char uname[1024];
+  // Start off with the hostname
+  (void) getHostName(uname, sizeof(uname));
+  int hlen = strlen(uname);
+  int len = readlink("/proc/self/ns/uts", uname+hlen, sizeof(uname)-1-hlen);
+  if (len < 0) len = 0;
+
+  uname[hlen+len]='\0';
+  TRACE(INIT,"unique hostname '%s'", uname);
+
+  return getHash(uname);
+}
+
+/* Generate a hash of the unique identifying string for this process
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $$ $(readlink /proc/self/ns/pid)
+ */
+uint64_t getPidHash(void) {
+  char pname[1024];
+  // Start off with our pid ($$)
+  sprintf(pname, "%ld", (long) getpid());
+  int plen = strlen(pname);
+  int len = readlink("/proc/self/ns/pid", pname+plen, sizeof(pname)-1-plen);
+  if (len < 0) len = 0;
+
+  pname[plen+len]='\0';
+  TRACE(INIT,"unique PID '%s'", pname);
+
+  return getHash(pname);
+}
+
+int parseStringList(const char* string, struct netIf* ifList, int maxList) {
+  if (!string) return 0;
+
+  const char* ptr = string;
+  // Ignore "^" prefix, will be detected outside of this function
+  if (ptr[0] == '^') ptr++;
+
+  int ifNum = 0;
+  int ifC = 0;
+  char c;
+  do {
+    c = *ptr;
+    if (c == ':') {
+      if (ifC > 0) {
+        ifList[ifNum].prefix[ifC] = '\0';
+        ifList[ifNum].port = atoi(ptr+1);
+        ifNum++; ifC = 0;
+      }
+      while (c != ',' && c != '\0') c = *(++ptr);
+    } else if (c == ',' || c == '\0') {
+      if (ifC > 0) {
+        ifList[ifNum].prefix[ifC] = '\0';
+        ifList[ifNum].port = -1;
+        ifNum++; ifC = 0;
+      }
+    } else {
+      ifList[ifNum].prefix[ifC] = c;
+      ifC++;
+    }
+    ptr++;
+  } while (c);
+  return ifNum;
+}
+
+static bool matchPrefix(const char* string, const char* prefix) {
+  return (strncmp(string, prefix, strlen(prefix)) == 0);
+}
+
+static bool matchPort(const int port1, const int port2) {
+  if (port1 == -1) return true;
+  if (port2 == -1) return true;
+  if (port1 == port2) return true;
+  return false;
+}
+
+
+bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize) {
+  // Make an exception for the case where no user list is defined
+  if (listSize == 0) return true;
+
+  for (int i=0; i<listSize; i++) {
+    if (matchPrefix(string, ifList[i].prefix)
+        && matchPort(port, ifList[i].port)) {
+      return true;
+    }
+  }
+  return false;
+}
diff --git a/src/nccl.h b/src/nccl.h
deleted file mode 100644
index 7bb5aa5..0000000
--- a/src/nccl.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_H_
-#define NCCL_H_
-
-#include <cuda_runtime.h>
-
-#if CUDART_VERSION >= 7050
-#include <cuda_fp16.h>
-#define CUDA_HAS_HALF 1
-#else
-#undef CUDA_HAS_HALF
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Opaque handle to communicator */
-typedef struct ncclComm* ncclComm_t;
-
-#define NCCL_UNIQUE_ID_BYTES 128
-typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
-
-/* Error type */
-typedef enum { ncclSuccess                 =  0,
-               ncclUnhandledCudaError      =  1,
-               ncclSystemError             =  2,
-               ncclInternalError           =  3,
-               ncclInvalidDevicePointer    =  4,
-               ncclInvalidRank             =  5,
-               ncclUnsupportedDeviceCount  =  6,
-               ncclDeviceNotFound          =  7,
-               ncclInvalidDeviceIndex      =  8,
-               ncclLibWrapperNotSet        =  9,
-               ncclCudaMallocFailed        = 10,
-               ncclRankMismatch            = 11,
-               ncclInvalidArgument         = 12,
-               ncclInvalidType             = 13,
-               ncclInvalidOperation        = 14,
-               nccl_NUM_RESULTS            = 15 } ncclResult_t;
-
-/* Generates a unique Id with each call. Used to generate commId for
- * ncclCommInitAll. uniqueId will be created in such a way that it is
- * guaranteed to be unique accross the host. */
-ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
-ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
-
-/* Creates a new communicator (multi process version).
- * rank must be between 0 and ndev-1 and unique within a communicator clique.
- * ndev is number of logical devices
- * The communicator is created on the current CUDA device.
- * ncclCommInitRank implicitly syncronizes with other ranks, so INIT OF EACH RANK MUST
- * BE CALLED IN A SEPARATE HOST THREADS to avoid deadlock. */
-ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank);
-ncclResult_t pncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank);
-
-/* Creates a clique of communicators.
- * This is a convenience function to create a single-process communicator clique.
- * Returns an array of ndev newly initialized communicators in comm.
- * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
- * If devlist is NULL, the first ndev CUDA devices are used.
- * Order of devlist defines user-order of processors within the communicator. */
-ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
-ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
-
-/* Frees resources associated with communicator object. */
-void  ncclCommDestroy(ncclComm_t comm);
-void pncclCommDestroy(ncclComm_t comm);
-
-/* Returns nice error message. */
-const char*  ncclGetErrorString(ncclResult_t result);
-const char* pncclGetErrorString(ncclResult_t result);
-
-/* Sets count to number of devices in the communicator clique. */
-ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
-ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
-
-/* Returns cuda device number associated with communicator. */
-ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
-ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
-
-/* Returns user-ordered "rank" assocaiated with communicator. */
-ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
-ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
-
-/* Reduction opperation selector */
-typedef enum { ncclSum        = 0,
-               ncclProd       = 1,
-               ncclMax        = 2,
-               ncclMin        = 3,
-               nccl_NUM_OPS   = 4 } ncclRedOp_t;
-
-/* Data types */
-typedef enum { ncclChar       = 0,
-               ncclInt        = 1,
-#ifdef CUDA_HAS_HALF
-               ncclHalf       = 2,
-#endif
-               ncclFloat      = 3,
-               ncclDouble     = 4,
-               ncclInt64      = 5,
-               ncclUint64     = 6,
-               nccl_NUM_TYPES = 7 } ncclDataType_t;
-
-/* Reduces data arrays of length count in sendbuff into recvbuf using op operation.
- * recvbuf may be NULL on all calls except for root device.
- * On the root device, sendbuff and recvbuff are assumed to reside on
- * the same device.
- * Must be called separately for each communicator in communicator clique.
-*/
-ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype,
-    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype,
-    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
-
-/* Reduces data arrays of length count in sendbuff using op operation, and leaves
- * identical copies of result on each GPUs recvbuff.
- * Sendbuff and recvbuff are assumed to reside on the same device.
- * Must be called separately for each communicator in communicator clique. */
-ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
-
-/* Reduces data in sendbuff using op operation and leaves reduced result scattered
- * over the devices so that recvbuff on the i-th GPU will contain the i-th block of
- * the result. Sendbuff and recvbuff are assumed to reside on same device. Assumes
- * sendbuff has size at least ndev*recvcount elements, where ndev is number of
- * communicators in communicator clique
- * Must be called separately for each communicator in communicator clique.*/
-ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
-    int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-    cudaStream_t stream);
-ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
-    int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-    cudaStream_t stream);
-
-/* Copies count values from root to all other devices.
- * Root specifies the source device in user-order
- * (see ncclCommInit).
- * Must be called separately for each communicator in communicator clique. */
-ncclResult_t  ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-
-
-/* Each device gathers count values from other GPUs.
- * Result is ordered by comm's logical device order.
- * Assumes recvbuff has size at least ndev*count, where ndev is number of communicators
- * in communicator clique.
- * Sendbuff and recvbuff are assumed to reside on same device.
- * Must be called separately for each communicator in communicator clique. */
-ncclResult_t  ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
-    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
-    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
-
-
-/* The following collective operations are not implemented yet */
-///* Gather count values from each device to recvbuff.
-// * Result is ordered by comm's logical device order.
-// * recvbuff may be NULL for all calls except for root device.
-// * On the root device, sendbuff and recvbuff are assumed to reside on the same device.
-// * Must be called separately for each communicator in communicator clique. */
-// * All GPUs, including root, perform copies into recvbuff.
-//ncclResult_t  ncclGather(const void* sendbuff, int count, ncclDataType_t datatype,
-//    void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream);
-//ncclResult_t pncclGather(const void* sendbuff, int count, ncclDataType_t datatype,
-//                        void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream);
-
-///* Root device scatters count values to each devices.
-// * sendbuff may be NULL on all devices except a single root
-// * device where it is assumed to have size at least nGPUs*count.
-// * recvbuff allocated on each gpu, including root, size=count.
-// * Result is ordered by comm's logical device order.
-// * Called separately for each device in the ncclComm. */
-//ncclResult_t  ncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff,
-//    int count, int root, ncclComm_t comm, cudaStream_t stream);
-//ncclResult_t pncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff,
-//    int count, int root, ncclComm_t comm, cudaStream_t stream);
-//
-///* All GPUs scatter blocks of count elements to other devices.
-// * Must be called separately for each device in the ncclComm.
-// * sendbuff and recvbuff assumed to reside on same device and
-// * have size at least nGPUs*count.
-// * Called separately for each device in the ncclComm. */
-//ncclResult_t  ncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype,
-//    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
-//ncclResult_t pncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype,
-//    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
-
-#ifdef __cplusplus
-} // end extern "C"
-#endif
-
-#endif // end include guard
-
diff --git a/src/nccl.h.in b/src/nccl.h.in
new file mode 100644
index 0000000..7227625
--- /dev/null
+++ b/src/nccl.h.in
@@ -0,0 +1,251 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_H_
+#define NCCL_H_
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#define NCCL_MAJOR ${nccl:Major}
+#define NCCL_MINOR ${nccl:Minor}
+#define NCCL_PATCH ${nccl:Patch}
+#define NCCL_SUFFIX "${nccl:Suffix}"
+
+#define NCCL_VERSION_CODE ${nccl:Version}
+#define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Opaque handle to communicator */
+typedef struct ncclComm* ncclComm_t;
+
+#define NCCL_UNIQUE_ID_BYTES 128
+typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
+
+/* Error type */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclNumResults              =  6 } ncclResult_t;
+
+/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
+ * This integer is coded with the MAJOR, MINOR and PATCH level of the
+ * NCCL library
+ */
+ncclResult_t ncclGetVersion(int *version);
+ncclResult_t pncclGetVersion(int *version);
+
+/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
+ * called once and the Id should be distributed to all ranks in the
+ * communicator before calling ncclCommInitRank. */
+ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
+ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
+
+/* Creates a new communicator (multi thread/process version).
+ * rank must be between 0 and nranks-1 and unique within a communicator clique.
+ * Each rank is associated to a CUDA device, which has to be set before calling
+ * ncclCommInitRank.
+ * ncclCommInitRank implicitly syncronizes with other ranks, so it must be
+ * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
+ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+
+/* Creates a clique of communicators (single process version).
+ * This is a convenience function to create a single-process communicator clique.
+ * Returns an array of ndev newly initialized communicators in comm.
+ * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
+ * If devlist is NULL, the first ndev CUDA devices are used.
+ * Order of devlist defines user-order of processors within the communicator. */
+ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+
+/* Frees resources associated with communicator object. */
+ncclResult_t  ncclCommDestroy(ncclComm_t comm);
+ncclResult_t pncclCommDestroy(ncclComm_t comm);
+
+/* Returns a human-readable error message. */
+const char*  ncclGetErrorString(ncclResult_t result);
+const char* pncclGetErrorString(ncclResult_t result);
+
+/* Gets the number of ranks in the communicator clique. */
+ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
+ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
+
+/* Returns the cuda device number associated with the communicator. */
+ncclResult_t  ncclCommCuDevice(const ncclComm_t comm, int* device);
+ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
+
+/* Returns the user-ordered "rank" associated with the communicator. */
+ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
+ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
+
+/* Reduction operation selector */
+typedef enum { ncclSum        = 0,
+               ncclProd       = 1,
+               ncclMax        = 2,
+               ncclMin        = 3,
+               ncclNumOps     = 4 } ncclRedOp_t;
+
+/* Data types */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclNumTypes   = 9 } ncclDataType_t;
+
+/*
+ * Collective communication operations
+ *
+ * Collective communication operations must be called separately for each
+ * communicator in a communicator clique.
+ *
+ * They return when operations have been enqueued on the CUDA stream.
+ *
+ * Since they may perform inter-CPU synchronization, each call has to be done
+ * from a different thread or process, or need to use Group Semantics (see
+ * below).
+ */
+
+/*
+ * Reduce
+ *
+ * Reduces data arrays of length count in sendbuff into recvbuff using op
+ * operation.
+ * recvbuff may be NULL on all calls except for root device.
+ * root is the rank (not the CUDA device) where data will reside after the
+ * operation is complete.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * (deprecated) Broadcast (in-place)
+ *
+ * Copies count values from root to all other devices.
+ * root is the rank (not the CUDA device) where data resides before the
+ * operation is started.
+ *
+ * This operation is implicitely in place.
+ */
+ncclResult_t  ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Broadcast
+ *
+ * Copies count values from root to all other devices.
+ * root is the rank (not the CUDA device) where data resides before the
+ * operation is started.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * All-Reduce
+ *
+ * Reduces data arrays of length count in sendbuff using op operation, and
+ * leaves identical copies of result on each recvbuff.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Reduce-Scatter
+ *
+ * Reduces data in sendbuff using op operation and leaves reduced result
+ * scattered over the devices so that recvbuff on rank i will contain the i-th
+ * block of the result.
+ * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
+ * should have a size of at least nranks*recvcount elements.
+ *
+ * In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
+ */
+ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream);
+ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream);
+
+/*
+ * All-Gather
+ *
+ * Each device gathers sendcount values from other GPUs into recvbuff,
+ * receiving data from rank i at offset i*sendcount.
+ * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
+ * should have a size of at least nranks*sendcount elements.
+ *
+ * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
+ */
+ncclResult_t  ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Group semantics
+ *
+ * When managing multiple GPUs from a single thread, and since NCCL collective
+ * calls may perform inter-CPU synchronization, we need to "group" calls for
+ * different ranks/devices into a single call.
+ *
+ * Grouping NCCL calls as being part of the same collective operation is done
+ * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
+ * collective calls until the ncclGroupEnd call, which will wait for all calls
+ * to be complete. Note that for collective communication, ncclGroupEnd only
+ * guarantees that the operations are enqueued on the streams, not that
+ * the operation is effectively done.
+ *
+ * Both collective communication and ncclCommInitRank can be used in conjunction
+ * of ncclGroupStart/ncclGroupEnd.
+ */
+
+/*
+ * Group Start
+ *
+ * Start a group call. All subsequent calls to NCCL may not block due to
+ * inter-CPU synchronization.
+ */
+ncclResult_t ncclGroupStart();
+
+/*
+ * Group End
+ *
+ * End a group call. Wait for all calls since ncclGroupStart to complete
+ * before returning.
+ */
+ncclResult_t ncclGroupEnd();
+
+#ifdef __cplusplus
+} // end extern "C"
+#endif
+
+#endif // end include guard
diff --git a/src/primitives.h b/src/primitives.h
deleted file mode 100644
index bcaeca8..0000000
--- a/src/primitives.h
+++ /dev/null
@@ -1,206 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef PRIMITIVES_H_
-#define PRIMITIVES_H_
-
-#include <type_traits>
-#include "copy_kernel.h" // for FuncPassA
-#include "reduce_kernel.h" // for reduction funcs
-
-
-/* Defines primitive operations: Copy, Reduce, DoubleCopy, and ReduceCopy.
- *
- * In order to reduce the reptetion of template arguments, the operations
- * are bundled as static methods of the Primitives class.
- *
- * Each primitive operation copies/reduces a contiguous buffer and syncs
- * an optional set of flags against a sub-step counter. The sync value is
- * based on the step parameter. Sync flags must be of type WaitFlag or
- * PostFlag. The primitive routines wait for all WaitFlag args to attain
- * at least a value of SUBSTEPS*(step-1)+substep+1 (i.e. completion of
- * corresponding substep by previous step) before executing the transfer.
- * After each substep is transfered, all PostFlag arguments get updated to
- * the value SUBSTEPS*step+substep+1.
- */
-
-
-class WaitFlag {
-  volatile int * const flag;
-  const int shift;
-  public:
-  __device__ __forceinline__
-  WaitFlag(volatile int * const flag, const int shift) : flag(flag), shift(shift) { }
-  __device__ __forceinline__
-  void wait(int val) { while (*flag < (val + shift)) /*SPIN*/; }
-};
-
-
-class PostFlag {
-  volatile int * const flag;
-  const int shift;
-  public:
-  __device__ __forceinline__
-  PostFlag(volatile int* const flag, const int shift) : flag(flag), shift(shift) { }
-  __device__ __forceinline__
-  void post(int val) { *flag = (val + shift); }
-};
-
-
-// Helper to check if any argument is of type T.
-// e.g. AnyAre<WaitFlag>(Flag1, Flag2, ...)
-template<typename T> __device__ __forceinline__
-bool AnyAre() { return false; }
-
-template<typename T, typename FIRST_T, typename... TAIL_Ts>
-__device__ __forceinline__
-bool AnyAre(FIRST_T first, TAIL_Ts... tail) {
-  return std::is_same<T, FIRST_T>::value || AnyAre<T>(tail...);
-}
-
-
-// Wait on all WaitFlags, ignore PostFlags
-__device__ __forceinline__
-void WaitOnFlags(int val) { }
-
-template <typename... TAIL_Ts> __device__ __forceinline__
-void WaitOnFlags(int val, WaitFlag flag, TAIL_Ts... tail) {
-  flag.wait(val);
-  WaitOnFlags(val, tail...);
-}
-
-template <typename... TAIL_Ts> __device__ __forceinline__
-void WaitOnFlags(int val, PostFlag, TAIL_Ts... tail) {
-  WaitOnFlags(val, tail...);
-}
-
-
-// Post all PostFlags, ingnore WaitFlags
-__device__ __forceinline__
-void PostToFlags(int val) { }
-
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostToFlags(int val, WaitFlag flag, TAIL_Ts... tail) {
-  PostToFlags(val, tail...);
-}
-
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostToFlags(int val, PostFlag flag, TAIL_Ts... tail) {
-  flag.post(val);
-  PostToFlags(val, tail...);
-}
-
-
-// Create pointer arithmetic syntax that doesn't break for nullptr_t
-template <typename Tptr> __device__ __forceinline__
-Tptr ptradd(Tptr ptr, int i) {
-  return ptr + i;
-}
-
-__device__ __forceinline__
-std::nullptr_t ptradd(std::nullptr_t ptr, int i) {
-  return nullptr;
-}
-
-
-// Implementation of primitive types
-template <int THREADS, int UNROLL, int SUBSTEPS, typename T, typename REDOP=FuncSum<T> >
-class Primitives {
-  private:
-  template <typename SRC2_T, // either T* or nullptr_t
-            typename DST2_T, // either T* or nullptr_t
-            typename... SYNC_Ts> // either WaitFunc or PostFunc
-  static __device__ __forceinline__ void
-  GenericOp(const T*     src1,
-            const SRC2_T src2,
-                  T*     dst1,
-                  DST2_T dst2,
-            int len, int maxoffset, int step, SYNC_Ts... flags) {
-
-    enum { noSrc2 = std::is_same<SRC2_T, std::nullptr_t>::value };
-    enum { noDst2 = std::is_same<DST2_T, std::nullptr_t>::value };
-    static_assert(noSrc2 || std::is_same<SRC2_T, const T*>::value,
-        "src2 must be of type T* or nullptr_t");
-    static_assert(noDst2 || std::is_same<DST2_T, T*>::value,
-        "dst2 must be of type T* or nullptr_t");
-
-    using OpType = typename std::conditional<noSrc2, FuncPassA<T>, REDOP>::type;
-
-    if (threadIdx.x < THREADS) {
-      int sliceSize = len / SUBSTEPS;
-      int sliceOffset = 0;
-      #pragma unroll 1
-      for (int sub=0; sub<SUBSTEPS; ++sub) {
-        if (AnyAre<WaitFlag>(flags...)) {
-          if (threadIdx.x == 0) {
-            WaitOnFlags(SUBSTEPS*step + sub + 1, flags...);
-          }
-          asm volatile ("bar.sync 1, %0;" :: "r"(THREADS));
-        }
-        ReduceOrCopy
-            <
-             UNROLL,
-             THREADS,
-             OpType,
-             T,
-             !std::is_same<DST2_T, std::nullptr_t>::value, // HAS_DEST1
-             !std::is_same<SRC2_T, std::nullptr_t>::value  // HAS_SRC1
-            >
-            (
-             threadIdx.x,
-             ptradd(dst1, sliceOffset),
-             ptradd(dst2, sliceOffset),
-             ptradd(src1, sliceOffset),
-             ptradd(src2, sliceOffset),
-             min(sliceSize, maxoffset-sliceOffset)
-            );
-        if (AnyAre<PostFlag>(flags...)) {
-          __syncthreads();
-        }
-        sliceOffset += sliceSize;
-      }
-    } else {
-      for(int sub=0; sub<SUBSTEPS; ++sub) {
-        if (AnyAre<PostFlag>(flags...)) {
-          __syncthreads();
-          __threadfence_system();
-          PostToFlags(SUBSTEPS*step + sub + 1, flags...);
-        }
-      }
-    }
-  }
-
-  public:
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  Copy(const T* src, T* dst,
-      int len, int maxOffset, int step, SYNC_Ts... flags) {
-    GenericOp(src, nullptr, dst, nullptr, len, maxOffset, step, flags...);
-  }
-
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  DoubleCopy(const T* src, T* dst1, T* dst2,
-      int len, int maxOffset, int step, SYNC_Ts... flags) {
-    GenericOp(src, nullptr, dst1, dst2, len, maxOffset, step, flags...);
-  }
-
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  Reduce(const T* src1, const T* src2, T* dst,
-      int len, int maxOffset, int step, SYNC_Ts... flags) {
-    GenericOp(src1, src2, dst, nullptr, len, maxOffset, step, flags...);
-  }
-
-  template <typename... SYNC_Ts>
-  static __device__ __forceinline__ void
-  ReduceCopy(const T* src1, const T* src2, T* dst1, T* dst2,
-      int len, int maxOffset, int step, SYNC_Ts... flags) {
-    GenericOp(src1, src2, dst1, dst2, len, maxOffset, step, flags...);
-  }
-};
-
-#endif // end include guard
diff --git a/src/reduce.cu b/src/reduce.cu
deleted file mode 100644
index 7215183..0000000
--- a/src/reduce.cu
+++ /dev/null
@@ -1,148 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 2
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and boffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  boffset += sliceSize; \
-  if (boffset == buffSize) boffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void ReduceKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ DevRing<T> ring;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, (1-NUM_BUFCHUNKS)*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, 0);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, 0);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T, FUNC> Prims;
-
-  const int size = args.N;
-  const int nranks = args.nRanks;
-  const int rank = ring.userRank[0];
-  const int prevRank = ring.userRank[nranks-1];
-  const int root = args.root;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int boffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int offset = 0; offset < size; offset += sliceSize) {
-    int maxOffset = size-offset;
-    if (prevRank == root) {
-      Prims::Copy(
-          thisInput + offset,
-          nextOutput + boffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext,
-          postReadyToNext);
-    } else if (rank == root) {
-      Prims::Reduce(
-          thisInput + offset,
-          prevInput + boffset,
-          thisOutput + offset,
-          sliceSize, maxOffset,
-          step,
-          waitReadyFromPrev,
-          postDoneToPrev);
-    } else {
-      Prims::Reduce(
-          thisInput + offset,
-          prevInput + boffset,
-          nextOutput + boffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-    }
-    NEXT_STEP; // Increases step, boffset
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    if (rank != root) {
-      // Wait for last update from next then reset the flag
-      waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-      *ring.recvFlagFromNext = 0;
-    }
-
-    if (prevRank != root) {
-      // reset the flag
-      *ring.recvFlagFromPrev = 0;
-    }
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 512
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingReduce(const void* sendbuff, void* recvbuff, const int count, const int root,
-    ncclComm* comm, cudaStream_t stream) {
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
-  } else {
-    KernelArgs<T> args;
-    ArgsSetup(&args, sendbuff, recvbuff, root, count, comm);
-    LAUNCH_KERNEL(ReduceKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template<typename> class RedOp>
-class ReduceFunctor {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int root, ncclComm* comm, cudaStream_t stream) {
-    return RingReduce<RedOp<T>, T>(sendbuff, recvbuff, count, root, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, int count,
-    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, datatype, op, root, comm, "Reduce"));
-  return enqueue<ReduceFunctor>(sendbuff, recvbuff, count, datatype, op, root, comm, stream);
-}
-
diff --git a/src/reduce_kernel.h b/src/reduce_kernel.h
deleted file mode 100644
index f2cd512..0000000
--- a/src/reduce_kernel.h
+++ /dev/null
@@ -1,309 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-
-#ifndef REDUCE_KERNEL_H_
-#define REDUCE_KERNEL_H_
-
-#include "common_kernel.h"
-#include <limits>
-
-template<typename T>
-struct FuncNull {
-  __device__ T operator()(const T x, const T y) const {
-    return 0;
-  }
-};
-
-template<typename T>
-struct FuncSum {
-  __device__ T operator()(const T x, const T y) const {
-    return x + y;
-  }
-};
-
-template<typename T>
-struct FuncProd {
-  __device__ T operator()(const T x, const T y) const {
-    return x * y;
-  }
-};
-
-template<typename T>
-struct FuncMax {
-  __device__ T operator()(const T x, const T y) const {
-    return (x < y) ? y : x;
-  }
-};
-
-template<typename T>
-struct FuncMin {
-  __device__ T operator()(const T x, const T y) const {
-    return (x < y) ? x : y;
-  }
-};
-
-template<>
-struct FuncSum<char> {
-  union converter {
-    uint32_t storage;
-    char4 a;
-  };
-  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
-#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
-    int32_t rv, z=0;
-    asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
-    return rv;
-#elif (__CUDA_ARCH__ >= 500)
-    int32_t rv;
-    asm("vadd.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-        "vadd.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-        "vadd.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-        "vadd.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
-#else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = cx.a.x + cy.a.x;
-    cr.a.y = cx.a.y + cy.a.y;
-    cr.a.z = cx.a.z + cy.a.z;
-    cr.a.w = cx.a.w + cy.a.w;
-    return cr.storage;
-#endif
-  }
-  __device__ char operator()(const char x, const char y) const {
-    return x+y;
-  }
-};
-
-template<>
-struct FuncProd<char> {
-  union converter { uint32_t storage; char4 a; };
-  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
-#if (__CUDA_ARCH__ >= 300)
-    int32_t rv, zero=0;
-    asm("{ .reg .u32 t0, t1, t2, t3;\n\t"
-        " vmad.u32.u32.u32 t3, %1.b3, %2.b3, %3;\n\t"
-        " vmad.u32.u32.u32 t2, %1.b2, %2.b2, %3;\n\t"
-        " shl.b32          t3, t3, 16;\n\t"
-        " shl.b32          t2, t2, 16;\n\t"
-        " vmad.u32.u32.u32 t1, %1.b1, %2.b1, t3;\n\t"
-        " shl.b32          t1, t1, 8;\n\t"
-        " vmad.u32.u32.u32 t0, %1.b0, %2.b0, t2;\n\t"
-        " and.b32          t1, t1, 0xff00ff00;\n\t"
-        " and.b32          t0, t0, 0x00ff00ff;\n\t"
-        " or.b32           %0,  t0, t1;\n\t"
-        "}" : "=r"(rv) : "r"(x), "r"(y), "r"(zero));
-    return rv;
-#else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = cx.a.x * cy.a.x;
-    cr.a.y = cx.a.y * cy.a.y;
-    cr.a.z = cx.a.z * cy.a.z;
-    cr.a.w = cx.a.w * cy.a.w;
-    return cr.storage;
-#endif
-  }
-  __device__ char operator()(const char x, const char y) const {
-    return x*y;
-  }
-};
-
-template<>
-struct FuncMax<char> {
-  union converter { uint32_t storage; char4 a; };
-  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
-#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
-    int32_t rv, z=0;
-    if (std::numeric_limits<char>::is_signed)
-      asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
-    else
-      asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
-    return rv;
-#elif (__CUDA_ARCH__ >= 500)
-    int32_t rv;
-    if (std::numeric_limits<char>::is_signed)
-      asm("vmax.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-          "vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-          "vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-          "vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    else
-      asm("vmax.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
-          "vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-          "vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-          "vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
-#else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = max(cx.a.x, cy.a.x);
-    cr.a.y = max(cx.a.y, cy.a.y);
-    cr.a.z = max(cx.a.z, cy.a.z);
-    cr.a.w = max(cx.a.w, cy.a.w);
-    return cr.storage;
-#endif
-  }
-  __device__ char operator()(const char x, const char y) const {
-    return (x>y) ? x : y;
-  }
-};
-
-template<>
-struct FuncMin<char> {
-  union converter { uint32_t storage; char4 a; };
-  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
-#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
-    int32_t rv, z=0;
-    if (std::numeric_limits<char>::is_signed)
-      asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
-    else
-      asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
-    return rv;
-#elif (__CUDA_ARCH__ >= 500)
-    int32_t rv;
-    if (std::numeric_limits<char>::is_signed)
-      asm("vmin.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
-          "vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-          "vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-          "vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    else
-      asm("vmin.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
-          "vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
-          "vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
-          "vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
-    return rv;
-#else
-    converter cx, cy, cr;
-    cx.storage = x;
-    cy.storage = y;
-    cr.a.x = min(cx.a.x, cy.a.x);
-    cr.a.y = min(cx.a.y, cy.a.y);
-    cr.a.z = min(cx.a.z, cy.a.z);
-    cr.a.w = min(cx.a.w, cy.a.w);
-    return cr.storage;
-#endif
-  }
-  __device__ char operator()(const char x, const char y) const {
-    return (x<y) ? x : y;
-  }
-};
-
-#ifdef CUDA_HAS_HALF
-template<>
-struct FuncSum<half> {
-  __device__ half2 operator()(const half2 x, const half2 y) const {
-#if __CUDA_ARCH__ >= 530
-    return __hadd2(x, y);
-#else
-    float2 fx, fy, fr;
-    fx = __half22float2(x);
-    fy = __half22float2(y);
-    fr.x = fx.x + fy.x;
-    fr.y = fx.y + fy.y;
-    return __float22half2_rn(fr);
-#endif
-  }
-  __device__ half operator()(const half x, const half y) const {
-#if __CUDA_ARCH__ >= 530
-    return __hadd(x, y);
-#else
-    return __float2half( __half2float(x) + __half2float(y) );
-#endif
-  }
-};
-
-template<>
-struct FuncProd<half> {
-  __device__ half2 operator()(const half2 x, const half2 y) const {
-#if __CUDA_ARCH__ >= 530
-    return __hmul2(x, y);
-#else
-    float2 fx, fy, fr;
-    fx = __half22float2(x);
-    fy = __half22float2(y);
-    fr.x = fx.x * fy.x;
-    fr.y = fx.y * fy.y;
-    return __float22half2_rn(fr);
-#endif
-  }
-  __device__ half operator()(const half x, const half y) const {
-#if __CUDA_ARCH__ >= 530
-    return __hmul(x, y);
-#else
-    return __float2half( __half2float(x) * __half2float(y) );
-#endif
-  }
-};
-
-template<>
-struct FuncMax<half> {
-  __device__ half2 operator()(const half2 x, const half2 y) const {
-    float2 fx, fy, fr;
-    fx = __half22float2(x);
-    fy = __half22float2(y);
-    fr.x = fmaxf(fx.x, fy.x);
-    fr.y = fmaxf(fx.y, fy.y);
-    return __float22half2_rn(fr);
-  }
-  __device__ half operator()(const half x, const half y) const {
-    float fx, fy, fm;
-    fx = __half2float(x);
-    fy = __half2float(y);
-    fm = fmaxf(fx, fy);
-    return __float2half(fm);
-  }
-};
-
-template<>
-struct FuncMin<half> {
-  __device__ half2 operator()(const half2 x, const half2 y) const {
-    float2 fx, fy, fr;
-    fx = __half22float2(x);
-    fy = __half22float2(y);
-    fr.x = fminf(fx.x, fy.x);
-    fr.y = fminf(fx.y, fy.y);
-    return __float22half2_rn(fr);
-  }
-  __device__ half operator()(const half x, const half y) const {
-    float fx, fy, fm;
-    fx = __half2float(x);
-    fy = __half2float(y);
-    fm = fminf(fx, fy);
-    return __float2half(fm);
-  }
-};
-#endif
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of threads in the CTA
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, class FUNC, typename T>
-__device__ void Reduce(volatile T * __restrict__ const dest,
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1, const int N) {
-  ReduceOrCopy<UNROLL, THREADS, FUNC, T, false, true>(threadIdx.x, dest,
-      nullptr, src0, src1, N);
-}
-
-// Assumptions:
-// - there is exactly 1 block
-// - THREADS is the number of threads in the CTA
-// - this function is called by all producer threads
-template<int UNROLL, int THREADS, class FUNC, typename T>
-__device__ void ReduceAndCopy(volatile T * __restrict__ const dest0,
-    volatile T * __restrict__ const dest1,
-    const volatile T * __restrict__ const src0,
-    const volatile T * __restrict__ const src1, const int N) {
-  ReduceOrCopy<UNROLL, THREADS, FUNC, T, true, true>(threadIdx.x, dest0, dest1,
-      src0, src1, N);
-}
-
-#endif // REDUCE_KERNEL_H_
diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
deleted file mode 100644
index b1100dd..0000000
--- a/src/reduce_scatter.cu
+++ /dev/null
@@ -1,165 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "common_coll.h"
-#include "enqueue.h"
-#include "primitives.h"
-
-#define NUM_SUBSTEPS 2
-#define NUM_BUFCHUNKS 2
-
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
-  step++; \
-  poffset = noffset; \
-  noffset += sliceSize; \
-  if (noffset == buffSize) noffset = 0;
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-template<int THREADS, int UNROLL, class FUNC, typename T>
-__launch_bounds__(THREADS+WARP_SIZE, 1)
-__global__ void ReduceScatterKernel(const KernelArgs<T> args) {
-  const int tid = threadIdx.x;
-  __shared__ DevRing<T> ring;
-
-  LoadRing<THREADS>(args.ring, &ring);
-  __syncthreads();
-
-  if (tid == 0) {
-    WaitFlag prevCommOp(ring.prevOpCounter, 0);
-    WaitFlag nextCommOp(ring.nextOpCounter, 0);
-    prevCommOp.wait(args.opIndex);
-    nextCommOp.wait(args.opIndex);
-  }
-  __syncthreads();
-
-  WaitFlag waitDoneFromNext(ring.recvFlagFromNext, -NUM_BUFCHUNKS*NUM_SUBSTEPS);
-  WaitFlag waitReadyFromPrev(ring.recvFlagFromPrev, -1*NUM_SUBSTEPS);
-  PostFlag postDoneToPrev(ring.sendFlagToPrev, -1*NUM_SUBSTEPS);
-  PostFlag postReadyToNext(ring.sendFlagToNext, 0);
-
-  typedef Primitives<THREADS, UNROLL, NUM_SUBSTEPS, T, FUNC> Prims;
-
-  const int size = args.N;
-  const int nranks = args.nRanks;
-  const int buffSize = args.buffSize / sizeof(T);
-  const int sliceSize = buffSize / NUM_BUFCHUNKS;
-  
-  int step = 0;
-  int poffset, noffset = 0;
-
-  // Compute pointers
-  const T * __restrict__ thisInput = args.ThisInput;
-  T * __restrict__ thisOutput =  args.ThisOutput;
-  T * __restrict__ prevInput = ring.recvBuffer;
-  T * __restrict__ nextOutput =  ring.sendBuffer;
-
-  for (int chunkOffset = 0; chunkOffset < size; chunkOffset += sliceSize) {
-    /////////////// begin ReduceScatter steps ///////////////
-    int offset;
-    int maxOffset = size-chunkOffset;
-    int rankDest;
-
-    // step 0: push data to next GPU
-    rankDest = ring.userRank[nranks-1];
-    offset = chunkOffset + rankDest * size;
-
-    Prims::Copy(
-        thisInput  + offset,
-        nextOutput + noffset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP; // Increases step, poffset, noffset
-
-    // k-2 steps: reduce and copy to next GPU
-    for (int j=2; j<nranks; ++j) {
-      rankDest = ring.userRank[nranks-j];
-      offset = chunkOffset + rankDest * size;
-
-      Prims::Reduce(
-          prevInput  + poffset,
-          thisInput  + offset,
-          nextOutput + noffset,
-          sliceSize, maxOffset,
-          step,
-          waitDoneFromNext, waitReadyFromPrev,
-          postReadyToNext, postDoneToPrev);
-
-      NEXT_STEP;
-    }
-
-    // step k-1: reduce this buffer and data, which will produce the final
-    // result that we store in this data and push to the next GPU
-    rankDest = ring.userRank[0];
-    offset = chunkOffset + rankDest * size;
-
-    Prims::Reduce(
-        prevInput  + poffset,
-        thisInput  + offset,
-        thisOutput + chunkOffset,
-        sliceSize, maxOffset,
-        step,
-        waitDoneFromNext, waitReadyFromPrev,
-        postReadyToNext, postDoneToPrev);
-
-    NEXT_STEP;
-  }
-
-  // wait for the last data to be pushed to us
-  if (tid == 0) {
-    // Wait for last update from next then reset the flag
-    waitDoneFromNext.wait(NUM_SUBSTEPS*(step+NUM_BUFCHUNKS-1));
-    *ring.recvFlagFromNext = 0;
-
-    // Wait for last update from prev then reset the flag
-    waitReadyFromPrev.wait(NUM_SUBSTEPS*(step+1));
-    *ring.recvFlagFromPrev = 0;
-
-    incrementOpCounter(&args);
-  }
-}
-
-#define THREADS 512
-#define UNROLL 8
-
-template<class FUNC, typename T>
-ncclResult_t RingReduceScatter(const void* sendbuff, void* recvbuff,
-    const int count, ncclComm* comm, cudaStream_t stream) {
-  if (comm->nRanks == 1) {
-    if (sendbuff != recvbuff)
-      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, count*sizeof(T), cudaMemcpyDeviceToDevice, stream), ncclUnhandledCudaError);
-  } else {
-    KernelArgs<T> args;
-    ArgsSetup(&args, sendbuff, recvbuff, 0, count, comm);
-    LAUNCH_KERNEL(ReduceScatterKernel, THREADS, UNROLL, FUNC, T, args, stream);
-  }
-
-  return ncclSuccess;
-}
-
-template<typename T, template <typename> class RedOp>
-class ReduceScatter {
-  public:
-  static ncclResult_t entry(const void* sendbuff, void* recvbuff,
-      int count, int /*root*/, ncclComm* comm, cudaStream_t stream) {
-    return RingReduceScatter<RedOp<T>, T>(sendbuff, recvbuff, count, comm, stream);
-  }
-};
-
-NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, int recvcount,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
-ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, int recvcount,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
-  NCCLCHECK(ArgsCheck(sendbuff, recvbuff, recvcount, datatype, op, 0, comm, "ReduceScatter"));
-  return enqueue<ReduceScatter>(sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream);
-}
-
diff --git a/src/ring.cu b/src/ring.cu
new file mode 100644
index 0000000..e9e9573
--- /dev/null
+++ b/src/ring.cu
@@ -0,0 +1,70 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "ring.h"
+#include "param.h"
+
+NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
+
+ncclResult_t initRing(struct ncclComm* comm, int ringid) {
+  struct ncclRing* ring = comm->rings+ringid;
+  ring->id = ringid;
+
+  // Setup intermediate buffering
+  ring->buffSize = ncclParamBuffsize();
+
+  const int sendSize = ring->devMemSendSize = sizeof(struct ncclSendMem);
+  struct ncclSendMem* sendMem;
+  NCCLCHECK(ncclCudaCalloc((char**)&sendMem, sendSize));
+  ring->devMemSend = sendMem;
+
+  const int recvSize = ring->devMemRecvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
+  struct ncclRecvMem* recvMem;
+  NCCLCHECK(ncclCudaCalloc((char**)&recvMem, recvSize));
+  ring->devMemRecv = recvMem;
+
+  TRACE(INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize);
+
+  // Pre-configure send/recv pointers. Those are the default, they may change later.
+  ring->recv.conn.buff = recvMem->buff;
+  ring->recv.conn.llBuff = recvMem->llBuff;
+  ring->recv.conn.tail = &recvMem->tail;
+  ring->recv.conn.opCount = &recvMem->opCount;
+  ring->recv.conn.direct = 0;
+  ring->send.conn.head = &sendMem->head;
+  ring->send.conn.llHead = &sendMem->llHead;
+  ring->send.conn.direct = 0;
+  ring->send.conn.llStep = 0;
+  ring->send.conn.llLastCleaning = 0;
+
+  // Ring index to user rank table.
+  NCCLCHECK(ncclCudaCalloc(&ring->devUserRanks, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&ring->userRanks, comm->nRanks));
+
+  // Per-ring operation list.
+  NCCLCHECK(ncclCudaHostAlloc((void**)&ring->collectives, (void**)&ring->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
+  return ncclSuccess;
+}
+
+ncclResult_t freeRing(struct ncclRing* ring) {
+  // Intermediate buffering
+  CUDACHECK(cudaFree(ring->devMemSend));
+  CUDACHECK(cudaFree(ring->devMemRecv));
+
+  // Index to rank table
+  free(ring->userRanks);
+  CUDACHECK(cudaFree(ring->devUserRanks));
+
+  // Operation list
+  NCCLCHECK(ncclCudaHostFree(ring->collectives));
+
+  // Free transport proxy resources
+  if (ring->send.transportResources) NCCLCHECK(ring->send.transport->send.free(ring->send.transportResources));
+  NCCLCHECK(transportDestroyProxy(&ring->send));
+  if (ring->recv.transportResources) NCCLCHECK(ring->recv.transport->recv.free(ring->recv.transportResources));
+  NCCLCHECK(transportDestroyProxy(&ring->recv));
+  return ncclSuccess;
+}
diff --git a/src/transport.cu b/src/transport.cu
new file mode 100644
index 0000000..f5f9d75
--- /dev/null
+++ b/src/transport.cu
@@ -0,0 +1,187 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+
+extern struct ncclTransport p2pTransport;
+extern struct ncclTransport shmTransport;
+extern struct ncclTransport netTransport;
+
+struct ncclTransport ncclTransports[NTRANSPORTS] = {
+  p2pTransport,
+  shmTransport,
+  netTransport,
+};
+
+static void FifoPullArgs(struct transportProxyInfo* info, struct ncclProxyArgs *args) {
+  struct ncclProxyArgs *fifoArgs = info->argsFifo + (info->argsFifoHead % TRANSPORT_PROXY_FIFO_SIZE);
+  pthread_mutex_lock(&info->mutex);
+  while (fifoArgs->active == 0)
+    pthread_cond_wait(&info->cond, &info->mutex);
+  __sync_synchronize();
+  memcpy(args, fifoArgs, sizeof(struct ncclProxyArgs));
+  __sync_synchronize();
+  fifoArgs->active = 0;
+  pthread_cond_signal(&info->cond);
+  pthread_mutex_unlock(&info->mutex);
+  info->argsFifoHead++;
+}
+
+static struct ncclProxyArgs* FifoGetNextArgs(struct transportProxyInfo* info) {
+  if (info == NULL) return NULL;
+  struct ncclProxyArgs* fifoArgs = info->argsFifo + (info->argsFifoTail % TRANSPORT_PROXY_FIFO_SIZE);
+  pthread_mutex_lock(&info->mutex);
+  while (fifoArgs->active == 1)
+    pthread_cond_wait(&info->cond, &info->mutex);
+  pthread_mutex_unlock(&info->mutex);
+  info->argsFifoTail++;
+  return fifoArgs;
+}
+
+static void FifoPushArgs(struct transportProxyInfo* info) {
+  if (info == NULL) return;
+
+  struct ncclProxyArgs* fifoArgs = info->argsFifo + ((info->argsFifoTail-1) % TRANSPORT_PROXY_FIFO_SIZE);
+  if (fifoArgs->active == 0) return;
+
+  pthread_mutex_lock(&info->mutex);
+  pthread_cond_signal(&info->cond);
+  pthread_mutex_unlock(&info->mutex);
+}
+
+static void WaitProxyReady(struct transportProxyInfo* info) {
+  pthread_mutex_lock(&info->mutex);
+  while (info->proxyReady == 0)
+    pthread_cond_wait(&info->cond, &info->mutex);
+  pthread_mutex_unlock(&info->mutex);
+}
+
+static void SetProxyReady(struct transportProxyInfo* info) {
+  pthread_mutex_lock(&info->mutex);
+  info->proxyReady = 1;
+  pthread_cond_signal(&info->cond);
+  pthread_mutex_unlock(&info->mutex);
+}
+
+static void StopProxy(struct transportProxyInfo* info) {
+  struct ncclProxyArgs* fifoArgs = FifoGetNextArgs(info);
+  fifoArgs->active = -1;
+  FifoPushArgs(info);
+}
+
+#define RECV 0
+#define SEND 1
+
+static bool NeedProxy(int type, int pattern, struct ncclRing* ring, int nranks) {
+  enum proxyMode mode = proxyPatternMode(pattern);
+  if (mode == proxyRing) return true;
+
+  /* In chains, one rank does not need a proxy. Let's figure out which one it is */
+  int root = proxyPatternRoot(pattern);
+  // Which index in the reorganized rings should we compare root against */
+  const int myrank = 0, nextrank = 1, prevrank = nranks-1;
+  int index = mode == proxyFrom ?
+      /*                            no recv /  no send    if root = */
+      /* bcast  */ (type == RECV ?   myrank : nextrank ):
+      /* reduce */ (type == RECV ? prevrank :   myrank );
+  int rank = ring->userRanks[index];
+  return (root != rank);
+}
+
+static void SaveProxy(struct ncclConnector* connector, struct ncclProxyArgs* args, int needProxy) {
+  struct transportProxyInfo* info = connector->proxyInfo;
+  if (info == NULL) return;
+  struct ncclProxyArgs* fifoArgs = FifoGetNextArgs(info);
+  args->needProxy = needProxy;
+  __sync_synchronize();
+  memcpy(fifoArgs, args, sizeof(struct ncclProxyArgs));
+  __sync_synchronize();
+  fifoArgs->active = 1;
+}
+
+ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t nbytes, int pattern, struct ncclComm* comm) {
+  int llMode, nrings, nthreads;
+  ncclGetCollResource(comm, nbytes, &nrings, &nthreads, &llMode);
+  nbytes       = llMode ? nbytes * 2    : nbytes;
+  substeps     = llMode ? 1             : substeps;
+  subchunks    = llMode ? NCCL_LL_CHUNKS : subchunks;
+  int buffSize = llMode ? NCCL_LL_BUFF_SIZE : comm->rings[0].buffSize;
+
+  int nrounds = (int)(DIVUP(nbytes, ((size_t)nrings * nblocksPerRound * (buffSize/subchunks)))); // Fixed 32-bit overflow
+  int nsteps = nstepsPerRound * nrounds * substeps;
+  TRACE(NET,"opCount %lx substeps %d subchunks %d nrounds %d nsteps %d comm %p", comm->opCount, subchunks, subchunks, nrounds, nsteps, comm);
+  TRACE(NET,"opCount %lx nbytes %zi nrings %d buffSize %d pattern %d comm %p", comm->opCount, nbytes, nrings, buffSize, pattern, comm);
+  for (int r=0; r<nrings; r++) {
+    struct ncclRing* ring = comm->rings+((comm->myParams->gridDim.x+r)%comm->nRings);
+    struct ncclProxyArgs args = { ring, substeps*subchunks, nsteps, comm->opCount, llMode, 0 };
+    SaveProxy(&ring->recv, &args, NeedProxy(RECV, pattern, ring, comm->nRanks));
+    SaveProxy(&ring->send, &args, NeedProxy(SEND, pattern, ring, comm->nRanks));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t transportStartProxies(ncclComm* comm) {
+  for (int r=0; r<comm->nRings; r++) {
+    FifoPushArgs(comm->rings[r].send.proxyInfo);
+    FifoPushArgs(comm->rings[r].recv.proxyInfo);
+  }
+  pthread_yield(); // Let other threads run
+  return ncclSuccess;
+}
+
+void* persistentThread(void *opaqueInfo) {
+  struct transportProxyInfo* info = (struct transportProxyInfo*)opaqueInfo;
+  // We need to initialize the context before launching any NCCL cuda kernel,
+  // otherwise we would create it during the first cudaMemcpyAsync inside the
+  // proxy function and that would cause a deadlock
+  cudaSetDevice(info->comm->cudaDev);
+  // Signal the main thread the context is created and it can proceed.
+  SetProxyReady(info);
+  while (1) {
+    struct ncclProxyArgs args;
+    FifoPullArgs(info, &args);
+    if (args.active == -1) {
+      // Main thread asked to stop
+      return NULL;
+    }
+    ncclResult_t res = info->func(&args);
+    if (res != ncclSuccess) {
+      WARN("%s:%d -> %d [Proxy thread error]", __FILE__, __LINE__, res);
+    }
+  }
+}
+
+ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm) {
+  struct ncclConnector* connector = (type == RECV) ? &ring->recv : &ring->send;
+  threadFunc_t proxyfunc = (threadFunc_t) ((type == RECV) ? connector->transport->recv.proxy : connector->transport->send.proxy);
+  if (proxyfunc) {
+    TRACE(NET,"type %d ring %p proxyfunc %p comm %p", type, ring, proxyfunc, comm);
+    struct transportProxyInfo* info;
+    NCCLCHECK(ncclCalloc(&info, 1));
+    connector->proxyInfo = info;
+    info->comm = comm;
+    info->cond = PTHREAD_COND_INITIALIZER;
+    info->mutex = PTHREAD_MUTEX_INITIALIZER;
+    info->func = proxyfunc;
+    info->argsFifoHead = info->argsFifoTail = 0;
+    info->proxyReady = 0;
+    pthread_create(&connector->proxyInfo->thread, NULL, persistentThread, info);
+    // Wait for thread to initialize its CUDA context.
+    WaitProxyReady(info);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t transportDestroyProxy(struct ncclConnector* connector) {
+  if (connector->proxyInfo) {
+    StopProxy(connector->proxyInfo);
+    pthread_join(connector->proxyInfo->thread, NULL);
+    free(connector->proxyInfo);
+    connector->proxyInfo = NULL;
+  }
+  return ncclSuccess;
+}
diff --git a/src/transport/net.cu b/src/transport/net.cu
new file mode 100644
index 0000000..ed62a66
--- /dev/null
+++ b/src/transport/net.cu
@@ -0,0 +1,521 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "transport.h"
+#include "nvmlwrap.h"
+#include "net.h"
+#include "param.h"
+#include "nvlink.h"
+#include <cuda_runtime.h>
+#include <assert.h>
+
+#define NET_MAX_IFS 16
+
+// We encode 3 bits of distance per interface into a ncclTvalue_t (64-bit)
+#define NET_BITS_PER_IF 3
+#define NET_BITS_PER_IF_MASK ((1<<NET_BITS_PER_IF)-1)
+static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t");
+
+struct netInfo {
+  int rank;
+  int ndev;
+  short scores[NET_MAX_IFS];
+};
+
+struct netConnectInfo {
+  ncclNetHandle_t netHandle;
+};
+
+struct netSendResources {
+  void* netSendComm;
+  struct ncclSendMem* hostSendMem;
+  struct ncclRecvMem* hostRecvMem;
+  struct ncclSendMem* devHostSendMem;
+  struct ncclRecvMem* devHostRecvMem;
+  struct ncclSendMem* hostDevMem;
+  int netDev;
+  bool cudaSupport;
+  struct ncclRecvMem* devNetMem;
+  uint64_t llStep;
+  uint64_t llLastCleaning;
+};
+
+struct netRecvResources {
+  void* netListenComm;
+  void* netRecvComm;
+  struct ncclSendMem* hostSendMem;
+  struct ncclRecvMem* hostRecvMem;
+  struct ncclSendMem* devHostSendMem;
+  struct ncclRecvMem* devHostRecvMem;
+  struct ncclRecvMem* hostDevMem;
+  int netDev;
+  bool cudaSupport;
+  uint64_t llStep;
+  uint64_t llLastCleaning;
+};
+
+/* Fill information necessary to exchange between ranks to choose whether or not
+ * to use this transport */
+ncclResult_t netFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
+  struct netInfo* info = (struct netInfo*)opaqueInfo;
+  static_assert(sizeof(struct netInfo) <= sizeof(ncclTinfo_t), "NET Info too large");
+  info->rank = rank;
+  int *scores;
+  NCCLCHECK(ncclNetDevices(&info->ndev, &scores));
+  if (info->ndev == 0) {
+    WARN("Error : Network returned 0 device");
+    return ncclSystemError;
+  }
+  if (info->ndev > NET_MAX_IFS) info->ndev = NET_MAX_IFS;
+  for (int d=0; d<info->ndev; d++) info->scores[d] = scores[d];
+  free(scores);
+  return ncclSuccess;
+}
+
+/* Determine if we can communicate with the peer */
+ncclResult_t netCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
+  ret[0] = 0;
+  struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
+  for (int d=0; d<myInfo->ndev; d++) {
+    // Keep 3 bits of score info per dev
+    ret[0] |= ((myInfo->scores[d] & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
+  }
+  return ncclSuccess;
+}
+
+static inline int groupBestStart(int nranks, int* groups, int group, ncclTvalue_t* values, int card, int minScore) {
+  int bestRank = -1;
+  int bestScore = 0;
+  for (int rank=0; rank<nranks; rank++) {
+    if (groups[rank] != group) continue;
+    for (int i=0; i<nranks; i++) {
+      ncclTvalue_t netValue = values[rank*nranks+i];
+      if (netValue != 0) {
+        ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK;
+        if (score >= minScore && score > bestScore) {
+          bestScore = score;
+          bestRank = rank;
+        }
+        // All other values should be the same, stop here for this rank
+        break;
+      }
+    }
+  }
+  return bestRank;
+}
+static inline int groupBestEnd(int nranks, int* groups, int group, int* subgroups, int startSubGroup, int startRank, ncclTvalue_t* values, int card, int minScore) {
+  // For the last rank, we don't need the absolute best score, just to be within minScore.
+  for (int rank=nranks-1; rank>=0; rank--) {
+    if (groups[rank] != group) continue;
+    if (startSubGroup != -1 && startSubGroup == subgroups[rank]) continue;
+    if (startRank == rank) continue;
+    for (int i=0; i<nranks; i++) {
+      ncclTvalue_t netValue = values[rank*nranks+i];
+      if (netValue != 0) {
+        ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK;
+        if (score >= minScore) {
+          return rank;
+        }
+        // All other values should be the same, stop here for this rank
+        break;
+      }
+    }
+  }
+  return -1;
+}
+
+
+ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
+  int nGroups = groups[nranks-1] + 1;
+  int cardUsed[NET_MAX_IFS*nGroups];
+  for (int c=0; c<NET_MAX_IFS*nGroups; c++) cardUsed[c] = 0;
+
+  for (int ring = 0; ring<*nringsRet; ring++) {
+    int starts[nGroups];
+    int ends[nGroups];
+    for (int group = 0; group<nGroups; group++) {
+      int nranksInGroup = 0;
+      int nsubGroups = 0;
+      for (int rank=0; rank<nranks; rank++) if (groups[rank] == group) {
+          nranksInGroup++;
+          nsubGroups = std::max(subgroups[rank], nsubGroups);
+        }
+      starts[group] = ends[group] = -1;
+      // Receive on the rank closest to the NIC
+      for (int card=0; card<NET_MAX_IFS; card++) {
+        if (cardUsed[group*NET_MAX_IFS+card] == 1) continue;
+        int start = groupBestStart(nranks, groups, group, values, card, minScore);
+        // Send from any rank, but best on a different subgroup and close to the NIC also.
+        int end = (nranksInGroup == 1) ? start
+            : groupBestEnd(nranks, groups, group, subgroups, nsubGroups ? subgroups[start] : -1, start, values, card, minScore);
+        //printf("Ring %d, Minscore %d, Card %d, group %d, start = %d, end = %d\n", ring, minScore, card, group, start, end);
+        if (start != -1 && end != -1) {
+          cardUsed[group*NET_MAX_IFS+card] = 1;
+          starts[group] = start;
+          ends[group] = end;
+          break;
+        }
+      }
+      if (starts[group] == -1 || ends[group] == -1) {
+        *nringsRet = ring;
+        return ncclSuccess;
+      }
+    }
+    // Link groups together
+    for (int group = 0; group<nGroups; group++) {
+      int nextGroup = (group+1)%nGroups;
+      next[ring*nranks+ends[group]] = starts[nextGroup];
+      prev[ring*nranks+starts[nextGroup]] = ends[group];
+    }
+  }
+  return ncclSuccess;
+}
+
+int getDev(int ringId, int nDev, short* scores) {
+  int maxScore = 0;
+  for (int d=0; d<nDev; d++) if (scores[d] > maxScore) maxScore = scores[d];
+  int skip = ringId+1;
+  while (skip) {
+    for (int d=0; d<nDev; d++) {
+      if (scores[d] == maxScore) {
+        skip--;
+        if (skip == 0) return d;
+      }
+    }
+  }
+  return 0;
+}
+
+NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
+
+/* Determine if we will use this transport for this peer and return connect
+ * information for this peer */
+ncclResult_t netSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
+  struct netSendResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  ring->send.transportResources = resources;
+
+  struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
+  resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->scores);
+  resources->cudaSupport = false;
+
+  // Get user's GDR READ setting
+  int gdrReadParam = ncclParamNetGdrRead();
+
+  // Determine whether the GPU has NVLink
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+  CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
+  int nvlinks = getNumNvlinks(busId);
+
+  // Enable GDR read when:
+  // 1) user sets it, or
+  // 2) we are on a NVSwitch platform (i.e. no P2P traffic over PCI-E switch) AND the GPU is Volta
+  bool enableGdrRead = (gdrReadParam > 0) || (nvlinks >= CONNECT_NVSWITCH && ncclCudaCompCap() > 6 && gdrReadParam != 0);
+  if (enableGdrRead) {
+    int flags;
+    NCCLCHECK(ncclNetPtrSupport(resources->netDev, &flags));
+    if (flags & NCCL_PTR_CUDA)
+      resources->cudaSupport = true;
+  }
+  if (resources->cudaSupport)
+    INFO(INIT|NET, "Net: enabling net device %d to read from rank %d", resources->netDev, myInfo->rank);
+
+  int size = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
+  if (resources->cudaSupport) {
+    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devNetMem), size));
+  }
+
+  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, size));
+  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, size));
+
+  return ncclSuccess;
+}
+
+ncclResult_t netRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
+  struct netRecvResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  ring->recv.transportResources = resources;
+
+  struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
+  resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->scores);
+  int flags;
+  NCCLCHECK(ncclNetPtrSupport(resources->netDev, &flags));
+  resources->cudaSupport = (flags & NCCL_PTR_CUDA) ? true : false;
+
+  int sendSize = sizeof(struct ncclSendMem);
+  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
+
+  int recvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
+  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
+
+  struct netInfo* peerInfo = (struct netInfo*)peerOpaqueInfo;
+  INFO(INIT|NET,"Ring %02d : %d -> %d via NET/%s/%d%s%s", ring->id, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
+      resources->cudaSupport ? "/GDRDMA" : "",
+      (resources->hostDevMem != NULL) ? "/GDCopy" : "");
+  struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
+  NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
+  return ncclSuccess;
+}
+
+ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+  // Setup device pointers
+  struct netSendResources* resources = (struct netSendResources*)send->transportResources;
+
+  if (resources->cudaSupport) {
+    send->conn.buff = resources->devNetMem->buff;
+    // We don't use devMem for llMode because the CPU has to read the data
+    send->conn.llBuff = resources->devHostRecvMem->llBuff;
+  } else {
+    send->conn.buff = resources->devHostRecvMem->buff;
+    send->conn.llBuff = resources->devHostRecvMem->llBuff;
+  }
+  send->conn.tail = &resources->devHostRecvMem->tail;
+  send->conn.opCount = &resources->devHostRecvMem->opCount;
+  send->conn.fifo = resources->devHostRecvMem->sizesFifo;
+  send->conn.llFifo = resources->devHostRecvMem->llSizesFifo;
+
+  if (resources->hostDevMem == NULL) {
+    send->conn.head = &resources->devHostSendMem->head;
+    send->conn.llHead = &resources->devHostSendMem->llHead;
+  }
+
+  // Connect to remote peer
+  struct netConnectInfo* info = (struct netConnectInfo*)connectInfo;
+  NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm));
+  return ncclSuccess;
+}
+
+/* Connect to this peer */
+ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+  // Setup device pointers
+  struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources;
+
+  recv->conn.head = &resources->devHostSendMem->head;
+  recv->conn.llHead = &resources->devHostSendMem->llHead;
+
+  if (resources->cudaSupport == false) {
+    recv->conn.buff = resources->devHostRecvMem->buff;
+    recv->conn.llBuff = resources->devHostRecvMem->llBuff;
+  }
+
+  if (resources->hostDevMem == NULL) {
+    recv->conn.tail = &resources->devHostRecvMem->tail;
+    recv->conn.opCount = &resources->devHostRecvMem->opCount;
+  }
+
+  // Finish connection establishment
+  NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
+  NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
+
+  return ncclSuccess;
+}
+
+ncclResult_t netSendFree(void* transportResources) {
+  struct netSendResources* resources = (struct netSendResources*)transportResources;
+  NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
+  NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
+  if (resources->cudaSupport)
+    CUDACHECK(cudaFree(resources->devNetMem));
+  NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
+  free(resources);
+  return ncclSuccess;
+}
+
+ncclResult_t netRecvFree(void* transportResources) {
+  struct netRecvResources* resources = (struct netRecvResources*)transportResources;
+  NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
+  NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
+  NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
+  free(resources);
+  return ncclSuccess;
+}
+
+ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
+  struct ncclRing* ring = args->ring;
+  struct netSendResources* resources = (struct netSendResources*) (ring->send.transportResources);
+  const int llMode = args->llMode;
+
+  volatile uint64_t* prevTail = &resources->hostRecvMem->tail;
+  struct ncclSendMem* prevMem = resources->hostDevMem ? resources->hostDevMem : resources->hostSendMem;
+  uint64_t* prevHead = llMode ? &prevMem->llHead : &prevMem->head;
+  struct ncclRecvMem* localMem = resources->cudaSupport ? resources->devNetMem : resources->hostRecvMem;
+  char* localBuff = llMode ? resources->hostRecvMem->llBuff : localMem->buff;
+  int ptrType = resources->cudaSupport ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
+  volatile int* sizesFifo = llMode ? resources->hostRecvMem->llSizesFifo : resources->hostRecvMem->sizesFifo;
+  int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
+  int sliceSize = buffSize / args->substeps;
+
+  assert(args->substeps <= SIZES_FIFO_SIZE);
+
+  uint64_t head = llMode ? resources->llStep : 0ULL;
+  uint64_t tail = llMode ? resources->llStep : 0ULL;
+  uint64_t end = head + args->nsteps;
+
+  int idle = 0;
+  void* requests[args->substeps];
+
+  if (!args->needProxy) goto nextColl;
+
+  TRACE(NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
+  TRACE(NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
+
+  // Update in case we skipped some collectives
+  if (llMode == 0) resources->hostRecvMem->opCount = args->opCount;
+
+  while (head < end) {
+    idle++;
+    if (llMode) {
+      if (tail < end && tail < head + args->substeps) {
+        int slot = tail%args->substeps;
+        int size = sizesFifo[slot];
+        if (size != 0) {
+          if (size == -1) size = 0;
+          uint32_t flag = tail + 1;
+          int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
+          size = nFifoLines * sizeof(union ncclLLFifoLine);
+          union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+slot*sliceSize);
+          for (int i=0; i<nFifoLines; i++) {
+            volatile uint32_t *f1 = &lines[i].flag1;
+            volatile uint32_t *f2 = &lines[i].flag2;
+            while (f1[0] != flag || f2[0] != flag);
+          }
+          NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, ptrType, requests+slot));
+          sizesFifo[slot] = size;
+          tail++;
+          idle = 0;
+        }
+      }
+    } else while (tail < *prevTail) {
+        // Send through network
+        int slot = tail%args->substeps;
+        NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+slot*sliceSize, sizesFifo[slot], ptrType, requests+slot));
+        tail++;
+        idle = 0;
+      }
+    if (head < tail) {
+      int done;
+      int slot = head%args->substeps;
+      NCCLCHECK(ncclNetTest(requests[slot], &done, NULL));
+      if (done) {
+        if (llMode) {
+          sizesFifo[slot] = 0;
+          // Make sure size is reset to zero before we update the head.
+          __sync_synchronize();
+        }
+        head++;
+        *prevHead = head;
+        idle = 0;
+      }
+    }
+    if (idle) transportProxyIdle(idle);
+  }
+
+  // Reset
+  if (llMode == 0) *prevTail = 0;
+
+nextColl:
+  if (llMode) {
+    resources->llStep += args->nsteps;
+    // Don't forget to ack otherwise the GPU won't be able to push data.
+    *prevHead = resources->llStep;
+    if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
+      memset(localBuff, 0, NCCL_LL_BUFF_SIZE);
+      resources->llStep += NCCL_LL_CHUNKS;
+      *prevHead = resources->llStep;
+      resources->llLastCleaning = resources->llStep;
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
+  struct ncclRing* ring = args->ring;
+  struct netRecvResources* resources = (struct netRecvResources*) (ring->recv.transportResources);
+  int llMode = args->llMode;
+
+  volatile uint64_t* nextHead = llMode ? &resources->hostSendMem->llHead : &resources->hostSendMem->head;
+  struct ncclRecvMem* localMem = resources->cudaSupport ? ring->devMemRecv : resources->hostRecvMem;
+  char* localBuff = llMode ? localMem->llBuff : localMem->buff;
+  char* nextBuff = (resources->cudaSupport == false && resources->hostDevMem) ? resources->hostDevMem->buff : NULL;
+  int ptrType = resources->cudaSupport ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
+  uint64_t* nextTail = resources->hostDevMem ? &resources->hostDevMem->tail : &resources->hostRecvMem->tail;
+
+  int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
+  int sliceSize = buffSize / args->substeps;
+
+  uint64_t head = llMode ? resources->llStep : 0ULL;
+  uint64_t tail = llMode ? resources->llStep : 0ULL;
+  uint64_t end = head + args->nsteps;
+
+  int idle = 0;
+  void* requests[args->substeps];
+
+  if (!args->needProxy) goto nextColl;
+
+  TRACE(NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
+  TRACE(NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
+
+  if (llMode == 0) {
+    // Waiting for next opCount is only needed before writing nextTail.
+    uint64_t* nextOpCount = resources->hostDevMem ? &resources->hostDevMem->opCount : &resources->hostRecvMem->opCount;
+    transportProxyWait([=] { return *nextOpCount >= args->opCount; });
+  }
+
+  while (head < end) {
+    idle++;
+    if ((tail < head + args->substeps) && (tail < *nextHead + args->substeps) && (tail < end)) {
+      int slot = tail%args->substeps;
+      NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+slot*sliceSize, sliceSize, ptrType, requests+slot));
+      tail++;
+      idle = 0;
+    }
+    if (tail > head) {
+      int done;
+      int slot = head%args->substeps;
+      int size;
+      NCCLCHECK(ncclNetTest(requests[slot], &done, &size));
+      if (done) {
+        if (nextBuff) memcpy(nextBuff+slot*sliceSize, localBuff+slot*sliceSize, size);
+        head++;
+        if (llMode == 0) {
+          if (ptrType == NCCL_PTR_CUDA) ncclNetFlush(resources->netRecvComm, localBuff+slot*sliceSize, size);
+          *nextTail = head;
+        }
+        idle = 0;
+      }
+    }
+    if (idle) transportProxyIdle(idle);
+  }
+
+  // Wait for last ack and reset
+  if (llMode == 0) {
+    transportProxyWait([=] { return *nextHead == head; });
+    *nextHead = 0;
+  }
+
+nextColl:
+  if (llMode) {
+    resources->llStep += args->nsteps;
+    if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
+      resources->llStep += NCCL_LL_CHUNKS;
+      while (*nextHead < resources->llStep);
+      resources->llLastCleaning = resources->llStep;
+    }
+  }
+  return ncclSuccess;
+}
+
+struct ncclTransport netTransport = {
+  "NET",
+  netFillInfo,
+  netCanConnect,
+  netGetRings,
+  { netSendSetup, netSendConnect, netSendFree, netSendProxy },
+  { netRecvSetup, netRecvConnect, netRecvFree, netRecvProxy }
+};
diff --git a/src/transport/net_ib.cu b/src/transport/net_ib.cu
new file mode 100644
index 0000000..7b1fc99
--- /dev/null
+++ b/src/transport/net_ib.cu
@@ -0,0 +1,926 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "core.h"
+#include "socket.h"
+#include "net.h"
+#include "topo.h"
+#include "utils.h"
+#include "param.h"
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <poll.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "ibvwrap.h"
+
+#define USE_RDMA_WRITE 1
+#define USE_RDMA_SEND_INLINE 0
+#define MAXNAMESIZE 64
+static char ncclIbIfName[MAX_IF_NAME_SIZE];
+static union socketAddress ncclIbIfAddr;
+static int ncclNIbDevs = -1;
+struct ncclIbDev {
+  int device;
+  uint8_t port;
+  ibv_context* context;
+  char devName[MAXNAMESIZE];
+};
+
+#define MAX_IB_PORT 15
+struct userIbDev {
+  char devName[MAXNAMESIZE];
+  uint16_t port_en;
+};
+
+#define MAX_IB_DEVS 16
+struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
+struct userIbDev userIbDevs[MAX_IB_DEVS];
+pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
+
+NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", 0);
+NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 14);
+NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
+NCCL_PARAM(IbSl, "IB_SL", 0);
+NCCL_PARAM(IbTc, "IB_TC", 0);
+
+// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
+// allocated on separate pages as those pages will be marked DONTFORK
+// and if they are shared, that could cause a crash in a child process
+static ncclResult_t ncclIbMalloc(void** ptr, size_t size) {
+  size_t page_size = sysconf(_SC_PAGESIZE);
+  void* p;
+  int size_aligned = ROUNDUP(size, page_size);
+  int ret = posix_memalign(&p, page_size, size_aligned);
+  if (ret != 0) return ncclSystemError;
+  memset(p, 0, size);
+  *ptr = p;
+  return ncclSuccess;
+}
+
+pthread_t ncclIbAsyncThread;
+static void* ncclIbAsyncThreadMain(void* args) {
+  struct ibv_context* context = (struct ibv_context*)args;
+  while (1) {
+    struct ibv_async_event event;
+    if (ncclSuccess != wrap_ibv_get_async_event(context, &event)) { break; }
+    char *str;
+    if (ncclSuccess != wrap_ibv_event_type_str(&str, event.event_type)) { break; }
+    if (event.event_type != IBV_EVENT_COMM_EST)
+      WARN("NET/IB : Got async event : %s", str);
+    if (ncclSuccess != wrap_ibv_ack_async_event(&event)) { break; }
+  }
+  return NULL;
+}
+
+static void initDevices() {
+  if(wrap_ibv_symbols() != ncclSuccess) { return; }
+  if (ncclNIbDevs == -1) {
+    pthread_mutex_lock(&ncclIbLock);
+    wrap_ibv_fork_init();
+    if (ncclNIbDevs == -1) {
+      ncclNIbDevs = 0;
+      if (findInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
+        WARN("NET/IB : No IP interface found.");
+        return;
+      }
+      INFO(INIT|NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName);
+
+      // Detect IB cards
+      int nIbDevs;
+      struct ibv_device** devices;
+
+      // Check if user defined which IB device:port to use
+      char* userIbEnv = getenv("NCCL_IB_HCA");
+      struct netIf userIfs[MAX_IB_DEVS];
+      bool searchNot = userIbEnv && userIbEnv[0] == '^';
+      int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS);
+
+      if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return;
+
+      for (int d=0; d<nIbDevs; d++) {
+        struct ibv_context * context;
+        if (ncclSuccess != wrap_ibv_open_device(&context, devices[d])) {
+          WARN("NET/IB : Unable to open device %s", devices[d]->name);
+          continue;
+        }
+        int found = 0;
+        if (context) {
+          struct ibv_device_attr devAttr;
+          if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
+            WARN("NET/IB : Unable to query device %s", devices[d]->name);
+            continue;
+          }
+          for (int port = 1; port <= devAttr.phys_port_cnt; port++) {
+            struct ibv_port_attr portAttr;
+            if (ncclSuccess != wrap_ibv_query_port(context, port, &portAttr)) {
+              WARN("NET/IB : Unable to query port %d", port);
+              continue;
+            }
+            if (portAttr.state != IBV_PORT_ACTIVE) continue;
+            if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND
+                && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
+
+            // check against user specified HCAs/ports
+            if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) {
+              continue;
+            }
+            INFO(INIT|NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
+                portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
+            ncclIbDevs[ncclNIbDevs].device = d;
+            ncclIbDevs[ncclNIbDevs].port = port;
+            ncclIbDevs[ncclNIbDevs].context = context;
+            strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
+            ncclNIbDevs++;
+            found++;
+            pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
+          }
+
+          if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return; } }
+        }
+      }
+      if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return; };
+    }
+
+    pthread_mutex_unlock(&ncclIbLock);
+  }
+}
+
+ncclResult_t ncclIbDevices(int* ndev, int** scores) {
+  initDevices();
+  *ndev = ncclNIbDevs;
+  int cudaDev;
+  cudaGetDevice(&cudaDev);
+  char* cudaPath;
+  ncclResult_t err1 = getCudaPath(cudaDev, &cudaPath);
+  int* sc;
+  NCCLCHECK(ncclCalloc(&sc, ncclNIbDevs));
+  char line[1024];
+  sprintf(line, "CUDA Dev %d, IB Ports : ", cudaDev);
+  for (int d=0; d<ncclNIbDevs; d++) {
+    char* mlxPath;
+    ncclResult_t err2 = getMlxPath(ncclIbDevs[d].devName, &mlxPath);
+    int distance = (err1 != ncclSuccess || err2 != ncclSuccess || mlxPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(mlxPath, cudaPath);
+    sprintf(line+strlen(line), "%s/%d(%s) ", ncclIbDevs[d].devName, ncclIbDevs[d].port, pathDists[distance]);
+    sc[d] = 1+PATH_SOC-distance;
+    if (err2 == ncclSuccess) free(mlxPath);
+  }
+  INFO(INIT|NET,"%s", line);
+  if (err1 == ncclSuccess) free(cudaPath);
+  *scores = sc;
+  return ncclSuccess;
+}
+
+// Detect whether GDR can work on a given NIC with the current CUDA device
+// Returns :
+// ncclSuccess : GDR works
+// ncclSystemError : no module or module loaded but not supported by GPU
+ncclResult_t ncclIbGdrSupport(int ibDev) {
+  static int moduleLoaded = -1;
+  if (moduleLoaded == -1) {
+    moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
+  }
+  if (moduleLoaded == 0) return ncclSystemError;
+  ncclResult_t ret = ncclSystemError;
+  void* ptr;
+  if (cudaMalloc(&ptr, sizeof(int)) == cudaSuccess) {
+    struct ibv_mr* mr;
+    struct ibv_pd* pd;
+    if (wrap_ibv_alloc_pd(&pd, ncclIbDevs[ibDev].context) == ncclSuccess) {
+      if ((mr = wrap_direct_ibv_reg_mr(pd, ptr, sizeof(int), IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)) != NULL) {
+        ret = ncclSuccess;
+        wrap_ibv_dereg_mr(mr);
+      }
+      wrap_ibv_dealloc_pd(pd);
+    }
+    cudaFree(ptr);
+  }
+  return ret;
+}
+
+NCCL_PARAM(IbGdrLevel, "IB_GDR_LEVEL", -2);
+NCCL_PARAM(IbCudaSupport, "IB_CUDA_SUPPORT", -2);
+
+ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
+  initDevices();
+  *supportedTypes = NCCL_PTR_HOST;
+
+  int cudaDev;
+  if (cudaGetDevice(&cudaDev) != cudaSuccess) return ncclSuccess;
+
+  int ibGdrLevel = PATH_PHB;
+  if (ncclParamIbCudaSupport() != -2) ibGdrLevel = ncclParamIbCudaSupport() ? PATH_SOC + 1 : 0;
+  if (ncclParamIbGdrLevel() != -2) ibGdrLevel = ncclParamIbGdrLevel();
+  if (ibGdrLevel > 0) {
+    int gdrSupport = ncclIbGdrSupport(dev);
+    if (gdrSupport > 0) {
+      INFO(INIT|NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (%s)", cudaDev, ncclIbDevs[dev].devName, gdrSupport == 1 ? "no module" : "not supported by GPU");
+      ibGdrLevel = 0;
+    }
+  }
+
+  if (ibGdrLevel <= 0) return ncclSuccess;
+
+  char* cudaPath;
+  if (getCudaPath(cudaDev, &cudaPath) != ncclSuccess) return ncclSuccess;
+  char* mlxPath;
+  if (getMlxPath(ncclIbDevs[dev].devName, &mlxPath) != ncclSuccess) { free(cudaPath); return ncclSuccess; }
+  int distance = (mlxPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(mlxPath, cudaPath);
+  free(mlxPath); free(cudaPath);
+  if (distance < ibGdrLevel) {
+    *supportedTypes |= NCCL_PTR_CUDA;
+  } else {
+    INFO(INIT|NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (distance %d >= %d)", cudaDev, ncclIbDevs[dev].devName, distance, ibGdrLevel);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t GetSocketAddr(union socketAddress* addr) {
+  if (ncclNIbDevs == -1) initDevices();
+  memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
+  return ncclSuccess;
+}
+
+#define MAX_REQUESTS 128
+
+struct ncclIbQpInfo {
+  uint32_t lid;
+  uint8_t ib_port;
+  uint32_t qpn;
+
+  // For RoCE
+  uint64_t spn;
+  uint64_t iid;
+  enum ibv_mtu mtu;
+
+  // FIFO RDMA info
+  uint32_t fifoRkey;
+  uint64_t fifoAddr;
+};
+
+struct ncclIbHandle {
+  union socketAddress connectAddr;
+};
+
+struct ncclIbMr {
+  struct ibv_mr* mr;
+  int refcnt;
+};
+
+struct ncclIbVerbs {
+  struct ibv_pd* pd;
+  struct ibv_cq* cq;
+  struct ncclIbMr mrPool[MAX_REQUESTS];
+  int mrRotation;
+};
+
+struct ncclIbRequest {
+  int used;
+  int type;
+  struct ncclIbVerbs* verbs;
+  struct ncclIbMr * ibMr;
+  int done;
+  int size;
+  int free;
+};
+
+struct ncclIbListenComm {
+  int dev;
+  int fd;
+};
+
+struct ncclIbSendFifo {
+  uint64_t addr;
+  int      size;
+  uint32_t seq;
+  uint32_t rkey;
+  uint32_t ready;
+};
+
+struct ncclIbSendComm {
+  struct ncclIbSendFifo fifo[MAX_REQUESTS];
+  struct ncclIbRequest reqs[MAX_REQUESTS];
+  uint32_t fifoHead;
+  int fd;
+  int ready;
+  struct ncclIbVerbs verbs;
+  struct ibv_qp* qp;
+  struct ibv_mr* fifoMr;
+};
+
+struct ncclIbGpuFlush {
+  int enabled;
+  int hostMem;
+  struct ibv_mr* hostMr;
+  struct ibv_sge sge;
+  struct ibv_qp* qp;
+};
+
+struct ncclIbRemFifo {
+  struct ncclIbSendFifo elems[MAX_REQUESTS];
+  uint64_t addr;
+  uint32_t rkey;
+  uint32_t tail;
+  uint32_t flags;
+  struct ibv_mr* mr;
+  struct ibv_sge sge;
+};
+
+struct ncclIbRecvComm {
+  struct ncclIbRemFifo remFifo;
+  struct ncclIbRequest reqs[MAX_REQUESTS];
+  int fd;
+  int ready;
+  struct ncclIbVerbs verbs;
+  struct ibv_qp* qp;
+  struct ncclIbGpuFlush gpuFlush;
+};
+
+ncclResult_t ncclIbInitVerbs(ibv_context* ctx, struct ncclIbVerbs* verbs) {
+  NCCLCHECK(wrap_ibv_alloc_pd(&verbs->pd, ctx));
+  NCCLCHECK(wrap_ibv_create_cq(&verbs->cq, ctx, MAX_REQUESTS, NULL, NULL, 0));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbDestroyVerbs(struct ncclIbVerbs* verbs) {
+  NCCLCHECK(wrap_ibv_destroy_cq(verbs->cq));
+  NCCLCHECK(wrap_ibv_dealloc_pd(verbs->pd));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int access_flags, struct ibv_qp** qp) {
+  struct ibv_qp_init_attr qpInitAttr;
+  memset(&qpInitAttr, 0, sizeof(struct ibv_qp_init_attr));
+  qpInitAttr.send_cq = verbs->cq;
+  qpInitAttr.recv_cq = verbs->cq;
+  qpInitAttr.qp_type = IBV_QPT_RC;
+  qpInitAttr.cap.max_send_wr = MAX_REQUESTS;
+  qpInitAttr.cap.max_recv_wr = MAX_REQUESTS;
+  qpInitAttr.cap.max_send_sge = 1;
+  qpInitAttr.cap.max_recv_sge = 1;
+  qpInitAttr.cap.max_inline_data = 0;
+  NCCLCHECK(wrap_ibv_create_qp(qp, verbs->pd, &qpInitAttr));
+  struct ibv_qp_attr qpAttr;
+  memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
+  qpAttr.qp_state = IBV_QPS_INIT;
+  qpAttr.pkey_index = 0;
+  qpAttr.port_num = ib_port;
+  qpAttr.qp_access_flags = access_flags;
+  NCCLCHECK(wrap_ibv_modify_qp(*qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbRtrQp(ibv_qp* qp, struct ncclIbQpInfo* info) {
+  struct ibv_qp_attr qpAttr;
+  memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
+  qpAttr.qp_state = IBV_QPS_RTR;
+  qpAttr.path_mtu = info->mtu;
+  qpAttr.dest_qp_num = info->qpn;
+  qpAttr.rq_psn = 0;
+  qpAttr.max_dest_rd_atomic = 1;
+  qpAttr.min_rnr_timer = 12;
+  if (info->lid == 0) {
+    qpAttr.ah_attr.is_global = 1;
+    qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->spn;
+    qpAttr.ah_attr.grh.dgid.global.interface_id = info->iid;
+    qpAttr.ah_attr.grh.flow_label = 0;
+    qpAttr.ah_attr.grh.sgid_index = ncclParamIbGidIndex();
+    qpAttr.ah_attr.grh.hop_limit = 255;
+    qpAttr.ah_attr.grh.traffic_class = ncclParamIbTc();
+  } else {
+    qpAttr.ah_attr.is_global = 0;
+    qpAttr.ah_attr.dlid = info->lid;
+  }
+  qpAttr.ah_attr.sl = ncclParamIbSl();
+  qpAttr.ah_attr.src_path_bits = 0;
+  qpAttr.ah_attr.port_num = info->ib_port;
+  NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbRtsQp(ibv_qp* qp) {
+  struct ibv_qp_attr qpAttr;
+  memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
+  qpAttr.qp_state = IBV_QPS_RTS;
+  qpAttr.timeout = ncclParamIbTimeout();
+  qpAttr.retry_cnt = ncclParamIbRetryCnt();
+  qpAttr.rnr_retry = 7;
+  qpAttr.sq_psn = 0;
+  qpAttr.max_rd_atomic = 1;
+  NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC));
+  return ncclSuccess;
+}
+
+
+ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
+  struct ncclIbListenComm* comm;
+  NCCLCHECK(ncclCalloc(&comm, 1));
+  struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
+  static_assert(sizeof(struct ncclIbHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclIbHandle size too large");
+  comm->dev = dev;
+  NCCLCHECK(GetSocketAddr(&(handle->connectAddr)));
+  NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
+  *listenComm = comm;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
+  struct ncclIbSendComm* comm;
+  NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm)));
+
+  struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
+  NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr));
+  *sendComm = comm;
+
+  // IB Setup
+  initDevices(); /*NOTE: We need to do this for ncclNet unit test that bypasses nccl initialization*/
+  ibv_context* ctx = ncclIbDevs[dev].context;
+  NCCLCHECK(ncclIbInitVerbs(ctx, &comm->verbs));
+  uint8_t ib_port = ncclIbDevs[dev].port;
+  NCCLCHECK(ncclIbCreateQp(ib_port, &comm->verbs, IBV_ACCESS_REMOTE_WRITE, &comm->qp));
+
+  // Send my QP Info to receiver through the socket. Hope this won't block.
+  struct ibv_port_attr portAttr;
+  NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr));
+  struct ncclIbQpInfo qpInfo;
+  qpInfo.ib_port = ib_port;
+  qpInfo.qpn = comm->qp->qp_num;
+  qpInfo.mtu = portAttr.active_mtu;
+
+  // Prepare my fifo
+  NCCLCHECK(wrap_ibv_reg_mr(&comm->fifoMr, comm->verbs.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
+  qpInfo.fifoRkey = comm->fifoMr->rkey;
+  qpInfo.fifoAddr = (uint64_t)comm->fifo;
+
+  // RoCE support
+  qpInfo.lid = portAttr.lid;
+  if (qpInfo.lid) { // IB
+    INFO(INIT|NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid);
+  } else { // RoCE
+    union ibv_gid gid;
+    NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
+    qpInfo.spn = gid.global.subnet_prefix;
+    qpInfo.iid = gid.global.interface_id;
+    INFO(INIT|NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
+  }
+
+  NCCLCHECK(socketSend(comm->fd, &qpInfo, sizeof(qpInfo)));
+  return ncclSuccess;
+}
+
+NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
+
+ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
+  struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm;
+  struct ncclIbRecvComm* rComm;
+  NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm)));
+
+  struct sockaddr_in sockaddr;
+  socklen_t socklen = sizeof(struct sockaddr_in);
+  SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
+  struct ncclIbQpInfo remQpInfo;
+  NCCLCHECK(socketReceive(rComm->fd, &remQpInfo, sizeof(remQpInfo)));
+
+  // IB setup
+  ibv_context* ctx = ncclIbDevs[lComm->dev].context;
+  uint8_t ib_port = ncclIbDevs[lComm->dev].port;
+  struct ibv_port_attr portAttr;
+  NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr));
+  union ibv_gid gid;
+  NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
+
+  // QP Creation
+  NCCLCHECK(ncclIbInitVerbs(ctx, &rComm->verbs));
+  NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_REMOTE_WRITE, &rComm->qp));
+
+  // Adjust the MTU
+  remQpInfo.mtu = (enum ibv_mtu)std::min(remQpInfo.mtu, portAttr.active_mtu);
+
+  // Setup QP
+  struct ibv_qp* qp = rComm->qp;
+  NCCLCHECK(ncclIbRtrQp(qp, &remQpInfo));
+  NCCLCHECK(ncclIbRtsQp(qp));
+
+  // Retain remote fifo info and prepare my RDMA ops
+  rComm->remFifo.rkey = remQpInfo.fifoRkey;
+  rComm->remFifo.addr = remQpInfo.fifoAddr;
+  NCCLCHECK(wrap_ibv_reg_mr(&rComm->remFifo.mr, rComm->verbs.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
+  rComm->remFifo.sge.length = sizeof(struct ncclIbSendFifo);
+  rComm->remFifo.sge.lkey = rComm->remFifo.mr->lkey;
+
+#if USE_RDMA_SEND_INLINE
+  // Determine whether the remFifo element data can be sent INLINE
+  struct ibv_qp_attr attr;
+  struct ibv_qp_init_attr init_attr;
+  NCCLCHECK(wrap_ibv_query_qp(qp, &attr, IBV_QP_CAP, &init_attr));
+  if (init_attr.cap.max_inline_data >= rComm->remFifo.sge.length) rComm->remFifo.flags = IBV_SEND_INLINE;
+#endif
+
+  // Allocate Flush dummy buffer for GPU Direct RDMA
+  rComm->gpuFlush.enabled = (ncclIbGdrSupport(lComm->dev) == 0) && (ncclParamIbGdrFlushDisable() == 0) ? 1 : 0;
+  if (rComm->gpuFlush.enabled) {
+    NCCLCHECK(wrap_ibv_reg_mr(&rComm->gpuFlush.hostMr, rComm->verbs.pd, &rComm->gpuFlush.hostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE));
+    rComm->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlush.hostMem;
+    rComm->gpuFlush.sge.length = 1;
+    rComm->gpuFlush.sge.lkey = rComm->gpuFlush.hostMr->lkey;
+    NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rComm->gpuFlush.qp));
+    struct ncclIbQpInfo localQpInfo = {
+      .lid=portAttr.lid,
+      .ib_port=ib_port,
+      .qpn=rComm->gpuFlush.qp->qp_num,
+      .spn=gid.global.subnet_prefix,
+      .iid=gid.global.interface_id,
+      .mtu=portAttr.active_mtu
+    };
+    NCCLCHECK(ncclIbRtrQp(rComm->gpuFlush.qp, &localQpInfo));
+    NCCLCHECK(ncclIbRtsQp(rComm->gpuFlush.qp));
+  }
+
+  // Fill Handle
+  struct ncclIbQpInfo qpInfo = {
+    .lid=portAttr.lid,
+    .ib_port=ib_port,
+    .qpn=qp->qp_num,
+    .spn=gid.global.subnet_prefix,
+    .iid=gid.global.interface_id,
+    .mtu=remQpInfo.mtu
+  };
+
+  NCCLCHECK(socketSend(rComm->fd, &qpInfo, sizeof(qpInfo)));
+  *recvComm = rComm;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbGetRequest(struct ncclIbRequest* reqs, struct ncclIbRequest** req) {
+  for (int i=0; i<MAX_REQUESTS; i++) {
+    struct ncclIbRequest* r = reqs+i;
+    if (r->used == 0) {
+      r->used = 1;
+      r->type = 0;
+      r->verbs = NULL;
+      r->ibMr = NULL;
+      r->done = 0;
+      r->size = -1;
+      r->free = 0;
+      *req = r;
+      return ncclSuccess;
+    }
+  }
+  WARN("NET/IB : unable to allocate requests");
+  *req = NULL;
+  return ncclInternalError;
+}
+
+ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
+  if (comm->ready == 0) {
+    struct ncclIbQpInfo remQpInfo;
+    struct ibv_qp* qp = comm->qp;
+    NCCLCHECK(socketReceive(comm->fd, &remQpInfo, sizeof(remQpInfo)));
+    NCCLCHECK(ncclIbRtrQp(qp, &remQpInfo));
+    NCCLCHECK(ncclIbRtsQp(qp));
+    int go = 1;
+    NCCLCHECK(socketSend(comm->fd, &go, sizeof(go)));
+    comm->ready = 1;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclRecvCheck(struct ncclIbRecvComm* comm) {
+  if (comm->ready == 0) {
+    int go;
+    NCCLCHECK(socketReceive(comm->fd, &go, sizeof(go)));
+    comm->ready = 1;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbTest(void* request, int* done, int* size);
+
+#define REG_ALIGN (4096)
+
+// Cache previous MRs to avoid registering/unregistering for each Isend/Irecv
+ncclResult_t ncclIbGetMr(struct ncclIbVerbs* verbs, void* data, int size, struct ncclIbMr** mrRet) {
+  uint64_t addr = (uint64_t)data;
+  int elem = -1;
+  assert(size > 0);
+
+  // Look for an already existing MR
+  for (int i=0; i<MAX_REQUESTS; i++) {
+    if (verbs->mrPool[i].mr == NULL) continue;
+    uint64_t regAddr = (uint64_t)verbs->mrPool[i].mr->addr;
+    uint64_t regSize = (uint64_t)verbs->mrPool[i].mr->length;
+    if (regAddr <= addr && addr+size <= regAddr+regSize) {
+      *mrRet = verbs->mrPool+i;
+      verbs->mrPool[i].refcnt++;
+      return ncclSuccess;
+    }
+  }
+
+  // Find an unused element
+  if (elem == -1) {
+    elem = (verbs->mrRotation++);
+    for (int i=0; i<MAX_REQUESTS; i++) {
+      elem %= MAX_REQUESTS;
+      if (verbs->mrPool[elem].refcnt > 0) elem++; else break;
+    }
+    if (verbs->mrPool[elem].refcnt > 0) {
+      WARN("NET/IB : memory register : no MR available");
+      return ncclInternalError;
+    }
+  }
+
+  assert(elem < MAX_REQUESTS);
+  assert(verbs->mrPool[elem].refcnt == 0);
+
+  // Deregister / register
+  uint64_t regAddr = addr & (~(REG_ALIGN-1));
+  uint64_t regSize = addr+size - regAddr;
+  regSize = ((regSize + REG_ALIGN-1) / REG_ALIGN ) * REG_ALIGN;
+  if (verbs->mrPool[elem].mr) NCCLCHECK(wrap_ibv_dereg_mr(verbs->mrPool[elem].mr));
+  NCCLCHECK(wrap_ibv_reg_mr(&verbs->mrPool[elem].mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
+  *mrRet = verbs->mrPool+elem;
+  verbs->mrPool[elem].refcnt++;
+  TRACE(INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void** request) {
+  struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
+  NCCLCHECK(ncclSendCheck(comm));
+
+  struct ncclIbRequest* req;
+  NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
+  req->type = type;
+  req->verbs = &comm->verbs;
+  req->size = size;
+
+  struct ibv_send_wr wr;
+  memset(&wr, 0, sizeof(wr));
+  wr.wr_id = (uint64_t)req;
+
+  struct ibv_sge sge;
+  if (size == 0) {
+    wr.sg_list = NULL;
+    wr.num_sge = 0;
+  } else {
+    NCCLCHECK(ncclIbGetMr(&comm->verbs, data, size, &req->ibMr));
+    sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=req->ibMr->mr->lkey;
+    wr.sg_list = &sge;
+    wr.num_sge = 1;
+  }
+  wr.opcode = IBV_WR_SEND;
+  wr.send_flags = IBV_SEND_SIGNALED;
+
+  // Wait for receiver to have posted the recv
+  volatile struct ncclIbSendFifo* slot = comm->fifo + (comm->fifoHead%MAX_REQUESTS);
+  volatile uint32_t * readyPtr = &slot->ready;
+  while (*readyPtr == 0) sched_yield();
+#if USE_RDMA_WRITE
+  __sync_synchronize(); // order the readyPtr load against rkey load below
+  // Sanity checks to catch user collective call count/size mismatches
+  // plus any potential programming errors
+  if (size > slot->size || slot->size <= 0 || slot->addr == 0 || slot->rkey == 0 || slot->seq != comm->fifoHead) {
+    WARN("NET/IB : collective mismatch error local size %d remote %d addr %lx rkey %x seq %x/%x",
+        size, slot->size, slot->addr, slot->rkey, slot->seq, comm->fifoHead);
+    return ncclInternalError;
+  }
+  wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+  wr.wr.rdma.remote_addr = slot->addr;
+  wr.wr.rdma.rkey = slot->rkey;
+  wr.imm_data = size; // Send the message size via imm_data
+  __sync_synchronize();
+#endif
+  // We must clear slot->ready, but reset other fields to aid
+  // debugging and sanity checks
+  slot->ready = 0;
+  slot->addr = 0ULL;
+  slot->rkey = slot->size = slot->seq = 0;
+  comm->fifoHead++;
+
+  struct ibv_send_wr* bad_wr;
+  NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
+  *request = req;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t addr, int size) {
+  struct ibv_send_wr wr;
+  memset(&wr, 0, sizeof(wr));
+  struct ncclIbRequest* req;
+  NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
+  req->verbs = &comm->verbs;
+  req->free = 1; // Not a user req ; free as soon as it is complete.
+  wr.wr_id = (uint64_t)req;
+
+  struct ncclIbSendFifo* localElem = comm->remFifo.elems + (comm->remFifo.tail % MAX_REQUESTS);
+  localElem->addr = addr;
+  localElem->rkey = rkey;
+  localElem->ready = 1;
+  localElem->size = size; // Sanity/Debugging
+  localElem->seq = comm->remFifo.tail; // Sanity/Debugging
+  wr.wr.rdma.remote_addr = comm->remFifo.addr + (comm->remFifo.tail % MAX_REQUESTS) * sizeof(struct ncclIbSendFifo);
+  wr.wr.rdma.rkey = comm->remFifo.rkey;
+  comm->remFifo.sge.addr = (uint64_t)localElem;
+  wr.sg_list = &comm->remFifo.sge;
+  wr.num_sge = 1;
+  wr.opcode = IBV_WR_RDMA_WRITE;
+  wr.send_flags = IBV_SEND_SIGNALED | comm->remFifo.flags; // IBV_SEND_INLINE
+
+  struct ibv_send_wr* bad_wr;
+  NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
+  comm->remFifo.tail++;
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void** request) {
+  struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
+  NCCLCHECK(ncclRecvCheck(comm));
+
+  struct ncclIbRequest* req;
+  NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
+  req->type = type;
+  req->verbs = &comm->verbs;
+  req->size = size;
+
+  struct ibv_recv_wr wr;
+  memset(&wr, 0, sizeof(wr));
+  wr.wr_id = (uint64_t)req;
+
+  struct ibv_sge sge;
+  if (size == 0) {
+    wr.sg_list = NULL;
+    wr.num_sge = 0;
+    req->ibMr = NULL;
+  } else {
+    NCCLCHECK(ncclIbGetMr(&comm->verbs, data, size, &req->ibMr));
+    sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=req->ibMr->mr->lkey;
+    wr.sg_list = &sge;
+    wr.num_sge = 1;
+  }
+
+  struct ibv_recv_wr* bad_wr;
+  NCCLCHECK(wrap_ibv_post_recv(comm->qp, &wr, &bad_wr));
+  *request = req;
+
+  // Post to FIFO to notify sender
+  NCCLCHECK(ncclIbPostFifo(comm, req->ibMr->mr->rkey, (uint64_t)data, size));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbFlush(void* recvComm, void* data, int size) {
+  struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
+  if (comm->gpuFlush.enabled == 0 || size == 0) return ncclSuccess;
+
+  struct ncclIbRequest* req;
+  NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
+  req->verbs = &comm->verbs;
+  NCCLCHECK(ncclIbGetMr(&comm->verbs, data, 1, &req->ibMr));
+
+  struct ibv_send_wr wr;
+  memset(&wr, 0, sizeof(wr));
+  wr.wr_id = (uint64_t)req;
+
+  wr.wr.rdma.remote_addr = (uint64_t)data;
+  wr.wr.rdma.rkey = req->ibMr->mr->rkey;
+  wr.sg_list = &comm->gpuFlush.sge;
+  wr.num_sge = 1;
+  wr.opcode = IBV_WR_RDMA_READ;
+  wr.send_flags = IBV_SEND_SIGNALED;
+
+  struct ibv_send_wr* bad_wr;
+  NCCLCHECK(wrap_ibv_post_send(comm->gpuFlush.qp, &wr, &bad_wr));
+
+  int done = 0;
+  while (done == 0) {
+    NCCLCHECK((ncclResult_t)ncclIbTest(req, &done, NULL));
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbTest(void* request, int* done, int* size) {
+  struct ncclIbRequest *r = (struct ncclIbRequest*)request;
+  *done = 0;
+
+  while (1) {
+    if (r->done == 1) {
+      *done = 1;
+      if (size) *size = r->size;
+      r->used = 0;
+      return ncclSuccess;
+    }
+
+    int wrDone = 0;
+    struct ibv_wc wc;
+    NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 1, &wc, &wrDone));
+    if (wrDone == 0) return ncclSuccess;
+
+    if (wc.status != IBV_WC_SUCCESS) {
+      WARN("NET/IB : Got completion with error %d, opcode %d, len %d, vendor err %d", wc.status, wc.opcode, wc.byte_len, wc.vendor_err);
+      return ncclSystemError;
+    }
+
+    struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc.wr_id;
+    if (doneReq) {
+      if (wc.opcode == IBV_WC_RECV) {
+        doneReq->size = wc.byte_len;
+#if USE_RDMA_WRITE
+      } else if (wc.opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
+        doneReq->size = wc.imm_data;
+#endif
+      }
+      if (doneReq->ibMr != NULL) {
+        doneReq->ibMr->refcnt--;
+        if (doneReq->ibMr->refcnt < 0) WARN("NET/IB : doneReq %p MR %p refcount now %d", doneReq, doneReq->ibMr, doneReq->ibMr->refcnt);
+      }
+      doneReq->done = 1;
+      if (doneReq->free == 1) {
+        // This is an internal (FIFO post) req. Free it immediately.
+        doneReq->used = 0;
+      }
+    }
+  }
+}
+
+ncclResult_t ncclIbCloseSend(void* sendComm) {
+  struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
+  if (comm) {
+    close(comm->fd);
+    if (comm->qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qp));
+    if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr));
+    for (int i=0; i<MAX_REQUESTS; i++) {
+      if (comm->verbs.mrPool[i].mr != NULL) {
+        if (comm->verbs.mrPool[i].refcnt != 0) WARN("NET/IB : TX MR #%d has non-zero (%d) refcnt", i, comm->verbs.mrPool[i].refcnt);
+        NCCLCHECK(wrap_ibv_dereg_mr(comm->verbs.mrPool[i].mr));
+      }
+    }
+    NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs));
+    free(comm);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbCloseRecv(void* recvComm) {
+  struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
+  if (comm) {
+    close(comm->fd);
+    if (comm->qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qp));
+    if (comm->gpuFlush.enabled) {
+      if (comm->gpuFlush.qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->gpuFlush.qp));
+      if (comm->gpuFlush.hostMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->gpuFlush.hostMr));
+    }
+    if (comm->remFifo.mr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remFifo.mr));
+    for (int i=0; i<MAX_REQUESTS; i++) {
+      if (comm->verbs.mrPool[i].mr != NULL) {
+        if (comm->verbs.mrPool[i].refcnt != 0) WARN("NET/IB : RX MR #%d has non-zero (%d) refcnt", i, comm->verbs.mrPool[i].refcnt);
+        NCCLCHECK(wrap_ibv_dereg_mr(comm->verbs.mrPool[i].mr));
+      }
+    }
+    NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs));
+    free(comm);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbCloseListen(void* listenComm) {
+  struct ncclIbListenComm* comm = (struct ncclIbListenComm*)listenComm;
+  if (comm) {
+    close(comm->fd);
+    free(comm);
+  }
+  return ncclSuccess;
+}
+
+ncclNet_t ncclNetIb = {
+  "IB",
+  ncclIbDevices,
+  ncclIbPtrSupport,
+  ncclIbListen,
+  ncclIbConnect,
+  ncclIbAccept,
+  ncclIbIsend,
+  ncclIbIrecv,
+  ncclIbFlush,
+  ncclIbTest,
+  ncclIbCloseSend,
+  ncclIbCloseRecv,
+  ncclIbCloseListen
+};
+
+NCCL_PARAM(IbDisable, "IB_DISABLE", 0);
+
+bool ncclIbSupport() {
+  if (ncclParamIbDisable()) return 0;
+  initDevices();
+  return ncclNIbDevs > 0;
+}
diff --git a/src/transport/net_socket.cu b/src/transport/net_socket.cu
new file mode 100644
index 0000000..e7682dd
--- /dev/null
+++ b/src/transport/net_socket.cu
@@ -0,0 +1,239 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "core.h"
+#include "socket.h"
+#include "net.h"
+#include "topo.h"
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <poll.h>
+
+/* Init functions */
+
+ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) {
+  *supportedTypes = NCCL_PTR_HOST;
+  return ncclSuccess;
+}
+
+#define MAX_IFS 16
+static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
+static union socketAddress ncclNetIfAddrs[MAX_IFS];
+static int ncclNetIfs = -1;
+pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
+
+static void initDevices() {
+  if (ncclNetIfs == -1) {
+    pthread_mutex_lock(&ncclSocketLock);
+    if (ncclNetIfs == -1) {
+      ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
+      INFO(INIT|NET,"NET/Socket : %d interfaces found", ncclNetIfs);
+      if (ncclNetIfs <= 0) {
+        WARN("NET/Socket : no interface found");
+      }
+    }
+    pthread_mutex_unlock(&ncclSocketLock);
+  }
+}
+
+ncclResult_t ncclSocketDevices(int* ndev, int** scores) {
+  initDevices();
+  *ndev = ncclNetIfs;
+  int cudaDev;
+  cudaGetDevice(&cudaDev);
+  char* cudaPath;
+  ncclResult_t err1 = getCudaPath(cudaDev, &cudaPath);
+  int* sc;
+  NCCLCHECK(ncclCalloc(&sc, ncclNetIfs));
+  char line[1024];
+  sprintf(line, "CUDA Dev %d, IP Interfaces : ", cudaDev);
+  for (int i=0; i<ncclNetIfs; i++) {
+    char* sockPath;
+    ncclResult_t err2 = getSockPath(ncclNetIfNames+i*MAX_IF_NAME_SIZE, &sockPath);
+    int distance = (err1 != ncclSuccess || err2 != ncclSuccess || sockPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(sockPath, cudaPath);
+    sprintf(line+strlen(line), "%s(%s) ", ncclNetIfNames+i*MAX_IF_NAME_SIZE, pathDists[distance]);
+    sc[i] = 1+PATH_SOC-distance;
+    if (err2 == ncclSuccess) free(sockPath);
+  }
+  INFO(INIT|NET,"%s", line);
+  if (err1 == ncclSuccess) free(cudaPath);
+  *scores = sc;
+  return ncclSuccess;
+}
+
+static ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
+  if (ncclNetIfs == -1) initDevices();
+  if (dev >= ncclNetIfs) return ncclInternalError;
+  memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr));
+  return ncclSuccess;
+}
+
+/* Communication functions */
+
+struct ncclSocketHandle {
+  union socketAddress connectAddr;
+};
+
+struct ncclSocketRequest {
+  int used;
+  int size;
+};
+
+struct ncclSocketReqs {
+  struct ncclSocketRequest* requests;
+};
+
+struct ncclSocketComm {
+  int fd;
+  struct ncclSocketReqs reqs;
+};
+
+ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) {
+  NCCLCHECK(ncclCalloc(comm, 1));
+  (*comm)->fd = -1;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str) {
+  struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
+  NCCLCHECK(GetSocketAddrFromString(&(handle->connectAddr), str));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
+  struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
+  static_assert(sizeof(struct ncclSocketHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large");
+  // if dev >= 0, listen based on dev
+  if (dev >= 0) {
+    NCCLCHECK(GetSocketAddr(dev, &(handle->connectAddr)));
+  } else if (dev == findSubnetIf) {
+    // handle stores a remote address
+    // need to find a local addr that is in the same network as the remote addr
+    union socketAddress localAddr;
+    char ifName[MAX_IF_NAME_SIZE];
+    if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
+      WARN("No usable listening interface found");
+      return ncclSystemError;
+    }
+    // pass the local address back
+    memcpy(&handle->connectAddr, &localAddr, sizeof(handle->connectAddr));
+  } // Otherwise, handle stores a local address
+  struct ncclSocketComm* comm;
+  NCCLCHECK(ncclSocketNewComm(&comm));
+  NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
+  *listenComm = comm;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) {
+  struct ncclSocketComm* comm;
+  NCCLCHECK(ncclSocketNewComm(&comm));
+  struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
+  NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr));
+  *sendComm = comm;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) {
+  struct ncclSocketComm* lComm = (struct ncclSocketComm*)listenComm;
+  struct ncclSocketComm* rComm;
+  NCCLCHECK(ncclSocketNewComm(&rComm));
+  struct sockaddr_in sockaddr;
+  socklen_t socklen = sizeof(struct sockaddr_in);
+  SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
+  *recvComm = rComm;
+  return ncclSuccess;
+}
+
+#define MAX_REQUESTS 128
+
+ncclResult_t ncclSocketGetRequest(struct ncclSocketReqs* reqs, struct ncclSocketRequest** req) {
+  if (reqs->requests == NULL) {
+    NCCLCHECK(ncclCalloc(&reqs->requests, MAX_REQUESTS));
+  }
+  for (int i=0; i<MAX_REQUESTS; i++) {
+    struct ncclSocketRequest* r = reqs->requests+i;
+    if (r->used == 0) {
+      r->used = 1;
+      r->size = -1;
+      *req = r;
+      return ncclSuccess;
+    }
+  }
+  WARN("Socket : unable to allocate requests");
+  return ncclInternalError;
+}
+
+ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, int type, void** request) {
+  if (type != NCCL_PTR_HOST) return ncclInternalError;
+  struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm;
+  *request = NULL;
+  NCCLCHECK(socketSend(comm->fd, &size, sizeof(int)));
+  NCCLCHECK(socketSend(comm->fd, data, size));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, int type, void** request) {
+  if (type != NCCL_PTR_HOST) return ncclInternalError;
+  struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm;
+  int recvSize;
+  NCCLCHECK(socketReceive(comm->fd, &recvSize, sizeof(int)));
+  if (recvSize > size) {
+    WARN("Message truncated : received %d bytes instead of %d", recvSize, size);
+    return ncclInternalError;
+  }
+  NCCLCHECK(socketReceive(comm->fd, data, std::min(recvSize, size)));
+  struct ncclSocketRequest* recvReq = NULL;
+  NCCLCHECK(ncclSocketGetRequest(&comm->reqs, &recvReq));
+  recvReq->size = recvSize;
+  *request = recvReq;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size) {
+  // We don't support CUDA pointers, so we don't need a flush operation
+  return ncclInternalError;
+}
+
+ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
+  *done = 1;
+  struct ncclSocketRequest *r = (struct ncclSocketRequest*)request;
+  if (r) {
+    if (size) *size = r->size;
+    r->used = 0;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketClose(void* opaqueComm) {
+  struct ncclSocketComm* comm = (struct ncclSocketComm*)opaqueComm;
+  if (comm) {
+    free(comm->reqs.requests);
+    close(comm->fd);
+    free(comm);
+  }
+  return ncclSuccess;
+}
+
+ncclNet_t ncclNetSocket = {
+  "Socket",
+  ncclSocketDevices,
+  ncclSocketPtrSupport,
+  ncclSocketListen,
+  ncclSocketConnect,
+  ncclSocketAccept,
+  ncclSocketIsend,
+  ncclSocketIrecv,
+  ncclSocketFlush,
+  ncclSocketTest,
+  ncclSocketClose,
+  ncclSocketClose,
+  ncclSocketClose
+};
diff --git a/src/transport/p2p.cu b/src/transport/p2p.cu
new file mode 100644
index 0000000..35aebb4
--- /dev/null
+++ b/src/transport/p2p.cu
@@ -0,0 +1,611 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "utils.h"
+#include "topo.h"
+#include "transport.h"
+#include "param.h"
+#include <unistd.h>
+#include <cuda_runtime.h>
+#include "nvmlwrap.h"
+#include <ctype.h>
+#include "nvlink.h"
+
+struct p2pInfo {
+  int rank;
+  int cudaDev;
+  uint64_t hostHash;
+  uint64_t pidHash;
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+};
+
+struct p2pConnectInfo {
+  int direct;
+  union {
+    void* directPtr;
+    cudaIpcMemHandle_t devIpc;
+  };
+};
+
+#include <sys/types.h>
+
+/* Fill information necessary to exchange between ranks to choose whether or not
+ * to use this transport */
+ncclResult_t p2pFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
+  struct p2pInfo* info = (struct p2pInfo*)opaqueInfo;
+  static_assert(sizeof(struct p2pInfo) <= sizeof(ncclTinfo_t), "p2p Info too large");
+  info->rank = rank;
+  CUDACHECK(cudaGetDevice(&info->cudaDev));
+  info->hostHash=getHostHash();
+  info->pidHash=getPidHash();
+
+  // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
+  // cudaDev is a CUDA runtime dev number which could be different from the
+  // NVML device number. Then we get the busID from NVML to be sure it is
+  // consistent with NVML remote PCI bus Ids.
+  CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
+  nvmlDevice_t nvmlDevice;
+  NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice));
+  nvmlPciInfo_t pciInfo;
+  NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo));
+  strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE);
+  return ncclSuccess;
+}
+
+NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2);
+NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2);
+
+/* Determine if we can communicate with the peer through p2p */
+ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
+  // Do not use P2P across root complexes by default (provided CUDA permits it)
+  int p2pLevel = PATH_SOC;
+  if (ncclParamP2pDisable() == 1) p2pLevel = 0;
+  if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();
+
+  *ret = 0;
+
+  if (p2pLevel == 0) return ncclSuccess;
+
+  struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
+  struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
+
+  // Rule out different nodes
+  if (myInfo->hostHash != peerInfo->hostHash) return ncclSuccess;
+
+  // Do not detect topology if we're on the same GPU. Note this is not really supported.
+  if (myInfo->cudaDev == peerInfo->cudaDev) {
+    *ret = 1 + PATH_SOC;
+    return ncclSuccess;
+  }
+
+  // See if CUDA can do P2P
+  int p2p;
+  if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerInfo->cudaDev) != cudaSuccess) {
+    INFO(INIT|P2P,"peer query failed between dev %d and dev %d",
+        myInfo->cudaDev, peerInfo->cudaDev);
+    return ncclSuccess;
+  }
+  if (p2p == 0) return ncclSuccess;
+
+  // Check for NVLink/NVswitch
+  int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
+  if (nvlinkp2p > 0) {
+    *ret = nvlinkp2p;
+    return ncclSuccess;
+  }
+
+  // Finally compute the PCI distance and compare with the p2pLevel.
+  char* myPath;
+  char* peerPath;
+  ncclResult_t err1 = getCudaPath(myInfo->cudaDev, &myPath);
+  ncclResult_t err2 = getCudaPath(peerInfo->cudaDev, &peerPath);
+  if (err1 == ncclSuccess && err2 == ncclSuccess) {
+    int distance = pciDistance(myPath, peerPath);
+    if (distance < p2pLevel) {
+      *ret = 1 + PATH_SOC - distance;
+    }
+  }
+  if (err1 == ncclSuccess) free(myPath);
+  if (err2 == ncclSuccess) free(peerPath);
+  return ncclSuccess;
+}
+
+static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) {
+  int nrings = 0;
+  ncclTvalue_t* line = matrix+current*n;
+  inTheRing[current] = 1;
+  int currentStep = (currentRing+1)*n-remaining;
+  rings[currentStep-1] = current;
+  if (remaining == 0) {
+    int looprank = rings[currentRing*n];
+    if (line[looprank] > 0) {
+      if (currentRing+1 == nRingsMax) {
+        nrings = 1;
+      } else {
+        line[looprank]--;
+        for (int i=0; i<n; i++) inTheRing[i] = 0;
+        if (connect) {
+          // First two slots are already set and we need to respect those constraints
+          inTheRing[rings[currentStep]] = 1;
+          nrings = 1 + computeRingsRec(matrix, n, rings, currentRing+1, nRingsMax, inTheRing, rings[currentStep+1], n-2, connect);
+        } else {
+          rings[(currentRing+1)*n] = 0;
+          nrings = 1 + computeRingsRec(matrix, n, rings, currentRing+1, nRingsMax, inTheRing, 0, n-1, connect);
+        }
+        line[looprank]++;
+        for (int i=0; i<n; i++) inTheRing[i] = 1;
+      }
+    }
+  } else {
+    int ringsSave[nRingsMax*n];
+    int maxStep = 0;
+    for (int i=0; i<n; i++) {
+      if (inTheRing[i] == 0 && line[i] > 0) {
+        line[i]--;
+        int nr = computeRingsRec(matrix, n, rings, currentRing, nRingsMax, inTheRing, i, remaining-1, connect);
+        if (nr > nrings) {
+          nrings = nr;
+          maxStep = (nr+currentRing)*n;
+          ringsSave[currentStep] = i;
+          // Save the rest of the rings
+          for (int r=currentStep+1; r<maxStep; r++) {
+            ringsSave[r] = rings[r];
+          }
+          if (nrings + currentRing == nRingsMax) {
+            // We found an optimal solution. Let's stop there.
+            break;
+          }
+        }
+        line[i]++;
+      }
+    }
+    for (int r=currentStep; r<maxStep; r++) {
+      rings[r] = ringsSave[r];
+    }
+  }
+  inTheRing[current] = 0;
+  return nrings;
+}
+
+static inline int copyRings(int nranks, int* rings, int nrings, int newNrings) {
+  if (nrings == 0) return 0;
+  // Copy rings by dup times
+  if (newNrings > MAXRINGS) {
+    newNrings = MAXRINGS;
+  }
+  for (int r=nrings; r<newNrings; r++) {
+    for (int i=0; i<nranks; i++) rings[r*nranks+i] = rings[(r%nrings)*nranks+i];
+  }
+  return newNrings;
+}
+
+int p2pComputeRingsNvLink(ncclTvalue_t* matrix, int nranks, int *rings, int nringsMax, int connect) {
+  int* inTheRing = (int*)malloc(sizeof(int)*nranks);
+  if (inTheRing == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*nranks); return 0; }
+  for (int i=0; i<nranks; i++) inTheRing[i] = 0;
+  int nrings;
+  if (connect) {
+    inTheRing[rings[0]] = 1;
+    nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, rings[1], nranks-2, connect);
+    nrings = copyRings(nranks, rings, nrings, nringsMax);
+  } else {
+    rings[0] = 0;
+    nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, 0, nranks-1, connect);
+  }
+  free(inTheRing);
+  return nrings;
+}
+
+static inline int findConnect(int nranks, int* ranks) {
+  for (int i = 0; i<nranks; i++) {
+    if (ranks[i] != -1) return i;
+  }
+  return -1;
+}
+
+int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int oversubscribe, int* nthreads) {
+  if (nrings == 0) return 0;
+  if (nrings > MAXRINGS) {
+    WARN("Max rings reached, limiting to %d", MAXRINGS);
+    nrings = MAXRINGS;
+  }
+  // Find existing constraints / connections
+  int connect = 0;
+  for (int r=0; r<nrings; r++) {
+    int start = findConnect(nranks, prev+r*nranks);
+    int end = findConnect(nranks, next+r*nranks);
+    if (start != -1 && end != -1) {
+      rings[r*nranks] = end;
+      rings[r*nranks+1] = start;
+      connect = 1;
+    }
+  }
+
+  // Compute rings
+  ncclTvalue_t* matrix = (ncclTvalue_t*)malloc(sizeof(ncclTvalue_t)*nranks*nranks);
+  if (matrix == NULL) { WARN("malloc of %ld bytes failed", sizeof(ncclTvalue_t)*nranks*nranks); return 0; }
+  for (int i=0; i<nranks; i++) for (int j=0; j<nranks; j++)
+      matrix[i*nranks+j] = oversubscribe ? values[i*nranks+j]/CONNECT_NVLINK*2 : values[i*nranks+j]/CONNECT_NVLINK ;
+
+  int compNrings = p2pComputeRingsNvLink(matrix, nranks, rings, nrings, connect);
+
+  free(matrix);
+
+  if (oversubscribe || connect) return compNrings;
+
+  if (compNrings && compNrings < nrings && nranks <= 4) {
+    // Try to oversubscribe to get a better result
+    int *rings2 = (int *)malloc(sizeof(int)*MAXRINGS*nranks);
+    if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXRINGS*nranks); return 0; }
+    for (int i=0; i<MAXRINGS*nranks; i++) rings2[i] = -1;
+    int nThreads = *nthreads;
+    int compNrings2 = p2pComputeRingsNvLink(values, nranks, rings2, nrings, prev, next, 1, &nThreads);
+    if (compNrings2 > compNrings*2) {
+      // Oversubscription worked.
+      for (int i=0; i<compNrings2*nranks; i++) rings[i] = rings2[i];
+      compNrings = compNrings2;
+    }
+    free(rings2);
+  }
+
+  // Duplicate the rings for direct NVLink
+  compNrings = copyRings(nranks, rings, compNrings, compNrings*2);
+
+  if (ncclCudaCompCap() == 6) *nthreads /= 2;
+  return compNrings;
+}
+
+int p2pComputeRingsSeqConnect(ncclTvalue_t* values, int nranks, int* rings, int nringsStart, int* prev, int* next, int minScore, int* nthreads) {
+  int nrings = nringsStart;
+  int connect = 0;
+  for (int r=0; r<nrings; r++) {
+    int start = findConnect(nranks, prev+r*nranks);
+    int end = findConnect(nranks, next+r*nranks);
+    if (start != -1 && end != -1) {
+      rings[r*nranks] = end;
+      rings[r*nranks+1] = start;
+      int cur = start;
+      for (int i=2; i<nranks; i++) {
+        int next = (cur+1) % nranks;
+        while (next == end || next == start) next = (next+1) % nranks;
+        if (values[cur*nranks+next] < minScore) {
+          return 0;
+        }
+        rings[r*nranks+i] = next;
+        cur = next;
+      }
+      connect = 1;
+    } else {
+      if (connect == 1 && r > 0) {
+        WARN("Connecting rings but did not find start/end for ring %d. Disabling other rings.", r);
+        return r;
+      } else {
+        return 0;
+      }
+    }
+  }
+  return nrings;
+}
+
+int p2pComputeRingsSeqNew(ncclTvalue_t* values, int nranks, int* rings, int nringsStart, int* prev, int* next, int minScore, int* nthreads) {
+  for (int r=0; r<nringsStart; r++) {
+    for (int i=0; i<nranks; i++) {
+      rings[r*nranks+i] = i;
+    }
+  }
+  return nringsStart;
+}
+
+static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) {
+  for (int score = PATH_SOC+1; score >= minScore; score--) {
+    int best = -1;
+    int worst_end_score = PATH_SOC+2; // find the closest to rank, farthest from end
+    for (int n = 0; n < nranks; n++) {
+      if (inRing[n]) continue;
+      if (values[rank*nranks+n] == score) {
+        if (end == -1) return n;
+        if (values[end*nranks+n] < worst_end_score) {
+          best = n;
+          worst_end_score = values[end*nranks+n];
+        }
+      }
+    }
+    if (best != -1) return best;
+  }
+  return -1;
+}
+
+int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int minScore) {
+  int connect = 0;
+  for (int r=0; r<nrings; r++) {
+    int start = findConnect(nranks, prev+r*nranks);
+    int end = findConnect(nranks, next+r*nranks);
+
+    int inRing[nranks];
+    for (int i=0; i<nranks; i++) inRing[i] = 0;
+
+    if (start == -1 && end == -1) {
+      if (connect == 1 && r > 0) {
+        WARN("Connecting ring %d : did not find start/end. Disabling other rings.", r);
+        return r;
+      }
+      end = 0;
+      inRing[end] = 1;
+      start = findClosestPci(values, inRing, end, -1, nranks, minScore);
+      if (start == -1) return r;
+    } else if (start == -1 || end == -1) {
+      WARN("Connecting ring %d : inconsistent start/end. Disabling other rings.", r);
+      return r;
+    } else {
+      connect = 1;
+    }
+    rings[r*nranks] = end;
+    rings[r*nranks+1] = start;
+    inRing[start] = inRing[end] = 1;
+    int cur = start;
+    for (int i=2; i<nranks; i++) {
+      int next = findClosestPci(values, inRing, cur, end, nranks, minScore);
+      if (next == -1) return r;
+
+      inRing[next] = 1;
+      rings[r*nranks+i] = next;
+      cur = next;
+    }
+    // Check the loop is closing
+    inRing[end] = 0;
+    if (findClosestPci(values, inRing, cur, end, nranks, minScore) != end) return r;
+
+    if (connect == 0) return 1;
+  }
+  return nrings;
+}
+
+ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
+  if (*nringsRet == 0) return ncclSuccess;
+  int *rings;
+  NCCLCHECK(ncclCalloc(&rings, MAXRINGS*nranks));
+  for (int i=0; i<MAXRINGS*nranks; i++) rings[i] = -1;
+  int nrings = *nringsRet;
+
+  // NVswitch
+  int nvswitchLinks = 0;
+  int directLinks = 0;
+  for (int rank=0; rank<nranks; rank++) {
+    for (int j=1; j<nranks; j++) {
+      int i = (rank + j) % nranks;
+      ncclTvalue_t links = values[rank*nranks+i]/CONNECT_NVSWITCH;
+      if (j>1 && links != nvswitchLinks) {
+        WARN("Internal error : NVswitch links mismatch");
+        return ncclInternalError;
+      }
+      nvswitchLinks = links;
+    }
+  }
+  if (nvswitchLinks) {
+    // NVSwitch : Connect existing rings
+    int nringsConnected = p2pComputeRingsSeqConnect(values, nranks, rings, nrings, prev, next, minScore, nthreads);
+    if (nringsConnected > 0) {
+      nrings = nringsConnected;
+    } else {
+      nrings = std::min(nrings, nvswitchLinks); // NVSwitch: Limit rings to number of NVLinks
+      // Or create new ones
+      nrings = p2pComputeRingsSeqNew(values, nranks, rings, nrings, prev, next, minScore, nthreads);
+      // And duplicate them
+      nrings = copyRings(nranks, rings, nrings, nrings*2);
+    }
+    goto end;
+  }
+
+  // point-to-point NVLink
+  for (int rank=0; rank<nranks; rank++) {
+    int links = 0;
+    for (int i=0; i<nranks; i++) {
+      ncclTvalue_t val = values[rank*nranks+i];
+      if (val >= CONNECT_NVSWITCH) continue;
+      links += val/CONNECT_NVLINK;
+    }
+    if (rank == 0) directLinks = links;
+    else directLinks =  std::min(directLinks, links);
+  }
+  if (directLinks > 0) {
+    // NVLink : Connect rings or create new ones
+    nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads);
+    goto end;
+  }
+
+  // PCIe or QPI : Connect rings or create new ones
+  nrings = p2pComputeRingsPci(values, nranks, rings, *nringsRet, prev, next, minScore);
+
+end:
+  *nringsRet = nrings;
+  for (int ring = 0; ring<nrings; ring++) {
+    for (int index=0; index<nranks; index++) {
+      int prevIndex = (index - 1 + nranks) % nranks;
+      int nextIndex = (index + 1) % nranks;
+      int curRank = rings[ring*nranks+index];
+      int prevRank = rings[ring*nranks+prevIndex];
+      int nextRank = rings[ring*nranks+nextIndex];
+      if (prev[ring*nranks+curRank] == -1) prev[ring*nranks+curRank] = prevRank;
+      if (next[ring*nranks+curRank] == -1) next[ring*nranks+curRank] = nextRank;
+    }
+  }
+
+  free(rings);
+  return ncclSuccess;
+}
+
+#define TRACE_DUMP_IPC(DEVIPC)                                                             \
+  do {                                                                                     \
+    unsigned long *devIpc = (unsigned long *) (DEVIPC);                                    \
+    TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[0], devIpc[1], devIpc[2], devIpc[3]); \
+    TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \
+  } while (0)
+
+/* Send: Create and return connect structures for this peer to connect to me */
+ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
+  struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
+  struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
+  struct p2pConnectInfo info;
+  if (myInfo->pidHash == peerInfo->pidHash) {
+    info.direct = 1;
+    info.directPtr = ring->devMemSend;
+    if (myInfo->cudaDev == peerInfo->cudaDev) {
+      INFO(INIT|P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank);
+    } else {
+      // Enable P2P access
+      cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
+      if (err == cudaErrorPeerAccessAlreadyEnabled) {
+        cudaGetLastError();
+      } else if (err != cudaSuccess) {
+        WARN("failed to peer with device %d: %d %s",
+            peerInfo->cudaDev, err, cudaGetErrorString(err));
+        return ncclInternalError;
+      }
+      INFO(INIT|P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
+          ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+    }
+  } else {
+    info.direct = 0;
+    // Map IPC and enable P2P access
+    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)ring->devMemSend);
+    if (err != cudaSuccess) {
+      WARN("rank %d failed to get CUDA IPC handle to device %d : %d %s",
+          myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
+      return ncclInternalError;
+    }
+    INFO(INIT|P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
+        ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+    //TRACE_DUMP_IPC(&info.devIpc);
+  }
+  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
+  memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
+  return ncclSuccess;
+}
+
+/* Create and return connect structures for this peer to connect to me */
+ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
+  struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
+  struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
+  struct p2pConnectInfo info;
+  if (myInfo->pidHash == peerInfo->pidHash) {
+    info.direct = 1;
+    info.directPtr = ring->devMemRecv;
+    if (myInfo->cudaDev == peerInfo->cudaDev) {
+      TRACE(INIT|P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
+    } else {
+      // Enable P2P access
+      cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
+      if (err == cudaErrorPeerAccessAlreadyEnabled) {
+        cudaGetLastError();
+      } else if (err != cudaSuccess) {
+        WARN("failed to peer with device %d: %d %s",
+            peerInfo->cudaDev, err, cudaGetErrorString(err));
+        return ncclInternalError;
+      }
+      TRACE(INIT|P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+    }
+  } else {
+    info.direct = 0;
+    // Map IPC and enable P2P access
+    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)ring->devMemRecv);
+    if (err != cudaSuccess) {
+      WARN("rank %d failed to get CUDA IPC handle to device %d : %d %s",
+          myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
+      return ncclInternalError;
+    }
+    TRACE(INIT|P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+    //TRACE_DUMP_IPC(&info.devIpc);
+  }
+  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
+  memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
+  return ncclSuccess;
+}
+
+/* Connect/Send to this peer */
+static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+  void** resources = &send->transportResources;
+  struct ncclRecvMem* remDevMem;
+  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
+  if (info->direct) {
+    remDevMem = (struct ncclRecvMem*)(info->directPtr);
+    send->conn.direct = 1;
+    *resources = NULL;
+  } else {
+    void* remPtr = NULL;
+    //TRACE_DUMP_IPC(&info->devIpc);
+    cudaError_t err = cudaIpcOpenMemHandle(&remPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
+    void** ipcPtrSave;
+    NCCLCHECK(ncclCalloc(&ipcPtrSave, 1));
+    *resources = ipcPtrSave;
+    *ipcPtrSave = remPtr;
+    remDevMem = (struct ncclRecvMem*)remPtr;
+    if (err != cudaSuccess) {
+      WARN("failed to open CUDA IPC handle : %d %s",
+          err, cudaGetErrorString(err));
+      return ncclUnhandledCudaError;
+    }
+  }
+
+  send->conn.buff = remDevMem->buff;
+  send->conn.llBuff = remDevMem->llBuff;
+  send->conn.tail = &remDevMem->tail;
+  send->conn.opCount = &remDevMem->opCount;
+  // send->conn->head should have been set to devMemSend already
+  return ncclSuccess;
+}
+
+/* Connect/Recv from this peer */
+ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+  void** resources = &recv->transportResources;
+  struct ncclSendMem* remDevMem;
+  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
+  if (info->direct) {
+    remDevMem = (struct ncclSendMem*)(info->directPtr);
+    recv->conn.direct = 1;
+    recv->conn.ptrExchange = &remDevMem->ptrExchange;
+    *resources = NULL;
+  } else {
+    void* remPtr = NULL;
+    //TRACE_DUMP_IPC(&info->devIpc);
+    cudaError_t err = cudaIpcOpenMemHandle(&remPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
+    void** ipcPtrSave;
+    NCCLCHECK(ncclCalloc(&ipcPtrSave, 1));
+    *resources = ipcPtrSave;
+    *ipcPtrSave = remPtr;
+    remDevMem = (struct ncclSendMem*)remPtr;
+    if (err != cudaSuccess) {
+      WARN("failed to open CUDA IPC handle : %d %s",
+          err, cudaGetErrorString(err));
+      return ncclUnhandledCudaError;
+    }
+  }
+
+  // recv->conn->buff should have been set to devMemRecv already
+  // recv->conn->tail should have been set to devMemRecv already
+  // recv->conn->opCount should have been set to devMemRecv already
+  recv->conn.head = &remDevMem->head;
+  recv->conn.llHead = &remDevMem->llHead;
+  return ncclSuccess;
+}
+
+ncclResult_t p2pFree(void* resources) {
+  if (resources != NULL) {
+    void** ipcPtrSave = (void**) resources;
+    CUDACHECK(cudaIpcCloseMemHandle(*ipcPtrSave));
+    free(resources);
+  }
+  return ncclSuccess;
+}
+
+struct ncclTransport p2pTransport = {
+  "P2P",
+  p2pFillInfo,
+  p2pCanConnect,
+  p2pGetRings,
+  { p2pSendSetup, p2pSendConnect, p2pFree, NULL },
+  { p2pRecvSetup, p2pRecvConnect, p2pFree, NULL }
+};
diff --git a/src/transport/shm.cu b/src/transport/shm.cu
new file mode 100644
index 0000000..38b6c6a
--- /dev/null
+++ b/src/transport/shm.cu
@@ -0,0 +1,270 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "utils.h"
+#include "transport.h"
+#include "param.h"
+#include "shm.h"
+#include <unistd.h>
+#include <cuda_runtime.h>
+
+struct shmInfo {
+  int rank;
+  int cudaDev;
+  uint64_t hostHash;
+  uint64_t pidHash;
+};
+
+struct shmSendConnectInfo {
+  uint64_t pidHash;
+  int id;
+  int rank;
+  int shmSize;
+};
+
+struct shmRecvConnectInfo {
+  uint64_t pidHash;
+  int id;
+  int rank;
+  int shmSize;
+};
+
+struct shmSendResources {
+  int remShmSize;
+  struct ncclRecvMem* remHostMem;
+  struct ncclRecvMem* devRemHostMem;
+  int shmSize;
+  struct ncclSendMem* hostMem;
+  struct ncclSendMem* devHostMem;
+};
+
+struct shmRecvResources {
+  int remShmSize;
+  struct ncclSendMem* remHostMem;
+  struct ncclSendMem* devRemHostMem;
+  int shmSize;
+  struct ncclRecvMem* hostMem;
+  struct ncclRecvMem* devHostMem;
+};
+
+/* Fill information necessary to exchange between ranks to choose whether or not
+ * to use this transport */
+ncclResult_t shmFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
+  struct shmInfo* info = (struct shmInfo*)opaqueInfo;
+  static_assert(sizeof(struct shmInfo) <= sizeof(ncclTinfo_t), "shm Info too large");
+  info->rank = rank;
+  CUDACHECK(cudaGetDevice(&info->cudaDev));
+  info->hostHash=getHostHash();
+  info->pidHash=getPidHash();
+  return ncclSuccess;
+}
+
+NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);
+
+/* Determine if we can communicate with the peer */
+ncclResult_t shmCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
+  struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo;
+  struct shmInfo* peerInfo = (struct shmInfo*)peerOpaqueInfo;
+  *ret = ((ncclParamShmDisable() == 1) || (myInfo->hostHash != peerInfo->hostHash)) ? 0 : 1;
+  return ncclSuccess;
+}
+
+static inline int groupFirst(int nranks, int* groups, int group, int rankToAvoid) {
+  for (int rank = 0; rank<nranks; rank++) {
+    if ((groups[rank] == group) && (rank != rankToAvoid)) return rank;
+  }
+  return -1;
+}
+
+static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid) {
+  for (int rank = nranks-1; rank>=0; rank--) {
+    if ((groups[rank] == group) && (rank != rankToAvoid)) return rank;
+  }
+  return -1;
+}
+
+ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
+  if (*nringsRet == MAXRINGS) *nringsRet = 1;
+  int nGroups = groups[nranks-1] + 1;
+  int starts[nGroups];
+  int ends[nGroups];
+  for (int ring = 0; ring<*nringsRet; ring++) {
+    int startGroup = -1, endGroup = -1;
+    for (int group = 0; group<nGroups; group++) {
+      int start = -1;
+      int end = -1;
+      int nranksInGroup = 0;
+      for (int rank=0; rank<nranks; rank++) {
+        if (groups[rank] != group) continue;
+        nranksInGroup++;
+        if (prev[ring*nranks+rank] != -1) {
+          if (start != -1) {
+            WARN("Multiple starts found in group");
+          }
+          start = rank;
+          startGroup = group;
+        }
+        if (next[ring*nranks+rank] != -1) {
+          if (end != -1) {
+            WARN("Multiple ends found in group");
+          }
+          end = rank;
+          endGroup = group;
+        }
+      }
+      if (nranksInGroup == 1) {
+        start = end = groupFirst(nranks, groups, group, -1);
+      } else {
+        if (start == -1)
+          start = groupFirst(nranks, groups, group, end);
+        if (end == -1)
+          end = groupLast(nranks, groups, group, start);
+      }
+      if (start == -1 || end == -1) {
+        *nringsRet = ring;
+        return ncclSuccess;
+      }
+      starts[group] = start;
+      ends[group] = end;
+    }
+    if (endGroup == -1 || startGroup == -1) {
+      startGroup = 0;
+      endGroup = nGroups-1;
+      // Close the loop
+      next[ring*nranks+ends[endGroup]] = starts[startGroup];
+      prev[ring*nranks+starts[startGroup]] = ends[endGroup];
+    }
+    int group = startGroup;
+    for (int i=0; i<nGroups-2; i++) {
+      int nextGroup = (group+1)%nGroups;
+      if (nextGroup == endGroup) nextGroup = (nextGroup+1)%nGroups;
+      next[ring*nranks+ends[group]] = starts[nextGroup];
+      prev[ring*nranks+starts[nextGroup]] = ends[group];
+      group = nextGroup;
+    }
+    // Connect with the last
+    next[ring*nranks+ends[group]] = starts[endGroup];
+    prev[ring*nranks+starts[endGroup]] = ends[group];
+  }
+  return ncclSuccess;
+}
+
+#define MAX_SHM_NAME_LEN 1024
+
+/* Create and return connect structures for this peer to connect to me */
+ncclResult_t shmSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
+  struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo;
+  struct shmInfo* peerInfo = (struct shmInfo*)peerOpaqueInfo;
+
+  struct shmSendResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  ring->send.transportResources = resources;
+
+  struct shmRecvConnectInfo info;
+  char shmName[MAX_SHM_NAME_LEN];
+  sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
+  info.shmSize = resources->shmSize = sizeof(struct ncclSendMem);
+  TRACE(SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
+  NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+
+  INFO(INIT|SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+  info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
+  static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
+  memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo));
+  return ncclSuccess;
+}
+
+ncclResult_t shmRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
+  struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo;
+  struct shmRecvResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  ring->recv.transportResources = resources;
+
+  struct shmSendConnectInfo info;
+
+  char shmName[MAX_SHM_NAME_LEN];
+  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
+  info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
+  TRACE(SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
+  NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+
+  info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
+  static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big");
+  memcpy(connectInfo, &info, sizeof(struct shmSendConnectInfo));
+  return ncclSuccess;
+}
+
+/* Connect to this peer */
+ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+  // Setup device pointers
+  struct shmSendConnectInfo* info = (struct shmSendConnectInfo*)connectInfo;
+  struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
+
+  char shmName[MAX_SHM_NAME_LEN];
+  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", info->pidHash, info->id, info->rank);
+  resources->remShmSize = info->shmSize;
+  TRACE(SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
+  NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
+  // Remove the file to ensure proper clean-up
+  NCCLCHECK(shmUnlink(shmName));
+
+  send->transportResources = resources;
+  send->conn.buff = resources->devRemHostMem->buff;
+  send->conn.llBuff = resources->devRemHostMem->llBuff;
+  send->conn.tail = &resources->devRemHostMem->tail;
+  send->conn.opCount = &resources->devRemHostMem->opCount;
+
+  send->conn.head = &resources->devHostMem->head;
+  send->conn.llHead = &resources->devHostMem->llHead;
+  return ncclSuccess;
+}
+
+ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+  // Setup device pointers
+  struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
+  struct shmRecvConnectInfo* info = (struct shmRecvConnectInfo*)connectInfo;
+
+  char shmName[MAX_SHM_NAME_LEN];
+  sprintf(shmName, "nccl-shm-send-%lx-%d-%d", info->pidHash, info->id, info->rank);
+  resources->remShmSize = info->shmSize;
+  TRACE(SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
+  NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
+  NCCLCHECK(shmUnlink(shmName));
+  recv->conn.head = &resources->devRemHostMem->head;
+  recv->conn.llHead = &resources->devRemHostMem->llHead;
+
+  recv->conn.buff = resources->devHostMem->buff;
+  recv->conn.llBuff = resources->devHostMem->llBuff;
+  recv->conn.tail = &resources->devHostMem->tail;
+  recv->conn.opCount = &resources->devHostMem->opCount;
+  return ncclSuccess;
+}
+
+ncclResult_t shmSendFree(void* transportResources) {
+  struct shmSendResources* resources = (struct shmSendResources*)transportResources;
+  NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
+  NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
+  free(resources);
+  return ncclSuccess;
+}
+
+ncclResult_t shmRecvFree(void* transportResources) {
+  struct shmRecvResources* resources = (struct shmRecvResources*)transportResources;
+  NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
+  NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
+  free(resources);
+  return ncclSuccess;
+}
+
+struct ncclTransport shmTransport = {
+  "SHM",
+  shmFillInfo,
+  shmCanConnect,
+  shmGetRings,
+  { shmSendSetup, shmSendConnect, shmSendFree, NULL },
+  { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL }
+};
diff --git a/test/include/test_utilities.h b/test/include/test_utilities.h
deleted file mode 100644
index 80aeaee..0000000
--- a/test/include/test_utilities.h
+++ /dev/null
@@ -1,438 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-
-#ifndef SRC_TEST_UTILITIES_H_
-#define SRC_TEST_UTILITIES_H_
-
-#include <curand.h>
-#include <cerrno>
-#include <string>
-
-#define CUDACHECK(cmd) do {                         \
-  cudaError_t e = cmd;                              \
-  if( e != cudaSuccess ) {                          \
-    printf("Cuda failure %s:%d '%s'\n",             \
-        __FILE__,__LINE__,cudaGetErrorString(e));   \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-#define NCCLCHECK(cmd) do {                         \
-  ncclResult_t r = cmd;                             \
-  if (r!= ncclSuccess) {                            \
-    printf("NCCL failure %s:%d '%s'\n",             \
-        __FILE__,__LINE__,ncclGetErrorString(r));   \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-template<typename T>
-void Randomize(T* const dest, const int N, const int randomSeed);
-
-template<typename T>
-void Accumulate(T* dest, const T* contrib, int N, ncclRedOp_t op);
-
-template<typename T>
-double CheckDelta(const T* results, const T* expected, int N);
-
-#define CURAND_CHK(cmd)                                                         \
-    do {                                                                        \
-      curandStatus_t error = (cmd);                                             \
-      if (error != CURAND_STATUS_SUCCESS) {                                     \
-        printf("CuRAND error %i at %s:%i\n", error, __FILE__ , __LINE__);       \
-        exit(EXIT_FAILURE);                                                     \
-      }                                                                         \
-    } while (false)
-
-
-template<typename T>
-void GenerateRandom(curandGenerator_t generator, T * const dest,
-    const int N);
-
-template<>
-void GenerateRandom<char>(curandGenerator_t generator, char * const dest,
-    const int N) {
-  CURAND_CHK(curandGenerate(generator, (unsigned int*)dest,
-      N * sizeof(char) / sizeof(int)));
-}
-
-template<>
-void GenerateRandom<int>(curandGenerator_t generator, int * const dest,
-    const int N) {
-  CURAND_CHK(curandGenerate(generator, (unsigned int*)dest, N));
-}
-
-template<>
-void GenerateRandom<float>(curandGenerator_t generator, float * const dest,
-    const int N) {
-  CURAND_CHK(curandGenerateUniform(generator, dest, N));
-}
-
-template<>
-void GenerateRandom<double>(curandGenerator_t generator, double * const dest,
-    const int N) {
-  CURAND_CHK(curandGenerateUniformDouble(generator, dest, N));
-}
-
-template<>
-void GenerateRandom<unsigned long long>(curandGenerator_t generator, unsigned long long * const dest,
-    const int N) {
-  CURAND_CHK(curandGenerateLongLong(generator, dest, N));
-}
-
-
-template<typename T>
-void Randomize(T* const dest, const int N, const int randomSeed) {
-  curandGenerator_t gen;
-  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MTGP32));
-  CURAND_CHK(curandSetPseudoRandomGeneratorSeed(gen, randomSeed));
-  GenerateRandom<T>(gen, dest, N);
-  CURAND_CHK(curandDestroyGenerator(gen));
-  CUDACHECK(cudaDeviceSynchronize());
-}
-
-template<>
-void Randomize(unsigned long long* const dest, const int N, const int randomSeed) {
-  curandGenerator_t gen;
-  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_QUASI_SOBOL64));
-  GenerateRandom<unsigned long long>(gen, dest, N);
-  CURAND_CHK(curandDestroyGenerator(gen));
-  CUDACHECK(cudaDeviceSynchronize());
-}
-
-template<>
-void Randomize(long long* const dest, const int N, const int randomSeed) {
-  curandGenerator_t gen;
-  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_QUASI_SOBOL64));
-  GenerateRandom<unsigned long long>(gen, (unsigned long long *)dest, N);
-  CURAND_CHK(curandDestroyGenerator(gen));
-  CUDACHECK(cudaDeviceSynchronize());
-}
-
-#ifdef CUDA_HAS_HALF
-__global__ void halve(const float * src, half* dest, int N) {
-  for(int tid = threadIdx.x + blockIdx.x*blockDim.x;
-      tid < N; tid += blockDim.x * gridDim.x)
-    dest[tid] = __float2half(src[tid]);
-}
-
-template<>
-void Randomize<half>(half* const dest, const int N, const int randomSeed) {
-  curandGenerator_t gen;
-  CURAND_CHK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MTGP32));
-  CURAND_CHK(curandSetPseudoRandomGeneratorSeed(gen, randomSeed));
-
-  float* temp;
-  CUDACHECK(cudaMalloc(&temp, N*sizeof(float)));
-  GenerateRandom<float>(gen, temp, N);
-  halve<<<128, 512>>>(temp, dest, N);
-  CURAND_CHK(curandDestroyGenerator(gen));
-  CUDACHECK(cudaFree(temp));
-  CUDACHECK(cudaDeviceSynchronize());
-}
-#endif
-
-void makeRandom(void* ptr, int n, ncclDataType_t type, int seed) {
-  if (type == ncclChar)
-    Randomize<char>((char*)ptr, n, seed);
-  else if (type == ncclInt)
-    Randomize<int>((int*)ptr, n, seed);
-#ifdef CUDA_HAS_HALF
-  else if (type == ncclHalf)
-    Randomize<half>((half*)ptr, n, seed);
-#endif
-  else if (type == ncclFloat)
-    Randomize<float>((float*)ptr, n, seed);
-  else if (type == ncclDouble)
-    Randomize<double>((double*)ptr, n, seed);
-  else if (type == ncclInt64)
-    Randomize<long long>((long long*)ptr, n, seed);
-  else if (type == ncclUint64)
-    Randomize<unsigned long long>((unsigned long long*)ptr, n, seed);
-
-  return;
-}
-
-template<typename T, int OP> __global__ static
-void accumKern(T* acum, const T* contrib, int N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    T c = contrib[i];
-    T a = acum[i];
-    if(OP == ncclSum) {
-      acum[i] = a+c;
-    } else if(OP == ncclProd) {
-      acum[i] = a*c;
-    } else if(OP == ncclMax) {
-      acum[i] = (a > c) ? a : c;
-    } else if(OP == ncclMin) {
-      acum[i] = (a < c) ? a : c;
-    }
-  }
-}
-
-#ifdef CUDA_HAS_HALF
-template<> __global__
-void accumKern<half, ncclSum>(half* acum, const half* contrib, int N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    float c = __half2float(contrib[i]);
-    float a = __half2float(acum[i]);
-    acum[i] = __float2half( a + c );
-  }
-}
-
-template<> __global__
-void accumKern<half, ncclProd>(half* acum, const half* contrib, int N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    float c = __half2float(contrib[i]);
-    float a = __half2float(acum[i]);
-    acum[i] = __float2half( a * c );
-  }
-}
-
-template<> __global__
-void accumKern<half, ncclMax>(half* acum, const half* contrib, int N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    float c = __half2float(contrib[i]);
-    float a = __half2float(acum[i]);
-    acum[i] = __float2half( (a>c) ? a : c );
-  }
-}
-
-template<> __global__
-void accumKern<half, ncclMin>(half* acum, const half* contrib, int N) {
-  int tid = threadIdx.x + blockIdx.x*blockDim.x;
-  int offset = blockDim.x*gridDim.x;
-  for(int i=tid; i<N; i+=offset) {
-    float c = __half2float(contrib[i]);
-    float a = __half2float(acum[i]);
-    acum[i] = __float2half( (a<c) ? a : c );
-  }
-}
-#endif
-
-template<typename T>
-void accVecType(void* out, void* in, int n, ncclRedOp_t op) {
-  switch(op) {
-    case ncclSum:  accumKern<T, ncclSum> <<<256,256>>>((T*)out, (T*)in, n); break;
-    case ncclProd: accumKern<T, ncclProd><<<256,256>>>((T*)out, (T*)in, n); break;
-    case ncclMax:  accumKern<T, ncclMax> <<<256,256>>>((T*)out, (T*)in, n); break;
-    case ncclMin:  accumKern<T, ncclMin> <<<256,256>>>((T*)out, (T*)in, n); break;
-    default:
-      printf("Unknown reduction operation.\n");
-      exit(EXIT_FAILURE);
-  }
-}
-
-template<typename T>
-void Accumulate(T* dest, const T* contrib, int N, ncclRedOp_t op) {
-
-  T* devdest;
-  CUDACHECK(cudaHostRegister(dest, N*sizeof(T), 0));
-  CUDACHECK(cudaHostGetDevicePointer(&devdest, dest, 0));
-  accVecType<T>((void*)devdest, (void*)contrib, N, op);
-  CUDACHECK(cudaHostUnregister(dest));
-}
-
-void accVec(void* out, void* in, int n, ncclDataType_t type, ncclRedOp_t op) {
-  switch (type) {
-    case ncclChar:   accVecType<char>               (out, in, n, op); break;
-    case ncclInt:    accVecType<int>                (out, in, n, op); break;
-#ifdef CUDA_HAS_HALF
-    case ncclHalf:   accVecType<half>               (out, in, n, op); break;
-#endif
-    case ncclFloat:  accVecType<float>              (out, in, n, op); break;
-    case ncclDouble: accVecType<double>             (out, in, n, op); break;
-    case ncclInt64:  accVecType<long long>          (out, in, n, op); break;
-    case ncclUint64: accVecType<unsigned long long> (out, in, n, op); break;
-    default:
-      printf("Unknown reduction type.\n");
-      exit(EXIT_FAILURE);
-  }
-}
-
-template<typename T> __device__
-double absDiff(T a, T b) {
-  return fabs((double)(b - a));
-}
-
-#ifdef CUDA_HAS_HALF
-template<> __device__
-double absDiff<half>(half a, half b) {
-  float x = __half2float(a);
-  float y = __half2float(b);
-  return fabs((double)(y-x));
-}
-#endif
-
-template<typename T, int BSIZE> __global__
-void deltaKern(const T* A, const T* B, int N, double* max) {
-  __shared__ double temp[BSIZE];
-  int tid = threadIdx.x;
-  double locmax = 0.0;
-  for(int i=tid; i<N; i+=blockDim.x) {
-
-    double delta = absDiff(A[i], B[i]);
-    if( delta > locmax )
-      locmax = delta;
-  }
-
-  temp[tid] = locmax;
-  for(int stride = BSIZE/2; stride > 1; stride>>=1) {
-    __syncthreads();
-    if( tid < stride )
-      temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride];
-  }
-  __syncthreads();
-  if( threadIdx.x == 0)
-    *max = temp[0] > temp[1] ? temp[0] : temp[1];
-}
-
-template<typename T>
-double CheckDelta(const T* results, const T* expected, int N) {
-  T* devexp;
-  double maxerr;
-  double* devmax;
-  CUDACHECK(cudaHostRegister((void*)expected, N*sizeof(T), 0));
-  CUDACHECK(cudaHostGetDevicePointer((void**)&devexp, (void*)expected, 0));
-  CUDACHECK(cudaHostRegister((void*)&maxerr, sizeof(double), 0));
-  CUDACHECK(cudaHostGetDevicePointer(&devmax, &maxerr, 0));
-  deltaKern<T, 512><<<1, 512>>>(results, devexp, N, devmax);
-  CUDACHECK(cudaHostUnregister(&maxerr));
-  CUDACHECK(cudaHostUnregister((void*)expected));
-  return maxerr;
-}
-
-void maxDiff(double* max, void* first, void* second, int n, ncclDataType_t type, cudaStream_t s) {
-  switch (type) {
-    case ncclChar:   deltaKern<char, 512>              <<<1,512,0,s>>>((char*)first, (char*)second, n, max); break;
-    case ncclInt:    deltaKern<int, 512>               <<<1,512,0,s>>>((int*)first, (int*)second, n, max); break;
-#ifdef CUDA_HAS_HALF
-    case ncclHalf:   deltaKern<half, 512>              <<<1,512,0,s>>>((half*)first, (half*)second, n, max); break;
-#endif
-    case ncclFloat:  deltaKern<float, 512>             <<<1,512,0,s>>>((float*)first, (float*)second, n, max); break;
-    case ncclDouble: deltaKern<double, 512>            <<<1,512,0,s>>>((double*)first, (double*)second, n, max); break;
-    case ncclInt64:  deltaKern<long long, 512>         <<<1,512,0,s>>>((long long*)first, (long long*)second, n, max); break;
-    case ncclUint64: deltaKern<unsigned long long, 512><<<1,512,0,s>>>((unsigned long long*)first, (unsigned long long*)second, n, max); break;
-    default:
-      printf("Unknown reduction type.\n");
-      exit(EXIT_FAILURE);
-  }
-}
-
-std::string TypeName(const ncclDataType_t type) {
-  switch (type) {
-    case ncclChar:   return "char";
-    case ncclInt:    return "int";
-#ifdef CUDA_HAS_HALF
-    case ncclHalf:   return "half";
-#endif
-    case ncclFloat:  return "float";
-    case ncclDouble: return "double";
-    case ncclInt64:  return "int64";
-    case ncclUint64: return "uint64";
-    default:         return "unknown";
-  }
-}
-
-std::string OperationName(const ncclRedOp_t op) {
-  switch (op) {
-    case ncclSum:  return "sum";
-    case ncclProd: return "prod";
-    case ncclMax:  return "max";
-    case ncclMin:  return "min";
-    default:       return "unknown";
-  }
-}
-
-ncclDataType_t strToType(const char* s) {
-  if (strcmp(s, "char") == 0)
-    return ncclChar;
-  if (strcmp(s, "int") == 0)
-    return ncclInt;
-#ifdef CUDA_HAS_HALF
-  if (strcmp(s, "half") == 0)
-    return ncclHalf;
-#endif
-  if (strcmp(s, "float") == 0)
-    return ncclFloat;
-  if (strcmp(s, "double") == 0)
-    return ncclDouble;
-  if (strcmp(s, "int64") == 0)
-    return ncclInt64;
-  if (strcmp(s, "uint64") == 0)
-    return ncclUint64;
-
-  return nccl_NUM_TYPES;
-}
-
-size_t wordSize(ncclDataType_t type) {
-  switch(type) {
-    case ncclChar:   return sizeof(char);
-    case ncclInt:    return sizeof(int);
-#ifdef CUDA_HAS_HALF
-    case ncclHalf:   return sizeof(short);
-#endif
-    case ncclFloat:  return sizeof(float);
-    case ncclDouble: return sizeof(double);
-    case ncclInt64:  return sizeof(long long);
-    case ncclUint64: return sizeof(unsigned long long);
-  }
-
-  return 0;
-}
-
-double deltaMaxValue(ncclDataType_t type, bool is_reduction) {
-  if (is_reduction) {
-    switch(type) {
-#ifdef CUDA_HAS_HALF
-      case ncclHalf:   return 5e-2;
-#endif
-      case ncclFloat:  return 1e-5;
-      case ncclDouble: return 1e-12;
-    }
-  }
-  return 1e-200;
-}
-
-ncclRedOp_t strToOp(const char* s) {
-  if (strcmp(s, "sum") == 0)
-    return ncclSum;
-  if (strcmp(s, "prod") == 0)
-    return ncclProd;
-  if (strcmp(s, "max") == 0)
-    return ncclMax;
-  if (strcmp(s, "min") == 0)
-    return ncclMin;
-
-  return nccl_NUM_OPS;
-}
-
-int strToPosInt(const char* s) {
-  errno = 0;
-  long temp = strtol(s, NULL, 10);
-  if (errno != 0 || temp > INT_MAX || temp < 0)
-    return 0;
-  return (int)temp;
-}
-
-int strToNonNeg(const char* s) {
-  errno = 0;
-  long temp = strtol(s, NULL, 10);
-  if (errno != 0 || temp > INT_MAX || temp < 0)
-    return -1;
-  return (int)temp;
-}
-
-#endif // SRC_TEST_UTILITIES_H_
diff --git a/test/mpi/mpi_test.cu b/test/mpi/mpi_test.cu
deleted file mode 100644
index fea6ae5..0000000
--- a/test/mpi/mpi_test.cu
+++ /dev/null
@@ -1,93 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-#include <sys/types.h>
-#include <unistd.h>
-#include <stdio.h>
-
-#include "nccl.h"
-#include "mpi.h"
-#include "test_utilities.h"
-
-#define SIZE 128
-#define NITERS 1
-
-int main(int argc, char *argv[]) {
-  ncclUniqueId commId;
-  int size, rank;
-  ncclResult_t ret;
-
-  MPI_Init(&argc, &argv);
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
-  if (argc < size) {
-    if (rank == 0)
-      printf("Usage : %s <GPU list per rank>\n", argv[0]);
-    exit(1);
-  }
-
-  int gpu = atoi(argv[rank+1]);
-
-  // We have to set our device before NCCL init
-  CUDACHECK(cudaSetDevice(gpu));
-  MPI_Barrier(MPI_COMM_WORLD);
-
-  // NCCL Communicator creation
-  ncclComm_t comm;
-  NCCLCHECK(ncclGetUniqueId(&commId));
-  MPI_Bcast(&commId, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0, MPI_COMM_WORLD);
-  ret = ncclCommInitRank(&comm, size, commId, rank);
-  if (ret != ncclSuccess) {
-    printf("NCCL Init failed (%d) '%s'\n", ret, ncclGetErrorString(ret));
-    exit(1);
-  }
-
-  // CUDA stream creation
-  cudaStream_t stream;
-  CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-
-  // Initialize input values
-  int *dptr;
-  CUDACHECK(cudaMalloc(&dptr, SIZE*2*sizeof(int)));
-  int *val = (int*) malloc(SIZE*sizeof(int));
-  for (int v=0; v<SIZE; v++) {
-    val[v] = rank + 1;
-  }
-  CUDACHECK(cudaMemcpy(dptr, val, SIZE*sizeof(int), cudaMemcpyHostToDevice));
-
-  // Compute final value
-  int ref = size*(size+1)/2;
-
-  // Run allreduce
-  int errors = 0;
-  for (int i=0; i<NITERS; i++) {
-    NCCLCHECK(ncclAllReduce((const void*)dptr, (void*)(dptr+SIZE), SIZE, ncclInt, ncclSum, comm, stream));
-  }
-
-  // Check results
-  CUDACHECK(cudaStreamSynchronize(stream));
-  CUDACHECK(cudaMemcpy(val, (dptr+SIZE), SIZE*sizeof(int), cudaMemcpyDeviceToHost));
-  for (int v=0; v<SIZE; v++) {
-    if (val[v] != ref) {
-      errors++;
-      printf("[%d] Error at %d : got %d instead of %d\n", rank, v, val[v], ref);
-    }
-  }
-  CUDACHECK(cudaFree(dptr));
-
-  MPI_Allreduce(MPI_IN_PLACE, &errors, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD);
-  if (rank == 0) {
-    if (errors)
-      printf("%d errors. Test FAILED.\n", errors);
-    else
-      printf("Test PASSED.\n");
-  }
-
-  MPI_Finalize();
-  ncclCommDestroy(comm);
-  return errors ? 1 : 0;
-}
diff --git a/test/single/all_gather_scan.cu b/test/single/all_gather_scan.cu
deleted file mode 100644
index becf315..0000000
--- a/test/single/all_gather_scan.cu
+++ /dev/null
@@ -1,239 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <float.h>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-void showUsage(const char* bin) {
-  printf("\n"
-         "Usage: %s <type> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
-         "Where:\n"
-#ifdef CUDA_HAS_HALF
-         "    type   =   [char|int|half|float|double|int64|uint64]\n"
-#else
-         "    type   =   [char|int|float|double|int64|uint64]\n"
-#endif
-         "    n_min  >   0\n"
-         "    n_max  >=  n_min\n"
-         "    delta  >   0\n\n", bin);
-  return;
-}
-
-int main(int argc, char* argv[]) {
-  int nvis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nvis));
-  if (nvis == 0) {
-    printf("No GPUs found\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  ncclDataType_t type;
-  int n_min;
-  int n_max;
-  int delta;
-  int gpus;
-  int* list = NULL;
-
-  if (argc < 4) {
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  type = strToType(argv[1]);
-  if (type == nccl_NUM_TYPES) {
-    printf("Invalid <type> '%s'\n", argv[1]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_min = strToPosInt(argv[2]);
-  if (n_min < 1) {
-    printf("Invalid <n_min> '%s'\n", argv[2]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_max = strToPosInt(argv[3]);
-  if (n_max < n_min) {
-    printf("Invalid <n_max> '%s'\n", argv[3]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  if (argc > 4) {
-    delta = strToPosInt(argv[4]);
-    if (delta < 1) {
-      printf("Invalid <delta> '%s'\n", argv[4]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
-  }
-
-  if (argc > 5) {
-    gpus = strToPosInt(argv[5]);
-    if (gpus < 1) {
-      printf("Invalid <gpus> '%s'\n", argv[5]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    gpus = nvis;
-  }
-
-  list = (int*)malloc(gpus*sizeof(int));
-
-  if (argc > 6 && argc != 6+gpus) {
-    printf("If given, GPU list must be fully specified.\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    if(argc > 6) {
-      list[g] = strToNonNeg(argv[6+g]);
-      if (list[g] < 0) {
-        printf("Invalid GPU%d '%s'\n", g, argv[6+g]);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      } else if (list[g] >= nvis) {
-        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      }
-    } else {
-      list[g] = g % nvis;
-    }
-  }
-
-  size_t word = wordSize(type);
-  size_t max_input = n_max * word;
-  size_t max_output = max_input * gpus;
-  void* refout;
-  CUDACHECK(cudaMallocHost(&refout, max_output));
-
-  void **input, **output;
-  double** localError;
-  ncclComm_t* comm;
-  cudaStream_t* stream;
-
-  input = (void**)malloc(gpus*sizeof(void*));
-  output = (void**)malloc(gpus*sizeof(void*));
-  localError = (double**)malloc(gpus*sizeof(double*));
-  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
-  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
-
-  for(int g=0; g<gpus; ++g) {
-    char busid[32] = {0};
-    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
-    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
-
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaMalloc(&input[g],  max_input));
-    CUDACHECK(cudaMalloc(&output[g], max_output));
-    CUDACHECK(cudaMallocHost(&localError[g], sizeof(double)));
-    CUDACHECK(cudaStreamCreate(&stream[g]));
-    makeRandom(input[g], n_max, type, 42+g);
-
-    CUDACHECK(cudaMemcpy((char*)refout+max_input*g, input[g], max_input, cudaMemcpyDeviceToHost));
-  }
-
-  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
-
-  printf("       BYTES ERROR       MSEC     BW\n");
-
-  for(int n=n_min; n<=n_max; n+=delta) {
-    size_t out_bytes = word * n * gpus;
-
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaMemsetAsync(output[g], 0, out_bytes, stream[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-
-    auto start = std::chrono::high_resolution_clock::now();
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      NCCLCHECK(ncclAllGather(input[g], n, type, output[g], comm[g], stream[g]));
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-    auto stop = std::chrono::high_resolution_clock::now();
-    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
-        (stop - start).count() * 1000.0;
-
-    double max_error = 0.0;
-    for(int slice=0; slice<gpus; ++slice) {
-      void* refSlice = (void*)((char*)refout + slice*max_input);
-      for(int g=0; g<gpus; ++g) {
-        CUDACHECK(cudaSetDevice(list[g]));
-        void* mySlice = (void*)((char*)output[g] + slice*n*word);
-        maxDiff(localError[g], mySlice, refSlice, n, type, stream[g]);
-      }
-      for(int g=0; g<gpus; ++g) {
-        CUDACHECK(cudaSetDevice(list[g]));
-        CUDACHECK(cudaStreamSynchronize(stream[g]));
-        max_error = max(max_error, *localError[g]);
-      }
-    }
-
-    double mb = (double)(n*word * (gpus-1)) * 1.e-6;
-    double algbw = mb / ms;
-    printf("%12lu %5.0le %10.3lf %6.2lf\n",
-        n*word, max_error, ms, algbw);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamDestroy(stream[g]));
-    ncclCommDestroy(comm[g]);
-    CUDACHECK(cudaFree(input[g]));
-    CUDACHECK(cudaFree(output[g]));
-    CUDACHECK(cudaFreeHost(localError[g]));
-  }
-
-  free(localError);
-  free(output);
-  free(input);
-  free(comm);
-  free(stream);
-  CUDACHECK(cudaFreeHost(refout));
-  exit(EXIT_SUCCESS);
-}
-
diff --git a/test/single/all_gather_test.cu b/test/single/all_gather_test.cu
deleted file mode 100644
index 40d2f31..0000000
--- a/test/single/all_gather_test.cu
+++ /dev/null
@@ -1,235 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <vector>
-
-#include "nccl.h"
-#include "test_utilities.h"
-
-int errors = 0;
-double avg_bw = 0.0;
-int avg_count = 0;
-bool is_reduction = false;
-
-template<typename T>
-void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
-    ncclComm_t* const comms, const std::vector<int>& dList) {
-  // initialize data
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
-  T* buffer = (T*)malloc(nDev * N * sizeof(T));
-  T* result = (T*)malloc(nDev * N * sizeof(T));
-  memset(buffer, 0, nDev * N * sizeof(T));
-  memset(result, 0, nDev * N * sizeof(T));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamCreate(s+i));
-    CUDACHECK(cudaMemset(recvbuff[i], 0, nDev * N * sizeof(T)));
-    Randomize(sendbuff[i], N, i);
-
-    CUDACHECK(cudaMemcpy(result + i * N, sendbuff[i], N * sizeof(T),
-        cudaMemcpyDeviceToHost));
-  }
-
-  // warm up GPU
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    NCCLCHECK(ncclAllGather((const void*)sendbuff[i], std::min(32 * 1024, N), type,
-        (void*)recvbuff[i], comms[i], s[i]));
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamSynchronize(s[i]));
-  }
-
-  //for (int n = 1; n <= N; n = n << 1)
-  {
-    int n = N;
-    printf("%12i  %12i  %6s", (int)(n * sizeof(T)), n, TypeName(type).c_str());
-
-    auto start = std::chrono::high_resolution_clock::now();
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      NCCLCHECK(ncclAllGather((const void*)sendbuff[i], n, type, (void*)recvbuff[i], comms[i],
-          s[i]));
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count();
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 * (double)(nDev - 1)
-        / elapsedSec;
-    double busbw = algbw;
-
-    double maxDelta = 0.0;
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      double tmpDelta = CheckDelta<T>(recvbuff[i], result, nDev*N);
-      maxDelta = std::max(tmpDelta, maxDelta);
-    }
-
-    printf("  %7.3f  %5.2f  %5.2f  %7.0le\n", elapsedSec * 1.0E3, algbw, busbw,
-        maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamDestroy(s[i]));
-  }
-  free(s);
-  free(buffer);
-  free(result);
-}
-
-template<typename T>
-void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
-    const std::vector<int>& dList) {
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  T** sendbuff = (T**)malloc(nDev * sizeof(T*));
-  T** recvbuff = (T**)malloc(nDev * sizeof(T*));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaMalloc(sendbuff + i, N * sizeof(T)));
-    CUDACHECK(cudaMalloc(recvbuff + i, nDev * N * sizeof(T)));
-  }
-
-  RunTest<T>(sendbuff, recvbuff, N, type, comms, dList);
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaFree(sendbuff[i]));
-    CUDACHECK(cudaFree(recvbuff[i]));
-  }
-
-  free(sendbuff);
-  free(recvbuff);
-}
-
-void usage() {
-  printf("Tests nccl AllGather with user supplied arguments.\n"
-      "    Usage: all_reduce_test <data size in bytes> [number of GPUs] "
-      "[GPU 0] [GPU 1] ...\n\n");
-}
-
-int main(int argc, char* argv[]) {
-  int nVis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nVis));
-
-  int N = 0;
-  if (argc > 1) {
-    int t = sscanf(argv[1], "%d", &N);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    printf("Error: must specify at least data size in bytes!\n\n");
-    usage();
-    exit(EXIT_FAILURE);
-  }
-
-  int nDev = nVis;
-  if (argc > 2) {
-    int t = sscanf(argv[2], "%d", &nDev);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  }
-  std::vector<int> dList(nDev);
-  for (int i = 0; i < nDev; ++i)
-    dList[i] = i % nVis;
-
-
-  if (argc > 3) {
-    if (argc - 3 != nDev) {
-      printf("Error: insufficient number of GPUs in list\n\n");
-      usage();
-      exit(EXIT_FAILURE);
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
-      if (t == 0) {
-        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
-        usage();
-        exit(EXIT_FAILURE);
-      }
-    }
-  }
-
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
-  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
-
-  printf("# Using devices\n");
-  for (int g=0; g<nDev; ++g) {
-    int cudaDev;
-    int rank;
-    cudaDeviceProp prop;
-    NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
-    NCCLCHECK(ncclCommUserRank(comms[g], &rank));
-    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-    printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
-        prop.pciBusID, prop.name);
-  }
-  printf("\n");
-
-  printf("# %10s  %12s  %6s  %7s  %5s  %5s  %7s\n",
-      "bytes", "N", "type", "time", "algbw", "busbw", "delta");
-
-  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
-  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
-#ifdef CUDA_HAS_HALF
-  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
-#endif
-  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
-  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
-  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
-  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
-
-  printf("\n");
-
-  for(int i=0; i<nDev; ++i)
-    ncclCommDestroy(comms[i]);
-  free(comms);
-
-  char* str = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = str ? atof(str) : -1;
-  avg_bw /= avg_count;
-
-  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
-  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
-  printf("\n");
-  if (errors || avg_bw < check_avg_bw)
-    exit(EXIT_FAILURE);
-  else 
-    exit(EXIT_SUCCESS);
-}
-
diff --git a/test/single/all_reduce_scan.cu b/test/single/all_reduce_scan.cu
deleted file mode 100644
index f93a099..0000000
--- a/test/single/all_reduce_scan.cu
+++ /dev/null
@@ -1,247 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <float.h>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-void showUsage(const char* bin) {
-  printf("\n"
-         "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
-         "Where:\n"
-#ifdef CUDA_HAS_HALF
-         "    type   =   [char|int|half|float|double|int64|uint64]\n"
-#else
-         "    type   =   [char|int|float|double|int64|uint64]\n"
-#endif
-         "    op     =   [sum|prod|max|min]\n"
-         "    n_min  >   0\n"
-         "    n_max  >=  n_min\n"
-         "    delta  >   0\n\n", bin);
-  return;
-}
-
-int main(int argc, char* argv[]) {
-  int nvis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nvis));
-  if (nvis == 0) {
-    printf("No GPUs found\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  ncclDataType_t type;
-  ncclRedOp_t op;
-  int n_min;
-  int n_max;
-  int delta;
-  int gpus;
-  int* list = NULL;
-
-  if (argc < 5) {
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  type = strToType(argv[1]);
-  if (type == nccl_NUM_TYPES) {
-    printf("Invalid <type> '%s'\n", argv[1]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  op = strToOp(argv[2]);
-  if (op == nccl_NUM_OPS) {
-    printf("Invalid <op> '%s'\n", argv[2]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_min = strToPosInt(argv[3]);
-  if (n_min < 1) {
-    printf("Invalid <n_min> '%s'\n", argv[3]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_max = strToPosInt(argv[4]);
-  if (n_max < n_min) {
-    printf("Invalid <n_max> '%s'\n", argv[4]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  if (argc > 5) {
-    delta = strToPosInt(argv[5]);
-    if (delta < 1) {
-      printf("Invalid <delta> '%s'\n", argv[5]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
-  }
-
-  if (argc > 6) {
-    gpus = strToPosInt(argv[6]);
-    if (gpus < 1) {
-      printf("Invalid <gpus> '%s'\n", argv[6]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    gpus = nvis;
-  }
-
-  list = (int*)malloc(gpus*sizeof(int));
-
-  if (argc > 7 && argc != 7+gpus) {
-    printf("If given, GPU list must be fully specified.\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    if(argc > 7) {
-      list[g] = strToNonNeg(argv[7+g]);
-      if (list[g] < 0) {
-        printf("Invalid GPU%d '%s'\n", g, argv[7+g]);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      } else if (list[g] >= nvis) {
-        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      }
-    } else {
-      list[g] = g % nvis;
-    }
-  }
-
-  size_t word = wordSize(type);
-  size_t max_size = n_max * word;
-  void* refout;
-  CUDACHECK(cudaMallocHost(&refout, max_size));
-
-  void **input, **output;
-  double** localError;
-  ncclComm_t* comm;
-  cudaStream_t* stream;
-
-  input = (void**)malloc(gpus*sizeof(void*));
-  output = (void**)malloc(gpus*sizeof(void*));
-  localError = (double**)malloc(gpus*sizeof(double*));
-  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
-  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
-
-  for(int g=0; g<gpus; ++g) {
-    char busid[32] = {0};
-    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
-    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
-
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaMalloc(&input[g],  max_size));
-    CUDACHECK(cudaMalloc(&output[g], max_size));
-    CUDACHECK(cudaMallocHost(&localError[g], sizeof(double)));
-    CUDACHECK(cudaStreamCreate(&stream[g]));
-    makeRandom(input[g], n_max, type, 42+g);
-
-    if (g == 0)
-      CUDACHECK(cudaMemcpy(refout, input[g], max_size, cudaMemcpyDeviceToHost));
-    else
-      accVec(refout, input[g], n_max, type, op);
-  }
-
-  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
-
-  printf("       BYTES ERROR       MSEC  ALGBW  BUSBW\n");
-
-  for(int n=n_min; n<=n_max; n+=delta) {
-    size_t bytes = word * n;
-
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaMemsetAsync(output[g], 0, bytes, stream[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-
-    auto start = std::chrono::high_resolution_clock::now();
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      NCCLCHECK(ncclAllReduce(input[g], output[g], n, type, op, comm[g], stream[g]));
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-    auto stop = std::chrono::high_resolution_clock::now();
-    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
-        (stop - start).count() * 1000.0;
-
-    double max_error = 0.0;
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      maxDiff(localError[g], output[g], refout, n, type, stream[g]);
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-      max_error = max(max_error, *localError[g]);
-    }
-
-    double mb = (double)bytes * 1.e-6;
-    double algbw = mb / ms;
-    double busbw = algbw * (double)(2*gpus - 2) / (double)gpus;
-    printf("%12lu %5.0le %10.3lf %6.2lf %6.2lf\n",
-        n*word, max_error, ms, algbw, busbw);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamDestroy(stream[g]));
-    ncclCommDestroy(comm[g]);
-    CUDACHECK(cudaFree(input[g]));
-    CUDACHECK(cudaFree(output[g]));
-    CUDACHECK(cudaFreeHost(localError[g]));
-  }
-
-  free(localError);
-  free(output);
-  free(input);
-  free(comm);
-  free(stream);
-  CUDACHECK(cudaFreeHost(refout));
-  exit(EXIT_SUCCESS);
-}
-
diff --git a/test/single/all_reduce_test.cu b/test/single/all_reduce_test.cu
deleted file mode 100644
index 1935a38..0000000
--- a/test/single/all_reduce_test.cu
+++ /dev/null
@@ -1,301 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <vector>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-int csv = false;
-int errors = 0;
-double avg_bw = 0.0;
-int avg_count = 0;
-bool is_reduction = true;
-
-template<typename T>
-void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
-    const ncclRedOp_t op, ncclComm_t* comms, const std::vector<int>& dList) {
-  // initialize data
-  T* buffer = (T*)malloc(N * sizeof(T));
-  T* result = (T*)malloc(N * sizeof(T));
-  memset(buffer, 0, N * sizeof(T));
-  memset(result, 0, N * sizeof(T));
-
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamCreate(s+i));
-    CUDACHECK(cudaMemset(recvbuff[i], 0, N * sizeof(T)));
-    Randomize(sendbuff[i], N, i);
-    if(i == 0) {
-      CUDACHECK(cudaMemcpy(result, sendbuff[i], N*sizeof(T), cudaMemcpyDeviceToHost));
-    } else {
-      Accumulate<T>(result, sendbuff[i], N, op);
-    }
-  }
-
-  // warm up GPU
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024), type, op, comms[i], s[i]));
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamSynchronize(s[i]));
-  }
-
-//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
-  {
-    int n = N;
-    printf((csv) ? "%i,%i,%s,%s," : "%12i  %12i  %6s  %6s",
-        (int) (n * sizeof(T)), n, TypeName(type).c_str(),
-        OperationName(op).c_str());
-
-    // do out-of-place reduction first
-    nvtxRangePushA("out of place");
-    auto start = std::chrono::high_resolution_clock::now();
-    //for (int i=0; i<100; i++) {
-      for (int i = 0; i < nDev; ++i) {
-        CUDACHECK(cudaSetDevice(dList[i]));
-        NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op,
-            comms[i], s[i]));
-      }
-    //}
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-    nvtxRangePop();
-
-    nvtxRangePushA("out of place bookkeeping");
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count(); // / 100.0;
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
-    double busbw = algbw * (double)(2 * nDev - 2) / (double)nDev;
-
-    double maxDelta = 0.0;
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      double tmpDelta = CheckDelta<T>(recvbuff[i], result, N);
-      maxDelta = std::max(tmpDelta, maxDelta);
-    }
-
-    printf((csv)?"%f,%f,%f,%le,":"  %7.3f  %5.2f  %5.2f  %7.0le",
-        elapsedSec * 1.0E3, algbw, busbw, maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-
-    nvtxRangePop();
-  }
-
-
-//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
-  {
-    int n = N;
-    // now do in-place reduction
-    nvtxRangePushA("in place");
-    auto start = std::chrono::high_resolution_clock::now();
-    //for (int i=0; i<100; i++) {
-      for (int i = 0; i < nDev; ++i) {
-        CUDACHECK(cudaSetDevice(dList[i]));
-        NCCLCHECK(ncclAllReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op,
-            comms[i], s[i]));
-      }
-    //}
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-    nvtxRangePop();
-
-    nvtxRangePushA("in place bookkeeping");
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count(); // / 100.0;
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
-    double busbw = algbw * (double)(2 * nDev - 2) / (double)nDev;
-
-    double maxDelta = 0.0;
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      double tmpDelta = CheckDelta<T>(sendbuff[i], result, N);
-      maxDelta = std::max(tmpDelta, maxDelta);
-    }
-
-    printf((csv)?"%f,%f,%f,%le,":"  %7.3f  %5.2f  %5.2f  %7.0le\n",
-        elapsedSec * 1.0E3, algbw, busbw, maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-
-    nvtxRangePop();
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamDestroy(s[i]));
-  }
-  free(s);
-  free(buffer);
-  free(result);
-}
-
-template<typename T>
-void RunTests(const int N, const ncclDataType_t type, ncclComm_t* comms,
-    const std::vector<int>& dList) {
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  T** sendbuff = (T**)malloc(nDev * sizeof(T*));
-  T** recvbuff = (T**)malloc(nDev * sizeof(T*));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaMalloc(sendbuff + i, N * sizeof(T)));
-    CUDACHECK(cudaMalloc(recvbuff + i, N * sizeof(T)));
-  }
-
-  for (ncclRedOp_t op : { ncclSum, ncclProd, ncclMax, ncclMin }) {
-//  for (ncclRedOp_t op : { ncclSum }) {
-    RunTest<T>(sendbuff, recvbuff, N, type, op, comms, dList);
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaFree(sendbuff[i]));
-    CUDACHECK(cudaFree(recvbuff[i]));
-  }
-
-  free(sendbuff);
-  free(recvbuff);
-}
-
-void usage() {
-  printf("Tests nccl AllReduce with user supplied arguments.\n"
-      "    Usage: all_reduce_test <data size in bytes> [number of GPUs] "
-      "[GPU 0] [GPU 1] ...\n\n");
-}
-
-int main(int argc, char* argv[]) {
-  int nVis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nVis));
-
-  int N = 0;
-  if (argc > 1) {
-    int t = sscanf(argv[1], "%d", &N);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    printf("Error: must specify at least data size in bytes!\n\n");
-    usage();
-    exit(EXIT_FAILURE);
-  }
-
-  int nDev = nVis;
-  if (argc > 2) {
-    int t = sscanf(argv[2], "%d", &nDev);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  }
-  std::vector<int> dList(nDev);
-  for (int i = 0; i < nDev; ++i)
-    dList[i] = i % nVis;
-
-  if (argc > 3) {
-    if (argc - 3 != nDev) {
-      printf("Error: insufficient number of GPUs in list\n\n");
-      usage();
-      exit(EXIT_FAILURE);
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
-      if (t == 0) {
-        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
-        usage();
-        exit(EXIT_FAILURE);
-      }
-    }
-  }
-
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
-  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
-
-  if (!csv) {
-    printf("# Using devices\n");
-    for (int g = 0; g < nDev; ++g) {
-      int cudaDev;
-      int rank;
-      cudaDeviceProp prop;
-      NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
-      NCCLCHECK(ncclCommUserRank(comms[g], &rank));
-      CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-      printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
-          prop.pciBusID, prop.name);
-    }
-    printf("\n");
-
-    printf("# %10s  %12s  %6s  %6s        out-of-place                    in-place\n", "", "", "", "");
-    printf("# %10s  %12s  %6s  %6s  %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n", "bytes", "N", "type", "op",
-               "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
-  }
-  else {
-    printf("B,N,type,op,oop_time,oop_algbw,oop_busbw,oop_res,ip_time,ip_algbw,ip_busbw,ip_res\n");
-  }
-
-  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
-  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
-#ifdef CUDA_HAS_HALF
-  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
-#endif
-  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
-  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
-  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
-  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
-
-  printf("\n");
-
-  for(int i=0; i<nDev; ++i)
-    ncclCommDestroy(comms[i]);
-  free(comms);
-
-  char* str = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = str ? atof(str) : -1;
-  avg_bw /= avg_count;
-
-  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
-  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
-  printf("\n");
-  if (errors || avg_bw < check_avg_bw)
-    exit(EXIT_FAILURE);
-  else 
-    exit(EXIT_SUCCESS);
-}
-
diff --git a/test/single/broadcast_scan.cu b/test/single/broadcast_scan.cu
deleted file mode 100644
index ea11c7d..0000000
--- a/test/single/broadcast_scan.cu
+++ /dev/null
@@ -1,232 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <float.h>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-void showUsage(const char* bin) {
-  printf("\n"
-         "Usage: %s <type> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
-         "Where:\n"
-#ifdef CUDA_HAS_HALF
-         "    type   =   [char|int|half|float|double|int64|uint64]\n"
-#else
-         "    type   =   [char|int|float|double|int64|uint64]\n"
-#endif
-         "    n_min  >   0\n"
-         "    n_max  >=  n_min\n"
-         "    delta  >   0\n\n", bin);
-  return;
-}
-
-int main(int argc, char* argv[]) {
-  int nvis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nvis));
-  if (nvis == 0) {
-    printf("No GPUs found\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  ncclDataType_t type;
-  int n_min;
-  int n_max;
-  int delta;
-  int gpus;
-  int* list = NULL;
-
-  if (argc < 4) {
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  type = strToType(argv[1]);
-  if (type == nccl_NUM_TYPES) {
-    printf("Invalid <type> '%s'\n", argv[1]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_min = strToPosInt(argv[2]);
-  if (n_min < 1) {
-    printf("Invalid <n_min> '%s'\n", argv[2]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_max = strToPosInt(argv[3]);
-  if (n_max < n_min) {
-    printf("Invalid <n_max> '%s'\n", argv[3]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  if (argc > 4) {
-    delta = strToPosInt(argv[4]);
-    if (delta < 1) {
-      printf("Invalid <delta> '%s'\n", argv[4]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
-  }
-
-  if (argc > 5) {
-    gpus = strToPosInt(argv[5]);
-    if (gpus < 1) {
-      printf("Invalid <gpus> '%s'\n", argv[5]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    gpus = nvis;
-  }
-
-  list = (int*)malloc(gpus*sizeof(int));
-
-  if (argc > 6 && argc != 6+gpus) {
-    printf("If given, GPU list must be fully specified.\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    if(argc > 6) {
-      list[g] = strToNonNeg(argv[6+g]);
-      if (list[g] < 0) {
-        printf("Invalid GPU%d '%s'\n", g, argv[6+g]);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      } else if (list[g] >= nvis) {
-        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      }
-    } else {
-      list[g] = g % nvis;
-    }
-  }
-
-  size_t word = wordSize(type);
-  size_t max_size = n_max * word;
-  void* refout;
-  CUDACHECK(cudaMallocHost(&refout, max_size));
-
-  void** io;
-  double* localError;
-  ncclComm_t* comm;
-  cudaStream_t* stream;
-
-  io = (void**)malloc(gpus*sizeof(void*));
-  CUDACHECK(cudaMallocHost(&localError, gpus*sizeof(double)));
-  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
-  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
-
-  for(int g=0; g<gpus; ++g) {
-    char busid[32] = {0};
-    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
-    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
-
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamCreate(&stream[g]));
-    CUDACHECK(cudaMalloc(&io[g], max_size));
-    if(g == 0) {
-      makeRandom(io[g], n_max, type, 42+g);
-      CUDACHECK(cudaMemcpy(refout, io[g], max_size, cudaMemcpyDeviceToHost));
-    }
-  }
-
-  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
-
-  printf("       BYTES ERROR       MSEC     BW\n");
-
-  for(int n=n_min; n<=n_max; n+=delta) {
-    size_t bytes = word * n;
-
-    for(int g=1; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaMemsetAsync(io[g], 0, bytes, stream[g]));
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[0]));
-    }
-
-    auto start = std::chrono::high_resolution_clock::now();
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      NCCLCHECK(ncclBcast(io[g], n, type, 0, comm[g], stream[g]));
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-    auto stop = std::chrono::high_resolution_clock::now();
-    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
-        (stop - start).count() * 1000.0;
-
-    for(int g=1; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      maxDiff(localError+g, io[g], refout, n, type, stream[g]);
-    }
-    double maxError = 0.0;
-    for(int g=1; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-      maxError = max(maxError, localError[g]);
-    }
-
-    double mb = (double)bytes * 1.e-6;
-    double algbw = mb / ms;
-    printf("%12lu %5.0le %10.3lf %6.2lf\n",
-        n*word, maxError, ms, algbw);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamDestroy(stream[g]));
-    ncclCommDestroy(comm[g]);
-    CUDACHECK(cudaFree(io[g]));
-  }
-
-  free(io);
-  free(comm);
-  free(stream);
-  CUDACHECK(cudaFreeHost(refout));
-  CUDACHECK(cudaFreeHost(localError));
-  exit(EXIT_SUCCESS);
-}
-
diff --git a/test/single/broadcast_test.cu b/test/single/broadcast_test.cu
deleted file mode 100644
index 6b1e04f..0000000
--- a/test/single/broadcast_test.cu
+++ /dev/null
@@ -1,235 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <vector>
-
-#include "nccl.h"
-#include "test_utilities.h"
-
-int errors = 0;
-double avg_bw = 0.0;
-int avg_count = 0;
-bool is_reduction = false;
-
-template<typename T>
-void RunTest(T** buff, const int N, const ncclDataType_t type, const int root,
-    ncclComm_t* const comms, const std::vector<int>& dList) {
-  // initialize data
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
-  T* buffer = (T*)malloc(N * sizeof(T));
-  T* result = (T*)malloc(N * sizeof(T));
-  memset(result, 0, N * sizeof(T));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamCreate(s+i));
-
-    if (i == root) {
-      Randomize(buff[root], N, root);
-      CUDACHECK(cudaMemcpy(result, buff[root], N * sizeof(T),
-          cudaMemcpyDeviceToHost));
-    } else {
-      CUDACHECK(cudaMemset(buff[i], 0, N * sizeof(T)));
-    }
-
-    CUDACHECK(cudaDeviceSynchronize());
-  }
-
-  // warm up GPU
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    NCCLCHECK(ncclBcast((void*)buff[i], std::min(32 * 1024, N), type, root, comms[i], s[i]));
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamSynchronize(s[i]));
-  }
-
-//  for (int n = 1; n <= N; n = n << 1)
-  {
-    int n = N;
-    printf("%12i  %12i  %6s  %4i", (int)(n * sizeof(T)), n,
-        TypeName(type).c_str(), root);
-
-    auto start = std::chrono::high_resolution_clock::now();
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      NCCLCHECK(ncclBcast((void*)buff[i], n, type, root, comms[i], s[i]));
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count();
-    double algbw = (double)(n * sizeof(T)) / 1.0E9  / elapsedSec;
-    double busbw = algbw;
-
-    double maxDelta = 0.0;
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      double tmpDelta = CheckDelta<T>(buff[i], result, n);
-      maxDelta = std::max(tmpDelta, maxDelta);
-    }
-
-    printf("  %7.3f  %5.2f  %5.2f  %7.0le\n", elapsedSec * 1.0E3, algbw, busbw,
-            maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-
-  }
-
-  for(int i=0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamDestroy(s[i]));
-  }
-  free(s);
-  free(buffer);
-  free(result);
-}
-
-template<typename T>
-void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
-    const std::vector<int>& dList) {
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  T** buff = (T**)malloc(nDev * sizeof(T*));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaMalloc(buff + i, N * sizeof(T)));
-  }
-
-  //for (int root = 1; root < 2; ++root) {
-  for (int root = 0; root < nDev; ++root) {
-    RunTest<T>(buff, N, type, root, comms, dList);
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaFree(buff[i]));
-  }
-
-  free(buff);
-}
-
-void usage() {
-  printf("Tests nccl Broadcast with user supplied arguments.\n"
-      "    Usage: broadcast_test <data size in bytes> [number of GPUs] "
-      "[GPU 0] [GPU 1] ...\n\n");
-}
-
-int main(int argc, char* argv[]) {
-  int nVis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nVis));
-
-  unsigned long long N = 0;
-  if (argc > 1) {
-    int t = sscanf(argv[1], "%llu", &N);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    printf("Error: must specify at least data size in bytes!\n\n");
-    usage();
-    exit(EXIT_FAILURE);
-  }
-
-  int nDev = nVis;
-  if (argc > 2) {
-    int t = sscanf(argv[2], "%d", &nDev);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  }
-  std::vector<int> dList(nDev);
-  for (int i = 0; i < nDev; ++i)
-    dList[i] = i % nVis;
-
-  if (argc > 3) {
-    if (argc - 3 != nDev) {
-      printf("Error: insufficient number of GPUs in list\n\n");
-      usage();
-      exit(EXIT_FAILURE);
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
-      if (t == 0) {
-        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
-        usage();
-        exit(EXIT_FAILURE);
-      }
-    }
-  }
-
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);;
-  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
-
-  printf("# Using devices\n");
-  for (int g = 0; g < nDev; ++g) {
-    int cudaDev;
-    int rank;
-    cudaDeviceProp prop;
-    NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
-    NCCLCHECK(ncclCommUserRank(comms[g], &rank));
-    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-    printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
-        prop.pciBusID, prop.name);
-  }
-  printf("\n");
-
-  printf("# %10s  %12s  %6s  %4s  %7s  %5s  %5s  %7s\n",
-      "bytes", "N", "type", "root", "time", "algbw", "busbw", "delta");
-
-  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
-  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
-#ifdef CUDA_HAS_HALF
-  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
-#endif
-  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
-  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
-  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
-  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
-
-  printf("\n");
-
-  for(int i = 0; i < nDev; ++i)
-    ncclCommDestroy(comms[i]);
-  free(comms);
-
-  char* str = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = str ? atof(str) : -1;
-  avg_bw /= avg_count;
-
-  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
-  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
-  printf("\n");
-  if (errors || avg_bw < check_avg_bw)
-    exit(EXIT_FAILURE);
-  else 
-    exit(EXIT_SUCCESS);
-}
-
diff --git a/test/single/reduce_scan.cu b/test/single/reduce_scan.cu
deleted file mode 100644
index f42643e..0000000
--- a/test/single/reduce_scan.cu
+++ /dev/null
@@ -1,238 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <float.h>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-void showUsage(const char* bin) {
-  printf("\n"
-         "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
-         "Where:\n"
-#ifdef CUDA_HAS_HALF
-         "    type   =   [char|int|half|float|double|int64|uint64]\n"
-#else
-         "    type   =   [char|int|float|double|int64|uint64]\n"
-#endif
-         "    op     =   [sum|prod|max|min]\n"
-         "    n_min  >   0\n"
-         "    n_max  >=  n_min\n"
-         "    delta  >   0\n\n", bin);
-  return;
-}
-
-int main(int argc, char* argv[]) {
-  int nvis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nvis));
-  if (nvis == 0) {
-    printf("No GPUs found\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  ncclDataType_t type;
-  ncclRedOp_t op;
-  int n_min;
-  int n_max;
-  int delta;
-  int gpus;
-  int* list = NULL;
-
-  if (argc < 5) {
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  type = strToType(argv[1]);
-  if (type == nccl_NUM_TYPES) {
-    printf("Invalid <type> '%s'\n", argv[1]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  op = strToOp(argv[2]);
-  if (op == nccl_NUM_OPS) {
-    printf("Invalid <op> '%s'\n", argv[2]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_min = strToPosInt(argv[3]);
-  if (n_min < 1) {
-    printf("Invalid <n_min> '%s'\n", argv[3]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_max = strToPosInt(argv[4]);
-  if (n_max < n_min) {
-    printf("Invalid <n_max> '%s'\n", argv[4]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  if (argc > 5) {
-    delta = strToPosInt(argv[5]);
-    if (delta < 1) {
-      printf("Invalid <delta> '%s'\n", argv[5]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
-  }
-
-  if (argc > 6) {
-    gpus = strToPosInt(argv[6]);
-    if (gpus < 1) {
-      printf("Invalid <gpus> '%s'\n", argv[6]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    gpus = nvis;
-  }
-
-  list = (int*)malloc(gpus*sizeof(int));
-
-  if (argc > 7 && argc != 7+gpus) {
-    printf("If given, GPU list must be fully specified.\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    if(argc > 7) {
-      list[g] = strToNonNeg(argv[7+g]);
-      if (list[g] < 0) {
-        printf("Invalid GPU%d '%s'\n", g, argv[7+g]);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      } else if (list[g] >= nvis) {
-        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      }
-    } else {
-      list[g] = g % nvis;
-    }
-  }
-
-  size_t word = wordSize(type);
-  size_t max_size = n_max * word;
-  void* refout;
-  CUDACHECK(cudaMallocHost(&refout, max_size));
-
-  void** input;
-  void* output; // always goes on rank 0
-  double* maxError;
-  ncclComm_t* comm;
-  cudaStream_t* stream;
-
-  input = (void**)malloc(gpus*sizeof(void*));
-  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
-  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
-
-  for(int g=0; g<gpus; ++g) {
-    char busid[32] = {0};
-    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
-    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
-
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamCreate(&stream[g]));
-    CUDACHECK(cudaMalloc(&input[g],  max_size));
-    makeRandom(input[g], n_max, type, 42+g);
-
-    if (g == 0) {
-      CUDACHECK(cudaMalloc(&output, max_size));
-      CUDACHECK(cudaMallocHost(&maxError, sizeof(double)));
-      CUDACHECK(cudaMemcpy(refout, input[g], max_size, cudaMemcpyDeviceToHost));
-    } else {
-      accVec(refout, input[g], n_max, type, op);
-    }
-  }
-
-  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
-
-  printf("       BYTES ERROR       MSEC     BW\n");
-
-  for(int n=n_min; n<=n_max; n+=delta) {
-    size_t bytes = word * n;
-
-    CUDACHECK(cudaSetDevice(list[0]));
-    CUDACHECK(cudaMemsetAsync(output, 0, bytes, stream[0]));
-    for(int g=0; g<gpus; ++g)
-      CUDACHECK(cudaStreamSynchronize(stream[0]));
-
-    auto start = std::chrono::high_resolution_clock::now();
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      NCCLCHECK(ncclReduce(input[g], output, n, type, op, 0, comm[g], stream[g]));
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-    auto stop = std::chrono::high_resolution_clock::now();
-    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
-        (stop - start).count() * 1000.0;
-
-    CUDACHECK(cudaSetDevice(list[0]));
-    maxDiff(maxError, output, refout, n, type, stream[0]);
-    CUDACHECK(cudaStreamSynchronize(stream[0]));
-
-    double mb = (double)bytes * 1.e-6;
-    double algbw = mb / ms;
-    printf("%12lu %5.0le %10.3lf %6.2lf\n",
-        n*word, *maxError, ms, algbw);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamDestroy(stream[g]));
-    ncclCommDestroy(comm[g]);
-    CUDACHECK(cudaFree(input[g]));
-    if(g == 0) {
-      CUDACHECK(cudaFree(output));
-      CUDACHECK(cudaFreeHost(maxError));
-    }
-  }
-
-  free(input);
-  free(comm);
-  free(stream);
-  CUDACHECK(cudaFreeHost(refout));
-  exit(EXIT_SUCCESS);
-}
-
diff --git a/test/single/reduce_scatter_scan.cu b/test/single/reduce_scatter_scan.cu
deleted file mode 100644
index 8c37508..0000000
--- a/test/single/reduce_scatter_scan.cu
+++ /dev/null
@@ -1,249 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <float.h>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-void showUsage(const char* bin) {
-  printf("\n"
-         "Usage: %s <type> <op> <n_min> <n_max> [delta] [gpus] [gpu0 [gpu1 [...]]]\n"
-         "Where:\n"
-#ifdef CUDA_HAS_HALF
-         "    type   =   [char|int|half|float|double|int64|uint64]\n"
-#else
-         "    type   =   [char|int|float|double|int64|uint64]\n"
-#endif
-         "    op     =   [sum|prod|max|min]\n"
-         "    n_min  >   0\n"
-         "    n_max  >=  n_min\n"
-         "    delta  >   0\n\n", bin);
-  return;
-}
-
-int main(int argc, char* argv[]) {
-  int nvis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nvis));
-  if (nvis == 0) {
-    printf("No GPUs found\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  ncclDataType_t type;
-  ncclRedOp_t op;
-  int n_min;
-  int n_max;
-  int delta;
-  int gpus;
-  int* list = NULL;
-
-  if (argc < 5) {
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  type = strToType(argv[1]);
-  if (type == nccl_NUM_TYPES) {
-    printf("Invalid <type> '%s'\n", argv[1]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  op = strToOp(argv[2]);
-  if (op == nccl_NUM_OPS) {
-    printf("Invalid <op> '%s'\n", argv[2]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_min = strToPosInt(argv[3]);
-  if (n_min < 1) {
-    printf("Invalid <n_min> '%s'\n", argv[3]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  n_max = strToPosInt(argv[4]);
-  if (n_max < n_min) {
-    printf("Invalid <n_max> '%s'\n", argv[4]);
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  if (argc > 5) {
-    delta = strToPosInt(argv[5]);
-    if (delta < 1) {
-      printf("Invalid <delta> '%s'\n", argv[5]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    delta = (n_max == n_min) ? 1 : (n_max - n_min+9) / 10;
-  }
-
-  if (argc > 6) {
-    gpus = strToPosInt(argv[6]);
-    if (gpus < 1) {
-      printf("Invalid <gpus> '%s'\n", argv[6]);
-      showUsage(argv[0]);
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    gpus = nvis;
-  }
-
-  list = (int*)malloc(gpus*sizeof(int));
-
-  if (argc > 7 && argc != 7+gpus) {
-    printf("If given, GPU list must be fully specified.\n");
-    showUsage(argv[0]);
-    exit(EXIT_FAILURE);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    if(argc > 7) {
-      list[g] = strToNonNeg(argv[7+g]);
-      if (list[g] < 0) {
-        printf("Invalid GPU%d '%s'\n", g, argv[7+g]);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      } else if (list[g] >= nvis) {
-        printf("GPU%d (%d) exceeds visible devices (%d)\n", g, list[g], nvis);
-        showUsage(argv[0]);
-        exit(EXIT_FAILURE);
-      }
-    } else {
-      list[g] = g % nvis;
-    }
-  }
-
-  size_t word = wordSize(type);
-  size_t max_output = n_max * word;
-  size_t max_input = gpus * max_output;
-  void* refout;
-  CUDACHECK(cudaMallocHost(&refout, max_input)); // contains entire reduction
-
-  void **input, **output;
-  double** localError;
-  ncclComm_t* comm;
-  cudaStream_t* stream;
-
-  input = (void**)malloc(gpus*sizeof(void*));
-  output = (void**)malloc(gpus*sizeof(void*));
-  localError = (double**)malloc(gpus*sizeof(double*));
-  comm = (ncclComm_t*)malloc(gpus*sizeof(ncclComm_t));
-  stream = (cudaStream_t*)malloc(gpus*sizeof(cudaStream_t));
-
-  for(int g=0; g<gpus; ++g) {
-    char busid[32] = {0};
-    CUDACHECK(cudaDeviceGetPCIBusId(busid, 32, list[g]));
-    printf("# Rank %d using device %d [%s]\n", g, list[g], busid);
-
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaMalloc(&input[g],  max_input));
-    CUDACHECK(cudaMalloc(&output[g], max_output));
-    CUDACHECK(cudaMallocHost(&localError[g], sizeof(double)));
-    CUDACHECK(cudaStreamCreate(&stream[g]));
-    makeRandom(input[g], n_max*gpus, type, 42+g);
-
-    if (g == 0)
-      CUDACHECK(cudaMemcpy(refout, input[g], max_input, cudaMemcpyDeviceToHost));
-    else
-      accVec(refout, input[g], n_max*gpus, type, op);
-  }
-
-  NCCLCHECK(ncclCommInitAll(comm, gpus, list));
-
-  printf("       BYTES ERROR       MSEC  ALGBW  BUSBW\n");
-
-  for(int n=n_min; n<=n_max; n+=delta) {
-    size_t bytes = word * n;
-
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaMemsetAsync(output[g], 0, bytes, stream[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-
-    auto start = std::chrono::high_resolution_clock::now();
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      NCCLCHECK(ncclReduceScatter(input[g], output[g], n, type, op, comm[g], stream[g]));
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-    }
-    auto stop = std::chrono::high_resolution_clock::now();
-    double ms = std::chrono::duration_cast<std::chrono::duration<double>>
-        (stop - start).count() * 1000.0;
-
-    double max_error = 0.0;
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      void* myRef = (void*)((char*)refout + g*bytes);
-      maxDiff(localError[g], output[g], myRef, n, type, stream[g]);
-    }
-    for(int g=0; g<gpus; ++g) {
-      CUDACHECK(cudaSetDevice(list[g]));
-      CUDACHECK(cudaStreamSynchronize(stream[g]));
-      max_error = max(max_error, *localError[g]);
-    }
-
-    double mb = (double)bytes * 1.e-6;
-    double algbw = mb / ms;
-    double busbw = algbw * (double)(gpus - 1);
-    printf("%12lu %5.0le %10.3lf %6.2lf %6.2lf\n",
-        n*word, max_error, ms, algbw, busbw);
-  }
-
-  for(int g=0; g<gpus; ++g) {
-    CUDACHECK(cudaSetDevice(list[g]));
-    CUDACHECK(cudaStreamDestroy(stream[g]));
-    ncclCommDestroy(comm[g]);
-    CUDACHECK(cudaFree(input[g]));
-    CUDACHECK(cudaFree(output[g]));
-    CUDACHECK(cudaFreeHost(localError[g]));
-  }
-
-  free(localError);
-  free(output);
-  free(input);
-  free(comm);
-  free(stream);
-  CUDACHECK(cudaFreeHost(refout));
-  exit(EXIT_SUCCESS);
-}
-
diff --git a/test/single/reduce_scatter_test.cu b/test/single/reduce_scatter_test.cu
deleted file mode 100644
index b702800..0000000
--- a/test/single/reduce_scatter_test.cu
+++ /dev/null
@@ -1,285 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <vector>
-
-#include "nccl.h"
-#include "test_utilities.h"
-
-int errors = 0;
-double avg_bw = 0.0;
-int avg_count = 0;
-bool is_reduction = true;
-
-template<typename T>
-void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
-    const ncclRedOp_t op, ncclComm_t* const comms, const std::vector<int>& dList) {
-  // initialize data
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
-
-  T* buffer = (T*)malloc(N * nDev * sizeof(T));
-  T* result = (T*)malloc(N * nDev * sizeof(T));
-  memset(buffer, 0, N * nDev * sizeof(T));
-  memset(result, 0, N * nDev * sizeof(T));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamCreate(s+i));
-    CUDACHECK(cudaMemset(recvbuff[i], 0, N * sizeof(T)));
-    Randomize(sendbuff[i], N * nDev, i);
-
-    if (i == 0) {
-      CUDACHECK(cudaMemcpy(result, sendbuff[i], N * nDev * sizeof(T),
-          cudaMemcpyDeviceToHost));
-    } else {
-      Accumulate<T>(result, sendbuff[i], N * nDev, op);
-    }
-  }
-
-  // warm up GPU
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i],
-        std::min(N, 1024 * 1024), type, op, comms[i], s[i]));
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamSynchronize(s[i]));
-  }
-
-//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
-  {
-    int n = N;
-    printf("%12i  %12i  %6s  %6s", (int)(n * sizeof(T)), n,
-        TypeName(type).c_str(), OperationName(op).c_str());
-
-    // do out-of-place reduction first
-    auto start = std::chrono::high_resolution_clock::now();
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)recvbuff[i], n, type,
-          op, comms[i], s[i]));
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count();
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
-    double busbw = algbw * (double)(nDev - 1);
-
-    double maxDelta = 0.0;
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      double tmpDelta = CheckDelta<T>(recvbuff[i], result+i*n, n);
-      maxDelta = std::max(tmpDelta, maxDelta);
-    }
-
-    printf("  %7.3f  %5.2f  %5.2f  %7.0le", elapsedSec * 1.0E3, algbw, busbw,
-        maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-  }
-
-  {
-    // now do in-place reduction
-    int n = N;
-
-    auto start = std::chrono::high_resolution_clock::now();
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      NCCLCHECK(ncclReduceScatter((const void*)sendbuff[i], (void*)sendbuff[i], n, type,
-          op, comms[i], s[i]));
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count();
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
-    double busbw = algbw * (double)(nDev - 1);
-
-    double maxDelta = 0.0;
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      double tmpDelta = CheckDelta<T>(sendbuff[i], result+i*n, n);
-      maxDelta = std::max(tmpDelta, maxDelta);
-    }
-
-    printf("  %7.3f  %5.2f  %5.2f  %7.0le\n", elapsedSec * 1.0E3, algbw, busbw,
-        maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamDestroy(s[i]));
-  }
-  free(s);
-  free(buffer);
-  free(result);
-}
-
-template<typename T>
-void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
-    const std::vector<int>& dList) {
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  T** sendbuff = (T**)malloc(nDev * sizeof(T*));
-  T** recvbuff = (T**)malloc(nDev * sizeof(T*));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaMalloc(sendbuff + i, N * nDev * sizeof(T)));
-    CUDACHECK(cudaMalloc(recvbuff + i, N * sizeof(T)));
-  }
-
-  for (ncclRedOp_t op : { ncclSum, ncclProd, ncclMax, ncclMin }) {
-//  for (ncclRedOp_t op : { ncclSum }) {
-    RunTest<T>(sendbuff, recvbuff, N, type, op, comms, dList);
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaFree(sendbuff[i]));
-    CUDACHECK(cudaFree(recvbuff[i]));
-  }
-
-  free(sendbuff);
-  free(recvbuff);
-}
-
-void usage() {
-  printf("Tests nccl ReduceScatter with user supplied arguments.\n"
-      "    Usage: all_reduce_test <data size in bytes> [number of GPUs] "
-      "[GPU 0] [GPU 1] ...\n\n");
-}
-
-int main(int argc, char* argv[]) {
-  int nVis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nVis));
-
-  int N = 0;
-  if (argc > 1) {
-    int t = sscanf(argv[1], "%d", &N);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    printf("Error: must specify at least data size in bytes!\n\n");
-    usage();
-    exit(EXIT_FAILURE);
-  }
-
-  int nDev = nVis;
-  if (argc > 2) {
-    int t = sscanf(argv[2], "%d", &nDev);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  }
-  std::vector<int> dList(nDev);
-  for (int i = 0; i < nDev; ++i)
-    dList[i] = i % nVis;
-
-  if (argc > 3) {
-    if (argc - 3 != nDev) {
-      printf("Error: insufficient number of GPUs in list\n\n");
-      usage();
-      exit(EXIT_FAILURE);
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
-      if (t == 0) {
-        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
-        usage();
-        exit(EXIT_FAILURE);
-      }
-    }
-  }
-
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
-  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
-
-  printf("# Using devices\n");
-  for (int g = 0; g < nDev; ++g) {
-    int cudaDev;
-    int rank;
-    cudaDeviceProp prop;
-    NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
-    NCCLCHECK(ncclCommUserRank(comms[g], &rank));
-    CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-    printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
-        prop.pciBusID, prop.name);
-  }
-  printf("\n");
-
-  printf("# %10s  %12s  %6s  %6s        out-of-place                      "
-      "in-place\n", "", "", "", "");
-  printf("# %10s  %12s  %6s  %6s  %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n",
-      "bytes", "N", "type", "op", "time", "algbw", "busbw", "delta", "time",
-      "algbw", "busbw", "delta");
-
-  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
-  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
-#ifdef CUDA_HAS_HALF
-  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
-#endif
-  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
-  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
-  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
-  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
-
-  printf("\n");
-
-  for(int i=0; i<nDev; ++i)
-    ncclCommDestroy(comms[i]);
-  free(comms);
-
-  char* str = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = str ? atof(str) : -1;
-  avg_bw /= avg_count;
-
-  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
-  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
-  printf("\n");
-  if (errors || avg_bw < check_avg_bw)
-    exit(EXIT_FAILURE);
-  else 
-    exit(EXIT_SUCCESS);
-}
-
diff --git a/test/single/reduce_test.cu b/test/single/reduce_test.cu
deleted file mode 100644
index 6abb49c..0000000
--- a/test/single/reduce_test.cu
+++ /dev/null
@@ -1,299 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENCE.txt for license information
- ************************************************************************/
-
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <vector>
-
-#include "nccl.h"
-#include "test_utilities.h"
-#include <nvToolsExt.h>
-
-int csv = false;
-int errors = 0;
-double avg_bw = 0.0;
-int avg_count = 0;
-bool is_reduction = true;
-
-template<typename T>
-void RunTest(T** sendbuff, T** recvbuff, const int N, const ncclDataType_t type,
-    const ncclRedOp_t op, int root, ncclComm_t* const comms,
-    const std::vector<int>& dList) {
-
-  // initialize data
-  T* buffer = (T*)malloc(N * sizeof(T));
-  T* result = (T*)malloc(N * sizeof(T));
-  memset(buffer, 0, N * sizeof(T));
-  memset(result, 0, N * sizeof(T));
-
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)*nDev);
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamCreate(s+i));
-    CUDACHECK(cudaMemset(recvbuff[i], 0, N * sizeof(T)));
-    Randomize(sendbuff[i], N, i);
-    if(i == 0) {
-      CUDACHECK(cudaMemcpy(result, sendbuff[i], N*sizeof(T), cudaMemcpyDeviceToHost));
-    } else {
-      Accumulate<T>(result, sendbuff[i], N, op);
-    }
-  }
-
-  // warm up GPU
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], std::min(N, 1024 * 1024),
-        type, op, root, comms[i], s[i]));
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamSynchronize(s[i]));
-  }
-
-//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
-  {
-    int n = N;
-    printf((csv) ? "%i,%i,%s,%s,%d," : "%12i  %12i  %6s  %6s %4d",
-        (int) (n * sizeof(T)), n, TypeName(type).c_str(),
-        OperationName(op).c_str(), root);
-
-    // do out-of-place reduction first
-    nvtxRangePushA("out of place");
-    auto start = std::chrono::high_resolution_clock::now();
-    //for (int i=0; i<100; i++) {
-      for (int i = 0; i < nDev; ++i) {
-        CUDACHECK(cudaSetDevice(dList[i]));
-        NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)recvbuff[i], n, type, op,
-            root, comms[i], s[i]));
-      }
-    //}
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-    nvtxRangePop();
-
-    nvtxRangePushA("out of place bookkeeping");
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count(); // / 100.0;
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
-    double busbw = algbw;
-
-    CUDACHECK(cudaSetDevice(dList[root]));
-    double maxDelta = CheckDelta<T>(recvbuff[root], result, N);
-
-    printf((csv)?"%f,%f,%f,%le,":"  %7.3f  %5.2f  %5.2f  %7.0le",
-        elapsedSec * 1.0E3, algbw, busbw, maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-
-    nvtxRangePop();
-  }
-
-
-//  for (int n = 0; n <= N; n = (n > 0) ? n << 1 : 1)
-  {
-    int n = N;
-    // now do in-place reduction
-    nvtxRangePushA("in place");
-    auto start = std::chrono::high_resolution_clock::now();
-    //for (int i=0; i<100; i++) {
-      for (int i = 0; i < nDev; ++i) {
-        CUDACHECK(cudaSetDevice(dList[i]));
-        NCCLCHECK(ncclReduce((const void*)sendbuff[i], (void*)sendbuff[i], n, type, op,
-            root, comms[i], s[i]));
-      }
-    //}
-
-    for (int i = 0; i < nDev; ++i) {
-      CUDACHECK(cudaSetDevice(dList[i]));
-      CUDACHECK(cudaStreamSynchronize(s[i]));
-    }
-
-    auto stop = std::chrono::high_resolution_clock::now();
-    nvtxRangePop();
-
-    nvtxRangePushA("in place bookkeeping");
-    double elapsedSec =
-        std::chrono::duration_cast<std::chrono::duration<double>>(
-            stop - start).count(); // / 100.0;
-    double algbw = (double)(n * sizeof(T)) / 1.0E9 / elapsedSec;
-    double busbw = algbw;
-
-    CUDACHECK(cudaSetDevice(dList[root]));
-    double maxDelta = CheckDelta<T>(sendbuff[root], result, N);
-
-    printf((csv)?"%f,%f,%f,%le,":"  %7.3f  %5.2f  %5.2f  %7.0le\n",
-        elapsedSec * 1.0E3, algbw, busbw, maxDelta);
-
-    if (maxDelta > deltaMaxValue(type, is_reduction)) errors++;
-    avg_bw += busbw;
-    avg_count++;
-
-    nvtxRangePop();
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaStreamDestroy(s[i]));
-  }
-  free(s);
-  free(buffer);
-  free(result);
-}
-
-template<typename T>
-void RunTests(const int N, const ncclDataType_t type, ncclComm_t* const comms,
-    const std::vector<int>& dList) {
-  int nDev = 0;
-  NCCLCHECK(ncclCommCount(comms[0], &nDev));
-  T** sendbuff = (T**)malloc(nDev * sizeof(T*));
-  T** recvbuff = (T**)malloc(nDev * sizeof(T*));
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaMalloc(sendbuff + i, N * sizeof(T)));
-    CUDACHECK(cudaMalloc(recvbuff + i, N * sizeof(T)));
-  }
-
-  for (ncclRedOp_t op : { ncclSum, ncclProd, ncclMax, ncclMin }) {
-//  for (ncclRedOp_t op : { ncclSum }) {
-    for(int root=0; root<nDev; ++root) {
-      RunTest<T>(sendbuff, recvbuff, N, type, op, root, comms, dList);
-    }
-  }
-
-  for (int i = 0; i < nDev; ++i) {
-    CUDACHECK(cudaSetDevice(dList[i]));
-    CUDACHECK(cudaFree(sendbuff[i]));
-    CUDACHECK(cudaFree(recvbuff[i]));
-  }
-
-  free(sendbuff);
-  free(recvbuff);
-}
-
-void usage() {
-  printf("Tests nccl Reduce with user supplied arguments.\n"
-      "    Usage: reduce_test <data size in bytes> [number of GPUs] "
-      "[GPU 0] [GPU 1] ...\n\n");
-}
-
-int main(int argc, char* argv[]) {
-  int nVis = 0;
-  CUDACHECK(cudaGetDeviceCount(&nVis));
-
-  int N = 0;
-  if (argc > 1) {
-    int t = sscanf(argv[1], "%d", &N);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  } else {
-    printf("Error: must specify at least data size in bytes!\n\n");
-    usage();
-    exit(EXIT_FAILURE);
-  }
-
-  int nDev = nVis;
-  if (argc > 2) {
-    int t = sscanf(argv[2], "%d", &nDev);
-    if (t == 0) {
-      printf("Error: %s is not an integer!\n\n", argv[1]);
-      usage();
-      exit(EXIT_FAILURE);
-    }
-  }
-  std::vector<int> dList(nDev);
-  for (int i = 0; i < nDev; ++i)
-    dList[i] = i % nVis;
-
-  if (argc > 3) {
-    if (argc - 3 != nDev) {
-      printf("Error: insufficient number of GPUs in list\n\n");
-      usage();
-      exit(EXIT_FAILURE);
-    }
-
-    for (int i = 0; i < nDev; ++i) {
-      int t = sscanf(argv[3 + i], "%d", dList.data() + i);
-      if (t == 0) {
-        printf("Error: %s is not an integer!\n\n", argv[2 + i]);
-        usage();
-        exit(EXIT_FAILURE);
-      }
-    }
-  }
-
-  ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nDev);
-  NCCLCHECK(ncclCommInitAll(comms, nDev, dList.data()));
-
-  if (!csv) {
-    printf("# Using devices\n");
-    for (int g = 0; g < nDev; ++g) {
-      int cudaDev;
-      int rank;
-      cudaDeviceProp prop;
-      NCCLCHECK(ncclCommCuDevice(comms[g], &cudaDev));
-      NCCLCHECK(ncclCommUserRank(comms[g], &rank));
-      CUDACHECK(cudaGetDeviceProperties(&prop, cudaDev));
-      printf("#   Rank %2d uses device %2d [0x%02x] %s\n", rank, cudaDev,
-        prop.pciBusID, prop.name);
-    }
-    printf("\n");
-
-    printf("# %10s  %12s  %6s  %6s  %4s        out-of-place                    in-place\n", "", "", "", "", "");
-    printf("# %10s  %12s  %6s  %6s  %4s %7s  %5s  %5s  %7s  %7s  %5s  %5s  %7s\n",
-               "bytes", "N", "type", "op", "root",
-               "time", "algbw", "busbw", "res", "time", "algbw", "busbw", "res");
-  }
-  else {
-    printf("B,N,type,op,root,oop_time,oop_algbw,oop_busbw,oop_res,ip_time,ip_algbw,ip_busbw,ip_res\n");
-  }
-
-  RunTests<char>(N / sizeof(char), ncclChar, comms, dList);
-  RunTests<int>(N / sizeof(int), ncclInt, comms, dList);
-#ifdef CUDA_HAS_HALF
-  RunTests<half>(N / sizeof(half), ncclHalf, comms, dList);
-#endif
-  RunTests<float>(N / sizeof(float), ncclFloat, comms, dList);
-  RunTests<double>(N / sizeof(double), ncclDouble, comms, dList);
-  RunTests<long long>(N / sizeof(long long), ncclInt64, comms, dList);
-  RunTests<unsigned long long>(N / sizeof(unsigned long long), ncclUint64, comms, dList);
-
-  printf("\n");
-
-  for(int i = 0; i < nDev; ++i)
-    ncclCommDestroy(comms[i]);
-  free(comms);
-
-  char* str = getenv("NCCL_TESTS_MIN_BW");
-  double check_avg_bw = str ? atof(str) : -1;
-  avg_bw /= avg_count;
-
-  printf(" Out of bounds values : %d %s\n", errors, errors ? "FAILED" : "OK");
-  printf(" Avg bus bandwidth    : %g %s\n", avg_bw, check_avg_bw == -1 ? "" : (avg_bw < check_avg_bw ? "FAILED" : "OK"));
-  printf("\n");
-  if (errors || avg_bw < check_avg_bw)
-    exit(EXIT_FAILURE);
-  else 
-    exit(EXIT_SUCCESS);
-}
-
author	Sylvain Jeaugey <sjeaugey@nvidia.com>	2018-09-25 02:06:59 +0300
committer	Sylvain Jeaugey <sjeaugey@nvidia.com>	2018-09-26 00:12:01 +0300
commit	f93fe9bfd94884cec2ba711897222e0df5569a53 (patch)
tree	78b91eed1abfbaa3346b85bffe0c0ef9d9fb32bf
parent	286916a1a37ca1fe8cd43e280f5c42ec29569fc5 (diff)