Merge branch 'nccl' into pmaster2

author: Marcin Junczys-Dowmunt <marcinjd@microsoft.com> 2018-12-07 00:22:00 +0300
committer: Marcin Junczys-Dowmunt <marcinjd@microsoft.com> 2018-12-07 00:22:00 +0300
commit: d43ffce6080170e970befe5c565bd9d2c05c4e03 (patch)
tree: 346097ecf1890efedf73ca78779d959b19f72f86
parent: 3efcfdf7f471f80fa73757615116dcc72e0aec05 (diff)
parent: 1b2968c8b9465ce2225f304f5deea7f642f3e533 (diff)
90 files changed, 12583 insertions, 16 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1489f797..c585b9f4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -91,16 +91,6 @@ if(CUDA_FOUND)
     endif(CUDNN_FOUND)
   endif(USE_CUDNN)
 
-  if(USE_NCCL)
-    find_package(NCCL)
-    if(NCCL_FOUND)
-      include_directories(${NCCL_INCLUDE_DIR})
-      set(EXT_LIBS ${EXT_LIBS} ${NCCL_LIBRARIES})
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_NCCL")
-      LIST(APPEND CUDA_NVCC_FLAGS -DUSE_NCCL; )
-    endif(NCCL_FOUND)
-  endif(USE_NCCL)
-
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCUDA_FOUND")
   list(APPEND CUDA_NVCC_FLAGS -DCUDA_FOUND; )
 
@@ -108,6 +98,20 @@ if(CUDA_FOUND)
     list(APPEND CUDA_NVCC_FLAGS -DBOOST_PP_VARIADICS=0; )
   endif()
 
+  if(USE_NCCL)
+    set(NCCL_STATIC "${CMAKE_CURRENT_SOURCE_DIR}/src/3rd_party/nccl/build/lib/libnccl_static.a")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_NCCL")
+    LIST(APPEND CUDA_NVCC_FLAGS -DUSE_NCCL; )
+
+    add_custom_command(OUTPUT ${NCCL_STATIC}
+                       COMMAND make
+                       WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/src/3rd_party/nccl")
+    add_custom_target(nccl_target DEPENDS ${NCCL_STATIC})
+    add_library(nccl STATIC IMPORTED)
+    set_target_properties(nccl PROPERTIES IMPORTED_LOCATION ${NCCL_STATIC})
+    add_dependencies(nccl nccl_target)
+  endif(USE_NCCL)
+
 else(CUDA_FOUND)
   message(FATAL_ERROR "CUDA has not been found, set -DCOMPILE_CUDA=off to avoid this check and to compile the CPU version only")
 endif(CUDA_FOUND)
diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt
index f3129577..f7eabf54 100644
--- a/src/3rd_party/CMakeLists.txt
+++ b/src/3rd_party/CMakeLists.txt
@@ -31,7 +31,6 @@ if(USE_SENTENCEPIECE)
   endif()
 endif(USE_SENTENCEPIECE)
 
-
 include_directories(./SQLiteCpp/include)
 include_directories(./CLI)
 include_directories(./pathie-cpp/include)
diff --git a/src/3rd_party/nccl/.gitignore b/src/3rd_party/nccl/.gitignore
new file mode 100644
index 00000000..87ad949e
--- /dev/null
+++ b/src/3rd_party/nccl/.gitignore
@@ -0,0 +1,4 @@
+# Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+/build
+*.gcov
+/coverage/
diff --git a/src/3rd_party/nccl/LICENSE.txt b/src/3rd_party/nccl/LICENSE.txt
new file mode 100644
index 00000000..3593a7aa
--- /dev/null
+++ b/src/3rd_party/nccl/LICENSE.txt
@@ -0,0 +1,31 @@
+
+ Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+    Laboratory, the U.S. Department of Energy, nor the names of their
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ The U.S. Department of Energy funded the development of this software
+ under subcontract 7078610 with Lawrence Berkeley National Laboratory.
+
diff --git a/src/3rd_party/nccl/Makefile b/src/3rd_party/nccl/Makefile
new file mode 100644
index 00000000..605e3bfa
--- /dev/null
+++ b/src/3rd_party/nccl/Makefile
@@ -0,0 +1,31 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+.PHONY : all clean
+
+default : src.build
+install : src.install
+BUILDDIR ?= $(abspath ./build)
+ABSBUILDDIR := $(abspath $(BUILDDIR))
+TARGETS := src pkg
+clean: ${TARGETS:%=%.clean}
+test.build: src.build
+LICENSE_FILES := LICENSE.txt
+LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%)
+lic: $(LICENSE_TARGETS)
+
+${BUILDDIR}/%.txt: %.txt
+	@printf "Copying    %-35s > %s\n" $< $@
+	mkdir -p ${BUILDDIR}
+	cp $< $@
+
+src.%:
+	${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR}
+
+pkg.%:
+	${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR}
+
+pkg.debian.prep: lic
+pkg.txz.prep: lic
diff --git a/src/3rd_party/nccl/README.md b/src/3rd_party/nccl/README.md
new file mode 100644
index 00000000..fa514532
--- /dev/null
+++ b/src/3rd_party/nccl/README.md
@@ -0,0 +1,92 @@
+# NCCL
+
+Optimized primitives for collective multi-GPU communication.
+
+## Introduction
+
+NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
+
+For more information on NCCL usage, please refer to the [NCCL documentation](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html).
+
+## What's inside
+
+At present, the library implements the following collectives operations:
+
+- all-reduce
+- all-gather
+- reduce-scatter
+- reduce
+- broadcast
+
+These operations are implemented using ring algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API.
+
+## Requirements
+
+NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. For PCIe based platforms, best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported.
+
+## Build
+
+Note: the official and tested builds of NCCL can be downloaded from: https://developer.nvidia.com/nccl. You can skip the following build steps if you choose to use the official builds.
+
+To build the library :
+
+```shell
+$ cd nccl
+$ make -j src.build
+```
+
+If CUDA is not installed in the default /usr/local/cuda path, you can define the CUDA path with :
+
+```shell
+$ make src.build CUDA_HOME=<path to cuda install>
+```
+
+NCCL will be compiled and installed in `build/` unless `BUILDDIR` is set.
+
+By default, NCCL is compiled for all supported architectures. To accelerate the compilation and reduce the binary size, consider redefining `NVCC_GENCODE` (defined in `makefiles/common.mk`) to only include the architecture of the target platform :
+```shell
+$ make -j src.build NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70"
+```
+
+## Install
+
+To install NCCL on the system, create a package then install it as root.
+
+Debian/Ubuntu :
+```shell
+$ # Install tools to create debian packages
+$ sudo apt install build-essential devscripts debhelper
+$ # Build NCCL deb package
+$ make pkg.debian.build
+$ ls build/pkg/deb/
+```
+
+RedHat/CentOS :
+```shell
+$ # Install tools to create rpm packages
+$ sudo yum install rpm-build rpmdevtools
+$ # Build NCCL rpm package
+$ make pkg.redhat.build
+$ ls build/pkg/rpm/
+```
+
+OS-agnostic tarball :
+```shell
+$ make pkg.txz.build
+$ ls build/pkg/txz/
+```
+
+## Tests
+
+Tests for NCCL are maintained separately at https://github.com/nvidia/nccl-tests.
+
+```shell
+$ git clone https://github.com/NVIDIA/nccl-tests.git
+$ cd nccl-tests
+$ make
+$ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g <ngpus>
+```
+
+## Copyright
+
+All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
diff --git a/src/3rd_party/nccl/makefiles/common.mk b/src/3rd_party/nccl/makefiles/common.mk
new file mode 100644
index 00000000..83a2a395
--- /dev/null
+++ b/src/3rd_party/nccl/makefiles/common.mk
@@ -0,0 +1,87 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+CUDA_HOME ?= /usr/local/cuda
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+KEEP ?= 0
+DEBUG ?= 0
+TRACE ?= 0
+PROFAPI ?= 0
+
+NVCC = $(CUDA_HOME)/bin/nvcc
+
+CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
+CUDA_VERSION = $(strip $(shell $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+#CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
+CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
+CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
+#$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR})
+
+
+# Better define NVCC_GENCODE in your environment to the minimal set
+# of archs to reduce compile time.
+CUDA8_GENCODE = -gencode=arch=compute_30,code=sm_30 \
+                -gencode=arch=compute_35,code=sm_35 \
+                -gencode=arch=compute_50,code=sm_50 \
+                -gencode=arch=compute_60,code=sm_60 \
+                -gencode=arch=compute_61,code=sm_61
+CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
+
+CUDA8_PTX     = -gencode=arch=compute_61,code=compute_61
+CUDA9_PTX     = -gencode=arch=compute_70,code=compute_70
+
+# Include Volta support if we're using CUDA9 or above
+ifeq ($(shell test "$(CUDA_MAJOR)" -gt 8; echo $$?),0)
+  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
+else
+  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
+endif
+#$(info NVCC_GENCODE is ${NVCC_GENCODE})
+
+CXXFLAGS   := -I$(CUDA_INC) -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
+CXXFLAGS   += -Wall -Wno-sign-compare
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
+# Use addprefix so that we can specify more than one path
+NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
+
+########## GCOV ##########
+GCOV ?= 0 # disable by default.
+GCOV_FLAGS := $(if $(filter 0,${GCOV} ${DEBUG}),,--coverage) # only gcov=1 and debug =1
+CXXFLAGS  += ${GCOV_FLAGS}
+NVCUFLAGS += ${GCOV_FLAGS:%=-Xcompiler %}
+LDFLAGS   += ${GCOV_FLAGS}
+NVLDFLAGS   += ${GCOV_FLAGS:%=-Xcompiler %}
+# $(warning GCOV_FLAGS=${GCOV_FLAGS})
+########## GCOV ##########
+
+ifeq ($(DEBUG), 0)
+NVCUFLAGS += -O3
+CXXFLAGS  += -O3 -g
+else
+NVCUFLAGS += -O0 -G -g
+CXXFLAGS  += -O0 -g -ggdb3
+endif
+
+ifneq ($(VERBOSE), 0)
+NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
+CXXFLAGS  += -Wall -Wextra
+else
+.SILENT:
+endif
+
+ifneq ($(TRACE), 0)
+CXXFLAGS  += -DENABLE_TRACE
+endif
+
+ifneq ($(KEEP), 0)
+NVCUFLAGS += -keep
+endif
+
+ifneq ($(PROFAPI), 0)
+CXXFLAGS += -DPROFAPI
+endif
diff --git a/src/3rd_party/nccl/makefiles/formatting.mk b/src/3rd_party/nccl/makefiles/formatting.mk
new file mode 100644
index 00000000..4a4ab885
--- /dev/null
+++ b/src/3rd_party/nccl/makefiles/formatting.mk
@@ -0,0 +1,33 @@
+#
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# Prerequisite: $(FILESTOFORMAT) contains the list of files of interest for formatting
+# As this file defines a new target (format), it should be included at least after the definition of the
+# default target.
+
+ASTYLE_FORMAT_OPTS=-Qv --style=java --indent-after-parens --indent-modifiers --indent-switches --indent-continuation=2 --keep-one-line-blocks --keep-one-line-statements --indent=spaces=2 --lineend=linux --suffix=none
+ASTYLEDIR := $(BUILDDIR)/contrib
+ASTYLETAR := $(ASTYLEDIR)/astyle.tar.gz
+ASTYLEBIN := $(ASTYLEDIR)/astyle/build/gcc/bin/astyle
+ASTYLEBLD := $(ASTYLEDIR)/astyle/build/gcc/
+ASTYLEVER := 3.1
+ASTYLEURL := "https://versaweb.dl.sourceforge.net/project/astyle/astyle/astyle%20$(ASTYLEVER)/astyle_$(ASTYLEVER)_linux.tar.gz"
+
+$(ASTYLEDIR) :
+	@mkdir -p $(ASTYLEDIR)
+
+$(ASTYLETAR) : $(ASTYLEDIR)
+	@wget -q -O $(ASTYLETAR) $(ASTYLEURL)
+
+$(ASTYLEBLD) : $(ASTYLETAR)
+	@cd $(ASTYLEDIR) && tar xzf $(ASTYLETAR)
+
+$(ASTYLEBIN) : $(ASTYLEBLD)
+	${MAKE} -C $(ASTYLEBLD)
+
+.PHONY : format
+format : $(ASTYLEBIN)
+	@$(ASTYLEBIN) $(ASTYLE_FORMAT_OPTS) $(FILESTOFORMAT)
diff --git a/src/3rd_party/nccl/makefiles/version.mk b/src/3rd_party/nccl/makefiles/version.mk
new file mode 100644
index 00000000..f9cee6a5
--- /dev/null
+++ b/src/3rd_party/nccl/makefiles/version.mk
@@ -0,0 +1,6 @@
+##### version
+NCCL_MAJOR   := 2
+NCCL_MINOR   := 3
+NCCL_PATCH   := 7
+NCCL_SUFFIX  :=
+PKG_REVISION := 1
diff --git a/src/3rd_party/nccl/pkg/Makefile b/src/3rd_party/nccl/pkg/Makefile
new file mode 100644
index 00000000..04b23da7
--- /dev/null
+++ b/src/3rd_party/nccl/pkg/Makefile
@@ -0,0 +1,26 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+.PHONY : all clean
+
+default : build
+build : debian.build txz.build
+
+BUILDDIR ?= $(abspath ../build)
+ABSBUILDDIR := $(abspath $(BUILDDIR))
+TARGETS := debian txz
+all:   ${TARGETS:%=%.build}
+prep:  ${TARGETS:%=%.prep}
+build: ${TARGETS:%=%.build}
+clean: ${TARGETS:%=%.clean}
+
+%.prep:
+	${MAKE} -C $* prep BUILDDIR=${ABSBUILDDIR}
+
+%.build:
+	${MAKE} -C $* build BUILDDIR=${ABSBUILDDIR}
+
+%.clean:
+	${MAKE} -C $* clean
diff --git a/src/3rd_party/nccl/pkg/debian/.gitignore b/src/3rd_party/nccl/pkg/debian/.gitignore
new file mode 100644
index 00000000..1e97a9fe
--- /dev/null
+++ b/src/3rd_party/nccl/pkg/debian/.gitignore
@@ -0,0 +1,7 @@
+/*.debhelper.log
+/*.debhelper
+/*.substvars
+/tmp/
+/files
+/libnccl1/
+/libnccl-dev/
diff --git a/src/3rd_party/nccl/pkg/debian/Makefile b/src/3rd_party/nccl/pkg/debian/Makefile
new file mode 100644
index 00000000..439635f9
--- /dev/null
+++ b/src/3rd_party/nccl/pkg/debian/Makefile
@@ -0,0 +1,58 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../../makefiles/common.mk
+include ../../makefiles/version.mk
+BUILDDIR ?= $(abspath ../../build)
+DEBPREPDIR := $(BUILDDIR)/debian
+PKGDIR  := $(BUILDDIR)/pkg/deb/
+
+DEBGEN_IN  := $(wildcard *.in)
+DEBGEN     := $(DEBGEN_IN:.in=)
+DEBFILES   := compat copyright libnccl-dev.install rules $(DEBGEN)
+DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES))
+
+PKG_TIMESTAMP  := $(shell date -R)
+ARCH           := $(shell uname -m)
+PKG_ARCH       ?= $(shell uname -m | sed -e "s/x86_64/amd64/g" | sed -e "s/ppc64le/ppc64el/g")
+PKG_MULTIARCH  ?= $(shell $(CXX) -print-multiarch)
+ifeq ($(PKG_MULTIARCH),)
+# Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
+PKG_MULTIARCH  := $(ARCH)-linux-gnu
+endif
+
+prep : $(DEBTARGETS)
+	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
+
+build : prep
+	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
+	@printf "Building Debian package\n"
+	(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b)
+	mkdir -p $(PKGDIR)
+	mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/
+
+clean:
+	rm -Rf $(DEBPREPDIR) $(PKGDIR)
+
+$(DEBPREPDIR)/% : %.in
+	@printf "Generating %-35s > %s\n" $< $@
+	mkdir -p $(DEBPREPDIR)
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
+	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
+	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
+	    -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
+	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
+	    -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
+	    $< > $@
+
+$(DEBPREPDIR)/% : %
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(DEBPREPDIR)
+	cp -f $< $@
diff --git a/src/3rd_party/nccl/pkg/debian/changelog.in b/src/3rd_party/nccl/pkg/debian/changelog.in
new file mode 100644
index 00000000..f280ae3f
--- /dev/null
+++ b/src/3rd_party/nccl/pkg/debian/changelog.in
@@ -0,0 +1,5 @@
+nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium
+
+  * Automatic Debian package from build
+
+ -- cudatools <cudatools@nvidia.com>  ${pkg:Timestamp}
diff --git a/src/3rd_party/nccl/pkg/debian/compat b/src/3rd_party/nccl/pkg/debian/compat
new file mode 100644
index 00000000..ec635144
--- /dev/null
+++ b/src/3rd_party/nccl/pkg/debian/compat
@@ -0,0 +1 @@
+9
diff --git a/src/3rd_party/nccl/pkg/debian/control.in b/src/3rd_party/nccl/pkg/debian/control.in
new file mode 100644
index 00000000..c8f52660
--- /dev/null
+++ b/src/3rd_party/nccl/pkg/debian/control.in
@@ -0,0 +1,30 @@
+Source: nccl
+Section: libs
+Maintainer: cudatools <cudatools@nvidia.com>
+Priority: optional
+Build-depends: debhelper(>=9)
+Standards-Version: 3.9.5
+
+Package: libnccl${nccl:Major}
+Section: libs
+Architecture: ${pkg:Arch}
+Depends: ${misc:Depends}, ${shlibs:Depends}
+Description: NVIDIA Collectives Communication Library (NCCL) Runtime
+ NCCL (pronounced "Nickel") is a stand-alone library of standard collective
+ communication routines for GPUs, implementing all-reduce, all-gather, reduce,
+ broadcast, and reduce-scatter.
+ It has been optimized to achieve high bandwidth on any platform using PCIe,
+ NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
+ sockets.
+
+Package: libnccl-dev
+Section: libdevel
+Architecture: ${pkg:Arch}
+Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
+Description: NVIDIA Collectives Communication Library (NCCL) Development Files
+ NCCL (pronounced "Nickel") is a stand-alone library of standard collective
+ communication routines for GPUs, implementing all-reduce, all-gather, reduce,
+ broadcast, and reduce-scatter.
+ It has been optimized to achieve high bandwidth on any platform using PCIe,
+ NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
+ sockets.
diff --git a/src/3rd_party/nccl/pkg/debian/copyright b/src/3rd_party/nccl/pkg/debian/copyright
new file mode 120000
index 00000000..1ef648f6
--- /dev/null
+++ b/src/3rd_party/nccl/pkg/debian/copyright
@@ -0,0 +1 @@
+../../LICENSE.txt
+\ No newline at end of file
diff --git a/src/3rd_party/nccl/pkg/debian/gbp.conf b/src/3rd_party/nccl/pkg/debian/gbp.conf
new file mode 100644
index 00000000..9a9d7ca7
--- /dev/null
+++ b/src/3rd_party/nccl/pkg/debian/gbp.conf
@@ -0,0 +1,9 @@
+[DEFAULT]
+debian-branch   = master
+upstream-branch = master
+
+ignore-new = True
+
+[git-buildpackage]
+
+no-purge = True
diff --git a/src/3rd_party/nccl/pkg/debian/libnccl-dev.install.in b/src/3rd_party/nccl/pkg/debian/libnccl-dev.install.in
new file mode 100644
index 00000000..9cedf3e2
--- /dev/null
+++ b/src/3rd_party/nccl/pkg/debian/libnccl-dev.install.in
@@ -0,0 +1,3 @@
+include/nccl.h /usr/include
+lib/libnccl.so /usr/lib/${pkg:MultiArch}
+lib/libnccl_static.a /usr/lib/${pkg:MultiArch}
diff --git a/src/3rd_party/nccl/pkg/debian/libnccl2.install.in b/src/3rd_party/nccl/pkg/debian/libnccl2.install.in
new file mode 100644
index 00000000..7c5367c2
--- /dev/null
+++ b/src/3rd_party/nccl/pkg/debian/libnccl2.install.in
@@ -0,0 +1,2 @@
+lib/libnccl.so.${nccl:Major} /usr/lib/${pkg:MultiArch}
+lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/${pkg:MultiArch}
diff --git a/src/3rd_party/nccl/pkg/debian/rules b/src/3rd_party/nccl/pkg/debian/rules
new file mode 100755
index 00000000..23b90a9e
--- /dev/null
+++ b/src/3rd_party/nccl/pkg/debian/rules
@@ -0,0 +1,13 @@
+#!/usr/bin/make -f
+
+%:
+	dh $@ --parallel
+
+override_dh_auto_install:
+	PREFIX=debian/tmp dh_auto_install
+
+override_dh_auto_test:
+	# Do not make test
+
+override_dh_auto_clean:
+	# Do not make clean
diff --git a/src/3rd_party/nccl/pkg/debian/source/format b/src/3rd_party/nccl/pkg/debian/source/format
new file mode 100644
index 00000000..89ae9db8
--- /dev/null
+++ b/src/3rd_party/nccl/pkg/debian/source/format
@@ -0,0 +1 @@
+3.0 (native)
diff --git a/src/3rd_party/nccl/pkg/redhat/Makefile b/src/3rd_party/nccl/pkg/redhat/Makefile
new file mode 100644
index 00000000..ffcc973b
--- /dev/null
+++ b/src/3rd_party/nccl/pkg/redhat/Makefile
@@ -0,0 +1,62 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../../makefiles/common.mk
+include ../../makefiles/version.mk
+BUILDDIR ?= $(abspath ../../build)
+RPMPREPDIR := $(BUILDDIR)/redhat
+PKGDIR  := $(BUILDDIR)/pkg/rpm/
+
+RPMGEN_IN  := $(wildcard *.in)
+RPMGEN     := $(RPMGEN_IN:.in=)
+RPMFILES   := $(RPMGEN)
+RPMTARGETS := $(patsubst %, $(RPMPREPDIR)/%, $(RPMFILES))
+
+PKG_TIMESTAMP  := $(shell date -R)
+ARCH           := $(shell uname -m)
+PKG_ARCH       ?= $(shell uname -m)
+PKG_MULTIARCH  ?= $(shell $(CXX) -print-multiarch)
+ifeq ($(PKG_MULTIARCH),)
+# Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
+PKG_MULTIARCH  := $(ARCH)-linux-gnu
+endif
+
+prep : $(RPMTARGETS)
+	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
+
+build : prep
+	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
+	$(MAKE) -C ../txz build BUILDDIR=$(BUILDDIR)
+	@printf "Building Redhat package\n"
+	mkdir -p $(PKGDIR)
+	rpmbuild --define "_sourcedir $(BUILDDIR)/pkg/txz" \
+                 --define "_rpmdir $(PKGDIR)" \
+                 --define "_builddir $(PKGDIR)/build/" \
+                 --define "_buildrootdir $(PKGDIR)/buildroot/" \
+                 -bb $(BUILDDIR)/redhat/nccl.spec
+
+clean:
+	rm -Rf $(RPMPREPDIR) $(PKGDIR)
+
+$(RPMPREPDIR)/% : %.in
+	@printf "Generating %-35s > %s\n" $< $@
+	mkdir -p $(RPMPREPDIR)
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
+	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
+	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
+	    -e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
+	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
+	    -e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
+	    $< > $@
+
+$(RPMPREPDIR)/% : %
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(RPMPREPDIR)
+	cp -f $< $@
diff --git a/src/3rd_party/nccl/pkg/redhat/nccl.spec.in b/src/3rd_party/nccl/pkg/redhat/nccl.spec.in
new file mode 100644
index 00000000..65a2c601
--- /dev/null
+++ b/src/3rd_party/nccl/pkg/redhat/nccl.spec.in
@@ -0,0 +1,73 @@
+Name:           libnccl
+Version:        ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
+Release:        ${pkg:Revision}
+Summary:        NVIDIA Collectives Communication Library (NCCL) Runtime
+
+Group:          Development/Libraries
+License:        BSD
+URL:            http://developer.nvidia.com/nccl
+Source0:        nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz
+
+%description
+NCCL (pronounced "Nickel") is a stand-alone library of standard collective
+communication routines for GPUs, implementing all-reduce, all-gather, reduce,
+broadcast, and reduce-scatter.
+It has been optimized to achieve high bandwidth on any platform using PCIe,
+NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
+sockets.
+
+%package devel
+Summary:        NVIDIA Collectives Communication Library (NCCL) Runtime
+Group:          Development/Libraries
+%description devel
+NCCL development files
+
+%package static
+Summary:        NVIDIA Collectives Communication Library (NCCL) Runtime
+Group:          Development/Libraries
+%description static
+NCCL static library
+
+%define debug_package %{nil}
+
+%prep
+%setup -n nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch} -q
+
+%build
+
+%install
+rm -rf $RPM_BUILD_ROOT
+install -m 755 -d $RPM_BUILD_ROOT
+install -m 755 -d $RPM_BUILD_ROOT/%{_libdir}
+install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}
+ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major}
+
+# devel
+install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
+install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
+ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
+
+# static
+install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir}
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%files devel
+%doc LICENSE.txt
+%defattr(-,root,root,-)
+%{_includedir}/nccl.h
+%{_libdir}/libnccl.so
+
+%files static
+%doc LICENSE.txt
+%defattr(-,root,root,-)
+%{_libdir}/libnccl_static.a
+
+%files
+%doc LICENSE.txt
+%defattr(-,root,root,-)
+%{_libdir}/libnccl.so.${nccl:Major}
+%{_libdir}/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch}
+
+%changelog
diff --git a/src/3rd_party/nccl/pkg/srctxz/Makefile b/src/3rd_party/nccl/pkg/srctxz/Makefile
new file mode 100644
index 00000000..1cb7c06a
--- /dev/null
+++ b/src/3rd_party/nccl/pkg/srctxz/Makefile
@@ -0,0 +1,39 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../../makefiles/common.mk
+include ../../makefiles/version.mk
+BUILDDIR ?= $(abspath ../../build)
+TXZPREPDIR  := $(BUILDDIR)/srctxz
+PKGDIR  := $(BUILDDIR)/pkg/srctxz/
+
+TXZGEN_IN  := $(wildcard *.in)
+TXZGEN     := $(TXZGEN_IN:.in=)
+TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
+
+PKG_REVISION   ?= 3
+PKG_ARCH       := $(shell uname -m)
+
+prep: $(TXZTARGETS)
+
+build: prep
+	$(MAKE) -C ../../src clean
+	@printf "Building source tar.xz package\n"
+	(cd $(BUILDDIR); bash srctxz/create_srctxz.sh)
+	mkdir -p $(PKGDIR)
+	mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR)
+
+clean:
+	rm -Rf $(TXZPREPDIR) $(PKGDIR)
+
+$(TXZPREPDIR)/% : %.in
+	@printf "Generating %-35s > %s\n" $< $@
+	mkdir -p $(TXZPREPDIR)
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    $< > $@
diff --git a/src/3rd_party/nccl/pkg/srctxz/create_srctxz.sh.in b/src/3rd_party/nccl/pkg/srctxz/create_srctxz.sh.in
new file mode 100644
index 00000000..0b8e6d2b
--- /dev/null
+++ b/src/3rd_party/nccl/pkg/srctxz/create_srctxz.sh.in
@@ -0,0 +1,34 @@
+#!/bin/bash
+#
+# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# To run from $BUILDDIR/
+
+cd ..
+NCCLDIR=`basename $PWD`
+
+echo "Checking for unclean directory ..."
+git clean -x -i
+echo "Clean done."
+echo "Checking for uncommited files ..."
+if [ "`git status -s | wc -l`" != "0" ]; then
+  git status -s
+  echo "Some changes are not committed yet. Continue ? (Ctrl-C to abort)"
+  read
+fi
+
+cd ..
+NCCL_MAJOR=${nccl:Major}
+NCCL_MINOR=${nccl:Minor}
+NCCL_PATCH=${nccl:Patch}
+NCCL_SUFFIX=${nccl:Suffix}
+
+NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}"
+
+tar --exclude build \
+    --exclude ".git*" \
+    --exclude pkg/srctxz \
+    --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR
diff --git a/src/3rd_party/nccl/pkg/txz/Makefile b/src/3rd_party/nccl/pkg/txz/Makefile
new file mode 100644
index 00000000..fa587ef1
--- /dev/null
+++ b/src/3rd_party/nccl/pkg/txz/Makefile
@@ -0,0 +1,43 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../../makefiles/common.mk
+include ../../makefiles/version.mk
+BUILDDIR ?= $(abspath ../../build)
+TXZPREPDIR  := $(BUILDDIR)/txz
+PKGDIR  := $(BUILDDIR)/pkg/txz/
+
+TXZGEN_IN  := $(wildcard *.in)
+TXZGEN     := $(TXZGEN_IN:.in=)
+TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
+
+PKG_ARCH   := $(shell uname -m)
+
+prep: $(TXZTARGETS)
+	$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
+
+build: prep
+	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
+	@printf "Building tar.xz package\n"
+	(cd $(BUILDDIR); bash txz/create_txz.sh)
+	mkdir -p $(PKGDIR)
+	mv $(BUILDDIR)/../nccl*.txz $(PKGDIR)
+
+clean:
+	rm -Rf $(TXZPREPDIR) $(PKGDIR)
+
+$(TXZPREPDIR)/% : %.in
+	@printf "Generating %-35s > %s\n" $< $@
+	mkdir -p $(TXZPREPDIR)
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
+	    -e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
+	    -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
+	    -e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
+	    $< > $@
diff --git a/src/3rd_party/nccl/pkg/txz/create_txz.sh.in b/src/3rd_party/nccl/pkg/txz/create_txz.sh.in
new file mode 100644
index 00000000..73922e09
--- /dev/null
+++ b/src/3rd_party/nccl/pkg/txz/create_txz.sh.in
@@ -0,0 +1,24 @@
+#!/bin/bash
+#
+# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# To run from $BUILDDIR/
+
+BUILDDIR=`basename $PWD`
+
+cd ..
+NCCL_MAJOR=${nccl:Major}
+NCCL_MINOR=${nccl:Minor}
+NCCL_PATCH=${nccl:Patch}
+NCCL_SUFFIX=${nccl:Suffix}
+CUDA_MAJOR=${cuda:Major}
+CUDA_MINOR=${cuda:Minor}
+PKG_REVISION=${pkg:Revision}
+PKG_ARCH=${pkg:Arch}
+
+NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}"
+
+tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt
diff --git a/src/3rd_party/nccl/src/Makefile b/src/3rd_party/nccl/src/Makefile
new file mode 100644
index 00000000..5d304441
--- /dev/null
+++ b/src/3rd_party/nccl/src/Makefile
@@ -0,0 +1,107 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../makefiles/common.mk
+include ../makefiles/version.mk
+
+##### src files
+INCEXPORTS  := nccl.h
+LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \
+		misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \
+		transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
+                collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
+
+##### lib files
+LIBNAME     := libnccl.so
+STATICLIBNAME := libnccl_static.a
+##### dirs
+BUILDDIR ?= $(abspath ../build)
+INCDIR := $(BUILDDIR)/include
+LIBDIR := $(BUILDDIR)/lib
+OBJDIR := $(BUILDDIR)/obj
+##### target files
+INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
+LIBSONAME  := $(LIBNAME:%=%.$(NCCL_MAJOR))
+LIBTARGET  := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
+STATICLIBTARGET := $(STATICLIBNAME)
+LIBOBJ     := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
+DEPFILES   := $(LIBOBJ:%.o=%.d)
+LDFLAGS    += -L${CUDA_LIB} -lcudart_static -lrt
+
+DEVICELIB  := $(BUILDDIR)/obj/collectives/device/colldevice.a
+
+
+##### rules
+build : lib staticlib
+
+lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
+
+staticlib : $(LIBDIR)/$(STATICLIBTARGET)
+
+devicelib: $(INCDIR)/nccl.h
+	$(MAKE) -C collectives/device
+
+-include $(DEPFILES)
+$(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
+
+$(INCDIR)/nccl.h : nccl.h.in
+# NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
+	@$(eval NCCL_VERSION := $(shell printf "%d%d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH)))
+	mkdir -p $(INCDIR)
+	printf "Generating %-35s > %s\n" $< $@
+	sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+	    -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+	    -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+	    -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+	    -e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \
+	    $< > $@
+
+$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) devicelib
+	@printf "Linking    %-35s > %s\n" $(LIBTARGET) $@
+	mkdir -p $(LIBDIR)
+	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS)
+	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
+	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
+
+$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) devicelib
+	@printf "Archiving  %-35s > %s\n" $(STATICLIBTARGET) $@
+	mkdir -p $(LIBDIR)
+	$(eval TMP := $(shell mktemp -d))
+	cp $(LIBOBJ) $(TMP)
+	cd $(TMP) && ar x $(DEVICELIB) && cd -
+	ar cr $@ $(LIBOBJ) $(TMP)/*.o
+	rm -Rf $(TMP)
+
+$(INCDIR)/%.h : %.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(INCDIR)
+	cp -f $< $@
+
+$(OBJDIR)/%.o : %.cu
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(NVCC) -I. -I$(INCDIR) -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
+	@$(NVCC) -I. -I$(INCDIR) -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
+	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
+	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
+                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
+	@rm -f $(@:%.o=%.d.tmp)
+
+clean :
+	rm -rf ${INCDIR} ${LIBDIR} ${OBJDIR}
+	$(MAKE) -C collectives/device clean
+
+install : lib
+	mkdir -p $(PREFIX)/lib
+	mkdir -p $(PREFIX)/include
+	cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
+	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
+
+FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
+# Note that formatting.mk defines a new target so in order to not overwrite the default target,
+# it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
+# as the BUILDDIR variable.
+include ../makefiles/formatting.mk
diff --git a/src/3rd_party/nccl/src/bootstrap.cu b/src/3rd_party/nccl/src/bootstrap.cu
new file mode 100644
index 00000000..8593726b
--- /dev/null
+++ b/src/3rd_party/nccl/src/bootstrap.cu
@@ -0,0 +1,249 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "core.h"
+#include "utils.h"
+#include "bootstrap.h"
+#include "net.h"
+#include <unistd.h>
+#include <sys/types.h>
+
+// Always use sockets for bootstrap
+ncclNet_t* ncclBootstrapNet = &ncclNetSocket;
+
+static ncclResult_t bootstrapListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t bootstrapConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; }
+static ncclResult_t bootstrapAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; }
+static ncclResult_t bootstrapTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t bootstrapCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; }
+static ncclResult_t bootstrapCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; }
+static ncclResult_t bootstrapCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; }
+
+// Additional sync functions based on async + test for bootstrap, using host ptrs.
+static ncclResult_t bootstrapSend(void* sendComm, void* data, int size) {
+  void* request;
+  NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, NCCL_PTR_HOST, &request));
+  int done = 0;
+  while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
+  return ncclSuccess;
+}
+static ncclResult_t bootstrapRecv(void* recvComm, void* data, int size) {
+  void* request;
+  NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, NCCL_PTR_HOST, &request));
+  int done = 0;
+  while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
+  return ncclSuccess;
+}
+
+struct extId {
+  ncclNetHandle_t extHandleRoot;
+  void* extListenComm;
+  uint64_t hostHash;
+  pid_t pid;
+  int fd;
+  pthread_t boostrapThread;
+};
+
+struct extInfo {
+  int rank;
+  int nranks;
+  ncclNetHandle_t extHandleListenFromRoot;
+  ncclNetHandle_t extHandleRing;
+};
+
+#include <sys/resource.h>
+
+static ncclResult_t setFilesLimit() {
+  struct rlimit filesLimit;
+  SYSCHECK(getrlimit(RLIMIT_NOFILE, &filesLimit), "getrlimit");
+  filesLimit.rlim_cur = filesLimit.rlim_max;
+  SYSCHECK(setrlimit(RLIMIT_NOFILE, &filesLimit), "setrlimit");
+  return ncclSuccess;
+}
+
+static void *bootstrapRoot(void* commId) {
+  struct extInfo info;
+  struct extId* id = (struct extId*)commId;
+  ncclNetHandle_t *extHandleBstrap = NULL; // for initial rank <-> root information exchange
+  ncclNetHandle_t *extHandleRing = NULL; // for bootstrap ring creation
+  ncclNetHandle_t zero = { 0 }; // for sanity checking
+  void* tmpComm;
+  char* data = NULL;
+  ncclResult_t res;
+  setFilesLimit();
+
+  /* Receive addresses from all ranks */
+  int nranks = 0, c = 0;
+  do {
+    NCCLCHECKGOTO(bootstrapAccept(id->extListenComm, &tmpComm), res, out);
+    NCCLCHECKGOTO(bootstrapRecv(tmpComm, &info, sizeof(info)), res, out);
+    NCCLCHECKGOTO(bootstrapCloseRecv(tmpComm), res, out);
+
+    if (c == 0) {
+      extHandleBstrap = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t));
+      extHandleRing = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t));
+      if (extHandleBstrap == NULL || extHandleRing == NULL) {
+        WARN("Bootstrap thread : failed to allocate memory");
+        goto out;
+      }
+      nranks = info.nranks;
+    }
+
+    if (nranks != info.nranks) {
+      WARN("Bootstrap Root : mismatch in rank count from procs %d : %d", nranks, info.nranks);
+      goto out;
+    }
+
+    if (memcmp(&zero, &extHandleBstrap[info.rank], sizeof(ncclNetHandle_t)) != 0) {
+      WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
+      goto out;
+    }
+
+    // Save the connection handle for connecting back to the ranks
+    memcpy(&extHandleBstrap[info.rank], info.extHandleListenFromRoot, sizeof(ncclNetHandle_t));
+    // Save the connection handle for the AllGather ring
+    memcpy(&extHandleRing[info.rank], info.extHandleRing, sizeof(ncclNetHandle_t));
+
+    ++c;
+  } while (c < nranks);
+
+  // Send the connect handle for the next rank in the AllGather ring
+  for (int r=0; r<nranks; ++r) {
+    int next = (r+1) % nranks;
+    void *tmpSendComm;
+    NCCLCHECKGOTO(bootstrapConnect(0, extHandleBstrap[r], &tmpSendComm), res, out);
+    NCCLCHECKGOTO(bootstrapSend(tmpSendComm, &extHandleRing[next], sizeof(ncclNetHandle_t)), res, out);
+    NCCLCHECKGOTO(bootstrapCloseSend(tmpSendComm), res, out);
+  }
+
+out:
+  bootstrapCloseListen(id->extListenComm);
+  free(commId);
+  if (data) free(data);
+  return NULL;
+}
+
+ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
+  struct extId* id = (struct extId*)commId;
+  id->hostHash = getHostHash();
+  NCCLCHECK(bootstrapListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
+  ncclUniqueId* threadIdCopy;
+  NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
+  memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
+  pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy);
+  return ncclSuccess;
+}
+
+ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
+  static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
+  extId* id = (extId*)out;
+
+  char* env = getenv("NCCL_COMM_ID");
+  if (env) {
+    if (ncclSocketCreateHandle(&id->extHandleRoot, env) != 0) {
+      WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
+      return ncclInvalidArgument;
+    }
+    id->pid = -1;
+  } else {
+    id->pid = getpid();
+    NCCLCHECK(bootstrapCreateRoot(out, false));
+  }
+
+  return ncclSuccess;
+}
+
+struct extState {
+  void* extBstrapRingRecvComm;
+  void* extBstrapRingSendComm;
+  ncclNetHandle_t extBstrapRootHandle;
+  int rank;
+  int nranks;
+  int dev;
+};
+
+ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) {
+  struct extId* id = (struct extId*)commId;
+  bool idFromEnv = id->pid < 0;
+  struct extState* state;
+  NCCLCHECK(ncclCalloc(&state, 1));
+  state->rank = rank;
+  state->nranks = nranks;
+  *commState = state;
+  void* extBstrapRootListenComm; // comm on which we accept root's connections
+
+  struct extInfo info = { 0 };
+  info.rank = rank;
+  info.nranks = nranks;
+  void *tmpSendComm, *extBstrapRingListenComm, *tmpRecvComm;
+  // Pass the remote address to listen via info
+  if (idFromEnv) {
+    memcpy(&info.extHandleListenFromRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
+    memcpy(&info.extHandleRing, &id->extHandleRoot, sizeof(ncclNetHandle_t));
+  }
+  // listen will return the local address via info (specify interface type 'findSubnetIf')
+  state->dev = idFromEnv ? findSubnetIf : 0;
+  NCCLCHECK(bootstrapListen(state->dev, &info.extHandleListenFromRoot, &extBstrapRootListenComm));
+  NCCLCHECK(bootstrapListen(state->dev, &info.extHandleRing, &extBstrapRingListenComm)); // AllGather Ring
+
+  memcpy(&state->extBstrapRootHandle, &id->extHandleRoot, sizeof(ncclNetHandle_t));
+  // send info on my listening sockets to root
+  NCCLCHECK(bootstrapConnect(state->dev, id->extHandleRoot, &tmpSendComm));
+  NCCLCHECK(bootstrapSend(tmpSendComm, &info, sizeof(info)));
+  NCCLCHECK(bootstrapCloseSend(tmpSendComm));
+
+  // get info on my "next" rank in the bootstrap ring from root
+  ncclNetHandle_t extHandleNext;
+  NCCLCHECK(bootstrapAccept(extBstrapRootListenComm, &tmpRecvComm));
+  NCCLCHECK(bootstrapRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
+  NCCLCHECK(bootstrapCloseRecv(tmpRecvComm));
+
+  NCCLCHECK(bootstrapConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
+  // Accept the connect request from the previous rank in the AllGather ring
+  NCCLCHECK(bootstrapAccept(extBstrapRingListenComm, &state->extBstrapRingRecvComm));
+  NCCLCHECK(bootstrapCloseListen(extBstrapRingListenComm));
+  NCCLCHECK(bootstrapCloseListen(extBstrapRootListenComm));
+
+  return ncclSuccess;
+}
+
+ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
+  struct extState* state = (struct extState*)commState;
+  char* data = (char*)allData;
+  int rank = state->rank;
+  int nranks = state->nranks;
+
+  TRACE(INIT, "rank %d nranks %d size %d", rank, nranks, size);
+
+  /* Simple ring based AllGather
+   * At each step i receive data from (rank-i-1) from left
+   * and send previous step's data from (rank-i) to right
+   */
+  for (int i=0; i<nranks-1; i++) {
+    int rslice = (rank - i - 1 + nranks) % nranks;
+    int sslice = (rank - i + nranks) % nranks;
+
+    // Send slice to the right
+    NCCLCHECK(bootstrapSend(state->extBstrapRingSendComm, data+sslice*size, size));
+    // Recv slice from the left
+    NCCLCHECK(bootstrapRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
+  }
+
+  TRACE(INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
+  return ncclSuccess;
+}
+
+ncclResult_t bootstrapClose(void* commState) {
+  struct extState* state = (struct extState*)commState;
+
+  NCCLCHECK(bootstrapCloseSend(state->extBstrapRingSendComm));
+  NCCLCHECK(bootstrapCloseRecv(state->extBstrapRingRecvComm));
+
+  free(state);
+
+  return ncclSuccess;
+}
diff --git a/src/3rd_party/nccl/src/collectives/all_gather.cu b/src/3rd_party/nccl/src/collectives/all_gather.cu
new file mode 100644
index 00000000..e19feff0
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/all_gather.cu
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "collectives.h"
+
+ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  size_t nbytes = count*ncclTypeSize(datatype);
+  INFO(COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
+  if (comm->nRanks == 1) {
+    if (sendbuff != recvbuff)
+      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+  } else {
+    NCCLCHECK(transportSaveProxies(ALLGATHER_SUBSTEPS, ALLGATHER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
+    NCCLCHECK(saveKernel(ncclCollAllGather, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes*comm->nRanks, 1));
+  }
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
+  return ncclEnqueueCheck(ncclAllGatherFunc, "AllGather", sendbuff, recvbuff, sendcount, datatype,
+          ncclSum, 0, comm, stream);
+}
diff --git a/src/3rd_party/nccl/src/collectives/all_reduce.cu b/src/3rd_party/nccl/src/collectives/all_reduce.cu
new file mode 100644
index 00000000..77ae4c84
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/all_reduce.cu
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "collectives.h"
+
+ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  size_t nbytes = count*ncclTypeSize(datatype);
+  INFO(COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
+  if (comm->nRanks == 1) {
+    if (sendbuff != recvbuff)
+      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+  } else {
+    NCCLCHECK(transportSaveProxies(ALLREDUCE_SUBSTEPS, ALLREDUCE_BUFCHUNKS, (comm->nRanks)*2-2, comm->nRanks, nbytes, proxyPatternRing, comm));
+    NCCLCHECK(saveKernel(ncclCollAllReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, comm->nRanks));
+  }
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
+  return ncclEnqueueCheck(ncclAllReduceFunc, "AllReduce", sendbuff, recvbuff, count, datatype,
+          op, 0, comm, stream);
+}
diff --git a/src/3rd_party/nccl/src/collectives/broadcast.cu b/src/3rd_party/nccl/src/collectives/broadcast.cu
new file mode 100644
index 00000000..0e5ec7b1
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/broadcast.cu
@@ -0,0 +1,42 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "collectives.h"
+
+ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  size_t nbytes = count*ncclTypeSize(datatype);
+  INFO(COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
+  if (comm->nRanks == 1) {
+    if (sendbuff != recvbuff)
+      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+  } else {
+    NCCLCHECK(transportSaveProxies(BROADCAST_SUBSTEPS, BROADCAST_BUFCHUNKS, 1, 1, nbytes, proxyPatternFrom(root), comm));
+    NCCLCHECK(saveKernel(ncclCollBroadcast, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes, 1));
+  }
+
+  return ncclSuccess;
+}
+
+/* Deprecated original "in place" function, similar to MPI */
+NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  return ncclEnqueueCheck(ncclBroadcastFunc, "Bcast", buff, buff, count, datatype,
+          ncclSum, root, comm, stream);
+}
+
+NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  return ncclEnqueueCheck(ncclBroadcastFunc, "Broadcast", sendbuff, recvbuff, count, datatype,
+          ncclSum, root, comm, stream);
+}
diff --git a/src/3rd_party/nccl/src/collectives/collectives.h b/src/3rd_party/nccl/src/collectives/collectives.h
new file mode 100644
index 00000000..4a5cb7a9
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/collectives.h
@@ -0,0 +1,66 @@
+/*************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_COLLECTIVES_H_
+#define NCCL_COLLECTIVES_H_
+
+typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
+
+#define FUNC_INDEX(coll, redop, dtype, ll) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll)
+
+#define NCCL_COLL_NAME(coll, op, dtype) \
+  coll##_##op##_##dtype
+
+#define NCCL_KERN_NAME(coll, op, dtype) \
+  coll##Kernel_##op##_##dtype
+
+/* Declare all collective operations */
+#define DECL_COLL4(coll, op, dtype) \
+  extern __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
+  extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl coll); \
+
+#define DECL_COLL3(coll, op, dtype) \
+  DECL_COLL4(coll##LL, op, dtype) \
+  DECL_COLL4(coll, op, dtype)
+
+#define DECL_COLL2(coll, op) \
+  DECL_COLL3(coll, op, i8) \
+  DECL_COLL3(coll, op, u8) \
+  DECL_COLL3(coll, op, i32) \
+  DECL_COLL3(coll, op, u32) \
+  DECL_COLL3(coll, op, i64) \
+  DECL_COLL3(coll, op, u64) \
+  DECL_COLL3(coll, op, f16) \
+  DECL_COLL3(coll, op, f32) \
+  DECL_COLL3(coll, op, f64)
+
+#define DECL_COLL(coll) \
+  DECL_COLL2(coll, sum) \
+  DECL_COLL2(coll, prod) \
+  DECL_COLL2(coll, min) \
+  DECL_COLL2(coll, max)
+
+#define DECL_ALL_COLLS \
+  DECL_COLL2(ncclBroadcast, copy) \
+  DECL_COLL(ncclReduce) \
+  DECL_COLL2(ncclAllGather, copy) \
+  DECL_COLL(ncclReduceScatter) \
+  DECL_COLL(ncclAllReduce) \
+
+DECL_ALL_COLLS
+
+#define ALLREDUCE_SUBSTEPS 2
+#define ALLREDUCE_BUFCHUNKS 2
+#define ALLGATHER_SUBSTEPS 2
+#define ALLGATHER_BUFCHUNKS 2
+#define REDUCESCATTER_SUBSTEPS 2
+#define REDUCESCATTER_BUFCHUNKS 2
+#define BROADCAST_SUBSTEPS 8
+#define BROADCAST_BUFCHUNKS 2
+#define REDUCE_SUBSTEPS 8
+#define REDUCE_BUFCHUNKS 2
+
+#endif
diff --git a/src/3rd_party/nccl/src/collectives/device/Makefile b/src/3rd_party/nccl/src/collectives/device/Makefile
new file mode 100644
index 00000000..e2bcd490
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/device/Makefile
@@ -0,0 +1,86 @@
+#
+# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+include ../../../makefiles/common.mk
+include ../../../makefiles/version.mk
+
+BUILDDIR ?= $(abspath ../../../build)
+OBJDIR := $(BUILDDIR)/obj/collectives/device
+
+LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu
+
+LIBOBJ     := $(patsubst %.cu,$(OBJDIR)/%_sum.o, $(LIBSRCFILES)) \
+              $(patsubst %.cu,$(OBJDIR)/%_prod.o, $(LIBSRCFILES)) \
+              $(patsubst %.cu,$(OBJDIR)/%_min.o, $(LIBSRCFILES)) \
+              $(patsubst %.cu,$(OBJDIR)/%_max.o, $(LIBSRCFILES)) \
+              $(OBJDIR)/functions.o
+
+LIBSRCFILES += functions.cu
+
+DEPFILES   := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES))
+DEPENDFILES := $(DEPFILES:%.d=%.dep)
+STATICLIB  := $(OBJDIR)/colldevice.a
+DEVOBJ     := $(OBJDIR)/devlink.o
+
+NVCUFLAGS  += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden"
+
+
+all: $(STATICLIB)
+
+# Dummy rule so that the extra dependency (%.dep) files are preserved by make
+all_deps: $(DEPENDFILES)
+
+-include $(DEPFILES)
+
+$(STATICLIB): $(LIBOBJ) $(DEVOBJ)
+	@printf "Archiving  %-35s > %s\n" objects $@
+	ar cr $@ $^
+
+# We do not want make to build *.d when running make clean.
+# So we only provide targets for .dep which will produce .dep and .d,
+# with only .d being included, and .dep keeping track of what needs to
+# be regenerated.
+$(OBJDIR)/%.dep : %.cu
+	@mkdir -p $(OBJDIR)
+	@$(NVCC) $(NVCUFLAGS) -M $< -o $@.tmp
+	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $@.tmp > $@
+	@sed -e 's/.*://' -e 's/\\$$//' < $@.tmp | fmt -1 | \
+                sed -e 's/^ *//' -e 's/$$/:/' >> $@
+	@rm -f $@.tmp
+	@cp $@ $(@:.dep=.d)
+
+# Compiled kernels and collectives with relocatable device code ...
+$(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(NVCC) $(NVCUFLAGS) -dc $< -o $@
+
+$(OBJDIR)/%_sum.o : %.cu $(OBJDIR)/%.dep
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(NVCC) -DNCCL_OP=0 $(NVCUFLAGS) -dc $< -o $@
+
+$(OBJDIR)/%_prod.o : %.cu $(OBJDIR)/%.dep
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(NVCC) -DNCCL_OP=1 $(NVCUFLAGS) -dc $< -o $@
+
+$(OBJDIR)/%_min.o : %.cu $(OBJDIR)/%.dep
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(NVCC) -DNCCL_OP=2 $(NVCUFLAGS) -dc $< -o $@
+
+$(OBJDIR)/%_max.o : %.cu $(OBJDIR)/%.dep
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(NVCC) -DNCCL_OP=3 $(NVCUFLAGS) -dc $< -o $@
+
+# ... and create the device-side linked object with all those.
+$(DEVOBJ) : $(LIBOBJ)
+	$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
+
+clean:
+	rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(STATICLIB) test
diff --git a/src/3rd_party/nccl/src/collectives/device/all_gather.cu b/src/3rd_party/nccl/src/collectives/device/all_gather.cu
new file mode 100644
index 00000000..0f572ce7
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/device/all_gather.cu
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common.h"
+#include "all_gather.h"
+#include "collectives.h"
+
+#define UNROLL 4
+
+#if NCCL_OP == 0
+IMPL_COLL3(ncclAllGather, copy, FuncSum, i8, int8_t, ncclCollAllGather, ncclSum, ncclInt8);
+#endif
diff --git a/src/3rd_party/nccl/src/collectives/device/all_gather.h b/src/3rd_party/nccl/src/collectives/device/all_gather.h
new file mode 100644
index 00000000..a30e5755
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/device/all_gather.h
@@ -0,0 +1,269 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "primitives.h"
+#include "collectives.h"
+
+// Increase Step and poffset/noffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  poffset = noffset; \
+  noffset += sliceSize; \
+  if (noffset == buffSize) noffset = 0;
+
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = blockDim.x - 1;
+  const int bid = args->bid;
+  __shared__ T* sharedNextOutput;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  int prevdirect = ring->recv.conn.direct;
+  int nextdirect = ring->send.conn.direct;
+
+  WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS);
+  PostFlag postDoneToPrev(ring->recv.conn.head, ALLGATHER_SUBSTEPS, NULL, 0);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
+
+  typedef Primitives<UNROLL, ALLGATHER_SUBSTEPS, T> Prims;
+
+  const ssize_t size = args->N;
+  const int nranks = comm->nRanks;
+  const int buffSize = ring->buffSize / sizeof(T);
+  const int sliceSize = buffSize / ALLGATHER_BUFCHUNKS;
+  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+
+  if (tid == 0) {
+    // Update in case we skipped some collectives
+    *ring->recv.conn.opCount = args->opCount;
+    // Wait for next to be ready
+    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
+    waitOpCountNext.wait(args->opCount);
+    if (prevdirect) {
+      *ring->recv.conn.ptrExchange = args->ThisOutput;
+    }
+    if (nextdirect) {
+      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
+      while (*ptr == nullptr);
+      sharedNextOutput = (T*)*ptr;
+      *ptr = nullptr;
+    }
+  }
+  __syncthreads();
+
+  uint64_t step = 0ULL;
+  int poffset, noffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
+  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
+    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+    /////////////// begin AllGather steps ///////////////
+    ssize_t offset;
+    int maxOffset = min(chunkSize, size-chunkOffset);
+    int rankDest;
+
+    // step 0: push data to next GPU
+    rankDest = ring->devUserRanks[0];
+    offset = chunkOffset + rankDest * size;
+
+    if (thisInput + chunkOffset == thisOutput + offset) { // In place
+      Prims::Copy(tid, nthreads,
+          thisInput  + chunkOffset,
+          nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext,
+          postReadyToNext);
+    } else {
+      Prims::DoubleCopy(tid, nthreads,
+          thisInput  + chunkOffset,
+          thisOutput + offset,
+          nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext,
+          postReadyToNext);
+    }
+
+    NEXT_STEP; // Increases step, poffset, noffset
+
+    // k-2 steps: copy to next GPU
+    if (prevdirect) {
+      for (int j=1; j<nranks-1; ++j) {
+        rankDest = ring->devUserRanks[nranks-j];
+        offset = chunkOffset + rankDest * size;
+
+        Prims::Copy(tid, nthreads,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+
+        NEXT_STEP;
+      }
+      Prims::Copy(tid, nthreads,
+          NULL,
+          NULL,
+          0, 0,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    } else {
+      for (int j=1; j<nranks-1; ++j) {
+        rankDest = ring->devUserRanks[nranks-j];
+        offset = chunkOffset + rankDest * size;
+
+        Prims::DoubleCopy(tid, nthreads,
+            prevInput + poffset,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+
+        NEXT_STEP;
+      }
+
+      // Make final copy from buffer to dest.
+      rankDest = ring->devUserRanks[1];
+      offset = chunkOffset + rankDest * size;
+
+      // Here we need to copy from buffer to this output.
+      Prims::Copy(tid, nthreads,
+          prevInput + poffset,
+          thisOutput + offset,
+          sliceSize, maxOffset,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    }
+  }
+
+  if (tid == 0) {
+    waitDoneFromNext.wait(ALLGATHER_SUBSTEPS*(step + ALLGATHER_BUFCHUNKS));
+    *ring->send.conn.head = 0ULL;
+    *ring->recv.conn.tail = 0ULL;
+    __threadfence_system();
+    *ring->recv.conn.opCount = args->opCount+1;
+  }
+}
+
+#include "ll_kernel.h"
+
+#define NEXT_STEP_LL \
+  poffset = noffset; \
+  pflag = nflag; \
+  noffset += NCCL_LL_SLICE_LINES; \
+  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
+  nflag++; \
+  step++;
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int llNthreads = args->nThreads;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
+  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
+  volatile int * sizesFifo = ring->send.conn.llFifo;
+  uint64_t sendHead = sendHeadPtr[0];
+
+  typedef LLPrimitives<T, FUNC> LL;
+
+  const ssize_t size = args->N;
+  //const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nRings*chunkSize;
+
+  uint64_t step = ring->send.conn.llStep;
+  uint32_t pflag, nflag = step + 1;
+  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
+  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    if (size-gridOffset < loopSize) {
+      chunkSize = args->lastChunkSize;
+    }
+    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+    /////////////// begin AllGather steps ///////////////
+    ssize_t offset;
+    int maxOffset = min(chunkSize, size-chunkOffset);
+    int rankDest;
+
+    // step 0: push data to next GPU
+    rankDest = ring->devUserRanks[0];
+    offset = chunkOffset + rankDest * size;
+
+    WAIT_NEXT;
+    if (thisInput + chunkOffset == thisOutput + offset) { // In place
+      LL::ReduceCopy(
+          thisInput  + chunkOffset,
+          nextOutput + noffset,
+          maxOffset, nflag, llNthreads);
+    } else {
+      LL::ReduceCopy(
+          thisInput  + chunkOffset,
+          thisOutput + offset,
+          nextOutput + noffset,
+          maxOffset, nflag, llNthreads);
+    }
+    POST_SIZE;
+
+    NEXT_STEP_LL;
+
+    // k-2 steps: copy to next GPU
+    for (int j=1; j<nranks-1; ++j) {
+      rankDest = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + rankDest * size;
+
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          prevInput  + poffset,
+          thisOutput + offset,
+          nextOutput + noffset,
+          maxOffset, pflag, nflag, llNthreads);
+      POST_SIZE;
+      ACK_PREV;
+
+      NEXT_STEP_LL;
+    }
+
+    // step k-1: final store
+    rankDest = ring->devUserRanks[1];
+    offset = chunkOffset + rankDest * size;
+
+    LL::ReduceCopy(
+        prevInput  + poffset,
+        thisOutput + offset,
+        maxOffset, pflag, llNthreads);
+    ACK_PREV;
+  }
+
+  FIFO_CLEANING_AND_SAVE_STEP(nflag);
+}
diff --git a/src/3rd_party/nccl/src/collectives/device/all_reduce.cu b/src/3rd_party/nccl/src/collectives/device/all_reduce.cu
new file mode 100644
index 00000000..caa1479c
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/device/all_reduce.cu
@@ -0,0 +1,21 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common.h"
+#include "all_reduce.h"
+#include "collectives.h"
+
+#define UNROLL 4
+
+#if NCCL_OP == 0
+IMPL_COLL2(ncclAllReduce, sum,  FuncSum,  ncclCollAllReduce, ncclSum);
+#elif NCCL_OP == 1
+IMPL_COLL2(ncclAllReduce, prod, FuncProd, ncclCollAllReduce, ncclProd);
+#elif NCCL_OP == 2
+IMPL_COLL2(ncclAllReduce, min,  FuncMin,  ncclCollAllReduce, ncclMin);
+#elif NCCL_OP == 3
+IMPL_COLL2(ncclAllReduce, max,  FuncMax,  ncclCollAllReduce, ncclMax);
+#endif
diff --git a/src/3rd_party/nccl/src/collectives/device/all_reduce.h b/src/3rd_party/nccl/src/collectives/device/all_reduce.h
new file mode 100644
index 00000000..d7abc644
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/device/all_reduce.h
@@ -0,0 +1,332 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "primitives.h"
+#include "collectives.h"
+
+// Increase Step and poffset/noffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  poffset = noffset; \
+  noffset += sliceSize; \
+  if (noffset == buffSize) noffset = 0;
+
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = blockDim.x - 1;
+  const int bid = args->bid;
+  __shared__ T* sharedNextOutput;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  int prevdirect = ring->recv.conn.direct;
+  int nextdirect = ring->send.conn.direct;
+
+  WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS);
+  PostFlag postDoneToPrev(ring->recv.conn.head, ALLREDUCE_SUBSTEPS, NULL, 0);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
+
+  typedef Primitives<UNROLL, ALLREDUCE_SUBSTEPS, T, FUNC> Prims;
+
+  const ssize_t size = args->N;
+  //const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  const int buffSize = ring->buffSize / sizeof(T);
+  const int sliceSize = buffSize / ALLREDUCE_BUFCHUNKS;
+  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+
+  if (tid == 0) {
+    // Update in case we skipped some collectives
+    *ring->recv.conn.opCount = args->opCount;
+    // Wait for next to be ready
+    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
+    waitOpCountNext.wait(args->opCount);
+    if (prevdirect) {
+      *ring->recv.conn.ptrExchange = args->ThisOutput;
+    }
+    if (nextdirect) {
+      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
+      while (*ptr == nullptr);
+      sharedNextOutput = (T*)*ptr;
+      *ptr = nullptr;
+    }
+  }
+  __syncthreads();
+
+  uint64_t step = 0ULL;
+  int poffset, noffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
+  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
+    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,nranks*args->nRings));
+    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
+
+    /////////////// begin AllReduce steps ///////////////
+    ssize_t offset;
+    int maxOffset;
+    int slice;
+
+    // step 0: push data to next GPU
+    slice = ring->devUserRanks[nranks-1];
+    offset = chunkOffset + slice * chunkSize;
+    maxOffset = min(chunkSize, size-offset);
+
+    Prims::Copy(tid, nthreads,
+        thisInput  + offset,
+        nextOutput + noffset,
+        sliceSize, maxOffset,
+        step,
+        waitDoneFromNext,
+        postReadyToNext);
+
+    NEXT_STEP; // Increases step, poffset, noffset
+
+    // k-2 steps: reduce and copy to next GPU
+    for (int j=2; j<nranks; ++j) {
+      slice = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + slice * chunkSize;
+      maxOffset = min(chunkSize, size-offset);
+
+      Prims::Reduce(tid, nthreads,
+          prevInput  + poffset,
+          thisInput  + offset,
+          nextOutput + noffset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext, waitReadyFromPrev,
+          postReadyToNext, postDoneToPrev);
+
+      NEXT_STEP;
+    }
+
+    // step k-1: reduce this buffer and data, which will produce the final
+    // result that we store in this data and push to the next GPU
+    slice = ring->devUserRanks[0];
+    offset = chunkOffset + slice * chunkSize;
+    maxOffset = min(chunkSize, size-offset);
+
+    Prims::ReduceCopy(tid, nthreads,
+        prevInput  + poffset,
+        thisInput  + offset,
+        nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+        thisOutput + offset,
+        sliceSize, maxOffset,
+        step,
+        waitDoneFromNext, waitReadyFromPrev,
+        postReadyToNext, postDoneToPrev);
+
+    NEXT_STEP;
+
+    // k-2 steps: copy to next GPU
+    if (prevdirect) {
+      for (int j=1; j<nranks-1; ++j) {
+        slice = ring->devUserRanks[nranks - j];
+        offset = chunkOffset + slice * chunkSize;
+        maxOffset = min(chunkSize, size-offset);
+
+        Prims::Copy(tid, nthreads,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+
+        NEXT_STEP;
+      }
+      Prims::Copy(tid, nthreads,
+          NULL,
+          NULL,
+          0, 0,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    } else {
+      for (int j=1; j<nranks-1; ++j) {
+        slice = ring->devUserRanks[nranks - j];
+        offset = chunkOffset + slice * chunkSize;
+        maxOffset = min(chunkSize, size-offset);
+
+        Prims::DoubleCopy(tid, nthreads,
+            prevInput + poffset,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+
+        NEXT_STEP;
+      }
+
+      // Make final copy from buffer to dest.
+      slice = ring->devUserRanks[1];
+      offset = chunkOffset + slice * chunkSize;
+      maxOffset = min(chunkSize, size-offset);
+
+      // Here we need to copy from buffer to this output.
+      Prims::Copy(tid, nthreads,
+          prevInput + poffset,
+          thisOutput + offset,
+          sliceSize, maxOffset,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    }
+  }
+
+  if (tid == 0) {
+    // Wait for next to have consumed all data before we reset the flag
+    waitDoneFromNext.wait(ALLREDUCE_SUBSTEPS*(step + ALLREDUCE_BUFCHUNKS));
+    *ring->send.conn.head = 0ULL;
+    *ring->recv.conn.tail = 0ULL;
+    __threadfence_system();
+    *ring->recv.conn.opCount = args->opCount+1;
+  }
+}
+
+#include "ll_kernel.h"
+
+#define NEXT_STEP_LL \
+  poffset = noffset; \
+  pflag = nflag; \
+  noffset += NCCL_LL_SLICE_LINES; \
+  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
+  nflag++; \
+  step++;
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int llNthreads = args->nThreads;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
+  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
+  volatile int * sizesFifo = ring->send.conn.llFifo;
+  uint64_t sendHead = sendHeadPtr[0];
+
+  typedef LLPrimitives<T, FUNC> LL;
+
+  const ssize_t size = args->N;
+  //const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nRings*nranks*chunkSize;
+
+  uint64_t step = ring->send.conn.llStep;
+  uint32_t pflag, nflag = step + 1;
+  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
+  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    if (size-gridOffset < loopSize) {
+      chunkSize = args->lastChunkSize;
+    }
+    ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
+
+    /////////////// begin AllReduce steps ///////////////
+    ssize_t offset;
+    int maxOffset;
+    int slice;
+
+    // step 0: push data to next GPU
+    slice = ring->devUserRanks[nranks-1];
+    offset = chunkOffset + slice * chunkSize;
+    maxOffset = min(chunkSize, size-offset);
+
+    WAIT_NEXT;
+    LL::ReduceCopy(
+        thisInput  + offset,
+        nextOutput + noffset,
+        maxOffset, nflag, llNthreads);
+    POST_SIZE;
+
+    NEXT_STEP_LL;
+
+    // k-2 steps: reduce and copy to next GPU
+    for (int j=2; j<nranks; ++j) {
+      slice = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + slice * chunkSize;
+      maxOffset = min(chunkSize, size-offset);
+
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          thisInput  + offset,
+          prevInput  + poffset,
+          nextOutput + noffset,
+          maxOffset, pflag, nflag, llNthreads);
+      POST_SIZE;
+      ACK_PREV;
+
+      NEXT_STEP_LL;
+    }
+
+    // step k-1: reduce this buffer and data, which will produce the final
+    // result that we store in this data and push to the next GPU
+    slice = ring->devUserRanks[0];
+    offset = chunkOffset + slice * chunkSize;
+    maxOffset = min(chunkSize, size-offset);
+
+    WAIT_NEXT;
+    LL::ReduceCopy(
+        thisInput  + offset,
+        prevInput  + poffset,
+        thisOutput + offset,
+        nextOutput + noffset,
+        maxOffset, pflag, nflag, llNthreads);
+    POST_SIZE;
+    ACK_PREV;
+
+    NEXT_STEP_LL;
+
+    // k-2 steps: copy to next GPU
+    for (int j=1; j<nranks-1; ++j) {
+      slice = ring->devUserRanks[nranks - j];
+      offset = chunkOffset + slice * chunkSize;
+      maxOffset = min(chunkSize, size-offset);
+
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          prevInput + poffset,
+          thisOutput + offset,
+          nextOutput + noffset,
+          maxOffset, pflag, nflag, llNthreads);
+      POST_SIZE;
+      ACK_PREV;
+
+      NEXT_STEP_LL;
+    }
+
+    // Make final copy from buffer to dest.
+    slice = ring->devUserRanks[1];
+    offset = chunkOffset + slice * chunkSize;
+    maxOffset = min(chunkSize, size-offset);
+
+    // Here we need to copy from buffer to this output.
+    LL::ReduceCopy(
+        prevInput + poffset,
+        thisOutput + offset,
+        maxOffset, pflag, llNthreads);
+    ACK_PREV;
+  }
+
+  FIFO_CLEANING_AND_SAVE_STEP(nflag);
+}
diff --git a/src/3rd_party/nccl/src/collectives/device/broadcast.cu b/src/3rd_party/nccl/src/collectives/device/broadcast.cu
new file mode 100644
index 00000000..4125de41
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/device/broadcast.cu
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common.h"
+#include "broadcast.h"
+#include "collectives.h"
+
+#define UNROLL 4
+
+#if NCCL_OP == 0
+IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8);
+#endif
diff --git a/src/3rd_party/nccl/src/collectives/device/broadcast.h b/src/3rd_party/nccl/src/collectives/device/broadcast.h
new file mode 100644
index 00000000..c2f6d001
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/device/broadcast.h
@@ -0,0 +1,228 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "primitives.h"
+#include "collectives.h"
+
+// Increase Step and boffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  boffset += sliceSize; \
+  if (boffset == buffSize) boffset = 0;
+
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = blockDim.x - 1;
+  const int bid = args->bid;
+  __shared__ T* sharedNextOutput;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  int prevdirect = ring->recv.conn.direct;
+  int nextdirect = ring->send.conn.direct;
+
+  WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
+  PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS);
+
+  typedef Primitives<UNROLL, BROADCAST_SUBSTEPS, T> Prims;
+
+  const ssize_t size = args->N;
+  const int buffSize = ring->buffSize / sizeof(T);
+  const int sliceSize = buffSize / BROADCAST_BUFCHUNKS;
+  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+  const int rank = ring->devUserRanks[0];
+  const int nextRank = ring->devUserRanks[1];
+  const int root = args->root;
+
+  if (tid == 0) {
+    // Update in case we skipped some collectives
+    *ring->recv.conn.opCount = args->opCount;
+    if (nextRank != root) {
+      // Wait for next to be ready
+      WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
+      waitOpCountNext.wait(args->opCount);
+    }
+    if (rank != root && prevdirect) {
+      *ring->recv.conn.ptrExchange = args->ThisOutput;
+    }
+    if (nextRank != root && nextdirect) {
+      void* volatile* ptr = &(ring->devMemSend->ptrExchange);
+      while (*ptr == nullptr);
+      sharedNextOutput = (T*)*ptr;
+      *ptr = nullptr;
+    }
+  }
+  __syncthreads();
+
+  uint64_t step = 0ULL;
+  int boffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
+  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
+    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t offset = gridOffset + bid*chunkSize;
+    int maxOffset = min(chunkSize, size-offset);
+
+    if (rank == root) {
+      if (thisInput == thisOutput) {
+        Prims::Copy(tid, nthreads,
+            thisInput  + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext,
+            postReadyToNext);
+      } else {
+        Prims::DoubleCopy(tid, nthreads,
+            thisInput  + offset,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext,
+            postReadyToNext);
+      }
+    } else if (nextRank == root) {
+      if (prevdirect) maxOffset = 0; // Only wait for signals
+      Prims::Copy(tid, nthreads,
+          prevInput  + boffset,
+          thisOutput + offset,
+          sliceSize, maxOffset,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    } else {
+      if (prevdirect) {
+        Prims::Copy(tid, nthreads,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+      } else {
+        Prims::DoubleCopy(tid, nthreads,
+            prevInput + boffset,
+            thisOutput + offset,
+            nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
+            sliceSize, maxOffset,
+            step,
+            waitDoneFromNext, waitReadyFromPrev,
+            postReadyToNext, postDoneToPrev);
+      }
+    }
+    NEXT_STEP; // Increases step, boffset
+  }
+
+  if (tid == 0) {
+    if (nextRank != root) {
+      // Wait for next to have consumed data before resetting the flag
+      waitDoneFromNext.wait(BROADCAST_SUBSTEPS*(step + BROADCAST_BUFCHUNKS - 1));
+      *ring->send.conn.head = 0ULL;
+    }
+    *ring->recv.conn.tail = 0ULL;
+    __threadfence_system();
+    *ring->recv.conn.opCount = args->opCount+1;
+  }
+}
+
+#include "ll_kernel.h"
+
+#define NEXT_STEP_LL \
+  boffset += NCCL_LL_SLICE_LINES; \
+  if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
+  flag++; \
+  step++;
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int llNthreads = args->nThreads;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
+  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
+  volatile int * sizesFifo = ring->send.conn.llFifo;
+  uint64_t sendHead = sendHeadPtr[0];
+  const int rank = comm->rank;
+  const int nextRank = ring->devUserRanks[1];
+  const int root = args->root;
+
+  typedef LLPrimitives<T, FUNC> LL;
+
+  const ssize_t size = args->N;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nRings*chunkSize;
+
+  uint64_t step = ring->send.conn.llStep;
+  uint32_t flag = step + 1;
+  int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
+  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    if (size-gridOffset < loopSize) {
+      chunkSize = args->lastChunkSize;
+    }
+    ssize_t offset = gridOffset + bid*chunkSize;
+
+    int maxOffset = min(chunkSize, size-offset);
+    if (rank == root) {
+      WAIT_NEXT;
+      if (thisInput == thisOutput) {
+        LL::ReduceCopy(
+            thisInput + offset,
+            nextOutput + boffset,
+            maxOffset, flag, llNthreads);
+      } else {
+        LL::ReduceCopy(
+            thisInput + offset,
+            thisOutput + offset,
+            nextOutput + boffset,
+            maxOffset, flag, llNthreads);
+      }
+      POST_SIZE;
+      NEXT_STEP_LL;
+    } else if (nextRank == root) {
+      LL::ReduceCopy(
+          prevInput + boffset,
+          thisOutput + offset,
+          maxOffset, flag, llNthreads);
+      NEXT_STEP_LL;
+      ACK_PREV;
+    } else {
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          prevInput + boffset,
+          thisOutput + offset,
+          nextOutput + boffset,
+          maxOffset, flag, flag, llNthreads);
+      POST_SIZE;
+      NEXT_STEP_LL;
+      ACK_PREV;
+    }
+  }
+
+  // We need everyone to acknowledge data even if they didn't receive anything
+  // so that the next collective can start right away.
+  ACK_PREV;
+
+  FIFO_CLEANING_AND_SAVE_STEP(flag);
+}
diff --git a/src/3rd_party/nccl/src/collectives/device/common.h b/src/3rd_party/nccl/src/collectives/device/common.h
new file mode 100644
index 00000000..c9889133
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/device/common.h
@@ -0,0 +1,90 @@
+/*************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_DEVICE_COMMON_H_
+#define NCCL_DEVICE_COMMON_H_
+
+#include "../collectives.h"
+#include "core.h"
+#include "nccl.h"
+
+typedef void(*ncclKern_t)(struct CollectiveArgs* args);
+extern __device__ ncclKern_t ncclFuncs[];
+
+static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
+  int* d = (int*)dst;
+  int* s = (int*)src;
+  __syncthreads();
+  for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
+  __syncthreads();
+}
+static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid) {
+  load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid);
+  if (tid == 0) hostColl->active = 0;
+}
+
+/* Functions for aggregation case */
+#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
+__device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
+  coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(args); \
+}
+/* Kernels with the first operation inlined */
+#define IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, fIndex) \
+__launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \
+__global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
+  int tid = threadIdx.x; \
+  int bid = blockIdx.x; \
+  __shared__ struct ncclColl localColl; \
+ \
+  struct ncclComm* comm = firstColl.args.comm; \
+  struct ncclRing* ring = comm->rings+bid; \
+  struct ncclColl* c; \
+  if (bid == 0) { \
+    /* To optimize for latency, (only) the first operation is passed as argument.*/ \
+    c = &firstColl; \
+  } else { \
+    c = &localColl; \
+    load_coll(c, ring->devCollectives+ring->collFifoHead, tid); \
+  } \
+  while (1) { \
+    if (tid < c->nThreads) { \
+      if (c->funcIndex == fIndex) { \
+        coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
+      } else { \
+        ncclFuncs[c->funcIndex](&c->args); \
+      } \
+    } \
+    int nextIndex = c->nextIndex; \
+    if (tid == 0) ring->collFifoHead = nextIndex; \
+ \
+    if (c->active == 2) { \
+      return; \
+    } \
+ \
+    /* Load next collective operation*/ \
+    c = &localColl; /* for bid 0 */ \
+    load_coll(c, ring->devCollectives+nextIndex, tid); \
+  } \
+}
+
+#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
+  IMPL_COLL4(coll##LL, op, ncclFunc, dtype, ctype) \
+  IMPL_COLL4K(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1)) \
+  IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
+  IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 0)) \
+
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+  IMPL_COLL3(coll, op, ncclFunc, i8,  int8_t,   ncclColl, ncclOp, ncclInt8) \
+  IMPL_COLL3(coll, op, ncclFunc, u8,  uint8_t,  ncclColl, ncclOp, ncclUint8) \
+  IMPL_COLL3(coll, op, ncclFunc, i32, int32_t,  ncclColl, ncclOp, ncclInt32) \
+  IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32) \
+  IMPL_COLL3(coll, op, ncclFunc, i64, int64_t,  ncclColl, ncclOp, ncclInt64) \
+  IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64) \
+  IMPL_COLL3(coll, op, ncclFunc, f16, half,     ncclColl, ncclOp, ncclFloat16) \
+  IMPL_COLL3(coll, op, ncclFunc, f32, float,    ncclColl, ncclOp, ncclFloat32) \
+  IMPL_COLL3(coll, op, ncclFunc, f64, double,   ncclColl, ncclOp, ncclFloat64)
+
+#endif
diff --git a/src/3rd_party/nccl/src/collectives/device/common_kernel.h b/src/3rd_party/nccl/src/collectives/device/common_kernel.h
new file mode 100644
index 00000000..0eaa0610
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/device/common_kernel.h
@@ -0,0 +1,372 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_COMMON_KERNEL_H_
+#define NCCL_COMMON_KERNEL_H_
+
+#include "core.h"
+#include <cstdio>
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+// Define min for ssize_t
+static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
+
+typedef uint64_t PackType;
+
+// unpack x and y to elements of type T and apply FUNC to each element
+template<class FUNC, typename T>
+struct MULTI {
+  __device__ PackType operator()(const PackType x, const PackType y) const;
+};
+
+template<class FUNC>
+struct MULTI<FUNC, int8_t> {
+  static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
+      "PackType must be twice the size of uint32_t.");
+  union converter {
+    PackType storage;
+    struct {
+      uint32_t a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    // for char, we do these as vector ops
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, uint8_t> {
+  static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
+      "PackType must be twice the size of uint32_t.");
+  union converter {
+    PackType storage;
+    struct {
+      uint32_t a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    // for char, we do these as vector ops
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, int32_t> {
+  static_assert(sizeof(PackType) == 2 * sizeof(int32_t),
+      "PackType must be twice the size of int.");
+  union converter {
+    PackType storage;
+    struct {
+      int32_t a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, uint32_t> {
+  static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
+      "PackType must be twice the size of int.");
+  union converter {
+    PackType storage;
+    struct {
+      uint32_t a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, half> {
+  static_assert(sizeof(PackType) == 4 * sizeof(half),
+      "PackType must be four times the size of half.");
+
+  struct PackHalf2 {
+    half2 a, b;
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    struct PackHalf2 cx, cy, cr;
+    cx = *(reinterpret_cast<const struct PackHalf2*>(&x));
+    cy = *(reinterpret_cast<const struct PackHalf2*>(&y));
+
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return *(reinterpret_cast<PackType*>(&cr));
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, float> {
+  static_assert(sizeof(PackType) == 2 * sizeof(float),
+      "PackType must be twice the size of float.");
+  union converter {
+    PackType storage;
+    struct {
+      float a, b;
+    };
+  };
+
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+
+    cr.a = FUNC()(cx.a, cy.a);
+    cr.b = FUNC()(cx.b, cy.b);
+
+    return cr.storage;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, double> {
+  static_assert(sizeof(PackType) == sizeof(double),
+      "PackType must be the same size as double.");
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    double rv = FUNC()(__longlong_as_double(x), __longlong_as_double(y));
+    return __double_as_longlong(rv);
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, uint64_t> {
+  static_assert(sizeof(PackType) == sizeof(uint64_t),
+      "PackType must be the same size as uint64_t.");
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    uint64_t rv = FUNC()(x, y);
+    return rv;
+  }
+};
+
+template<class FUNC>
+struct MULTI<FUNC, int64_t> {
+  static_assert(sizeof(PackType) == sizeof(int64_t),
+      "PackType must be the same size as int64_t.");
+  __device__ PackType operator()(const PackType x, const PackType y) const {
+    int64_t rv = FUNC()((int64_t)x, (int64_t)y);
+    return rv;
+  }
+};
+
+#define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))
+
+template<typename T>
+__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
+  size_t ptrval = reinterpret_cast<size_t>(ptr);
+  return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
+}
+
+template<typename T> inline __device__
+T vFetch(const volatile T* ptr) {
+  return *ptr;
+}
+
+template<typename T> inline __device__
+void vStore(volatile T* ptr, const T val) {
+  *ptr = val;
+}
+
+#if CUDART_VERSION < 9000
+template<> inline __device__
+half vFetch<half>(const volatile half* ptr) {
+  half r;
+  r.x = ptr->x;
+  return r;
+}
+
+template<> inline __device__
+void vStore<half>(volatile half* ptr, const half val) {
+  ptr->x = val.x;
+}
+#else
+template<> inline __device__
+half vFetch<half>(const volatile half* ptr) {
+  half r;
+  r = ((half*)ptr)[0];
+  return r;
+}
+
+template<> inline __device__
+void vStore<half>(volatile half* ptr, const half val) {
+  ((half*)ptr)[0] = val;
+}
+#endif
+
+template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
+__device__ inline void ReduceCopy(
+    const int tid, const int nthreads,
+    const volatile T * __restrict__ const src0,
+    const volatile T * __restrict__ const src1,
+    volatile T * __restrict__ const dest0,
+    volatile T * __restrict__ const dest1, const int N) {
+  for (int idx = tid; idx < N; idx += nthreads) {
+    T val = vFetch(src0+idx);
+    if (TWO_INPUTS) {
+      val = FUNC()(val, vFetch(src1+idx));
+    }
+    vStore(dest0+idx, val);
+    if (TWO_OUTPUTS) {
+      vStore(dest1+idx, val);
+    }
+  }
+}
+
+typedef ulong2 Pack128;
+
+template<class FUNC, typename T>
+struct MULTI128 {
+  __device__ void operator()(Pack128& x, Pack128& y) {
+    x.x = MULTI<FUNC, T>()(x.x, y.x);
+    x.y = MULTI<FUNC, T>()(x.y, y.y);
+  }
+};
+
+inline __device__ void Fetch128(Pack128& v, Pack128* p) {
+  asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory");
+}
+inline __device__ void Store128(Pack128* p, Pack128& v) {
+  asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" :: "l"(p), "l"(v.x), "l"(v.y) : "memory");
+}
+
+#define WARP_SIZE 32
+template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL>
+__device__ inline void ReduceCopy128b( const int w, const int nw, const int t,
+    Pack128 * src0, Pack128 * src1, Pack128 * dest0, Pack128 * dest1,
+    const int N) {
+  Pack128 t0[UNROLL];
+  Pack128 t1[UNROLL];
+  const Pack128* src0_end = src0 + N;
+  const int inc = nw * UNROLL * WARP_SIZE;
+  const int offset = w * UNROLL * WARP_SIZE + t;
+  src0 += offset;  if (TWO_INPUTS)  src1 += offset;
+  dest0 += offset; if (TWO_OUTPUTS) dest1 += offset;
+
+  while (src0 < src0_end) {
+#pragma unroll
+    for (int u = 0; u < UNROLL; ++u) {
+      Fetch128(t0[u], src0+u*WARP_SIZE);
+      if (TWO_INPUTS) Fetch128(t1[u], src1+u*WARP_SIZE);
+    }
+#pragma unroll
+    for (int u = 0; u < UNROLL; ++u) {
+      if (TWO_INPUTS) MULTI128<FUNC, T>()(t0[u], t1[u]);
+      Store128(dest0+u*WARP_SIZE, t0[u]);
+      if (TWO_OUTPUTS) Store128(dest1+u*WARP_SIZE, t0[u]);
+    }
+    src0 += inc;  if (TWO_INPUTS)  src1 += inc;
+    dest0 += inc; if (TWO_OUTPUTS) dest1 += inc;
+  }
+}
+
+template<int UNROLL, class FUNC, typename T, bool HAS_DEST1, bool HAS_SRC1>
+__device__ inline void ReduceOrCopy(const int tid, const int nthreads,
+    volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
+    const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
+    int N) {
+  int Nrem = N;
+  if (Nrem <= 0) return;
+
+  int Npreamble = (Nrem<alignof(Pack128)) ? Nrem : AlignUp(dest0, alignof(Pack128)) - dest0;
+
+  // stage 0: check if we'll be able to use the fast, 128-bit aligned path.
+  // If not, we'll just use the slow preamble path for the whole operation
+  bool alignable = (((AlignUp(src0,  alignof(Pack128)) == src0  + Npreamble)) &&
+          (!HAS_DEST1 || (AlignUp(dest1, alignof(Pack128)) == dest1 + Npreamble)) &&
+          (!HAS_SRC1  || (AlignUp(src1,  alignof(Pack128)) == src1  + Npreamble)));
+
+  if (!alignable) {
+    Npreamble = Nrem;
+  }
+
+  // stage 1: preamble: handle any elements up to the point of everything coming
+  // into alignment
+  ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Npreamble);
+
+  Nrem -= Npreamble;
+  if (Nrem == 0) return;
+
+  dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
+  src0  += Npreamble; if (HAS_SRC1)  { src1  += Npreamble; }
+
+  // stage 2: fast path: use 128b loads/stores to do the bulk of the work,
+  // assuming the pointers we have are all 128-bit alignable.
+  int w = tid / WARP_SIZE;       // Warp number
+  int nw = nthreads / WARP_SIZE; // Number of warps
+  int t = tid % WARP_SIZE;       // Thread (inside the warp)
+
+  const int PackFactor = sizeof(Pack128) / sizeof(T);
+
+  // stage 2a: main loop
+  int Nalign2a = (Nrem / (PackFactor * UNROLL * nthreads))
+      * (UNROLL * nthreads); // round down
+
+  ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2a);
+
+  int Ndone2a = Nalign2a * PackFactor;
+  Nrem -= Ndone2a;
+  if (Nrem == 0) return;
+  dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; }
+  src0  += Ndone2a; if (HAS_SRC1)  { src1  += Ndone2a; }
+
+  // stage 2b: slightly less optimized for section when we don't have full
+  // UNROLLs
+
+  int Nalign2b = Nrem / PackFactor;
+
+  ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, 1>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2b);
+
+  int Ndone2b = Nalign2b * PackFactor;
+  Nrem -= Ndone2b;
+  if (Nrem == 0) return;
+  dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; }
+  src0  += Ndone2b; if (HAS_SRC1)  { src1  += Ndone2b; }
+
+  // stage 2c: tail
+  ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Nrem);
+}
+
+#endif // COMMON_KERNEL_H_
diff --git a/src/3rd_party/nccl/src/collectives/device/functions.cu b/src/3rd_party/nccl/src/collectives/device/functions.cu
new file mode 100644
index 00000000..16f1865d
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/device/functions.cu
@@ -0,0 +1,64 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "collectives.h"
+#include "common.h"
+
+#define NCCL_FUNC4(coll, op, dtype) \
+  NCCL_COLL_NAME(coll, op, dtype), \
+  NCCL_COLL_NAME(coll##LL, op, dtype)  \
+
+// Must be consistent with ncclDataType_t
+#define NCCL_FUNCS3A(coll, op) \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  u8), \
+  NCCL_FUNC4(coll, op, i32), \
+  NCCL_FUNC4(coll, op, u32), \
+  NCCL_FUNC4(coll, op, i64), \
+  NCCL_FUNC4(coll, op, u64), \
+  NCCL_FUNC4(coll, op, f16), \
+  NCCL_FUNC4(coll, op, f32), \
+  NCCL_FUNC4(coll, op, f64)
+#define NCCL_FUNCS3B(coll, op) \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8), \
+  NCCL_FUNC4(coll, op,  i8)
+
+// Must be consistent with ncclRedOp_t
+#define NCCL_FUNCS2A(coll) \
+  NCCL_FUNCS3A(coll, sum ), \
+  NCCL_FUNCS3A(coll, prod), \
+  NCCL_FUNCS3A(coll, max ), \
+  NCCL_FUNCS3A(coll, min )
+#define NCCL_FUNCS2B(coll) \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy)
+
+// Must be consistent with ncclColl_t
+#define NCCL_FUNCS() { \
+  NCCL_FUNCS2B(ncclBroadcast), \
+  NCCL_FUNCS2A(ncclReduce), \
+  NCCL_FUNCS2B(ncclAllGather), \
+  NCCL_FUNCS2A(ncclReduceScatter), \
+  NCCL_FUNCS2A(ncclAllReduce) }
+
+// Must be consistent with the ncclFuncSet enum
+__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
+  NCCL_FUNCS2B(ncclBroadcast),
+  NCCL_FUNCS2A(ncclReduce),
+  NCCL_FUNCS2B(ncclAllGather),
+  NCCL_FUNCS2A(ncclReduceScatter),
+  NCCL_FUNCS2A(ncclAllReduce)
+};
diff --git a/src/3rd_party/nccl/src/collectives/device/ll_kernel.h b/src/3rd_party/nccl/src/collectives/device/ll_kernel.h
new file mode 100644
index 00000000..5ec3c9a8
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/device/ll_kernel.h
@@ -0,0 +1,154 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_LL_KERNEL_H_
+#define NCCL_LL_KERNEL_H_
+
+static __device__ uint64_t readLL(union ncclLLFifoLine* src, uint32_t flag) {
+  uint32_t data1, flag1, data2, flag2;
+  do {
+    asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
+  } while ((flag1 != flag) || (flag2 != flag));
+  uint64_t val64 = data1 + (((uint64_t)data2) << 32);
+  return val64;
+}
+
+static __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
+  asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
+}
+
+// Using memcpy handles misaligned pointers.
+static __device__ uint64_t readAL(uint64_t* src) {
+  uint64_t val;
+  memcpy((char*)&val, (char*)src, sizeof(uint64_t));
+  return val;
+}
+static __device__ void storeAL(uint64_t* dst, uint64_t val) {
+  memcpy((char*)dst, (char*)&val, sizeof(uint64_t));
+}
+
+template <typename T, class FUNC>
+class LLPrimitives {
+ private:
+  template <int HAS_SRC1, int HAS_SRC2, int HAS_DST1, int HAS_DST2>
+  static __device__ void ReduceCopyGeneric(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
+    if (size <= 0) return;
+    size_t size64 = size * sizeof(T) / sizeof(uint64_t);
+    uint64_t* src1A = (uint64_t*)src1;
+    uint64_t* dst1A = (uint64_t*)dst1;
+    int offset = threadIdx.x;
+    // Do multiples of 64 bits
+#pragma unroll 1
+    for (; offset < size64; offset += nthreads) {
+      uint64_t val;
+      if (HAS_SRC1) {
+        val = readAL(src1A+offset);
+        if (HAS_SRC2) val = MULTI<FUNC, T>()(readLL(src2+offset, iflag), val);
+      } else if (HAS_SRC2) {
+        val = readLL(src2+offset, iflag);
+      }
+      if (HAS_DST1) storeAL(dst1A+offset, val);
+      if (HAS_DST2) storeLL(dst2+offset, val, oflag);
+    }
+    // Finish last word
+    int sizeDone = size64*(sizeof(uint64_t)/sizeof(T));
+    int sizeRem = size - sizeDone;
+    if (threadIdx.x == 0 && sizeRem) {
+      const T* src1B = src1 + sizeDone;
+      T* dst1B = dst1 + sizeDone;
+
+      uint64_t lastVal;
+      T* vals = (T*)&lastVal;
+
+      if (HAS_SRC2) {
+        uint64_t lastVal2 = readLL(src2+size64, iflag);
+        T* src2B = (T*)&lastVal2;
+        for (int offset = 0; offset < sizeRem; offset++) {
+          vals[offset] = HAS_SRC1 ? FUNC()(src2B[offset], src1B[offset]) : src2B[offset];
+        }
+      } else if (HAS_SRC1) {
+        for (int offset = 0; offset < sizeRem; offset++) {
+          vals[offset] = src1B[offset];
+        }
+      }
+      if (HAS_DST2) storeLL(dst2+size64, lastVal, oflag);
+      if (HAS_DST1) {
+        for (int offset = 0; offset < sizeRem; offset++) {
+          dst1B[offset] = vals[offset];
+        }
+      }
+    }
+  }
+ public:
+  static __device__ void ReduceCopy(const T* src, union ncclLLFifoLine* dst, int size, uint32_t oflag, int nthreads) {
+    return ReduceCopyGeneric<1, 0, 0, 1>(src, NULL, NULL, dst, size, 0, oflag, nthreads);
+  }
+
+  static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst, int size, uint32_t iflag, int nthreads) {
+    return ReduceCopyGeneric<0, 1, 1, 0>(NULL, src, dst, NULL, size, iflag, 0, nthreads);
+  }
+
+  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, union ncclLLFifoLine* dst, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
+    return ReduceCopyGeneric<1, 1, 0, 1>(src1, src2, NULL, dst, size, iflag, oflag, nthreads);
+  }
+
+  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst, int size, uint32_t iflag, int nthreads) {
+    return ReduceCopyGeneric<1, 1, 1, 0>(src1, src2, dst, NULL, size, iflag, 0, nthreads);
+  }
+
+  static __device__ void ReduceCopy(const T* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t oflag, int nthreads) {
+    return ReduceCopyGeneric<1, 0, 1, 1>(src, NULL, dst1, dst2, size, 0, oflag, nthreads);
+  }
+
+  static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
+    return ReduceCopyGeneric<0, 1, 1, 1>(NULL, src, dst1, dst2, size, iflag, oflag, nthreads);
+  }
+
+  static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
+    return ReduceCopyGeneric<1, 1, 1, 1>(src1, src2, dst1, dst2, size, iflag, oflag, nthreads);
+  }
+};
+
+// Common macros
+
+#define STEP_TO_SLOT(step) \
+  (step % NCCL_LL_CHUNKS)
+
+#define WAIT_NEXT \
+  if (tid == 0) { \
+    while (sendHead + NCCL_LL_CHUNKS <= step) { \
+      sendHead = sendHeadPtr[0]; \
+    } \
+  } \
+  asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads));
+
+#define POST_SIZE \
+  if (tid == 0 && sizesFifo) sizesFifo[step % NCCL_LL_CHUNKS] = (maxOffset <= 0) ? -1 : (maxOffset*2*(int)sizeof(T));
+
+#define ACK_PREV \
+  asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads)); \
+  if (tid == 0) recvHeadPtr[0] = step;
+
+#define FIFO_CLEANING_AND_SAVE_STEP(flag) do { \
+  if (step > ring->send.conn.llLastCleaning + NCCL_LL_CLEAN_FREQ) { \
+    /* Reset all flags */ \
+    static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); \
+    static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); \
+    const union ncclLLFifoLine resetLine = { 0, flag, 0, flag }; \
+    for (int i=0; i<NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*llNthreads); i++) { \
+      prevInput[tid+i*llNthreads].i4 = resetLine.i4; \
+    } \
+    __threadfence_system(); \
+    /* Restart from the same slot, only make sure sender waits for data to be reset */ \
+    step += NCCL_LL_CHUNKS; \
+    ACK_PREV; \
+    while (sendHeadPtr[0] < step); \
+    if (tid == 0) ring->send.conn.llLastCleaning = step; \
+  } \
+  ring->send.conn.llStep = step; \
+} while (0);
+
+#endif
diff --git a/src/3rd_party/nccl/src/collectives/device/primitives.h b/src/3rd_party/nccl/src/collectives/device/primitives.h
new file mode 100644
index 00000000..8df152e7
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/device/primitives.h
@@ -0,0 +1,226 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PRIMITIVES_H_
+#define NCCL_PRIMITIVES_H_
+
+#include <type_traits>
+#include "reduce_kernel.h" // for reduction funcs
+
+
+/* Defines primitive operations: Copy, Reduce, DoubleCopy, and ReduceCopy.
+ *
+ * In order to reduce the reptetion of template arguments, the operations
+ * are bundled as static methods of the Primitives class.
+ *
+ * Each primitive operation copies/reduces a contiguous buffer and syncs
+ * an optional set of flags against a sub-step counter. The sync value is
+ * based on the step parameter. Sync flags must be of type WaitFlag or
+ * PostFlag. The primitive routines wait for all WaitFlag args to attain
+ * at least a value of SUBSTEPS*(step-1)+substep+1 (i.e. completion of
+ * corresponding substep by previous step) before executing the transfer.
+ * After each substep is transfered, all PostFlag arguments get updated to
+ * the value SUBSTEPS*step+substep+1.
+ */
+
+
+class WaitFlag {
+  volatile uint64_t * const flag;
+  const int shift;
+ public:
+  __device__ __forceinline__
+  WaitFlag(volatile uint64_t * const flag, const int shift) : flag(flag), shift(shift) { }
+  __device__ __forceinline__
+  void wait(uint64_t val) { while ((*flag + shift) < val) /*SPIN*/; }
+};
+
+
+class PostFlag {
+  volatile uint64_t * const flag;
+  const int shift;
+  volatile int * const fifo;
+  const int fifo_size;
+ public:
+  __device__ __forceinline__
+  PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size) : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size) { }
+  __device__ __forceinline__
+  void post(uint64_t val) { *flag = (val - shift); }
+  __device__ __forceinline__
+  void postSize(uint64_t step, int size) { if (fifo != NULL) fifo[step%fifo_size] = size; };
+};
+
+
+// Helper to check if any argument is of type T.
+// e.g. AnyAre<WaitFlag>(Flag1, Flag2, ...)
+template<typename T> __device__ __forceinline__
+bool AnyAre() { return false; }
+
+template<typename T, typename FIRST_T, typename... TAIL_Ts>
+__device__ __forceinline__
+bool AnyAre(FIRST_T first, TAIL_Ts... tail) {
+  return std::is_same<T, FIRST_T>::value || AnyAre<T>(tail...);
+}
+
+
+// Wait on all WaitFlags, ignore PostFlags
+__device__ __forceinline__
+void WaitOnFlags(uint64_t val) { }
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void WaitOnFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
+  flag.wait(val);
+  WaitOnFlags(val, tail...);
+}
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void WaitOnFlags(uint64_t val, PostFlag, TAIL_Ts... tail) {
+  WaitOnFlags(val, tail...);
+}
+
+
+// Post all PostFlags, ignore WaitFlags
+__device__ __forceinline__
+void PostToFlags(uint64_t val) { }
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void PostToFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
+  PostToFlags(val, tail...);
+}
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void PostToFlags(uint64_t val, PostFlag flag, TAIL_Ts... tail) {
+  flag.post(val);
+  PostToFlags(val, tail...);
+}
+
+
+// Post sizes for PostFlags, ignore WaitFlags
+__device__ __forceinline__
+void PostSizeToFlags(uint64_t step, int size) { }
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void PostSizeToFlags(uint64_t step, int size, WaitFlag flag, TAIL_Ts... tail) {
+  PostSizeToFlags(step, size, tail...);
+}
+
+template <typename... TAIL_Ts> __device__ __forceinline__
+void PostSizeToFlags(uint64_t step, int size, PostFlag flag, TAIL_Ts... tail) {
+  flag.postSize(step, size);
+  PostSizeToFlags(step, size, tail...);
+}
+
+
+// Create pointer arithmetic syntax that doesn't break for nullptr_t
+template <typename Tptr> __device__ __forceinline__
+Tptr ptradd(Tptr ptr, int i) {
+  return ptr + i;
+}
+
+__device__ __forceinline__
+nullptr_t ptradd(nullptr_t ptr, int i) {
+  return nullptr;
+}
+
+
+// Implementation of primitive types
+template <int UNROLL, int SUBSTEPS, typename T, typename REDOP=FuncSum<T> >
+class Primitives {
+ private:
+  template <typename SRC2_T, // either T* or nullptr_t
+      typename DST2_T, // either T* or nullptr_t
+      typename... SYNC_Ts> // either WaitFunc or PostFunc
+  static __device__ __forceinline__ void
+  GenericOp(const int tid, const int nthreads,
+      const T*     src1,
+      const SRC2_T src2,
+      T*     dst1,
+      DST2_T dst2,
+      int len, int maxoffset, uint64_t step, SYNC_Ts... flags) {
+
+    enum { noSrc2 = std::is_same<SRC2_T, nullptr_t>::value };
+    enum { noDst2 = std::is_same<DST2_T, nullptr_t>::value };
+    static_assert(noSrc2 || std::is_same<SRC2_T, const T*>::value,
+        "src2 must be of type T* or nullptr_t");
+    static_assert(noDst2 || std::is_same<DST2_T, T*>::value,
+        "dst2 must be of type T* or nullptr_t");
+
+    using OpType = typename std::conditional<noSrc2, FuncSum<T>, REDOP>::type;
+
+    int sliceSize = len / SUBSTEPS;
+    int sliceOffset = 0;
+
+#pragma unroll 1
+    for (int sub=0; sub<SUBSTEPS; ++sub) {
+      int realSize = max(0, min(sliceSize, maxoffset-sliceOffset));
+      if (tid < nthreads) {
+        if (AnyAre<WaitFlag>(flags...)) {
+          if (tid == 0) {
+            WaitOnFlags(SUBSTEPS*step + sub + 1, flags...);
+          }
+          asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+        }
+        ReduceOrCopy
+        <
+        UNROLL,
+        OpType,
+        T,
+        !std::is_same<DST2_T, nullptr_t>::value, // HAS_DEST1
+        !std::is_same<SRC2_T, nullptr_t>::value  // HAS_SRC1
+        >
+        (
+            tid, nthreads,
+            ptradd(dst1, sliceOffset),
+            ptradd(dst2, sliceOffset),
+            ptradd(src1, sliceOffset),
+            ptradd(src2, sliceOffset),
+            realSize
+        );
+        if (AnyAre<PostFlag>(flags...)) {
+          __syncthreads();
+        }
+      } else {
+        if (AnyAre<PostFlag>(flags...)) {
+          __syncthreads();
+          PostSizeToFlags(SUBSTEPS*step+sub, realSize*sizeof(T), flags...);
+          __threadfence_system();
+          PostToFlags(SUBSTEPS*step + sub + 1, flags...);
+        }
+      }
+      sliceOffset += sliceSize;
+    }
+  }
+
+ public:
+  template <typename... SYNC_Ts>
+  static __device__ __forceinline__ void
+  Copy(const int tid, const int nthreads, const T* src, T* dst,
+      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
+    GenericOp(tid, nthreads, src, nullptr, dst, nullptr, len, maxOffset, step, flags...);
+  }
+
+  template <typename... SYNC_Ts>
+  static __device__ __forceinline__ void
+  DoubleCopy(const int tid, const int nthreads, const T* src, T* dst1, T* dst2,
+      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
+    GenericOp(tid, nthreads, src, nullptr, dst1, dst2, len, maxOffset, step, flags...);
+  }
+
+  template <typename... SYNC_Ts>
+  static __device__ __forceinline__ void
+  Reduce(const int tid, const int nthreads, const T* src1, const T* src2, T* dst,
+      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
+    GenericOp(tid, nthreads, src1, src2, dst, nullptr, len, maxOffset, step, flags...);
+  }
+
+  template <typename... SYNC_Ts>
+  static __device__ __forceinline__ void
+  ReduceCopy(const int tid, const int nthreads, const T* src1, const T* src2, T* dst1, T* dst2,
+      int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
+    GenericOp(tid, nthreads, src1, src2, dst1, dst2, len, maxOffset, step, flags...);
+  }
+};
+
+#endif // end include guard
diff --git a/src/3rd_party/nccl/src/collectives/device/reduce.cu b/src/3rd_party/nccl/src/collectives/device/reduce.cu
new file mode 100644
index 00000000..bd1d23ce
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/device/reduce.cu
@@ -0,0 +1,21 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common.h"
+#include "reduce.h"
+#include "collectives.h"
+
+#define UNROLL 4
+
+#if NCCL_OP == 0
+IMPL_COLL2(ncclReduce, sum,  FuncSum,  ncclCollReduce, ncclSum);
+#elif NCCL_OP == 1
+IMPL_COLL2(ncclReduce, prod, FuncProd, ncclCollReduce, ncclProd);
+#elif NCCL_OP == 2
+IMPL_COLL2(ncclReduce, min,  FuncMin,  ncclCollReduce, ncclMin);
+#elif NCCL_OP == 3
+IMPL_COLL2(ncclReduce, max,  FuncMax,  ncclCollReduce, ncclMax);
+#endif
diff --git a/src/3rd_party/nccl/src/collectives/device/reduce.h b/src/3rd_party/nccl/src/collectives/device/reduce.h
new file mode 100644
index 00000000..f5694b14
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/device/reduce.h
@@ -0,0 +1,190 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "primitives.h"
+#include "collectives.h"
+
+// Increase Step and boffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  boffset += sliceSize; \
+  if (boffset == buffSize) boffset = 0;
+
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclReduceKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = blockDim.x - 1;
+  const int bid = args->bid;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+
+  WaitFlag waitDoneFromNext(ring->send.conn.head, (REDUCE_BUFCHUNKS-1)*REDUCE_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
+  PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS);
+
+  typedef Primitives<UNROLL, REDUCE_SUBSTEPS, T, FUNC> Prims;
+
+  const ssize_t size = args->N;
+  const int nranks = comm->nRanks;
+  const int buffSize = ring->buffSize / sizeof(T);
+  const int sliceSize = buffSize / REDUCE_BUFCHUNKS;
+  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+  const int rank = ring->devUserRanks[0];
+  const int prevRank = ring->devUserRanks[nranks-1];
+  const int root = args->root;
+
+  if (tid == 0) {
+    // Update in case we skipped some collectives
+    *ring->recv.conn.opCount = args->opCount;
+
+    if (rank != root) {
+      // Wait for next to be ready
+      WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
+      waitOpCountNext.wait(args->opCount);
+    }
+  }
+  __syncthreads();
+
+  uint64_t step = 0ULL;
+  int boffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
+  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
+    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t offset = gridOffset + bid*chunkSize;
+    int maxOffset = min(chunkSize, size-offset);
+    if (prevRank == root) {
+      Prims::Copy(tid, nthreads,
+          thisInput + offset,
+          nextOutput + boffset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext,
+          postReadyToNext);
+    } else if (rank == root) {
+      Prims::Reduce(tid, nthreads,
+          prevInput  + boffset,
+          thisInput + offset,
+          thisOutput + offset,
+          sliceSize, maxOffset,
+          step,
+          waitReadyFromPrev,
+          postDoneToPrev);
+    } else {
+      Prims::Reduce(tid, nthreads,
+          prevInput + boffset,
+          thisInput + offset,
+          nextOutput + boffset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext, waitReadyFromPrev,
+          postReadyToNext, postDoneToPrev);
+    }
+    NEXT_STEP; // Increases step, boffset
+  }
+
+  if (tid == 0) {
+    if (rank != root) {
+      // Wait for next to have consumed data before resetting the flag
+      waitDoneFromNext.wait(REDUCE_SUBSTEPS*(step + REDUCE_BUFCHUNKS - 1));
+      *ring->send.conn.head = 0ULL;
+    }
+    *ring->recv.conn.tail = 0ULL;
+    __threadfence_system();
+    *ring->recv.conn.opCount = args->opCount+1;
+  }
+}
+
+#include "ll_kernel.h"
+
+#define NEXT_STEP_LL \
+  boffset += NCCL_LL_SLICE_LINES; \
+  if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
+  flag++; \
+  step++;
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int llNthreads = args->nThreads;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
+  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
+  volatile int * sizesFifo = ring->send.conn.llFifo;
+  uint64_t sendHead = sendHeadPtr[0];
+  const int nranks = comm->nRanks;
+  const int rank = comm->rank;
+  const int prevRank = ring->devUserRanks[nranks-1];
+  const int root = args->root;
+
+  typedef LLPrimitives<T, FUNC> LL;
+
+  const ssize_t size = args->N;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nRings*chunkSize;
+
+  uint64_t step = ring->send.conn.llStep;
+  uint32_t flag = step + 1;
+  int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
+  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    if (size-gridOffset < loopSize) {
+      chunkSize = args->lastChunkSize;
+    }
+    ssize_t offset = gridOffset + bid*chunkSize;
+
+    int maxOffset = min(chunkSize, size-offset);
+    if (prevRank == root) {
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          thisInput + offset,
+          nextOutput + boffset,
+          maxOffset, flag, llNthreads);
+      POST_SIZE;
+      NEXT_STEP_LL;
+    } else if (rank == root) {
+      LL::ReduceCopy(
+          thisInput + offset,
+          prevInput  + boffset,
+          thisOutput + offset,
+          maxOffset, flag, llNthreads);
+      NEXT_STEP_LL;
+      ACK_PREV;
+    } else {
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          thisInput + offset,
+          prevInput + boffset,
+          nextOutput + boffset,
+          maxOffset, flag, flag, llNthreads);
+      POST_SIZE;
+      NEXT_STEP_LL;
+      ACK_PREV;
+    }
+  }
+
+  // We need everyone to acknowledge data even if they didn't receive anything
+  // so that the next collective can start right away.
+  ACK_PREV;
+
+  FIFO_CLEANING_AND_SAVE_STEP(flag);
+}
diff --git a/src/3rd_party/nccl/src/collectives/device/reduce_kernel.h b/src/3rd_party/nccl/src/collectives/device/reduce_kernel.h
new file mode 100644
index 00000000..0cb8f139
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/device/reduce_kernel.h
@@ -0,0 +1,364 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+
+#ifndef NCCL_REDUCE_KERNEL_H_
+#define NCCL_REDUCE_KERNEL_H_
+
+#include "common_kernel.h"
+#include <limits>
+
+template<typename T>
+struct FuncNull {
+  __device__ T operator()(const T x, const T y) const {
+    return 0;
+  }
+};
+
+template<typename T>
+struct FuncSum {
+  __device__ T operator()(const T x, const T y) const {
+    return x + y;
+  }
+};
+
+template<typename T>
+struct FuncProd {
+  __device__ T operator()(const T x, const T y) const {
+    return x * y;
+  }
+};
+
+template<typename T>
+struct FuncMax {
+  __device__ T operator()(const T x, const T y) const {
+    return (x < y) ? y : x;
+  }
+};
+
+template<typename T>
+struct FuncMin {
+  __device__ T operator()(const T x, const T y) const {
+    return (x < y) ? x : y;
+  }
+};
+
+template<>
+struct FuncSum<int8_t> {
+  union converter { uint32_t storage; char4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
+    int32_t rv;
+    asm("vadd.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
+        "vadd.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+        "vadd.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+        "vadd.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = cx.a.x + cy.a.x;
+    cr.a.y = cx.a.y + cy.a.y;
+    cr.a.z = cx.a.z + cy.a.z;
+    cr.a.w = cx.a.w + cy.a.w;
+    return cr.storage;
+#endif
+  }
+  __device__ int8_t operator()(const int8_t x, const int8_t y) const {
+    return x+y;
+  }
+};
+template<>
+struct FuncSum<uint8_t> {
+  union converter { uint32_t storage; uchar4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    asm("vadd4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
+    int32_t rv;
+    asm("vadd.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
+        "vadd.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+        "vadd.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+        "vadd.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = cx.a.x + cy.a.x;
+    cr.a.y = cx.a.y + cy.a.y;
+    cr.a.z = cx.a.z + cy.a.z;
+    cr.a.w = cx.a.w + cy.a.w;
+    return cr.storage;
+#endif
+  }
+  __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
+    return x+y;
+  }
+};
+
+static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) {
+  /* This can be used both for signed and unsigned 8-bit multiplication */
+#if (__CUDA_ARCH__ >= 300)
+  uint32_t rv;
+  asm("{ .reg .u32 t0, t1, t2, t3;\n\t"
+      " vmad.u32.u32.u32 t3, %1.b3, %2.b3, 0;\n\t"
+      " vmad.u32.u32.u32 t2, %1.b2, %2.b2, 0;\n\t"
+      " shl.b32          t3, t3, 16;\n\t"
+      " shl.b32          t2, t2, 16;\n\t"
+      " vmad.u32.u32.u32 t1, %1.b1, %2.b1, t3;\n\t"
+      " shl.b32          t1, t1, 8;\n\t"
+      " vmad.u32.u32.u32 t0, %1.b0, %2.b0, t2;\n\t"
+      " and.b32          t1, t1, 0xff00ff00;\n\t"
+      " and.b32          t0, t0, 0x00ff00ff;\n\t"
+      " or.b32           %0,  t0, t1;\n\t"
+      "}" : "=r"(rv) : "r"(x), "r"(y));
+  return rv;
+#else
+  union converter { uint32_t storage; char4 a; };
+  converter cx, cy, cr;
+  cx.storage = x;
+  cy.storage = y;
+  cr.a.x = cx.a.x * cy.a.x;
+  cr.a.y = cx.a.y * cy.a.y;
+  cr.a.z = cx.a.z * cy.a.z;
+  cr.a.w = cx.a.w * cy.a.w;
+  return cr.storage;
+#endif
+}
+
+template<>
+struct FuncProd<int8_t> {
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+    return mulChar4(x, y);
+  }
+  __device__ int8_t operator()(const int8_t x, const int8_t y) const {
+    return x*y;
+  }
+};
+template<>
+struct FuncProd<uint8_t> {
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+    return mulChar4(x, y);
+  }
+  __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
+    return x*y;
+  }
+};
+
+template<>
+struct FuncMax<int8_t> {
+  union converter { uint32_t storage; char4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
+    int32_t rv;
+    asm("vmax.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
+        "vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+        "vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+        "vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = max(cx.a.x, cy.a.x);
+    cr.a.y = max(cx.a.y, cy.a.y);
+    cr.a.z = max(cx.a.z, cy.a.z);
+    cr.a.w = max(cx.a.w, cy.a.w);
+    return cr.storage;
+#endif
+  }
+  __device__ int8_t operator()(const int8_t x, const int8_t y) const {
+    return (x>y) ? x : y;
+  }
+};
+template<>
+struct FuncMax<uint8_t> {
+  union converter { uint32_t storage; uchar4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
+    int32_t rv;
+    asm("vmax.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
+        "vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+        "vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+        "vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = max(cx.a.x, cy.a.x);
+    cr.a.y = max(cx.a.y, cy.a.y);
+    cr.a.z = max(cx.a.z, cy.a.z);
+    cr.a.w = max(cx.a.w, cy.a.w);
+    return cr.storage;
+#endif
+  }
+  __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
+    return (x>y) ? x : y;
+  }
+};
+
+template<>
+struct FuncMin<int8_t> {
+  union converter { uint32_t storage; char4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
+    int32_t rv;
+    asm("vmin.s32.s32.s32 %0,    %1.b0, %2.b0;    \n\t"
+        "vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+        "vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+        "vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = min(cx.a.x, cy.a.x);
+    cr.a.y = min(cx.a.y, cy.a.y);
+    cr.a.z = min(cx.a.z, cy.a.z);
+    cr.a.w = min(cx.a.w, cy.a.w);
+    return cr.storage;
+#endif
+  }
+  __device__ int8_t operator()(const int8_t x, const int8_t y) const {
+    return (x<y) ? x : y;
+  }
+};
+template<>
+struct FuncMin<uint8_t> {
+  union converter { uint32_t storage; uchar4 a; };
+  __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
+#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
+    int32_t rv, z=0;
+    asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
+    return rv;
+#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
+    int32_t rv;
+    asm("vmin.u32.u32.u32 %0,    %1.b0, %2.b0;    \n\t"
+        "vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
+        "vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
+        "vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
+    return rv;
+#else
+    converter cx, cy, cr;
+    cx.storage = x;
+    cy.storage = y;
+    cr.a.x = min(cx.a.x, cy.a.x);
+    cr.a.y = min(cx.a.y, cy.a.y);
+    cr.a.z = min(cx.a.z, cy.a.z);
+    cr.a.w = min(cx.a.w, cy.a.w);
+    return cr.storage;
+#endif
+  }
+  __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
+    return (x<y) ? x : y;
+  }
+};
+
+template<>
+struct FuncSum<half> {
+  __device__ half2 operator()(const half2 x, const half2 y) const {
+#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
+    return __hadd2(x, y);
+#else
+    float2 fx, fy, fr;
+    fx = __half22float2(x);
+    fy = __half22float2(y);
+    fr.x = fx.x + fy.x;
+    fr.y = fx.y + fy.y;
+    return __float22half2_rn(fr);
+#endif
+  }
+  __device__ half operator()(const half x, const half y) const {
+#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
+    return __hadd(x, y);
+#else
+    return __float2half( __half2float(x) + __half2float(y) );
+#endif
+  }
+};
+
+template<>
+struct FuncProd<half> {
+  __device__ half2 operator()(const half2 x, const half2 y) const {
+#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
+    return __hmul2(x, y);
+#else
+    float2 fx, fy, fr;
+    fx = __half22float2(x);
+    fy = __half22float2(y);
+    fr.x = fx.x * fy.x;
+    fr.y = fx.y * fy.y;
+    return __float22half2_rn(fr);
+#endif
+  }
+  __device__ half operator()(const half x, const half y) const {
+#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
+    return __hmul(x, y);
+#else
+    return __float2half( __half2float(x) * __half2float(y) );
+#endif
+  }
+};
+
+template<>
+struct FuncMax<half> {
+  __device__ half2 operator()(const half2 x, const half2 y) const {
+    float2 fx, fy, fr;
+    fx = __half22float2(x);
+    fy = __half22float2(y);
+    fr.x = fmaxf(fx.x, fy.x);
+    fr.y = fmaxf(fx.y, fy.y);
+    return __float22half2_rn(fr);
+  }
+  __device__ half operator()(const half x, const half y) const {
+    float fx, fy, fm;
+    fx = __half2float(x);
+    fy = __half2float(y);
+    fm = fmaxf(fx, fy);
+    return __float2half(fm);
+  }
+};
+
+template<>
+struct FuncMin<half> {
+  __device__ half2 operator()(const half2 x, const half2 y) const {
+    float2 fx, fy, fr;
+    fx = __half22float2(x);
+    fy = __half22float2(y);
+    fr.x = fminf(fx.x, fy.x);
+    fr.y = fminf(fx.y, fy.y);
+    return __float22half2_rn(fr);
+  }
+  __device__ half operator()(const half x, const half y) const {
+    float fx, fy, fm;
+    fx = __half2float(x);
+    fy = __half2float(y);
+    fm = fminf(fx, fy);
+    return __float2half(fm);
+  }
+};
+#endif // REDUCE_KERNEL_H_
diff --git a/src/3rd_party/nccl/src/collectives/device/reduce_scatter.cu b/src/3rd_party/nccl/src/collectives/device/reduce_scatter.cu
new file mode 100644
index 00000000..b16053c4
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/device/reduce_scatter.cu
@@ -0,0 +1,21 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "common.h"
+#include "reduce_scatter.h"
+#include "collectives.h"
+
+#define UNROLL 4
+
+#if NCCL_OP == 0
+IMPL_COLL2(ncclReduceScatter, sum,  FuncSum,  ncclCollReduceScatter, ncclSum);
+#elif NCCL_OP == 1
+IMPL_COLL2(ncclReduceScatter, prod, FuncProd, ncclCollReduceScatter, ncclProd);
+#elif NCCL_OP == 2
+IMPL_COLL2(ncclReduceScatter, min,  FuncMin,  ncclCollReduceScatter, ncclMin);
+#elif NCCL_OP == 3
+IMPL_COLL2(ncclReduceScatter, max,  FuncMax,  ncclCollReduceScatter, ncclMax);
+#endif
diff --git a/src/3rd_party/nccl/src/collectives/device/reduce_scatter.h b/src/3rd_party/nccl/src/collectives/device/reduce_scatter.h
new file mode 100644
index 00000000..cad011b2
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/device/reduce_scatter.h
@@ -0,0 +1,217 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "primitives.h"
+#include "collectives.h"
+
+// Increase Step and poffset/noffset for buffer sync
+#define NEXT_STEP \
+  step++; \
+  poffset = noffset; \
+  noffset += sliceSize; \
+  if (noffset == buffSize) noffset = 0;
+
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int nthreads = blockDim.x - 1;
+  const int bid = args->bid;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+
+  WaitFlag waitDoneFromNext(ring->send.conn.head, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
+  WaitFlag waitReadyFromPrev(ring->recv.conn.tail, REDUCESCATTER_SUBSTEPS);
+  PostFlag postDoneToPrev(ring->recv.conn.head, REDUCESCATTER_SUBSTEPS, NULL, 0);
+  PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
+
+  typedef Primitives<UNROLL, REDUCESCATTER_SUBSTEPS, T, FUNC> Prims;
+
+  const ssize_t size = args->N;
+  const int nranks = comm->nRanks;
+  const int buffSize = ring->buffSize / sizeof(T);
+  const int sliceSize = buffSize / REDUCESCATTER_BUFCHUNKS;
+  const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+
+  if (tid == 0) {
+    // Update in case we skipped some collectives
+    *ring->recv.conn.opCount = args->opCount;
+    // Wait for next to be ready
+    WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
+    waitOpCountNext.wait(args->opCount);
+  }
+  __syncthreads();
+
+  uint64_t step = 0ULL;
+  int poffset, noffset = 0;
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
+  T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
+    ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+    /////////////// begin ReduceScatter steps ///////////////
+    ssize_t offset;
+    int maxOffset = min(chunkSize, size-chunkOffset);
+    int rankDest;
+
+    // step 0: push data to next GPU
+    rankDest = ring->devUserRanks[nranks-1];
+    offset = chunkOffset + rankDest * size;
+
+    Prims::Copy(tid, nthreads,
+        thisInput  + offset,
+        nextOutput + noffset,
+        sliceSize, maxOffset,
+        step,
+        waitDoneFromNext,
+        postReadyToNext);
+
+    NEXT_STEP; // Increases step, poffset, noffset
+
+    // k-2 steps: reduce and copy to next GPU
+    for (int j=2; j<nranks; ++j) {
+      rankDest = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + rankDest * size;
+
+      Prims::Reduce(tid, nthreads,
+          prevInput  + poffset,
+          thisInput  + offset,
+          nextOutput + noffset,
+          sliceSize, maxOffset,
+          step,
+          waitDoneFromNext, waitReadyFromPrev,
+          postReadyToNext, postDoneToPrev);
+
+      NEXT_STEP;
+    }
+
+    // step k-1: reduce this buffer and data, which will produce the final
+    // result that we store in this data and push to the next GPU
+    rankDest = ring->devUserRanks[0];
+    offset = chunkOffset + rankDest * size;
+
+    Prims::Reduce(tid, nthreads,
+        prevInput  + poffset,
+        thisInput  + offset,
+        thisOutput + chunkOffset,
+        sliceSize, maxOffset,
+        step,
+        waitReadyFromPrev,
+        postDoneToPrev);
+  }
+
+  if (tid == 0) {
+    waitDoneFromNext.wait(REDUCESCATTER_SUBSTEPS*(step + REDUCESCATTER_BUFCHUNKS));
+    *ring->send.conn.head = 0ULL;
+    *ring->recv.conn.tail = 0ULL;
+    __threadfence_system();
+    *ring->recv.conn.opCount = args->opCount+1;
+  }
+}
+
+#include "ll_kernel.h"
+
+#define NEXT_STEP_LL \
+  poffset = noffset; \
+  pflag = nflag; \
+  noffset += NCCL_LL_SLICE_LINES; \
+  if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
+  nflag++; \
+  step++;
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
+  const int tid = threadIdx.x;
+  const int bid = args->bid;
+  const int llNthreads = args->nThreads;
+  struct ncclComm* comm = args->comm;
+  struct ncclRing* ring = comm->rings+blockIdx.x;
+  volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
+  volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
+  volatile int * sizesFifo = ring->send.conn.llFifo;
+  uint64_t sendHead = sendHeadPtr[0];
+
+  typedef LLPrimitives<T, FUNC> LL;
+
+  const ssize_t size = args->N;
+  //const int rank = comm->rank;
+  const int nranks = comm->nRanks;
+  ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+  const ssize_t loopSize = args->nRings*chunkSize;
+
+  uint64_t step = ring->send.conn.llStep;
+  uint32_t pflag, nflag = step + 1;
+  int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+
+  // Compute pointers
+  const T * __restrict__ thisInput = (const T*)args->ThisInput;
+  T * __restrict__ thisOutput = (T*)args->ThisOutput;
+  union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
+  union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
+
+  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+    if (size-gridOffset < loopSize) {
+      chunkSize = args->lastChunkSize;
+    }
+    ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+    /////////////// begin ReduceScatter steps ///////////////
+    ssize_t offset;
+    int maxOffset = min(chunkSize, size-chunkOffset);
+    int rankDest;
+
+    // step 0: push data to next GPU
+    rankDest = ring->devUserRanks[nranks-1];
+    offset = chunkOffset + rankDest * size;
+
+    WAIT_NEXT;
+    LL::ReduceCopy(
+        thisInput  + offset,
+        nextOutput + noffset,
+        maxOffset, nflag, llNthreads);
+    POST_SIZE;
+
+    NEXT_STEP_LL;
+
+    // k-2 steps: reduce and copy to next GPU
+    for (int j=2; j<nranks; ++j) {
+      rankDest = ring->devUserRanks[nranks-j];
+      offset = chunkOffset + rankDest * size;
+
+      WAIT_NEXT;
+      LL::ReduceCopy(
+          thisInput  + offset,
+          prevInput  + poffset,
+          nextOutput + noffset,
+          maxOffset, pflag, nflag, llNthreads);
+      POST_SIZE;
+      ACK_PREV;
+
+      NEXT_STEP_LL;
+    }
+
+    // step k-1: reduce this buffer and data, which will produce the final
+    // result that we store in this data
+    rankDest = ring->devUserRanks[0];
+    offset = chunkOffset + rankDest * size;
+
+    LL::ReduceCopy(
+        thisInput  + offset,
+        prevInput  + poffset,
+        thisOutput + chunkOffset,
+        maxOffset, pflag, llNthreads);
+    ACK_PREV;
+  }
+
+  FIFO_CLEANING_AND_SAVE_STEP(nflag);
+}
diff --git a/src/3rd_party/nccl/src/collectives/reduce.cu b/src/3rd_party/nccl/src/collectives/reduce.cu
new file mode 100644
index 00000000..76d4a194
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/reduce.cu
@@ -0,0 +1,33 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "collectives.h"
+
+ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  size_t nbytes = count*ncclTypeSize(datatype);
+  INFO(COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
+  if (comm->nRanks == 1) {
+    if (sendbuff != recvbuff)
+      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+  } else {
+    NCCLCHECK(transportSaveProxies(REDUCE_SUBSTEPS, REDUCE_BUFCHUNKS, 1, 1, nbytes, proxyPatternTo(root), comm));
+    NCCLCHECK(saveKernel(ncclCollReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, 1));
+  }
+
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  return ncclEnqueueCheck(ncclReduceFunc, "Reduce", sendbuff, recvbuff, count, datatype,
+          op, root, comm, stream);
+}
diff --git a/src/3rd_party/nccl/src/collectives/reduce_scatter.cu b/src/3rd_party/nccl/src/collectives/reduce_scatter.cu
new file mode 100644
index 00000000..af9d78b2
--- /dev/null
+++ b/src/3rd_party/nccl/src/collectives/reduce_scatter.cu
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+#include "enqueue.h"
+#include "collectives.h"
+
+ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  size_t nbytes = count*ncclTypeSize(datatype);
+  INFO(COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
+  if (comm->nRanks == 1) {
+    if (sendbuff != recvbuff)
+      CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
+  } else {
+    NCCLCHECK(transportSaveProxies(REDUCESCATTER_SUBSTEPS, REDUCESCATTER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
+    NCCLCHECK(saveKernel(ncclCollReduceScatter, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes*comm->nRanks, 1));
+  }
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
+  return ncclEnqueueCheck(ncclReduceScatterFunc, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype,
+          op, 0, comm, stream);
+}
diff --git a/src/3rd_party/nccl/src/include/bootstrap.h b/src/3rd_party/nccl/src/include/bootstrap.h
new file mode 100644
index 00000000..278593c8
--- /dev/null
+++ b/src/3rd_party/nccl/src/include/bootstrap.h
@@ -0,0 +1,17 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_BOOTSTRAP_H_
+#define NCCL_BOOTSTRAP_H_
+
+#include "nccl.h"
+
+ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
+ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
+ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState);
+ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
+ncclResult_t bootstrapClose(void* commState);
+#endif
diff --git a/src/3rd_party/nccl/src/include/common_coll.h b/src/3rd_party/nccl/src/include/common_coll.h
new file mode 100644
index 00000000..cd97bbda
--- /dev/null
+++ b/src/3rd_party/nccl/src/include/common_coll.h
@@ -0,0 +1,195 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_COLL_H_
+#define COMMON_COLL_H_
+
+#include "core.h"
+#include "enqueue.h"
+#include "collectives/collectives.h"
+
+static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
+  cudaPointerAttributes attr;
+  cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
+  if (err != cudaSuccess || attr.devicePointer == NULL) {
+    WARN("%s : %s is not a valid pointer", opname, ptrname);
+    return ncclInvalidArgument;
+  }
+#if __CUDACC_VER_MAJOR__ >= 10
+  if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
+#else
+  if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
+#endif
+    WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
+    return ncclInvalidArgument;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
+  if (ptr == NULL) {
+    WARN("%s : %s argument is NULL", opname, ptrname);
+    return ncclInvalidArgument;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) {
+  NCCLCHECK(PtrCheck(comm, opname, "comm"));
+  // First, the easy ones
+  if (root < 0 || root >= comm->nRanks) {
+    WARN("%s : invalid root %d (root should be in the 0..%d range)", opname, root, comm->nRanks);
+    return ncclInvalidArgument;
+  }
+  if (type < 0 || type >= ncclNumTypes) {
+    WARN("%s : invalid type %d", opname, type);
+    return ncclInvalidArgument;
+  }
+  if (op < 0 || op >= ncclNumOps) {
+    WARN("%s : invalid reduction operation %d", opname, op);
+    return ncclInvalidArgument;
+  }
+
+  if (comm->checkPointers) {
+    // Check CUDA device pointers
+    if (strcmp(opname, "Broadcast") != 0 || comm->rank == root) {
+      NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname));
+    }
+    if (strcmp(opname, "Reduce") != 0 || comm->rank == root) {
+      NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname));
+    }
+  }
+  return ncclSuccess;
+}
+
+static __inline__ int ncclTypeSize(ncclDataType_t type) {
+  switch (type) {
+    case ncclInt8:
+    case ncclUint8:
+      return 1;
+    case ncclFloat16:
+      return 2;
+    case ncclInt32:
+    case ncclUint32:
+    case ncclFloat32:
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclFloat64:
+      return 8;
+    default:
+      return -1;
+  }
+}
+
+// In : comm, nbytes ; Out : nrings, nthreads, ll
+// - We start with the minimum number of threads possible (64) and see if the size fits in LL;
+//   If not, we increase the number of threads by 2x, until we reach the max number of LL threads (256, or set by user via NCCL_NTHREADS, or platform non-LL default)
+// - We use "maxRings" to limit the max number of rings we can use before reaching the max number of LL threads
+//   This ensures we don't use a large number of rings with a small number of threads
+// - We use the NCCL_LL_RING_THRESHOLD as the per-thread threshold before we reach the max number of threads
+//   we use NCCL_THREAD_THRESHOLD when we reach the max
+// - If by the max number of LL threads, the size still cannot fit in LL, then we use non-LL setting
+// - We honor the NCCL_LL_THRESHOLD (total threshold) set by user too
+static inline void ncclGetCollResource(ncclComm_t comm, size_t nbytes, int* nrings, int* nthreads, int* ll) {
+  *ll = 0;
+  int llEnforced = 0; /* see if the size falls in the NCCL_LL_THRESHOLD range set by user */
+  if (comm->llThreshold >= 0) { /* user sets total LL threshold */
+    if (nbytes > comm->llThreshold) { /* non-LL */
+      *nthreads = comm->nThreads+1;
+      *nrings = comm->nRings;
+      return;
+    } else {
+      llEnforced = 1; /* user wants to use LL */
+    }
+  }
+  int nt = NCCL_LL_MIN_NTHREADS; /* start with min number of LL threads */
+  size_t nr;
+  int ll_max_nthreads = std::min(NCCL_LL_MAX_NTHREADS, comm->nThreads); /* respect user's setting or platform's default setting */
+  int maxRings = (comm->nRanks <= 4) ? 1 : ll_max_nthreads / NCCL_LL_MIN_NTHREADS;
+  ssize_t threshold = std::min(comm->threadThreshold, (ssize_t)NCCL_LL_RING_THRESHOLD);
+  while (nt < ll_max_nthreads && *ll == 0) {
+    nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*nt*comm->nRanks));
+    if (nr <= maxRings) { /* avoid using few threads but many rings */
+      nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
+      *ll = nbytes > comm->nRanks*nr*nt*threshold ? 0 : 1;
+    }
+    if (*ll == 0) {
+      nt = nt << 1;
+    }
+  }
+  if (*ll == 1) {
+    *nthreads = nt;
+    *nrings = (int)nr;
+    return; /* we can use smaller number of threads to make LL work, stop here */
+  }
+  nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*ll_max_nthreads*comm->nRanks)); /* else we try the max number of LL threads */
+  nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
+  *ll = nbytes > comm->nRanks*nr*ll_max_nthreads*comm->threadThreshold ? llEnforced : 1;
+  *nthreads = *ll ? ll_max_nthreads : comm->nThreads+1;
+  *nrings = *ll ? (int)nr : comm->nRings;
+}
+
+static ncclResult_t saveKernel(int coll, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t dtype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, size_t nbytes, int loopFactor) {
+  int llMode, nBlocks, nThreads;
+  ncclGetCollResource(comm, nbytes, &nBlocks, &nThreads, &llMode);
+  comm->myParams->blockDim.x = std::max((int)comm->myParams->blockDim.x, nThreads);
+  if (comm->userStreamSet == false) {
+    comm->userStream = stream;
+    comm->userStreamSet = true;
+  } else if (stream != comm->userStream) {
+    WARN("Error : mixing different streams within a group call is not supported.");
+    return ncclInvalidUsage;
+  }
+  int lastChunkSize = 0;
+  if (llMode == 1) {
+    int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / ncclTypeSize(dtype);
+    const ssize_t loopSize = nBlocks*loopFactor*(ssize_t)sliceSize;
+    lastChunkSize = DIVUP((count-count/loopSize*loopSize), nBlocks*loopFactor);
+    ALIGN_SIZE(lastChunkSize, nThreads*sizeof(uint64_t)/ncclTypeSize(dtype));
+  }
+  for (int bid=0; bid<nBlocks; bid++) {
+    struct ncclRing* ring = comm->rings+(comm->myParams->gridDim.x % comm->nRings);
+    if (ring->collCount == NCCL_MAX_OPS) {
+      WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
+      return ncclInvalidUsage;
+    }
+
+    comm->myParams->gridDim.x++;
+
+    int opIndex = ring->collFifoTail;
+    struct ncclColl* c = ring->collectives+opIndex;
+    volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
+    while (activePtr[0] != 0) sched_yield();
+
+    struct CollectiveArgs* args = &c->args;
+    args->root = root;
+    args->N = count;
+    args->ThisInput = sendbuff;
+    args->ThisOutput = recvbuff;
+    args->comm = comm->devComm;
+    args->opCount = comm->opCount;
+    args->bid = bid;
+    args->nRings = nBlocks;
+    args->nThreads = nThreads;
+    args->lastChunkSize = lastChunkSize;
+
+    c->nThreads = nThreads;
+    c->funcIndex = FUNC_INDEX(coll, op, dtype, llMode);
+    c->active = 1;
+    opIndex = (opIndex+1)%NCCL_MAX_OPS;
+    c->nextIndex = opIndex;
+    ring->collFifoTail = opIndex;
+    ring->collCount++;
+  }
+  /*if (llMode == 0)*/ comm->opCount++;
+  return ncclSuccess;
+}
+
+extern __global__ void ncclMultiOpKernel (struct ncclColl firstColl);
+
+#endif
diff --git a/src/3rd_party/nccl/src/include/core.h b/src/3rd_party/nccl/src/include/core.h
new file mode 100644
index 00000000..2dd63d60
--- /dev/null
+++ b/src/3rd_party/nccl/src/include/core.h
@@ -0,0 +1,386 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CORE_H_
+#define NCCL_CORE_H_
+
+#define NCCL_MAX_OPS 2048
+
+#include "nccl.h"
+#include "transport.h"
+#include "debug.h"
+#include <cstdio>
+#include <algorithm> // std::min/std::max
+#include <unistd.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+
+#if __CUDACC_VER_MAJOR__ < 9
+struct cudaLaunchParams {
+  void *func;
+  dim3 gridDim;
+  dim3 blockDim;
+  void **args;
+  size_t sharedMem;
+  cudaStream_t stream;
+};
+#endif
+
+#define MAXRINGS 16
+#define MAXTHREADS 256
+#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
+
+// Rings / LL tuning
+#define NCCL_LL_RING_THRESHOLD 8 // Per thread size before we start increasing nrings
+#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL for Volta and above
+#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
+#define NCCL_LL_MAX_NTHREADS 256
+#define NCCL_LL_MIN_NTHREADS 64
+
+#define DIVUP(x, y) \
+    (((x)+(y)-1)/(y))
+#define ROUNDUP(x, y) \
+    (DIVUP((x), (y))*(y))
+
+#define ALIGN_SIZE(size, align) \
+  size = ((size + (align) - 1) / (align)) * (align);
+
+union ncclLLFifoLine {
+  /* Flags have to be *after* data, because otherwise, an incomplete receive
+     from the network may receive the flag but not the data.
+     Note this is assuming that either we receive contiguous chunks of data
+     (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
+  struct {
+    uint32_t data1;
+    uint32_t flag1;
+    uint32_t data2;
+    uint32_t flag2;
+  };
+  uint64_t v[2];
+  int4 i4;
+};
+
+struct ncclConnInfo {
+  // Regular comm mechanism
+  char *buff;         // Local for recv, remote for send
+  uint64_t *tail;     // Local for recv, remote for send
+  uint64_t *head;     // Local for send, remote for recv
+  uint64_t *opCount;  // Local for recv, remote for send
+
+  int direct;         // Direct communication
+  void **ptrExchange; // Pointer exchange for direct communication
+
+  int *fifo;          // Size fifo for proxy
+
+  // Low latency mechanism
+  char *llBuff;       // Local for recv, remote for send
+  uint64_t *llHead;   // Local for send, remote for recv
+  int *llFifo;        // LL Size fifo for proxy
+  uint64_t llStep;    // Keep where we are
+  uint64_t llLastCleaning;
+};
+
+struct ncclConnector {
+  struct transportProxyInfo* proxyInfo;
+  struct ncclTransport* transport;
+  void* transportResources; // Host-side resources
+  struct ncclConnInfo conn;
+};
+
+#define CACHE_LINE_SIZE 128
+#define MEM_ALIGN 4096
+#define SIZES_FIFO_SIZE 32
+#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
+
+#define NCCL_LL_CHUNKS 8
+#define NUM_LINES_PER_THREAD 8
+#define NCCL_LL_BUFF_SIZE (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_LL_CHUNKS*sizeof(union ncclLLFifoLine)) // 256K
+#define NCCL_LL_BUFF_LINES (NCCL_LL_BUFF_SIZE / (2*sizeof(uint64_t)))
+#define NCCL_LL_SLICE_LINES (NCCL_LL_BUFF_LINES / NCCL_LL_CHUNKS)
+#define NCCL_LL_CLEAN_FREQ 0x10000000
+
+struct ncclSendMem {
+  union {
+    struct {
+      uint64_t head;
+      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      void* ptrExchange;
+      char pad2[CACHE_LINE_SIZE-sizeof(void*)];
+      uint64_t llHead;
+    };
+    char pad3[MEM_ALIGN];
+  };
+};
+
+struct ncclRecvMem {
+  union {
+    struct {
+      uint64_t tail;
+      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      uint64_t opCount;
+      char pad4[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      int sizesFifo[SIZES_FIFO_SIZE];
+      int llSizesFifo[SIZES_FIFO_SIZE];
+    };
+    char pad5[MEM_ALIGN];
+  };
+  char llBuff[NCCL_LL_BUFF_SIZE];
+  char buff[1]; // Actually larger than that
+};
+
+struct ncclRing {
+  union {
+    struct {
+      int id;
+      int nthreads;
+      // Per ring resources
+      struct ncclSendMem* devMemSend;   // CUDA-size resources
+      struct ncclRecvMem* devMemRecv;   // CUDA-size resources
+      int buffSize;
+      int devMemSendSize;    // Keep the size for IPCs
+      int devMemRecvSize;    // Keep the size for IPCs
+      struct ncclConnector send;
+      struct ncclConnector recv;
+
+      // Maps an internal nccl index to user-specified rank order. This is necessary
+      // since we need to know how the user expects data to be ordered across
+      // devices. Ordered from current device.
+      int* userRanks;
+      int* devUserRanks;
+
+      // Operation list for aggregation
+      struct ncclColl* collectives;
+      struct ncclColl* devCollectives;
+      int collStart;
+      int collCount;
+      int collFifoHead; // Only used by GPU
+      int collFifoTail; // Only used by CPU
+    };
+    int data[0x80];
+  };
+};
+static_assert(sizeof(struct ncclRing) == 0x80*sizeof(int), "ncclRing must have a pow2 size");
+
+/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
+/* to make sure reads to host from the CUDA kernel are aligned. */
+/* Make sure to adjust padding at the end of ncclColl. */
+struct CollectiveArgs {
+  struct ncclComm* comm;
+  uint64_t opCount;
+
+  // local and remote input, output, and buffer
+  const void * ThisInput;
+  void * ThisOutput;
+
+  // general parameters
+  size_t N;
+  uint32_t root;
+  uint8_t bid;
+  uint8_t nRings;
+  uint16_t nThreads;
+
+  int lastChunkSize;
+};
+struct ncclColl {
+  union {
+    struct {
+      struct CollectiveArgs args;
+      uint16_t nThreads;
+      uint16_t funcIndex;
+      uint16_t nextIndex;
+      uint8_t  active;
+    };
+    int data[0x10];
+  };
+};
+static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
+
+struct ncclComm {
+  struct ncclRing rings[MAXRINGS];
+
+  int rank;    // my rank in the communicator
+  int nRanks;  // number of GPUs in communicator
+  int cudaDev; // my cuda device index
+
+  enum { GROUP, PARALLEL } launchMode;
+  cudaStream_t userStream;
+  bool userStreamSet;
+  cudaEvent_t doneEvent;
+  bool checkPointers;
+
+  // Counter to make sure collectives match (needed for bcast/reduce
+  // where syncs are not symmetric).
+  uint64_t opCount;
+
+  // Rings for collectives
+  int nRings;
+  int nThreads;
+
+  // Low-latency algorithm threshold
+  ssize_t llThreshold;
+  ssize_t threadThreshold;
+
+  // An internal CUDA stream for NCCL kernel CGMD launches
+  int groupCudaStream;
+  cudaStream_t groupStream;
+
+  // Device copy of the communicator
+  struct ncclComm *devComm;
+
+  // Intra-process sync
+  int intraRank;
+  int intraRanks;
+  int* intraBarrier;
+  int intraPhase;
+
+  // Storage for deferred intra-process launch
+  struct cudaLaunchParams * intraParams;
+  struct cudaLaunchParams *myParams;
+  int* intraCudaDevs;
+  int* intraCGMode; // Whether we can use CUDA9 CGMD or not
+  int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
+  struct ncclColl args;
+  void* argsptr;
+};
+
+// Check CUDA calls
+#define CUDACHECK(cmd) do {                                 \
+    cudaError_t e = cmd;                                    \
+    if( e != cudaSuccess ) {                                \
+        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
+        return ncclUnhandledCudaError;                      \
+    }                                                       \
+} while(false)
+
+#define CUDACHECKGOTO(cmd, res, label) do {                 \
+    cudaError_t e = cmd;                                    \
+    if( e != cudaSuccess ) {                                \
+        WARN("Cuda failure '%s'", cudaGetErrorString(e));   \
+        res = ncclUnhandledCudaError;                       \
+        goto label;                                         \
+    }                                                       \
+} while(false)
+
+#include <errno.h>
+// Check system calls
+#define SYSCHECK(call, name) do { \
+  int ret = -1; \
+  while (ret == -1) { \
+    SYSCHECKVAL(call, name, ret); \
+    if (ret == -1) { \
+      INFO(ALL,"Got %s, retrying", strerror(errno));   \
+    }\
+  } \
+} while (0);
+
+#define SYSCHECKVAL(call, name, retval) do { \
+  retval = call; \
+  if (retval == -1 && errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { \
+    WARN("Call to " name " failed : %s", strerror(errno)); \
+    return ncclSystemError; \
+  } \
+} while (0);
+
+#define SYSCHECKNTIMES(call, name, times, usec, exptype) do { \
+  int ret = -1; \
+  int count = 0; \
+  while (ret == -1 && count < times) { \
+    SYSCHECKVALEXP(call, name, ret, exptype); \
+    count++; \
+    if (ret == -1) { \
+      usleep(usec); \
+    }\
+  } \
+  if (ret == -1) { \
+    WARN("Call to " name " timeout : %s", strerror(errno)); \
+    return ncclSystemError; \
+  } \
+} while (0);
+
+#define SYSCHECKVALEXP(call, name, retval, exptype) do { \
+  retval = call; \
+  if (retval == -1 && errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN && errno != exptype) { \
+    WARN("Call to " name " failed : %s", strerror(errno)); \
+    return ncclSystemError; \
+  } \
+} while (0);
+
+// Propagate errors up
+#define NCCLCHECK(call) do { \
+  ncclResult_t res = call; \
+  if (res != ncclSuccess) { \
+    /* Print the back trace*/ \
+    INFO(ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    return res; \
+  } \
+} while (0);
+
+#define NCCLCHECKGOTO(call, res, label) do { \
+  res = call; \
+  if (res != ncclSuccess) { \
+    /* Print the back trace*/ \
+    INFO(ALL,"%s:%d -> %d", __FILE__, __LINE__, res);    \
+    goto label; \
+  } \
+} while (0);
+
+#ifdef PROFAPI
+#define NCCL_API(ret, func, args...)        \
+    __attribute__ ((visibility("default"))) \
+    __attribute__ ((alias(#func)))          \
+    ret p##func (args);                     \
+    extern "C"                              \
+    __attribute__ ((visibility("default"))) \
+    __attribute__ ((weak))                  \
+    ret func(args)
+#else
+#define NCCL_API(ret, func, args...)        \
+    extern "C"                              \
+    __attribute__ ((visibility("default"))) \
+    ret func(args)
+#endif // end PROFAPI
+
+int ncclCudaCompCap();
+
+#include <sys/mman.h>
+static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
+  CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
+  memset(*ptr, 0, size);
+  *devPtr = *ptr;
+  return ncclSuccess;
+}
+
+static inline ncclResult_t ncclCudaHostFree(void* ptr) {
+  CUDACHECK(cudaFreeHost(ptr));
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
+  void* p = malloc(nelem*sizeof(T));
+  if (p == NULL) {
+    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
+    return ncclSystemError;
+  }
+  memset(p, 0, nelem*sizeof(T));
+  *ptr = (T*)p;
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
+  CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
+  CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
+  CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
+  return ncclSuccess;
+}
+
+#endif // end include guard
diff --git a/src/3rd_party/nccl/src/include/debug.h b/src/3rd_party/nccl/src/include/debug.h
new file mode 100644
index 00000000..83ae98b9
--- /dev/null
+++ b/src/3rd_party/nccl/src/include/debug.h
@@ -0,0 +1,179 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_DEBUG_H_
+#define NCCL_DEBUG_H_
+
+#include <pthread.h>
+#include <stdio.h>
+#include <chrono>
+
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <limits.h>
+#include <string.h>
+#include "nccl.h"
+#define gettid() (pid_t) syscall(SYS_gettid)
+
+typedef enum {NONE=0, VERSION=1, WARN=2, INFO=3, ABORT=4, TRACE=5} DebugLevel;
+typedef enum {INIT=1, COLL=2, P2P=4, SHM=8, NET=16, ALL=~0} SubSys;
+extern DebugLevel ncclDebugLevel;
+extern uint64_t ncclDebugMask;
+extern pthread_mutex_t ncclDebugOutputLock;
+extern FILE *ncclDebugFile;
+extern ncclResult_t getHostName(char* hostname, int maxlen);
+
+#define WARN(...) do {                                           \
+  if (ncclDebugLevel >= WARN) {                                  \
+    char hostname[1024];                                         \
+    getHostName(hostname, 1024);                                 \
+    int cudaDev;                                                 \
+    cudaGetDevice(&cudaDev);                                     \
+    pthread_mutex_lock(&ncclDebugOutputLock);                    \
+    fprintf(ncclDebugFile,"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, __FILE__, __LINE__); \
+    fprintf(ncclDebugFile,__VA_ARGS__);                          \
+    fprintf(ncclDebugFile,"\n");                                 \
+    fflush(ncclDebugFile);                                       \
+    pthread_mutex_unlock(&ncclDebugOutputLock);                  \
+    if (ncclDebugLevel == ABORT) { fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n", hostname, getpid(), gettid(), cudaDev, __FILE__, __LINE__); abort(); } \
+  }                                                              \
+} while(0)
+
+#define INFO(FLAGS, ...) do {                                    \
+  if (ncclDebugLevel >= INFO && ((FLAGS) & ncclDebugMask)) {     \
+    char hostname[1024];                                         \
+    getHostName(hostname, 1024);                                 \
+    int cudaDev;                                                 \
+    cudaGetDevice(&cudaDev);                                     \
+    pthread_mutex_lock(&ncclDebugOutputLock);                    \
+    fprintf(ncclDebugFile,"%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev); \
+    fprintf(ncclDebugFile,__VA_ARGS__);fprintf(ncclDebugFile,"\n"); \
+    fflush(ncclDebugFile);                                       \
+    pthread_mutex_unlock(&ncclDebugOutputLock);                  \
+  }                                                              \
+} while(0)
+
+#ifdef ENABLE_TRACE
+#define TRACE(FLAGS, ...) do {                                   \
+  if (ncclDebugLevel == TRACE && ((FLAGS) & ncclDebugMask)) {    \
+    char hostname[1024];                                         \
+    getHostName(hostname, 1024);                                 \
+    int cudaDev;                                                 \
+    cudaGetDevice(&cudaDev);                                     \
+    pthread_mutex_lock(&ncclDebugOutputLock);                    \
+    auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch; \
+    double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000; \
+    fprintf(ncclDebugFile,"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, __func__, __LINE__); \
+    fprintf(ncclDebugFile,__VA_ARGS__);fprintf(ncclDebugFile,"\n"); \
+    fflush(ncclDebugFile);                                       \
+    pthread_mutex_unlock(&ncclDebugOutputLock);                  \
+  }                                                              \
+} while(0)
+
+extern std::chrono::high_resolution_clock::time_point ncclEpoch;
+
+#else
+#define TRACE(...)
+#endif
+
+#include <stdlib.h>
+
+static inline void initDebug() {
+  const char* nccl_debug = getenv("NCCL_DEBUG");
+  if (nccl_debug == NULL) {
+    ncclDebugLevel = NONE;
+  } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
+    ncclDebugLevel = VERSION;
+  } else if (strcasecmp(nccl_debug, "WARN") == 0) {
+    ncclDebugLevel = WARN;
+  } else if (strcasecmp(nccl_debug, "INFO") == 0) {
+    ncclDebugLevel = INFO;
+  } else if (strcasecmp(nccl_debug, "ABORT") == 0) {
+    ncclDebugLevel = ABORT;
+  } else if (strcasecmp(nccl_debug, "TRACE") == 0) {
+    ncclDebugLevel = TRACE;
+  }
+
+  /* Parse the NCCL_DEBUG_SUBSYS env var
+   * This can be a comma separated list such as INIT,COLL
+   * or ^INIT,COLL etc
+   */
+  char* nccl_debug_subsys = getenv("NCCL_DEBUG_SUBSYS");
+  if (nccl_debug_subsys != NULL) {
+    char *subsys = strtok(nccl_debug_subsys, ",");
+    while (subsys != NULL) {
+      int invert = 0;
+      uint64_t mask = 0;
+      if (subsys[0] == '^') { invert = 1; subsys++; }
+      if (strcasecmp(subsys, "INIT") == 0) {
+        mask = INIT;
+      } else if (strcasecmp(subsys, "COLL") == 0) {
+        mask = COLL;
+      } else if (strcasecmp(subsys, "P2P") == 0) {
+        mask = P2P;
+      } else if (strcasecmp(subsys, "SHM") == 0) {
+        mask = SHM;
+      } else if (strcasecmp(subsys, "NET") == 0) {
+        mask = NET;
+      } else if (strcasecmp(subsys, "ALL") == 0) {
+        mask = ALL;
+      }
+      if (mask) {
+        if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
+      }
+      subsys = strtok(NULL, ",");
+    }
+  }
+
+  /* Parse and expand the NCCL_DEBUG_FILE path and
+   * then create the debug file. But don't bother unless the
+   * NCCL_DEBUG level is > VERSION
+   */
+  const char* nccl_debug_file = getenv("NCCL_DEBUG_FILE");
+  if (ncclDebugLevel > VERSION && nccl_debug_file != NULL) {
+    int c = 0;
+    char debug_fn[PATH_MAX+1] = "";
+    char *dfn = debug_fn;
+    while (nccl_debug_file[c] != '\0' && c < PATH_MAX) {
+      if (nccl_debug_file[c++] != '%') {
+        *dfn++ = nccl_debug_file[c-1];
+        continue;
+      }
+      switch (nccl_debug_file[c++]) {
+        case '%': // Double %
+          *dfn++ = '%';
+          break;
+        case 'h': // %h = hostname
+          char hostname[1024];
+          getHostName(hostname, 1024);
+          dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
+          break;
+        case 'p': // %p = pid
+          dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
+          break;
+        default: // Echo everything we don't understand
+          *dfn++ = '%';
+          *dfn++ = nccl_debug_file[c-1];
+          break;
+      }
+    }
+    *dfn = '\0';
+    if (debug_fn[0] != '\0') {
+      FILE *file = fopen(debug_fn, "w");
+      if (file != NULL) {
+        INFO(ALL,"DEBUG file is '%s'", debug_fn);
+        ncclDebugFile = file;
+      }
+    }
+  }
+  pthread_mutex_init(&ncclDebugOutputLock, NULL);
+
+#ifdef ENABLE_TRACE
+  ncclEpoch = std::chrono::high_resolution_clock::now();
+#endif
+}
+
+#endif
diff --git a/src/3rd_party/nccl/src/include/enqueue.h b/src/3rd_party/nccl/src/include/enqueue.h
new file mode 100644
index 00000000..69d0463d
--- /dev/null
+++ b/src/3rd_party/nccl/src/include/enqueue.h
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ENQUEUE_H_
+#define NCCL_ENQUEUE_H_
+
+#include "core.h"
+#include "group.h"
+
+typedef ncclResult_t(*ncclFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+
+ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff,
+    void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
+ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
+ncclResult_t ncclCpuBarrierOut(ncclComm_t comm);
+ncclResult_t ncclBarrierEnqueue(ncclComm_t comm);
+ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm);
+ncclResult_t ncclEnqueueEvents(ncclComm_t comm);
+
+#endif // End include guard
diff --git a/src/3rd_party/nccl/src/include/group.h b/src/3rd_party/nccl/src/include/group.h
new file mode 100644
index 00000000..76da30f8
--- /dev/null
+++ b/src/3rd_party/nccl/src/include/group.h
@@ -0,0 +1,24 @@
+/*************************************************************************
+ * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_GROUP_H_
+#define NCCL_GROUP_H_
+
+#include "nccl.h"
+#include "core.h"
+
+bool ncclAsyncMode();
+ncclResult_t ncclAsyncErrCheck(ncclResult_t ret);
+
+typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
+
+ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
+
+typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+
+ncclResult_t ncclAsyncColl(ncclComm_t comm);
+#endif
diff --git a/src/3rd_party/nccl/src/include/ibvwrap.h b/src/3rd_party/nccl/src/include/ibvwrap.h
new file mode 100644
index 00000000..4f3e8311
--- /dev/null
+++ b/src/3rd_party/nccl/src/include/ibvwrap.h
@@ -0,0 +1,1109 @@
+/*************************************************************************
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2004, 2011-2012 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ *
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_IBVWRAP_H_
+#define NCCL_IBVWRAP_H_
+
+#include "core.h"
+#include <sys/types.h>
+#include <unistd.h>
+
+// Dynamically handle dependencies on IB verbs
+
+#if __GNUC__ >= 3
+#  define __attribute_const __attribute__((const))
+#else
+#  define __attribute_const
+#endif
+
+union ibv_gid {
+	uint8_t			raw[16];
+	struct {
+		uint64_t	subnet_prefix;
+		uint64_t	interface_id;
+	} global;
+};
+
+#ifndef container_of
+/**
+  * container_of - cast a member of a structure out to the containing structure
+  * @ptr:        the pointer to the member.
+  * @type:       the type of the container struct this is embedded in.
+  * @member:     the name of the member within the struct.
+  *
+ */
+#define container_of(ptr, type, member) \
+	((type *) ((uint8_t *)(ptr) - offsetof(type, member)))
+#endif
+
+#define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz))
+
+/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/
+//static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *)NULL) - 1;
+
+enum ibv_node_type {
+	IBV_NODE_UNKNOWN	= -1,
+	IBV_NODE_CA 		= 1,
+	IBV_NODE_SWITCH,
+	IBV_NODE_ROUTER,
+	IBV_NODE_RNIC,
+
+	/* Leave a gap for future node types before starting with
+	 * experimental node types.
+	 */
+	IBV_EXP_NODE_TYPE_START	= 32,
+	IBV_EXP_NODE_MIC	= IBV_EXP_NODE_TYPE_START
+};
+
+enum ibv_transport_type {
+	IBV_TRANSPORT_UNKNOWN	= -1,
+	IBV_TRANSPORT_IB	= 0,
+	IBV_TRANSPORT_IWARP,
+
+	/* Leave a gap for future transport types before starting with
+	 * experimental transport types.
+	 */
+	IBV_EXP_TRANSPORT_TYPE_START	= 32,
+	IBV_EXP_TRANSPORT_SCIF		= IBV_EXP_TRANSPORT_TYPE_START
+};
+
+enum ibv_device_cap_flags {
+	IBV_DEVICE_RESIZE_MAX_WR	= 1,
+	IBV_DEVICE_BAD_PKEY_CNTR	= 1 <<  1,
+	IBV_DEVICE_BAD_QKEY_CNTR	= 1 <<  2,
+	IBV_DEVICE_RAW_MULTI		= 1 <<  3,
+	IBV_DEVICE_AUTO_PATH_MIG	= 1 <<  4,
+	IBV_DEVICE_CHANGE_PHY_PORT	= 1 <<  5,
+	IBV_DEVICE_UD_AV_PORT_ENFORCE	= 1 <<  6,
+	IBV_DEVICE_CURR_QP_STATE_MOD	= 1 <<  7,
+	IBV_DEVICE_SHUTDOWN_PORT	= 1 <<  8,
+	IBV_DEVICE_INIT_TYPE		= 1 <<  9,
+	IBV_DEVICE_PORT_ACTIVE_EVENT	= 1 << 10,
+	IBV_DEVICE_SYS_IMAGE_GUID	= 1 << 11,
+	IBV_DEVICE_RC_RNR_NAK_GEN	= 1 << 12,
+	IBV_DEVICE_SRQ_RESIZE		= 1 << 13,
+	IBV_DEVICE_N_NOTIFY_CQ		= 1 << 14,
+	IBV_DEVICE_XRC			= 1 << 20,
+	IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29
+};
+
+enum ibv_atomic_cap {
+	IBV_ATOMIC_NONE,
+	IBV_ATOMIC_HCA,
+	IBV_ATOMIC_GLOB
+};
+
+struct ibv_device_attr {
+	char			fw_ver[64];
+	uint64_t		node_guid;
+	uint64_t		sys_image_guid;
+	uint64_t		max_mr_size;
+	uint64_t		page_size_cap;
+	uint32_t		vendor_id;
+	uint32_t		vendor_part_id;
+	uint32_t		hw_ver;
+	int			max_qp;
+	int			max_qp_wr;
+	int			device_cap_flags;
+	int			max_sge;
+	int			max_sge_rd;
+	int			max_cq;
+	int			max_cqe;
+	int			max_mr;
+	int			max_pd;
+	int			max_qp_rd_atom;
+	int			max_ee_rd_atom;
+	int			max_res_rd_atom;
+	int			max_qp_init_rd_atom;
+	int			max_ee_init_rd_atom;
+	enum ibv_atomic_cap	atomic_cap;
+	int			max_ee;
+	int			max_rdd;
+	int			max_mw;
+	int			max_raw_ipv6_qp;
+	int			max_raw_ethy_qp;
+	int			max_mcast_grp;
+	int			max_mcast_qp_attach;
+	int			max_total_mcast_qp_attach;
+	int			max_ah;
+	int			max_fmr;
+	int			max_map_per_fmr;
+	int			max_srq;
+	int			max_srq_wr;
+	int			max_srq_sge;
+	uint16_t		max_pkeys;
+	uint8_t			local_ca_ack_delay;
+	uint8_t			phys_port_cnt;
+};
+
+enum ibv_mtu {
+	IBV_MTU_256  = 1,
+	IBV_MTU_512  = 2,
+	IBV_MTU_1024 = 3,
+	IBV_MTU_2048 = 4,
+	IBV_MTU_4096 = 5
+};
+
+enum ibv_port_state {
+	IBV_PORT_NOP		= 0,
+	IBV_PORT_DOWN		= 1,
+	IBV_PORT_INIT		= 2,
+	IBV_PORT_ARMED		= 3,
+	IBV_PORT_ACTIVE		= 4,
+	IBV_PORT_ACTIVE_DEFER	= 5
+};
+
+enum {
+	IBV_LINK_LAYER_UNSPECIFIED,
+	IBV_LINK_LAYER_INFINIBAND,
+	IBV_LINK_LAYER_ETHERNET,
+
+	/* Leave a gap for future link layer types before starting with
+	 * experimental link layer.
+	 */
+	IBV_EXP_LINK_LAYER_START	= 32,
+	IBV_EXP_LINK_LAYER_SCIF		= IBV_EXP_LINK_LAYER_START
+};
+
+enum ibv_port_cap_flags {
+	IBV_PORT_SM				= 1 <<  1,
+	IBV_PORT_NOTICE_SUP			= 1 <<  2,
+	IBV_PORT_TRAP_SUP			= 1 <<  3,
+	IBV_PORT_OPT_IPD_SUP			= 1 <<  4,
+	IBV_PORT_AUTO_MIGR_SUP			= 1 <<  5,
+	IBV_PORT_SL_MAP_SUP			= 1 <<  6,
+	IBV_PORT_MKEY_NVRAM			= 1 <<  7,
+	IBV_PORT_PKEY_NVRAM			= 1 <<  8,
+	IBV_PORT_LED_INFO_SUP			= 1 <<  9,
+	IBV_PORT_SYS_IMAGE_GUID_SUP		= 1 << 11,
+	IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP	= 1 << 12,
+	IBV_PORT_EXTENDED_SPEEDS_SUP		= 1 << 14,
+	IBV_PORT_CM_SUP				= 1 << 16,
+	IBV_PORT_SNMP_TUNNEL_SUP		= 1 << 17,
+	IBV_PORT_REINIT_SUP			= 1 << 18,
+	IBV_PORT_DEVICE_MGMT_SUP		= 1 << 19,
+	IBV_PORT_VENDOR_CLASS			= 1 << 24,
+	IBV_PORT_CLIENT_REG_SUP			= 1 << 25,
+	IBV_PORT_IP_BASED_GIDS			= 1 << 26,
+};
+
+struct ibv_port_attr {
+	enum ibv_port_state	state;
+	enum ibv_mtu		max_mtu;
+	enum ibv_mtu		active_mtu;
+	int			gid_tbl_len;
+	uint32_t		port_cap_flags;
+	uint32_t		max_msg_sz;
+	uint32_t		bad_pkey_cntr;
+	uint32_t		qkey_viol_cntr;
+	uint16_t		pkey_tbl_len;
+	uint16_t		lid;
+	uint16_t		sm_lid;
+	uint8_t			lmc;
+	uint8_t			max_vl_num;
+	uint8_t			sm_sl;
+	uint8_t			subnet_timeout;
+	uint8_t			init_type_reply;
+	uint8_t			active_width;
+	uint8_t			active_speed;
+	uint8_t			phys_state;
+	uint8_t			link_layer;
+	uint8_t			reserved;
+};
+
+enum ibv_event_type {
+	IBV_EVENT_CQ_ERR,
+	IBV_EVENT_QP_FATAL,
+	IBV_EVENT_QP_REQ_ERR,
+	IBV_EVENT_QP_ACCESS_ERR,
+	IBV_EVENT_COMM_EST,
+	IBV_EVENT_SQ_DRAINED,
+	IBV_EVENT_PATH_MIG,
+	IBV_EVENT_PATH_MIG_ERR,
+	IBV_EVENT_DEVICE_FATAL,
+	IBV_EVENT_PORT_ACTIVE,
+	IBV_EVENT_PORT_ERR,
+	IBV_EVENT_LID_CHANGE,
+	IBV_EVENT_PKEY_CHANGE,
+	IBV_EVENT_SM_CHANGE,
+	IBV_EVENT_SRQ_ERR,
+	IBV_EVENT_SRQ_LIMIT_REACHED,
+	IBV_EVENT_QP_LAST_WQE_REACHED,
+	IBV_EVENT_CLIENT_REREGISTER,
+	IBV_EVENT_GID_CHANGE,
+
+	/* new experimental events start here leaving enough
+	 * room for 14 events which should be enough
+	 */
+	IBV_EXP_EVENT_DCT_KEY_VIOLATION = 32,
+	IBV_EXP_EVENT_DCT_ACCESS_ERR,
+	IBV_EXP_EVENT_DCT_REQ_ERR,
+};
+
+struct ibv_async_event {
+	union {
+		struct ibv_cq  *cq;
+		struct ibv_qp  *qp;
+		struct ibv_srq *srq;
+		struct ibv_exp_dct *dct;
+		int		port_num;
+		/* For source compatible with Legacy API */
+		uint32_t	xrc_qp_num;
+	} element;
+	enum ibv_event_type	event_type;
+};
+
+enum ibv_wc_status {
+	IBV_WC_SUCCESS,
+	IBV_WC_LOC_LEN_ERR,
+	IBV_WC_LOC_QP_OP_ERR,
+	IBV_WC_LOC_EEC_OP_ERR,
+	IBV_WC_LOC_PROT_ERR,
+	IBV_WC_WR_FLUSH_ERR,
+	IBV_WC_MW_BIND_ERR,
+	IBV_WC_BAD_RESP_ERR,
+	IBV_WC_LOC_ACCESS_ERR,
+	IBV_WC_REM_INV_REQ_ERR,
+	IBV_WC_REM_ACCESS_ERR,
+	IBV_WC_REM_OP_ERR,
+	IBV_WC_RETRY_EXC_ERR,
+	IBV_WC_RNR_RETRY_EXC_ERR,
+	IBV_WC_LOC_RDD_VIOL_ERR,
+	IBV_WC_REM_INV_RD_REQ_ERR,
+	IBV_WC_REM_ABORT_ERR,
+	IBV_WC_INV_EECN_ERR,
+	IBV_WC_INV_EEC_STATE_ERR,
+	IBV_WC_FATAL_ERR,
+	IBV_WC_RESP_TIMEOUT_ERR,
+	IBV_WC_GENERAL_ERR
+};
+const char *ibv_wc_status_str(enum ibv_wc_status status);
+
+enum ibv_wc_opcode {
+	IBV_WC_SEND,
+	IBV_WC_RDMA_WRITE,
+	IBV_WC_RDMA_READ,
+	IBV_WC_COMP_SWAP,
+	IBV_WC_FETCH_ADD,
+	IBV_WC_BIND_MW,
+/*
+ * Set value of IBV_WC_RECV so consumers can test if a completion is a
+ * receive by testing (opcode & IBV_WC_RECV).
+ */
+	IBV_WC_RECV			= 1 << 7,
+	IBV_WC_RECV_RDMA_WITH_IMM
+};
+
+enum ibv_wc_flags {
+	IBV_WC_GRH		= 1 << 0,
+	IBV_WC_WITH_IMM		= 1 << 1
+};
+
+struct ibv_wc {
+	uint64_t		wr_id;
+	enum ibv_wc_status	status;
+	enum ibv_wc_opcode	opcode;
+	uint32_t		vendor_err;
+	uint32_t		byte_len;
+	uint32_t		imm_data;	/* in network byte order */
+	uint32_t		qp_num;
+	uint32_t		src_qp;
+	int			wc_flags;
+	uint16_t		pkey_index;
+	uint16_t		slid;
+	uint8_t			sl;
+	uint8_t			dlid_path_bits;
+};
+
+enum ibv_access_flags {
+	IBV_ACCESS_LOCAL_WRITE		= 1,
+	IBV_ACCESS_REMOTE_WRITE		= (1<<1),
+	IBV_ACCESS_REMOTE_READ		= (1<<2),
+	IBV_ACCESS_REMOTE_ATOMIC	= (1<<3),
+	IBV_ACCESS_MW_BIND		= (1<<4)
+};
+
+struct ibv_pd {
+	struct ibv_context     *context;
+	uint32_t		handle;
+};
+
+enum ibv_xrcd_init_attr_mask {
+	IBV_XRCD_INIT_ATTR_FD	    = 1 << 0,
+	IBV_XRCD_INIT_ATTR_OFLAGS   = 1 << 1,
+	IBV_XRCD_INIT_ATTR_RESERVED = 1 << 2
+};
+
+struct ibv_xrcd_init_attr {
+	uint32_t comp_mask;
+	int	 fd;
+	int	 oflags;
+};
+
+struct ibv_xrcd {
+	struct ibv_context     *context;
+};
+
+enum ibv_rereg_mr_flags {
+	IBV_REREG_MR_CHANGE_TRANSLATION	= (1 << 0),
+	IBV_REREG_MR_CHANGE_PD		= (1 << 1),
+	IBV_REREG_MR_CHANGE_ACCESS	= (1 << 2),
+	IBV_REREG_MR_KEEP_VALID		= (1 << 3)
+};
+
+struct ibv_mr {
+	struct ibv_context     *context;
+	struct ibv_pd	       *pd;
+	void		       *addr;
+	size_t			length;
+	uint32_t		handle;
+	uint32_t		lkey;
+	uint32_t		rkey;
+};
+
+enum ibv_mw_type {
+	IBV_MW_TYPE_1			= 1,
+	IBV_MW_TYPE_2			= 2
+};
+
+struct ibv_mw {
+	struct ibv_context     *context;
+	struct ibv_pd	       *pd;
+	uint32_t		rkey;
+};
+
+struct ibv_global_route {
+	union ibv_gid		dgid;
+	uint32_t		flow_label;
+	uint8_t			sgid_index;
+	uint8_t			hop_limit;
+	uint8_t			traffic_class;
+};
+
+struct ibv_grh {
+	uint32_t		version_tclass_flow;
+	uint16_t		paylen;
+	uint8_t			next_hdr;
+	uint8_t			hop_limit;
+	union ibv_gid		sgid;
+	union ibv_gid		dgid;
+};
+
+enum ibv_rate {
+	IBV_RATE_MAX      = 0,
+	IBV_RATE_2_5_GBPS = 2,
+	IBV_RATE_5_GBPS   = 5,
+	IBV_RATE_10_GBPS  = 3,
+	IBV_RATE_20_GBPS  = 6,
+	IBV_RATE_30_GBPS  = 4,
+	IBV_RATE_40_GBPS  = 7,
+	IBV_RATE_60_GBPS  = 8,
+	IBV_RATE_80_GBPS  = 9,
+	IBV_RATE_120_GBPS = 10,
+	IBV_RATE_14_GBPS  = 11,
+	IBV_RATE_56_GBPS  = 12,
+	IBV_RATE_112_GBPS = 13,
+	IBV_RATE_168_GBPS = 14,
+	IBV_RATE_25_GBPS  = 15,
+	IBV_RATE_100_GBPS = 16,
+	IBV_RATE_200_GBPS = 17,
+	IBV_RATE_300_GBPS = 18
+};
+
+/**
+ * ibv_rate_to_mult - Convert the IB rate enum to a multiple of the
+ * base rate of 2.5 Gbit/sec.  For example, IBV_RATE_5_GBPS will be
+ * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec.
+ * @rate: rate to convert.
+ */
+int ibv_rate_to_mult(enum ibv_rate rate) __attribute_const;
+
+/**
+ * mult_to_ibv_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate enum.
+ * @mult: multiple to convert.
+ */
+enum ibv_rate mult_to_ibv_rate(int mult) __attribute_const;
+
+/**
+ * ibv_rate_to_mbps - Convert the IB rate enum to Mbit/sec.
+ * For example, IBV_RATE_5_GBPS will return the value 5000.
+ * @rate: rate to convert.
+ */
+int ibv_rate_to_mbps(enum ibv_rate rate) __attribute_const;
+
+/**
+ * mbps_to_ibv_rate - Convert a Mbit/sec value to an IB rate enum.
+ * @mbps: value to convert.
+ */
+enum ibv_rate mbps_to_ibv_rate(int mbps) __attribute_const;
+
+struct ibv_ah_attr {
+	struct ibv_global_route	grh;
+	uint16_t		dlid;
+	uint8_t			sl;
+	uint8_t			src_path_bits;
+	uint8_t			static_rate;
+	uint8_t			is_global;
+	uint8_t			port_num;
+};
+
+enum ibv_srq_attr_mask {
+	IBV_SRQ_MAX_WR	= 1 << 0,
+	IBV_SRQ_LIMIT	= 1 << 1
+};
+
+struct ibv_srq_attr {
+	uint32_t		max_wr;
+	uint32_t		max_sge;
+	uint32_t		srq_limit;
+};
+
+struct ibv_srq_init_attr {
+	void		       *srq_context;
+	struct ibv_srq_attr	attr;
+};
+
+enum ibv_srq_type {
+	IBV_SRQT_BASIC,
+	IBV_SRQT_XRC
+};
+
+enum ibv_srq_init_attr_mask {
+	IBV_SRQ_INIT_ATTR_TYPE		= 1 << 0,
+	IBV_SRQ_INIT_ATTR_PD		= 1 << 1,
+	IBV_SRQ_INIT_ATTR_XRCD		= 1 << 2,
+	IBV_SRQ_INIT_ATTR_CQ		= 1 << 3,
+	IBV_SRQ_INIT_ATTR_RESERVED	= 1 << 4
+};
+
+struct ibv_srq_init_attr_ex {
+	void		       *srq_context;
+	struct ibv_srq_attr	attr;
+
+	uint32_t		comp_mask;
+	enum ibv_srq_type	srq_type;
+	struct ibv_pd	       *pd;
+	struct ibv_xrcd	       *xrcd;
+	struct ibv_cq	       *cq;
+};
+
+enum ibv_qp_type {
+	IBV_QPT_RC = 2,
+	IBV_QPT_UC,
+	IBV_QPT_UD,
+	/* XRC compatible code */
+	IBV_QPT_XRC,
+	IBV_QPT_RAW_PACKET = 8,
+	IBV_QPT_RAW_ETH = 8,
+	IBV_QPT_XRC_SEND = 9,
+	IBV_QPT_XRC_RECV,
+
+	/* Leave a gap for future qp types before starting with
+	 * experimental qp types.
+	 */
+	IBV_EXP_QP_TYPE_START	= 32,
+	IBV_EXP_QPT_DC_INI	= IBV_EXP_QP_TYPE_START
+};
+
+struct ibv_qp_cap {
+	uint32_t		max_send_wr;
+	uint32_t		max_recv_wr;
+	uint32_t		max_send_sge;
+	uint32_t		max_recv_sge;
+	uint32_t		max_inline_data;
+};
+
+struct ibv_qp_init_attr {
+	void		       *qp_context;
+	struct ibv_cq	       *send_cq;
+	struct ibv_cq	       *recv_cq;
+	struct ibv_srq	       *srq;
+	struct ibv_qp_cap	cap;
+	enum ibv_qp_type	qp_type;
+	int			sq_sig_all;
+	/* Below is needed for backwards compatabile */
+	struct ibv_xrc_domain  *xrc_domain;
+};
+
+enum ibv_qp_init_attr_mask {
+	IBV_QP_INIT_ATTR_PD		= 1 << 0,
+	IBV_QP_INIT_ATTR_XRCD		= 1 << 1,
+	IBV_QP_INIT_ATTR_RESERVED	= 1 << 2
+};
+
+struct ibv_qp_init_attr_ex {
+	void		       *qp_context;
+	struct ibv_cq	       *send_cq;
+	struct ibv_cq	       *recv_cq;
+	struct ibv_srq	       *srq;
+	struct ibv_qp_cap	cap;
+	enum ibv_qp_type	qp_type;
+	int			sq_sig_all;
+
+	uint32_t		comp_mask;
+	struct ibv_pd	       *pd;
+	struct ibv_xrcd	       *xrcd;
+};
+
+enum ibv_qp_open_attr_mask {
+	IBV_QP_OPEN_ATTR_NUM		= 1 << 0,
+	IBV_QP_OPEN_ATTR_XRCD		= 1 << 1,
+	IBV_QP_OPEN_ATTR_CONTEXT	= 1 << 2,
+	IBV_QP_OPEN_ATTR_TYPE		= 1 << 3,
+	IBV_QP_OPEN_ATTR_RESERVED	= 1 << 4
+};
+
+struct ibv_qp_open_attr {
+	uint32_t		comp_mask;
+	uint32_t		qp_num;
+	struct ibv_xrcd        *xrcd;
+	void		       *qp_context;
+	enum ibv_qp_type	qp_type;
+};
+
+enum ibv_qp_attr_mask {
+	IBV_QP_STATE			= 1 << 	0,
+	IBV_QP_CUR_STATE		= 1 << 	1,
+	IBV_QP_EN_SQD_ASYNC_NOTIFY	= 1 << 	2,
+	IBV_QP_ACCESS_FLAGS		= 1 << 	3,
+	IBV_QP_PKEY_INDEX		= 1 << 	4,
+	IBV_QP_PORT			= 1 << 	5,
+	IBV_QP_QKEY			= 1 << 	6,
+	IBV_QP_AV			= 1 << 	7,
+	IBV_QP_PATH_MTU			= 1 << 	8,
+	IBV_QP_TIMEOUT			= 1 << 	9,
+	IBV_QP_RETRY_CNT		= 1 << 10,
+	IBV_QP_RNR_RETRY		= 1 << 11,
+	IBV_QP_RQ_PSN			= 1 << 12,
+	IBV_QP_MAX_QP_RD_ATOMIC		= 1 << 13,
+	IBV_QP_ALT_PATH			= 1 << 14,
+	IBV_QP_MIN_RNR_TIMER		= 1 << 15,
+	IBV_QP_SQ_PSN			= 1 << 16,
+	IBV_QP_MAX_DEST_RD_ATOMIC	= 1 << 17,
+	IBV_QP_PATH_MIG_STATE		= 1 << 18,
+	IBV_QP_CAP			= 1 << 19,
+	IBV_QP_DEST_QPN			= 1 << 20
+};
+
+enum ibv_qp_state {
+	IBV_QPS_RESET,
+	IBV_QPS_INIT,
+	IBV_QPS_RTR,
+	IBV_QPS_RTS,
+	IBV_QPS_SQD,
+	IBV_QPS_SQE,
+	IBV_QPS_ERR,
+	IBV_QPS_UNKNOWN
+};
+
+enum ibv_mig_state {
+	IBV_MIG_MIGRATED,
+	IBV_MIG_REARM,
+	IBV_MIG_ARMED
+};
+
+struct ibv_qp_attr {
+	enum ibv_qp_state	qp_state;
+	enum ibv_qp_state	cur_qp_state;
+	enum ibv_mtu		path_mtu;
+	enum ibv_mig_state	path_mig_state;
+	uint32_t		qkey;
+	uint32_t		rq_psn;
+	uint32_t		sq_psn;
+	uint32_t		dest_qp_num;
+	int			qp_access_flags;
+	struct ibv_qp_cap	cap;
+	struct ibv_ah_attr	ah_attr;
+	struct ibv_ah_attr	alt_ah_attr;
+	uint16_t		pkey_index;
+	uint16_t		alt_pkey_index;
+	uint8_t			en_sqd_async_notify;
+	uint8_t			sq_draining;
+	uint8_t			max_rd_atomic;
+	uint8_t			max_dest_rd_atomic;
+	uint8_t			min_rnr_timer;
+	uint8_t			port_num;
+	uint8_t			timeout;
+	uint8_t			retry_cnt;
+	uint8_t			rnr_retry;
+	uint8_t			alt_port_num;
+	uint8_t			alt_timeout;
+};
+
+enum ibv_wr_opcode {
+	IBV_WR_RDMA_WRITE,
+	IBV_WR_RDMA_WRITE_WITH_IMM,
+	IBV_WR_SEND,
+	IBV_WR_SEND_WITH_IMM,
+	IBV_WR_RDMA_READ,
+	IBV_WR_ATOMIC_CMP_AND_SWP,
+	IBV_WR_ATOMIC_FETCH_AND_ADD
+};
+
+enum ibv_send_flags {
+	IBV_SEND_FENCE		= 1 << 0,
+	IBV_SEND_SIGNALED	= 1 << 1,
+	IBV_SEND_SOLICITED	= 1 << 2,
+	IBV_SEND_INLINE		= 1 << 3
+};
+
+struct ibv_sge {
+	uint64_t		addr;
+	uint32_t		length;
+	uint32_t		lkey;
+};
+
+struct ibv_send_wr {
+	uint64_t		wr_id;
+	struct ibv_send_wr     *next;
+	struct ibv_sge	       *sg_list;
+	int			num_sge;
+	enum ibv_wr_opcode	opcode;
+	int			send_flags;
+	uint32_t		imm_data;	/* in network byte order */
+	union {
+		struct {
+			uint64_t	remote_addr;
+			uint32_t	rkey;
+		} rdma;
+		struct {
+			uint64_t	remote_addr;
+			uint64_t	compare_add;
+			uint64_t	swap;
+			uint32_t	rkey;
+		} atomic;
+		struct {
+			struct ibv_ah  *ah;
+			uint32_t	remote_qpn;
+			uint32_t	remote_qkey;
+		} ud;
+	} wr;
+	union {
+		union {
+			struct {
+				uint32_t    remote_srqn;
+			} xrc;
+		} qp_type;
+
+		uint32_t		xrc_remote_srq_num;
+	};
+};
+
+struct ibv_recv_wr {
+	uint64_t		wr_id;
+	struct ibv_recv_wr     *next;
+	struct ibv_sge	       *sg_list;
+	int			num_sge;
+};
+
+struct ibv_mw_bind {
+	uint64_t		wr_id;
+	struct ibv_mr	       *mr;
+	void		       *addr;
+	size_t			length;
+	int			send_flags;
+	int			mw_access_flags;
+};
+
+struct ibv_srq {
+	struct ibv_context     *context;
+	void		       *srq_context;
+	struct ibv_pd	       *pd;
+	uint32_t		handle;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+	uint32_t		events_completed;
+
+	/* below are for source compatabilty with legacy XRC,
+	*   padding based on ibv_srq_legacy.
+	*/
+	uint32_t		xrc_srq_num_bin_compat_padding;
+	struct ibv_xrc_domain	*xrc_domain_bin_compat_padding;
+	struct ibv_cq	*xrc_cq_bin_compat_padding;
+	void		*ibv_srq_padding;
+
+	/* legacy fields */
+	uint32_t		xrc_srq_num;
+	struct ibv_xrc_domain	*xrc_domain;
+	struct ibv_cq		*xrc_cq;
+};
+
+/* Not in use in new API, needed for compilation as part of source compat layer */
+enum ibv_event_flags {
+	IBV_XRC_QP_EVENT_FLAG = 0x80000000,
+};
+
+
+
+struct ibv_qp {
+	struct ibv_context     *context;
+	void		       *qp_context;
+	struct ibv_pd	       *pd;
+	struct ibv_cq	       *send_cq;
+	struct ibv_cq	       *recv_cq;
+	struct ibv_srq	       *srq;
+	uint32_t		handle;
+	uint32_t		qp_num;
+	enum ibv_qp_state       state;
+	enum ibv_qp_type	qp_type;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+	uint32_t		events_completed;
+};
+
+struct ibv_comp_channel {
+	struct ibv_context     *context;
+	int			fd;
+	int			refcnt;
+};
+
+struct ibv_cq {
+	struct ibv_context     *context;
+	struct ibv_comp_channel *channel;
+	void		       *cq_context;
+	uint32_t		handle;
+	int			cqe;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+	uint32_t		comp_events_completed;
+	uint32_t		async_events_completed;
+};
+
+struct ibv_ah {
+	struct ibv_context     *context;
+	struct ibv_pd	       *pd;
+	uint32_t		handle;
+};
+
+enum ibv_flow_flags {
+	IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1,
+	IBV_FLOW_ATTR_FLAGS_DONT_TRAP = 1 << 1,
+};
+
+enum ibv_flow_attr_type {
+	/* steering according to rule specifications */
+	IBV_FLOW_ATTR_NORMAL		= 0x0,
+	/* default unicast and multicast rule -
+	 * receive all Eth traffic which isn't steered to any QP
+	 */
+	IBV_FLOW_ATTR_ALL_DEFAULT	= 0x1,
+	/* default multicast rule -
+	 * receive all Eth multicast traffic which isn't steered to any QP
+	 */
+	IBV_FLOW_ATTR_MC_DEFAULT	= 0x2,
+};
+
+enum ibv_flow_spec_type {
+	IBV_FLOW_SPEC_ETH	= 0x20,
+	IBV_FLOW_SPEC_IPV4	= 0x30,
+	IBV_FLOW_SPEC_TCP	= 0x40,
+	IBV_FLOW_SPEC_UDP	= 0x41,
+};
+
+struct ibv_flow_eth_filter {
+	uint8_t		dst_mac[6];
+	uint8_t		src_mac[6];
+	uint16_t	ether_type;
+	/*
+	 * same layout as 802.1q: prio 3, cfi 1, vlan id 12
+	 */
+	uint16_t	vlan_tag;
+};
+
+struct ibv_flow_spec_eth {
+	enum ibv_flow_spec_type  type;
+	uint16_t  size;
+	struct ibv_flow_eth_filter val;
+	struct ibv_flow_eth_filter mask;
+};
+
+struct ibv_flow_ipv4_filter {
+	uint32_t src_ip;
+	uint32_t dst_ip;
+};
+
+struct ibv_flow_spec_ipv4 {
+	enum ibv_flow_spec_type  type;
+	uint16_t  size;
+	struct ibv_flow_ipv4_filter val;
+	struct ibv_flow_ipv4_filter mask;
+};
+
+struct ibv_flow_tcp_udp_filter {
+	uint16_t dst_port;
+	uint16_t src_port;
+};
+
+struct ibv_flow_spec_tcp_udp {
+	enum ibv_flow_spec_type  type;
+	uint16_t  size;
+	struct ibv_flow_tcp_udp_filter val;
+	struct ibv_flow_tcp_udp_filter mask;
+};
+
+struct ibv_flow_spec {
+	union {
+		struct {
+			enum ibv_flow_spec_type	type;
+			uint16_t		size;
+		} hdr;
+		struct ibv_flow_spec_eth eth;
+		struct ibv_flow_spec_ipv4 ipv4;
+		struct ibv_flow_spec_tcp_udp tcp_udp;
+	};
+};
+
+struct ibv_flow_attr {
+	uint32_t comp_mask;
+	enum ibv_flow_attr_type type;
+	uint16_t size;
+	uint16_t priority;
+	uint8_t num_of_specs;
+	uint8_t port;
+	uint32_t flags;
+	/* Following are the optional layers according to user request
+	 * struct ibv_flow_spec_xxx [L2]
+	 * struct ibv_flow_spec_yyy [L3/L4]
+	 */
+};
+
+struct ibv_flow {
+	uint32_t	   comp_mask;
+	struct ibv_context *context;
+	uint32_t	   handle;
+};
+
+struct ibv_device;
+struct ibv_context;
+
+struct ibv_device_ops {
+	struct ibv_context *	(*alloc_context)(struct ibv_device *device, int cmd_fd);
+	void			(*free_context)(struct ibv_context *context);
+};
+
+enum {
+	IBV_SYSFS_NAME_MAX	= 64,
+	IBV_SYSFS_PATH_MAX	= 256
+};
+
+struct ibv_device {
+	struct ibv_device_ops	ops;
+	enum ibv_node_type	node_type;
+	enum ibv_transport_type	transport_type;
+	/* Name of underlying kernel IB device, eg "mthca0" */
+	char			name[IBV_SYSFS_NAME_MAX];
+	/* Name of uverbs device, eg "uverbs0" */
+	char			dev_name[IBV_SYSFS_NAME_MAX];
+	/* Path to infiniband_verbs class device in sysfs */
+	char			dev_path[IBV_SYSFS_PATH_MAX];
+	/* Path to infiniband class device in sysfs */
+	char			ibdev_path[IBV_SYSFS_PATH_MAX];
+};
+
+struct verbs_device {
+	struct ibv_device device; /* Must be first */
+	size_t	sz;
+	size_t	size_of_context;
+	int	(*init_context)(struct verbs_device *device,
+				struct ibv_context *ctx, int cmd_fd);
+	void	(*uninit_context)(struct verbs_device *device,
+				struct ibv_context *ctx);
+	/* future fields added here */
+};
+
+struct ibv_context_ops {
+	int			(*query_device)(struct ibv_context *context,
+					      struct ibv_device_attr *device_attr);
+	int			(*query_port)(struct ibv_context *context, uint8_t port_num,
+					      struct ibv_port_attr *port_attr);
+	struct ibv_pd *		(*alloc_pd)(struct ibv_context *context);
+	int			(*dealloc_pd)(struct ibv_pd *pd);
+	struct ibv_mr *		(*reg_mr)(struct ibv_pd *pd, void *addr, size_t length,
+					  int access);
+	struct ibv_mr *		(*rereg_mr)(struct ibv_mr *mr,
+					    int flags,
+					    struct ibv_pd *pd, void *addr,
+					    size_t length,
+					    int access);
+	int			(*dereg_mr)(struct ibv_mr *mr);
+	struct ibv_mw *		(*alloc_mw)(struct ibv_pd *pd, enum ibv_mw_type type);
+	int			(*bind_mw)(struct ibv_qp *qp, struct ibv_mw *mw,
+					   struct ibv_mw_bind *mw_bind);
+	int			(*dealloc_mw)(struct ibv_mw *mw);
+	struct ibv_cq *		(*create_cq)(struct ibv_context *context, int cqe,
+					     struct ibv_comp_channel *channel,
+					     int comp_vector);
+	int			(*poll_cq)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc);
+	int			(*req_notify_cq)(struct ibv_cq *cq, int solicited_only);
+	void			(*cq_event)(struct ibv_cq *cq);
+	int			(*resize_cq)(struct ibv_cq *cq, int cqe);
+	int			(*destroy_cq)(struct ibv_cq *cq);
+	struct ibv_srq *	(*create_srq)(struct ibv_pd *pd,
+					      struct ibv_srq_init_attr *srq_init_attr);
+	int			(*modify_srq)(struct ibv_srq *srq,
+					      struct ibv_srq_attr *srq_attr,
+					      int srq_attr_mask);
+	int			(*query_srq)(struct ibv_srq *srq,
+					     struct ibv_srq_attr *srq_attr);
+	int			(*destroy_srq)(struct ibv_srq *srq);
+	int			(*post_srq_recv)(struct ibv_srq *srq,
+						 struct ibv_recv_wr *recv_wr,
+						 struct ibv_recv_wr **bad_recv_wr);
+	struct ibv_qp *		(*create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *attr);
+	int			(*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+					    int attr_mask,
+					    struct ibv_qp_init_attr *init_attr);
+	int			(*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+					     int attr_mask);
+	int			(*destroy_qp)(struct ibv_qp *qp);
+	int			(*post_send)(struct ibv_qp *qp, struct ibv_send_wr *wr,
+					     struct ibv_send_wr **bad_wr);
+	int			(*post_recv)(struct ibv_qp *qp, struct ibv_recv_wr *wr,
+					     struct ibv_recv_wr **bad_wr);
+	struct ibv_ah *		(*create_ah)(struct ibv_pd *pd, struct ibv_ah_attr *attr);
+	int			(*destroy_ah)(struct ibv_ah *ah);
+	int			(*attach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid,
+						uint16_t lid);
+	int			(*detach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid,
+						uint16_t lid);
+	void			(*async_event)(struct ibv_async_event *event);
+};
+
+struct ibv_context {
+	struct ibv_device      *device;
+	struct ibv_context_ops	ops;
+	int			cmd_fd;
+	int			async_fd;
+	int			num_comp_vectors;
+	pthread_mutex_t		mutex;
+	void		       *abi_compat;
+};
+
+enum verbs_context_mask {
+	VERBS_CONTEXT_XRCD         = (uint64_t)1 << 0,
+	VERBS_CONTEXT_SRQ          = (uint64_t)1 << 1,
+	VERBS_CONTEXT_QP           = (uint64_t)1 << 2,
+	VERBS_CONTEXT_RESERVED     = (uint64_t)1 << 3,
+	VERBS_CONTEXT_EXP	   = (uint64_t)1 << 62
+};
+
+struct verbs_context {
+	/*  "grows up" - new fields go here */
+	int (*_reserved_2) (void);
+	int (*destroy_flow) (struct ibv_flow *flow);
+	int (*_reserved_1) (void);
+	struct ibv_flow * (*create_flow) (struct ibv_qp *qp,
+					  struct ibv_flow_attr *flow_attr);
+	struct ibv_qp * (*open_qp)(struct ibv_context *context,
+			struct ibv_qp_open_attr *attr);
+	struct ibv_qp * (*create_qp_ex)(struct ibv_context *context,
+			struct ibv_qp_init_attr_ex *qp_init_attr_ex);
+	int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num);
+	struct ibv_srq * (*create_srq_ex)(struct ibv_context *context,
+			struct ibv_srq_init_attr_ex *srq_init_attr_ex);
+	struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context,
+			struct ibv_xrcd_init_attr *xrcd_init_attr);
+	int  (*close_xrcd)(struct ibv_xrcd *xrcd);
+	uint64_t has_comp_mask;
+	size_t   sz;	/* Must be immediately before struct ibv_context */
+	struct ibv_context context;/* Must be last field in the struct */
+};
+
+/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/
+/*static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx)
+{
+	return (!ctx || (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED)) ?
+		NULL : container_of(ctx, struct verbs_context, context);
+}
+
+#define verbs_get_ctx_op(ctx, op) ({ \
+	struct verbs_context *_vctx = verbs_get_ctx(ctx); \
+	(!_vctx || (_vctx->sz < sizeof(*_vctx) - offsetof(struct verbs_context, op)) || \
+	!_vctx->op) ? NULL : _vctx; })*/
+
+#define verbs_set_ctx_op(_vctx, op, ptr) ({ \
+	struct verbs_context *vctx = _vctx; \
+	if (vctx && (vctx->sz >= sizeof(*vctx) - offsetof(struct verbs_context, op))) \
+		vctx->op = ptr; })
+
+static inline struct verbs_device *verbs_get_device(struct ibv_device *dev)
+{
+	return (dev->ops.alloc_context) ?
+		NULL : container_of(dev, struct verbs_device, device);
+}
+
+typedef enum ibv_return_enum
+{
+    IBV_SUCCESS = 0,                   //!< The operation was successful
+} ibv_return_t;
+
+ncclResult_t wrap_ibv_symbols(void);
+ncclResult_t wrap_ibv_fork_init(void);
+ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices);
+ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list);
+const char *wrap_ibv_get_device_name(struct ibv_device *device);
+ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device);
+ncclResult_t wrap_ibv_close_device(struct ibv_context *context);
+ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event);
+ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event);
+ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr);
+ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
+ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
+ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
+ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context);
+ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd);
+ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access);
+struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access);
+ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr);
+ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context);
+ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
+ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
+ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq);
+static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc, int* num_done) {
+  int done = cq->context->ops.poll_cq(cq, num_entries, wc); /*returns the number of wcs or 0 on success, a negative number otherwise*/
+  if (done < 0) {
+    WARN("Call to ibv_poll_cq() returned %d", done);
+    return ncclSystemError;
+  }
+  *num_done = done;
+  return ncclSuccess;
+}
+ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
+ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
+ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp);
+static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
+  return qp->context->ops.post_send(qp, wr, bad_wr);
+}
+
+static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
+  int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  if (ret != IBV_SUCCESS) {
+    WARN("ibv_post_send() failed with error %s", strerror(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+static inline ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) {
+  int ret = qp->context->ops.post_recv(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  if (ret != IBV_SUCCESS) {
+    WARN("ibv_post_recv() failed with error %s", strerror(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event);
+
+#endif //End include guard
diff --git a/src/3rd_party/nccl/src/include/nccl_net.h b/src/3rd_party/nccl/src/include/nccl_net.h
new file mode 100644
index 00000000..58c1335d
--- /dev/null
+++ b/src/3rd_party/nccl/src/include/nccl_net.h
@@ -0,0 +1,64 @@
+/*************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NET_H_
+#define NCCL_NET_H_
+
+#include "nccl.h"
+
+#define NCCL_NET_MAJOR 1
+#define NCCL_NET_MINOR 0
+
+#define NCCL_NET_HANDLE_MAXSIZE 64
+
+#define NCCL_PTR_HOST 0x1
+#define NCCL_PTR_CUDA 0x2
+
+#define NCCL_MAX_SCORE 0x7
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Return the number of network devices along with their scores relative to the
+  // current CUDA device. The per device score should be a value from 1-7 with a
+  // higher score representing a better choice for performance.
+  // This call should allocate the 'scores' array using malloc(3), and it
+  // will then be freed automatically by NCCL.
+  ncclResult_t (*devices)(int* ndev, int** scores);
+  // Return whether this device supports host pointers and/or CUDA pointers
+  // as data from the current GPU. Supported types should be composed with
+  // NCCL_PTR_HOST and NCCL_PTR_CUDA.
+  ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connectHandle
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int type, void** request);
+  // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*irecv)(void* recvComm, void* data, int size, int type, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*flush)(void* recvComm, void* data, int size);
+  // Test whether a request is complete and return the size received (can be less than requested).
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_t;
+
+extern
+#ifdef __cplusplus
+"C"
+#endif
+ncclNet_t* ncclNet;
+
+#endif // end include guard
diff --git a/src/3rd_party/nccl/src/include/net.h b/src/3rd_party/nccl/src/include/net.h
new file mode 100644
index 00000000..d841419a
--- /dev/null
+++ b/src/3rd_party/nccl/src/include/net.h
@@ -0,0 +1,40 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INT_NET_H_
+#define NCCL_INT_NET_H_
+
+#include "nccl.h"
+#include "nccl_net.h"
+
+typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
+
+/* Socket Interface Selection type */
+typedef enum { findSubnetIf   = -1,
+    dontCareIf     = -2
+} ncclSocketIfSl_t;
+
+// Translation to external API
+static const char* ncclNetName() { return ncclNet->name; }
+static ncclResult_t ncclNetDevices(int* ndev, int** scores) { NCCLCHECK(ncclNet->devices(ndev, scores)); return ncclSuccess; }
+static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(ncclNet->ptrSupport(dev, supportedTypes)); return ncclSuccess; }
+static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
+static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
+static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, type, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, type, request)); return ncclSuccess; }
+static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size) { NCCLCHECK(ncclNet->flush(recvComm, data, size)); return ncclSuccess; }
+static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
+static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
+static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
+
+extern bool ncclIbSupport();
+extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str);
+extern ncclNet_t ncclNetIb;
+extern ncclNet_t ncclNetSocket;
+
+#endif
diff --git a/src/3rd_party/nccl/src/include/nvlink.h b/src/3rd_party/nccl/src/include/nvlink.h
new file mode 100644
index 00000000..7eb74c9f
--- /dev/null
+++ b/src/3rd_party/nccl/src/include/nvlink.h
@@ -0,0 +1,155 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NVLINK_H_
+#define NCCL_NVLINK_H_
+
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "nvmlwrap.h"
+#include "topo.h"
+
+#define CONNECT_NVLINK 0x10
+#define CONNECT_NVSWITCH 0x100
+
+enum ncclNvLinkDeviceType {
+  ncclNvLinkDeviceGpu,
+  ncclNvLinkDeviceSwitch,
+};
+
+static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
+  char classPath[] =  "/sys/bus/pci/devices/0000:00:00.0/class";
+  memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
+  char* rPath = realpath(classPath, NULL);
+  int fd;
+  SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd);
+  free(rPath);
+  char pciClass[9];
+  strncpy(pciClass, "0x000000", 9);
+  int len;
+  SYSCHECKVAL(read(fd, pciClass, 8), "read", len);
+  SYSCHECK(close(fd), "close");
+  if (strcmp(pciClass, "0x068000") == 0) {
+    // PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
+    *type = ncclNvLinkDeviceSwitch;
+  } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
+      || strcmp(pciClass, "0x030000") == 0) {  // "VGA Controller" (GeForce)
+    *type = ncclNvLinkDeviceGpu;
+  } else {
+    // Ignore if we don't know what's on the other side.
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+/* Get the maximum number of NVLinks based on the GPU generation */
+static ncclResult_t getMaxNvlinks(int* maxLinks) {
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  int ccMajor;
+  CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
+  // 6 for Volta, 4 for Pascal
+  *maxLinks = (ccMajor > 6) ? 6 : 4;
+  // INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks);
+  return ncclSuccess;
+}
+
+static int getNvlinkGpu(const char* busId1, const char* busId2) {
+  // Determine if that connection is through NVLink
+  int links = 0;
+  int nvswitch_links = 0;
+  int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
+  nvmlDevice_t nvmlDev;
+  ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId1, &nvmlDev);
+  if (res != ncclSuccess) return 0;
+
+  for(int l=0; l<maxNvLinks; ++l) {
+    // nvmlDeviceGetNvLinkCapability(NVML_NVLINK_CAP_P2P_SUPPORTED) would seem to
+    // report whether the NVLink connects to a peer GPU (versus a POWER CPU?). I
+    // don't know whether nvmlDeviceGetNvLinkRemotePciInfo() would succeed in
+    // the POWER CPU case, so it seems best to check this as well.
+    unsigned canP2P;
+    if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
+
+    // nvmlDeviceGetNvLinkRemotePciInfo() will return NVML_ERROR_NOT_SUPPORTED
+    // if the links don't exist, or are disabled. So checking for that return
+    // here would probably make the nvmlDeviceGetNvLinkCapability check above
+    // redundant. Presumably, we still need to check the P2P capability above,
+    // since even non-GPUs would possess PCI info.
+    nvmlPciInfo_t remoteProc;
+    if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
+
+    // Old versions of NVML return a lowercase PCI ID
+    char* p = remoteProc.busId;
+    for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
+      if (p[c] == 0) break;
+      p[c] = toupper(p[c]);
+    }
+
+    if (strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
+      links++;
+    } else {
+      // Make a lower case copy of the bus ID for calling ncclDeviceType
+      // PCI system path is in lower case
+      char* p = remoteProc.busId;
+      char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+      for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
+        if (p[c] == 0) break;
+        lowerId[c] = tolower(p[c]);
+      }
+
+      // Determine if the remote side is NVswitch
+      enum ncclNvLinkDeviceType type;
+      if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
+        //TODO: we are making an assumption that all GPUs are connected to this switch
+        //This assumption may change for future architectures
+        nvswitch_links++;
+      }
+    }
+  }
+  return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*links;
+}
+
+static int getNumNvlinks(const char* busId) {
+  nvmlDevice_t nvmlDev;
+  ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev);
+  if (res != ncclSuccess) return 0;
+
+  int nvlinks = 0, nvswitch_links = 0;
+  int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
+  for(int l=0; l<maxNvLinks; ++l) {
+    unsigned canP2P;
+    nvmlEnableState_t isActive;
+    if (wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) == ncclSuccess && canP2P &&
+        wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) == ncclSuccess && isActive == NVML_FEATURE_ENABLED) {
+      nvlinks++;
+    } else {
+      continue;
+    }
+
+    nvmlPciInfo_t remoteProc;
+    if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
+
+    // Make a lower case copy of the bus ID for calling ncclDeviceType
+    // PCI system path is in lower case
+    char* p = remoteProc.busId;
+    char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+    for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
+      if (p[c] == 0) break;
+      lowerId[c] = tolower(p[c]);
+    }
+
+    // Determine if the remote side is NVswitch
+    enum ncclNvLinkDeviceType type;
+    if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
+      //TODO: we are making an assumption that all GPUs are connected to this switch
+      //This assumption may change for future architectures
+      nvswitch_links++;
+    }
+  }
+  return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*nvlinks;
+}
+#endif
diff --git a/src/3rd_party/nccl/src/include/nvmlwrap.h b/src/3rd_party/nccl/src/include/nvmlwrap.h
new file mode 100644
index 00000000..ddfd233d
--- /dev/null
+++ b/src/3rd_party/nccl/src/include/nvmlwrap.h
@@ -0,0 +1,149 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NVMLWRAP_H_
+#define NCCL_NVMLWRAP_H_
+
+#include "core.h"
+
+//#define NVML_DIRECT 1
+#ifdef NVML_DIRECT
+#include "nvml.h"
+
+#define NVMLCHECK(cmd) do {                              \
+    nvmlReturn_t e = cmd;                                \
+    if( e != NVML_SUCCESS ) {                            \
+      WARN("NVML failure '%s'", nvmlErrorString(e));     \
+      return ncclSystemError;                            \
+    }                                                    \
+} while(false)
+
+static ncclResult_t wrapNvmlSymbols(void) { return ncclSuccess; }
+static ncclResult_t wrapNvmlInit(void) { NVMLCHECK(nvmlInit()); return ncclSuccess; }
+static ncclResult_t wrapNvmlShutdown(void) { NVMLCHECK(nvmlShutdown()); return ncclSuccess; }
+static ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
+  NVMLCHECK(nvmlDeviceGetHandleByPciBusId(pciBusId, device));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
+  NVMLCHECK(nvmlDeviceGetIndex(device, index));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
+  NVMLCHECK(nvmlDeviceSetCpuAffinity(device));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
+  NVMLCHECK(nvmlDeviceClearCpuAffinity(device));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
+  NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetHandleByPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
+  NVMLCHECK(nvmlDeviceGetPciInfo(device, pci));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
+  NVMLCHECK(nvmlDeviceGetNvLinkState(device, link, isActive));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
+  NVMLCHECK(nvmlDeviceGetNvLinkRemotePciInfo(device, link, pci));
+  return ncclSuccess;
+}
+static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
+                                                   nvmlNvLinkCapability_t capability, unsigned int *capResult) {
+  NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
+  return ncclSuccess;
+}
+#else
+// Dynamically handle dependencies on NVML
+
+/* Extracted from nvml.h */
+typedef struct nvmlDevice_st* nvmlDevice_t;
+#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE   16
+
+typedef enum nvmlEnableState_enum
+{
+    NVML_FEATURE_DISABLED    = 0,     //!< Feature disabled
+    NVML_FEATURE_ENABLED     = 1      //!< Feature enabled
+} nvmlEnableState_t;
+
+typedef enum nvmlNvLinkCapability_enum
+{
+    NVML_NVLINK_CAP_P2P_SUPPORTED = 0,     // P2P over NVLink is supported
+    NVML_NVLINK_CAP_SYSMEM_ACCESS = 1,     // Access to system memory is supported
+    NVML_NVLINK_CAP_P2P_ATOMICS   = 2,     // P2P atomics are supported
+    NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3,     // System memory atomics are supported
+    NVML_NVLINK_CAP_SLI_BRIDGE    = 4,     // SLI is supported over this link
+    NVML_NVLINK_CAP_VALID         = 5,     // Link is supported on this device
+    // should be last
+    NVML_NVLINK_CAP_COUNT
+} nvmlNvLinkCapability_t;
+
+typedef enum nvmlReturn_enum
+{
+    NVML_SUCCESS = 0,                   //!< The operation was successful
+    NVML_ERROR_UNINITIALIZED = 1,       //!< NVML was not first initialized with nvmlInit()
+    NVML_ERROR_INVALID_ARGUMENT = 2,    //!< A supplied argument is invalid
+    NVML_ERROR_NOT_SUPPORTED = 3,       //!< The requested operation is not available on target device
+    NVML_ERROR_NO_PERMISSION = 4,       //!< The current user does not have permission for operation
+    NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
+    NVML_ERROR_NOT_FOUND = 6,           //!< A query to find an object was unsuccessful
+    NVML_ERROR_INSUFFICIENT_SIZE = 7,   //!< An input argument is not large enough
+    NVML_ERROR_INSUFFICIENT_POWER = 8,  //!< A device's external power cables are not properly attached
+    NVML_ERROR_DRIVER_NOT_LOADED = 9,   //!< NVIDIA driver is not loaded
+    NVML_ERROR_TIMEOUT = 10,            //!< User provided timeout passed
+    NVML_ERROR_IRQ_ISSUE = 11,          //!< NVIDIA Kernel detected an interrupt issue with a GPU
+    NVML_ERROR_LIBRARY_NOT_FOUND = 12,  //!< NVML Shared Library couldn't be found or loaded
+    NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
+    NVML_ERROR_CORRUPTED_INFOROM = 14,  //!< infoROM is corrupted
+    NVML_ERROR_GPU_IS_LOST = 15,        //!< The GPU has fallen off the bus or has otherwise become inaccessible
+    NVML_ERROR_RESET_REQUIRED = 16,     //!< The GPU requires a reset before it can be used again
+    NVML_ERROR_OPERATING_SYSTEM = 17,   //!< The GPU control device has been blocked by the operating system/cgroups
+    NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18,   //!< RM detects a driver/library version mismatch
+    NVML_ERROR_IN_USE = 19,             //!< An operation cannot be performed because the GPU is currently in use
+    NVML_ERROR_UNKNOWN = 999            //!< An internal driver error occurred
+} nvmlReturn_t;
+
+typedef struct nvmlPciInfo_st
+{
+    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
+    unsigned int domain;             //!< The PCI domain on which the device's bus resides, 0 to 0xffff
+    unsigned int bus;                //!< The bus on which the device resides, 0 to 0xff
+    unsigned int device;             //!< The device's id on the bus, 0 to 31
+    unsigned int pciDeviceId;        //!< The combined 16-bit device id and 16-bit vendor id
+
+    // Added in NVML 2.285 API
+    unsigned int pciSubSystemId;     //!< The 32-bit Sub System Device ID
+
+    // NVIDIA reserved for internal use only
+    unsigned int reserved0;
+    unsigned int reserved1;
+    unsigned int reserved2;
+    unsigned int reserved3;
+} nvmlPciInfo_t;
+/* End of nvml.h */
+
+ncclResult_t wrapNvmlSymbols(void);
+
+ncclResult_t wrapNvmlInit(void);
+ncclResult_t wrapNvmlShutdown(void);
+ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
+ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
+ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
+ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
+ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
+ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci);
+ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
+ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
+ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
+                                                   nvmlNvLinkCapability_t capability, unsigned int *capResult);
+#endif // NVML_DIRECT
+
+#endif // End include guard
diff --git a/src/3rd_party/nccl/src/include/param.h b/src/3rd_party/nccl/src/include/param.h
new file mode 100644
index 00000000..44c1b9a5
--- /dev/null
+++ b/src/3rd_party/nccl/src/include/param.h
@@ -0,0 +1,81 @@
+/*************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PARAM_H_
+#define NCCL_PARAM_H_
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <pwd.h>
+
+static const char* userHomeDir() {
+  struct passwd *pwUser = getpwuid(getuid());
+  return pwUser == NULL ? NULL : pwUser->pw_dir;
+}
+
+static void setEnvFile(const char* fileName) {
+  FILE * file = fopen(fileName, "r");
+  if (file == NULL) return;
+
+  char *line = NULL;
+  char envVar[1024];
+  char envValue[1024];
+  size_t n = 0;
+  ssize_t read;
+  while ((read = getline(&line, &n, file)) != -1) {
+    if (line[read-1] == '\n') line[read-1] = '\0';
+    int s=0; // Env Var Size
+    while (line[s] != '\0' && line[s] != '=') s++;
+    if (line[s] == '\0') continue;
+    strncpy(envVar, line, std::min(1024,s));
+    envVar[s] = '\0';
+    s++;
+    strncpy(envValue, line+s, 1024);
+    setenv(envVar, envValue, 0);
+    char *str = getenv(envVar);
+  }
+  if (line) free(line);
+  fclose(file);
+}
+
+static void initEnv() {
+  char confFilePath[1024];
+  const char * userDir = userHomeDir();
+  if (userDir) {
+    sprintf(confFilePath, "%s/.nccl.conf", userDir);
+    setEnvFile(confFilePath);
+  }
+  sprintf(confFilePath, "/etc/nccl.conf");
+  setEnvFile(confFilePath);
+}
+
+
+#define NCCL_PARAM(name, env, default_value) \
+pthread_mutex_t ncclParamMutex##name = PTHREAD_MUTEX_INITIALIZER; \
+int64_t ncclParam##name() { \
+  static_assert(default_value != -1LL, "default value cannot be -1"); \
+  static int64_t value = -1LL; \
+  pthread_mutex_lock(&ncclParamMutex##name); \
+  if (value == -1LL) { \
+    value = default_value; \
+    char* str = getenv("NCCL_" env); \
+    if (str && strlen(str) > 0) { \
+      errno = 0; \
+      int64_t v = strtoll(str, NULL, 0); \
+      if (errno) { \
+        INFO(ALL,"Invalid value %s for %s, using default %lu.", str, "NCCL_" env, value); \
+      } else { \
+        value = v; \
+        INFO(ALL,"%s set by environment to %lu.", "NCCL_" env, value);  \
+      } \
+    } \
+  } \
+  pthread_mutex_unlock(&ncclParamMutex##name); \
+  return value; \
+}
+
+#endif
diff --git a/src/3rd_party/nccl/src/include/ring.h b/src/3rd_party/nccl/src/include/ring.h
new file mode 100644
index 00000000..fa5e0995
--- /dev/null
+++ b/src/3rd_party/nccl/src/include/ring.h
@@ -0,0 +1,14 @@
+/*************************************************************************
+ * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_RING_H_
+#define NCCL_RING_H_
+#include "core.h"
+
+ncclResult_t initRing(struct ncclComm* comm, int ringid);
+ncclResult_t freeRing(struct ncclRing* ring);
+
+#endif
diff --git a/src/3rd_party/nccl/src/include/rings.h b/src/3rd_party/nccl/src/include/rings.h
new file mode 100644
index 00000000..751846c6
--- /dev/null
+++ b/src/3rd_party/nccl/src/include/rings.h
@@ -0,0 +1,17 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_RINGS_H_
+#define NCCL_RINGS_H_
+
+static int getDefaultThreads() {
+  // On Kepler, rings are doubled later.
+  return ncclCudaCompCap() == 3 ? 128 : 256;
+}
+
+ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next);
+
+#endif
diff --git a/src/3rd_party/nccl/src/include/shm.h b/src/3rd_party/nccl/src/include/shm.h
new file mode 100644
index 00000000..ce46a16c
--- /dev/null
+++ b/src/3rd_party/nccl/src/include/shm.h
@@ -0,0 +1,76 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_SHM_H_
+#define NCCL_SHM_H_
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPtr, void** devShmPtr, int create) {
+  *shmPtr = NULL;
+  int fd = shm_open(shmname, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+  if (fd == -1) {
+    WARN("shm_open failed to open %s : %s", shmname, strerror(errno));
+    return ncclSystemError;
+  }
+
+  if (create) {
+    int res = posix_fallocate(fd, 0, shmsize);
+    if (res != 0) {
+      WARN("Unable to allocate shared memory (%d bytes) : %s", shmsize, strerror(res));
+      shm_unlink(shmname);
+      close(fd);
+      return ncclSystemError;
+    }
+  }
+
+  void *ptr = mmap(NULL, shmsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+  close(fd);
+  if (ptr == MAP_FAILED) {
+    WARN("failure in mmap of %s (size %d) : %s", shmname, shmsize, strerror(errno));
+    shm_unlink(shmname);
+    return ncclSystemError;
+  }
+  if (create) {
+    memset(ptr, 0, shmsize);
+  }
+
+  cudaError_t e;
+  if ((e=cudaHostRegister(ptr, shmsize, cudaHostRegisterMapped)) != cudaSuccess) {
+    WARN("failed to register host buffer %p : %s", ptr, cudaGetErrorString(e));
+    if (create) shm_unlink(shmname);
+    munmap(ptr, shmsize);
+    return ncclUnhandledCudaError;
+  }
+
+  if ((e=cudaHostGetDevicePointer(devShmPtr, ptr, 0)) != cudaSuccess) {
+    WARN("failed to get device pointer for local shmem %p : %s", ptr, cudaGetErrorString(e));
+    if (create) shm_unlink(shmname);
+    munmap(ptr, shmsize);
+    return ncclUnhandledCudaError;
+  }
+  *shmPtr = ptr;
+  return ncclSuccess;
+}
+
+static ncclResult_t shmUnlink(const char* shmname) {
+  if (shmname != NULL) SYSCHECK(shm_unlink(shmname), "shm_unlink");
+  return ncclSuccess;
+}
+
+static ncclResult_t shmClose(void* shmPtr, void* devShmPtr, const int shmsize) {
+  CUDACHECK(cudaHostUnregister(shmPtr));
+  if (munmap(shmPtr, shmsize) != 0) {
+    WARN("munmap of shared memory failed");
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+#endif
diff --git a/src/3rd_party/nccl/src/include/socket.h b/src/3rd_party/nccl/src/include/socket.h
new file mode 100644
index 00000000..533cacc9
--- /dev/null
+++ b/src/3rd_party/nccl/src/include/socket.h
@@ -0,0 +1,409 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_SOCKET_H_
+#define NCCL_SOCKET_H_
+
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <netinet/tcp.h>
+#include <unistd.h>
+#include <netdb.h>
+#include <ifaddrs.h>
+#include <net/if.h>
+#include "utils.h"
+
+#define MAX_IFS 16
+#define MAX_IF_NAME_SIZE 16
+#define SLEEP_INT     1000  // sleep interval in usec
+#define RETRY_TIMES   2e4   // retry times before reporting a timeout (20 sec)
+
+/* Common socket address storage structure for IPv4/IPv6 */
+union socketAddress {
+  struct sockaddr sa;
+  struct sockaddr_in sin;
+  struct sockaddr_in6 sin6;
+};
+
+/* Format a string representation of a (struct sockaddr *) socket address using getnameinfo()
+ *
+ * Output: "IPv4/IPv6 address<port>"
+ */
+static inline const char *socketToString(struct sockaddr *saddr, char *buf) {
+  if (buf == NULL || saddr == NULL) return NULL;
+  if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
+  char host[NI_MAXHOST], service[NI_MAXSERV];
+  (void) getnameinfo(saddr, sizeof(union socketAddress), host, NI_MAXHOST, service, NI_MAXSERV, NI_NUMERICHOST|NI_NUMERICSERV);
+  sprintf(buf, "%s<%s>", host, service);
+  return buf;
+}
+
+static inline short socketToPort(struct sockaddr *saddr) {
+  return ntohs(saddr->sa_family == AF_INET ? ((struct sockaddr_in*)saddr)->sin_port : ((struct sockaddr_in6*)saddr)->sin6_port);
+}
+
+/* Allow the user to force the IPv4/IPv6 interface selection */
+static inline int envSocketFamily(void) {
+  int family = -1; // Family selection is not forced, will use first one found
+  char* env = getenv("NCCL_SOCKET_FAMILY");
+  if (env == NULL)
+    return family;
+
+  if (strcmp(env, "AF_INET") == 0)
+    family = AF_INET;  // IPv4
+  else if (strcmp(env, "AF_INET6") == 0)
+    family = AF_INET6; // IPv6
+  return family;
+}
+
+static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
+  char line[1024];
+  struct netIf userIfs[MAX_IFS];
+  bool searchNot = prefixList && prefixList[0] == '^';
+  int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
+
+  int found = 0;
+  struct ifaddrs *interfaces, *interface;
+  getifaddrs(&interfaces);
+  for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) {
+    if (interface->ifa_addr == NULL) continue;
+
+    /* We only support IPv4 & IPv6 */
+    int family = interface->ifa_addr->sa_family;
+    if (family != AF_INET && family != AF_INET6)
+      continue;
+
+    TRACE(INIT|NET,"Found interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
+
+    /* Allow the caller to force the socket family type */
+    if (sock_family != -1 && family != sock_family)
+      continue;
+
+    /* We also need to skip IPv6 loopback interfaces */
+    if (family == AF_INET6) {
+      struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr);
+      if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
+    }
+
+    // check against user specified interfaces
+    if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs) ^ searchNot)) {
+      continue;
+    }
+
+    // Check that this interface has not already been saved
+    // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link
+    bool duplicate = false;
+    for (int i = 0; i < found; i++) {
+      if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; }
+    }
+
+    if (!duplicate) {
+      // Store the interface name
+      strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize);
+      // Store the IP address
+      int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
+      memcpy(addrs+found, interface->ifa_addr, salen);
+      INFO(INIT|NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
+      found++;
+    }
+  }
+
+  freeifaddrs(interfaces);
+  return found;
+}
+
+static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
+  /* Check family first */
+  int family = local_if.ifa_addr->sa_family;
+  if (family != remote.sa.sa_family) {
+    return false;
+  }
+
+  if (family == AF_INET) {
+    struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
+    struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
+    struct sockaddr_in& remote_addr = remote.sin;
+    struct in_addr local_subnet, remote_subnet;
+    local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
+    remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
+    return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true;
+  } else if (family == AF_INET6) {
+    struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
+    struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
+    struct sockaddr_in6& remote_addr = remote.sin6;
+    struct in6_addr& local_in6 = local_addr->sin6_addr;
+    struct in6_addr& mask_in6 = mask->sin6_addr;
+    struct in6_addr& remote_in6 = remote_addr.sin6_addr;
+    bool same = true;
+    int len = 16;  //IPv6 address is 16 unsigned char
+    for (int c = 0; c < len; c++) {  //Network byte order is big-endian
+      char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c];
+      char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c];
+      if (c1 ^ c2) {
+        same = false;
+        break;
+      }
+    }
+    // At last, we need to compare scope id
+    // Two Link-type addresses can have the same subnet address even though they are not in the same scope
+    // For Global type, this field is 0, so a comparison wouldn't matter
+    same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id);
+    return same;
+  } else {
+    WARN("Net : Unsupported address family type");
+    return false;
+  }
+}
+
+static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress remoteAddr, int ifNameMaxSize, int maxIfs) {
+  char line[1024], line_a[1024];
+  int found = 0;
+  struct ifaddrs *interfaces, *interface;
+  getifaddrs(&interfaces);
+  for (interface = interfaces; interface && !found; interface = interface->ifa_next) {
+    if (interface->ifa_addr == NULL) continue;
+
+    /* We only support IPv4 & IPv6 */
+    int family = interface->ifa_addr->sa_family;
+    if (family != AF_INET && family != AF_INET6)
+      continue;
+
+    // check against user specified interfaces
+    if (!matchSubnet(*interface, remoteAddr)) {
+      continue;
+    }
+
+    // Store the local IP address
+    int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
+    memcpy(localAddrs+found, interface->ifa_addr, salen);
+
+    // Store the interface name
+    strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
+
+    INFO(INIT|NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
+    found++;
+    if (found == maxIfs) break;
+  }
+
+  if (found == 0) {
+    WARN("Net : No interface found in the same subnet as remote address %s", socketToString(&(remoteAddr.sa), line_a));
+  }
+  freeifaddrs(interfaces);
+  return found;
+}
+
+static ncclResult_t GetSocketAddrFromString(union socketAddress* ua, const char* ip_port_pair) {
+  if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
+    WARN("Net : string is null");
+    return ncclInvalidArgument;
+  }
+
+  bool ipv6 = ip_port_pair[0] == '[';
+  /* Construct the sockaddress structure */
+  if (!ipv6) {
+    struct netIf ni;
+    // parse <ip_or_hostname>:<port> string, expect one pair
+    if (parseStringList(ip_port_pair, &ni, 1) != 1) {
+      WARN("Net : No valid <IPv4_or_hostname>:<port> pair found");
+      return ncclInvalidArgument;
+    }
+
+    struct addrinfo hints, *p;
+    int rv;
+    memset(&hints, 0, sizeof(hints));
+    hints.ai_family = AF_UNSPEC;
+    hints.ai_socktype = SOCK_STREAM;
+
+    if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) {
+      WARN("Net : error encountered when getting address info : %s", gai_strerror(rv));
+      return ncclInvalidArgument;
+    }
+
+    // use the first
+    if (p->ai_family == AF_INET) {
+      struct sockaddr_in& sin = ua->sin;
+      memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in));
+      sin.sin_family = AF_INET;                        // IPv4
+      //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr));  // IP address
+      sin.sin_port = htons(ni.port);                   // port
+    } else if (p->ai_family == AF_INET6) {
+      struct sockaddr_in6& sin6 = ua->sin6;
+      memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6));
+      sin6.sin6_family = AF_INET6;                     // IPv6
+      sin6.sin6_port = htons(ni.port);                 // port
+      sin6.sin6_flowinfo = 0;                          // needed by IPv6, but possibly obsolete
+      sin6.sin6_scope_id = 0;                          // should be global scope, set to 0
+    } else {
+      WARN("Net : unsupported IP family");
+      return ncclInvalidArgument;
+    }
+
+    freeaddrinfo(p); // all done with this structure
+
+  } else {
+    int i, j = -1, len = strlen(ip_port_pair);
+    for (i = 1; i < len; i++) {
+      if (ip_port_pair[i] == '%') j = i;
+      if (ip_port_pair[i] == ']') break;
+    }
+    if (i == len) {
+      WARN("Net : No valid [IPv6]:port pair found");
+      return ncclInvalidArgument;
+    }
+    bool global_scope = (j == -1 ? true : false);     // If no % found, global scope; otherwise, link scope
+
+    char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ];
+    memset(ip_str, '\0', sizeof(ip_str));
+    memset(port_str, '\0', sizeof(port_str));
+    memset(if_name, '\0', sizeof(if_name));
+    strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1);
+    strncpy(port_str, ip_port_pair+i+2, len-i-1);
+    int port = atoi(port_str);
+    if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name
+
+    struct sockaddr_in6& sin6 = ua->sin6;
+    sin6.sin6_family = AF_INET6;                       // IPv6
+    inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr));    // IP address
+    sin6.sin6_port = htons(port);                      // port
+    sin6.sin6_flowinfo = 0;                            // needed by IPv6, but possibly obsolete
+    sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name);  // 0 if global scope; intf index if link scope
+  }
+  return ncclSuccess;
+}
+
+static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) {
+  int nIfs = 0;
+  // Allow user to force the INET socket family selection
+  int sock_family = envSocketFamily();
+  // User specified interface
+  char* env = getenv("NCCL_SOCKET_IFNAME");
+  if (env && strlen(env) > 1) {
+    // Specified by user : find or fail
+    nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+  } else {
+    // Try to automatically pick the right one
+    // Start with IB
+    nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    // else see if we can get some hint from COMM ID
+    if (nIfs == 0) {
+      char* commId = getenv("NCCL_COMM_ID");
+      if (commId && strlen(commId) > 1) {
+        // Try to find interface that is in the same subnet as the IP in comm id
+        union socketAddress idAddr;
+        GetSocketAddrFromString(&idAddr, commId);
+        nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, idAddr, ifNameMaxSize, maxIfs);
+      }
+    }
+    // Then look for anything else (but not docker or lo)
+    if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    // Finally look for docker, then lo.
+    if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+  }
+  return nIfs;
+}
+
+static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr) {
+  /* IPv4/IPv6 support */
+  int family = localAddr->sa.sa_family;
+  int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
+
+  /* Create socket and bind it to a port */
+  int sockfd = socket(family, SOCK_STREAM, 0);
+  if (sockfd == -1) {
+    WARN("Net : Socket creation failed : %s", strerror(errno));
+    return ncclSystemError;
+  }
+
+  if (socketToPort(&localAddr->sa)) {
+    // Port is forced by env. Make sure we get the port.
+    int opt = 1;
+    SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
+  }
+
+  // localAddr port should be 0 (Any port)
+  SYSCHECK(bind(sockfd, &localAddr->sa, salen), "bind");
+
+  /* Get the assigned Port */
+  socklen_t size = salen;
+  SYSCHECK(getsockname(sockfd, &localAddr->sa, &size), "getsockname");
+
+#ifdef ENABLE_TRACE
+  char line[1024];
+  TRACE(INIT|NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
+#endif
+
+  /* Put the socket in listen mode */
+  SYSCHECK(listen(sockfd, 128), "listen");
+  *fd = sockfd;
+  return ncclSuccess;
+}
+
+static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
+  /* IPv4/IPv6 support */
+  int family = remoteAddr->sa.sa_family;
+  int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
+
+  /* Connect to a hostname / port */
+  *fd = socket(family, SOCK_STREAM, 0);
+  if (*fd == -1) {
+    WARN("Net : Socket creation failed : %s", strerror(errno));
+    return ncclSystemError;
+  }
+
+  const int one = 1;
+  SYSCHECK(setsockopt(*fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
+
+  /*  const int bufsize = 128*1024;
+    SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
+    SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/
+
+#ifdef ENABLE_TRACE
+  char line[1024];
+  TRACE(INIT|NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line));
+#endif
+
+  SYSCHECKNTIMES(connect(*fd, &remoteAddr->sa, salen), "connect", RETRY_TIMES, SLEEP_INT, ECONNREFUSED);
+  return ncclSuccess;
+}
+
+static ncclResult_t socketReceive(int fd, void* ptr, int size) {
+  char* data = (char*)ptr;
+  int offset = 0;
+  while (offset < size) {
+    int recvsize;
+    SYSCHECKVAL(recv(fd, data, size-offset, 0), "recv", recvsize);
+    if (recvsize == 0) {
+      WARN("Net : Connection closed by remote peer");
+      return ncclSystemError;
+    }
+    if (recvsize == -1) {
+      INFO(NET,"Recv : got retcode %d, retrying", errno);
+      continue;
+    }
+    data += recvsize;
+    offset += recvsize;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t socketSend(int fd, void* ptr, int size) {
+  char* data = (char*)ptr;
+  int offset = 0;
+  while (offset < size) {
+    int sendsize;
+    SYSCHECKVAL(write(fd, data, size-offset), "write", sendsize);
+    if (sendsize == -1) {
+      INFO(NET,"Send : got retcode %d, retrying", errno);
+      continue;
+    }
+    data += sendsize;
+    offset += sendsize;
+  }
+  return ncclSuccess;
+}
+
+#endif
diff --git a/src/3rd_party/nccl/src/include/topo.h b/src/3rd_party/nccl/src/include/topo.h
new file mode 100644
index 00000000..24d7e9d2
--- /dev/null
+++ b/src/3rd_party/nccl/src/include/topo.h
@@ -0,0 +1,83 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TOPO_H_
+#define NCCL_TOPO_H_
+
+#include "nccl.h"
+#include <ctype.h>
+
+#define MAXPATHSIZE 1024
+
+static ncclResult_t getCudaPath(int cudaDev, char** path) {
+  char busId[16];
+  CUDACHECK(cudaDeviceGetPCIBusId(busId, 16, cudaDev));
+  for (int i=0; i<16; i++) busId[i] = tolower(busId[i]);
+  char busPath[] = "/sys/class/pci_bus/0000:00/device";
+  memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, sizeof("0000:00")-1);
+  char* cudaRpath = realpath(busPath, NULL);
+  char pathname[MAXPATHSIZE];
+  strncpy(pathname, cudaRpath, MAXPATHSIZE);
+  strncpy(pathname+strlen(pathname), "/", MAXPATHSIZE-strlen(pathname));
+  strncpy(pathname+strlen(pathname), busId, MAXPATHSIZE-strlen(pathname));
+  free(cudaRpath);
+  *path = realpath(pathname, NULL);
+  if (*path == NULL) {
+    WARN("Could not find real path of %s", pathname);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t getMlxPath(char* ibName, char** path) {
+  char devicepath[MAXPATHSIZE];
+  snprintf(devicepath, MAXPATHSIZE, "/sys/class/infiniband/%s/device", ibName);
+  *path = realpath(devicepath, NULL);
+  if (*path == NULL) {
+    WARN("Could not find real path of %s", devicepath);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t getSockPath(char* ifName, char** path) {
+  char devicepath[MAXPATHSIZE];
+  snprintf(devicepath, MAXPATHSIZE, "/sys/class/net/%s/device", ifName);
+  *path = realpath(devicepath, NULL);
+  if (*path == NULL) {
+    INFO(NET|INIT, "Could not find real path of %s", devicepath);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+enum ncclIbPathDist {
+  PATH_PIX = 0,
+  PATH_PXB = 1,
+  PATH_PHB = 2,
+  PATH_SOC = 3
+};
+
+static const char* pathDists[] = { "PIX", "PXB", "PHB", "SOC" };
+
+static int pciDistance(char* path1, char* path2) {
+  int score = 0;
+  int depth = 0;
+  int same = 1;
+  for (int i=0; i<strlen(path1); i++) {
+    if (path1[i] != path2[i]) same = 0;
+    if (path1[i] == '/') {
+      depth++;
+      if (same == 1) score++;
+    }
+  }
+  if (score == 3) return PATH_SOC;
+  if (score == 4) return PATH_PHB;
+  if (score == depth-1) return PATH_PIX;
+  return PATH_PXB;
+}
+
+#endif
diff --git a/src/3rd_party/nccl/src/include/transport.h b/src/3rd_party/nccl/src/include/transport.h
new file mode 100644
index 00000000..59f83c9a
--- /dev/null
+++ b/src/3rd_party/nccl/src/include/transport.h
@@ -0,0 +1,113 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TRANSPORT_H_
+#define NCCL_TRANSPORT_H_
+
+#include "nccl.h"
+#include <stdint.h>
+
+#define NTRANSPORTS 3
+
+extern struct ncclTransport ncclTransports[];
+
+// Forward declarations
+struct ncclRing;
+struct ncclConnector;
+struct ncclComm;
+
+#define RANK_INFO_SIZE 64
+typedef char ncclTinfo_t[RANK_INFO_SIZE];
+
+struct ncclInfo {
+  ncclTinfo_t tinfo[NTRANSPORTS];
+};
+
+// Used to hold the transport connection values
+typedef int64_t ncclTvalue_t;
+
+#define CONNECT_SIZE 128
+struct ncclConnect {
+  char data[CONNECT_SIZE];
+};
+
+struct ncclProxyArgs {
+  struct ncclRing* ring;
+  int substeps;
+  int nsteps;
+  uint64_t opCount;
+  int llMode;
+  bool needProxy;
+  int active;   // add component before this line -- it is left out during initialization
+};
+
+struct ncclTransportComm {
+  ncclResult_t (*setup)(ncclTinfo_t*, ncclTinfo_t*, struct ncclConnect*, struct ncclRing*);
+  ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*);
+  ncclResult_t (*free)(void*);
+  ncclResult_t (*proxy)(struct ncclProxyArgs*);
+};
+
+struct ncclTransport {
+  const char name[4];
+  ncclResult_t (*fillInfo)(ncclTinfo_t*, int);
+  ncclResult_t (*canConnect)(ncclTvalue_t*, ncclTinfo_t*, ncclTinfo_t*);
+  ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*);
+  struct ncclTransportComm send;
+  struct ncclTransportComm recv;
+};
+
+#include <pthread.h>
+
+typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
+
+#define TRANSPORT_PROXY_FIFO_SIZE NCCL_MAX_OPS
+
+struct transportProxyInfo {
+  struct ncclComm* comm;
+  pthread_t thread;
+  threadFunc_t func;
+  volatile int proxyReady;
+  struct ncclProxyArgs argsFifo[TRANSPORT_PROXY_FIFO_SIZE];
+  volatile uint64_t argsFifoHead;
+  volatile uint64_t argsFifoTail;
+  pthread_cond_t cond;
+  pthread_mutex_t mutex;
+};
+
+ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm);
+ncclResult_t transportDestroyProxy(struct ncclConnector* connector);
+
+enum proxyMode {
+  proxyRing = 0,
+  proxyFrom = 1,
+  proxyTo = 2
+};
+
+static int proxyPatternRing = proxyRing;
+static inline int proxyPatternFrom(int root) { return 1+root; }
+static inline int proxyPatternTo(int root) { return -1-root; }
+static inline enum proxyMode proxyPatternMode(int pattern) { return (pattern == 0) ? proxyRing : ((pattern > 0) ? proxyFrom : proxyTo); }
+static inline int proxyPatternRoot(int pattern) { return (pattern > 0) ? pattern-1 : -pattern-1; }
+
+ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t size, int pattern, struct ncclComm* comm);
+ncclResult_t transportStartProxies(struct ncclComm* comm);
+
+#include <unistd.h>
+
+// Spin wait until func evaluates to true
+template<typename FUNC>
+inline void transportProxyWait(const FUNC& func) {
+  while (!func()) {
+    sched_yield();
+  }
+}
+
+inline void transportProxyIdle(int idle) {
+  sched_yield();
+}
+
+#endif
diff --git a/src/3rd_party/nccl/src/include/utils.h b/src/3rd_party/nccl/src/include/utils.h
new file mode 100644
index 00000000..5a6a588c
--- /dev/null
+++ b/src/3rd_party/nccl/src/include/utils.h
@@ -0,0 +1,25 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_UTILS_H_
+#define NCCL_UTILS_H_
+
+#include "nccl.h"
+#include <stdint.h>
+
+ncclResult_t getHostName(char* hostname, int maxlen);
+uint64_t getHostHash();
+uint64_t getPidHash();
+
+struct netIf {
+  char prefix[64];
+  int port;
+};
+
+int parseStringList(const char* string, struct netIf* ifList, int maxList);
+bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize);
+
+#endif
diff --git a/src/3rd_party/nccl/src/init.cu b/src/3rd_party/nccl/src/init.cu
new file mode 100644
index 00000000..8ee21735
--- /dev/null
+++ b/src/3rd_party/nccl/src/init.cu
@@ -0,0 +1,861 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "core.h"
+#include "ring.h"
+#include "param.h"
+#include "nvmlwrap.h"
+#include "rings.h"
+#include "bootstrap.h"
+#include "transport.h"
+#include "common_coll.h"
+#include "group.h"
+#include "utils.h"
+#include "net.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sched.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <cuda_runtime.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+
+// Had to manually add this, why?
+#define NCCL_VERSION_CODE 2307
+
+DebugLevel ncclDebugLevel;
+uint64_t ncclDebugMask = INIT; // Default debug sub-system mask is INIT
+pthread_mutex_t ncclDebugOutputLock;
+FILE *ncclDebugFile = stdout;
+
+#ifdef ENABLE_TRACE
+std::chrono::high_resolution_clock::time_point ncclEpoch;
+#endif
+
+#if __CUDACC_VER_MAJOR__ >= 10 || (__CUDACC_VER_MAJOR__ >= 9 && __CUDACC_VER_MINOR__ >= 2)
+#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
+#else
+#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
+#endif
+
+NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
+
+NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
+
+extern "C" __attribute__ ((visibility("default")))
+ncclNet_t* ncclNet = NULL;
+
+// We define this as weak to let tests redefine their own
+#pragma weak ncclCudaCompCap
+int ncclCudaCompCap() {
+  int cudaDev;
+  if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
+  int ccMajor;
+  if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
+  return ccMajor;
+}
+int ncclCudaFullCompCap() {
+  int cudaDev;
+  if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
+  int ccMajor, ccMinor;
+  if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
+  if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0;
+  return ccMajor*10+ccMinor;
+}
+
+void initNet() {
+  if (ncclNet != NULL) {
+    INFO(INIT,"Using external Network %s", ncclNetName());
+  } else {
+    ncclNet = ncclIbSupport() ? &ncclNetIb : &ncclNetSocket;
+    INFO(INIT,"Using internal Network %s", ncclNetName());
+  }
+}
+
+NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
+NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2);
+
+int ncclThreadThreshold(int minCompCap, int multiNode) {
+  int threshold = ncclParamThreadThreshold();
+  if (threshold == -2) { // user has not set this env variable
+    threshold = (minCompCap <= 6) ? NCCL_THREAD_THRESHOLD_PREVOLTA : NCCL_THREAD_THRESHOLD;
+    // multiply by 2 if running on multiple nodes
+    if (multiNode) {
+      threshold *= 2;
+    }
+  }
+  return threshold;
+}
+
+pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
+static bool initialized = false;
+static ncclResult_t ncclInit() {
+  if (initialized) return ncclSuccess;
+  pthread_mutex_lock(&initLock);
+  if (!initialized) {
+    initEnv();
+    initDebug();
+    initNet();
+    initialized = true;
+  }
+  pthread_mutex_unlock(&initLock);
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclGetVersion, int* version);
+ncclResult_t ncclGetVersion(int* version) {
+  if (version == NULL) return ncclInvalidArgument;
+  *version = NCCL_VERSION_CODE;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
+ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
+  NCCLCHECK(ncclInit());
+  NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
+  return bootstrapGetUniqueId(out);
+}
+
+static ncclResult_t commFree(ncclComm_t comm) {
+  if (comm == NULL)
+    return ncclSuccess;
+
+  CUDACHECK(cudaFree(comm->devComm));
+
+  for (int ring=0; ring<comm->nRings; ring++)
+    NCCLCHECK(freeRing(comm->rings+ring));
+
+  if (comm->doneEvent != NULL)
+    CUDACHECK(cudaEventDestroy(comm->doneEvent));
+
+  if (comm->launchMode == ncclComm::GROUP) {
+    CUDACHECK(cudaStreamDestroy(comm->groupStream));
+  }
+
+  // Last rank frees shared resources between threads
+  int isLast;
+  NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
+  if (isLast) {
+    free(comm->intraBarrier);
+    free(comm->intraParams);
+    free(comm->intraCudaDevs);
+    free(comm->intraCGMode);
+    free(comm->intraCC);
+  }
+
+  free(comm);
+  return ncclSuccess;
+}
+
+static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
+  if (ndev < 1) {
+    WARN("invalid device count (%d) requested", ndev);
+    return ncclInvalidArgument;
+  }
+  if (rank >= ndev || rank < 0) {
+    WARN("rank %d exceeds ndev=%d", rank, ndev);
+    return ncclInvalidArgument;
+  }
+
+  // Try to create a CUDA object right away. If there is something wrong with
+  // the device we're on (failure cause #1) , better know it early.
+  cudaEvent_t doneEvent;
+  CUDACHECK(cudaEventCreateWithFlags(&doneEvent, cudaEventDisableTiming));
+
+  struct ncclComm* comm;
+  NCCLCHECK(ncclCalloc(&comm, 1));
+
+  INFO(INIT,"comm %p rank %d nranks %d", comm, rank, ndev);
+  comm->rank = rank;
+  comm->nRanks = ndev;
+  cudaGetDevice(&comm->cudaDev);
+  comm->doneEvent = doneEvent;
+  comm->llThreshold = ncclParamLlThreshold();
+  comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
+#if __CUDACC_VER_MAJOR__ >= 10 || (__CUDACC_VER_MAJOR__ >= 9 && __CUDACC_VER_MINOR__ >= 2)
+  comm->groupCudaStream = ncclParamGroupCudaStream();
+#else
+  // Don't allow the user to overload the default setting in older CUDA builds
+  comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM;
+#endif
+
+  comm->argsptr = &comm->args;
+
+  *comret = comm;
+  return ncclSuccess;
+}
+
+static ncclResult_t devCommSetup(ncclComm_t comm) {
+  // Fully duplicate the comm on the device
+  NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
+  // Copy the comm on the device
+  NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1));
+  // Copy userRanks
+  for (int r=0; r<comm->nRings; r++) {
+    NCCLCHECK(ncclCudaMemcpy(comm->rings[r].devUserRanks, comm->rings[r].userRanks, comm->nRanks));
+  }
+  return ncclSuccess;
+}
+
+// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
+#define STR2(v) #v
+#define STR(v) STR2(v)
+#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
+static void showVersion() {
+  static int shown = 0;
+  if (shown == 0 && ncclDebugLevel >= VERSION) {
+    printf("%s\n", VERSION_STRING);
+    fflush(stdout);
+    if (ncclDebugFile != stdout)
+      INFO(ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
+    shown = 1;
+  }
+}
+
+static ncclResult_t fillInfo(struct ncclInfo* info, int rank) {
+  for (int t=0; t<NTRANSPORTS; t++) {
+    NCCLCHECK(ncclTransports[t].fillInfo(info->tinfo+t, rank));
+  }
+  return ncclSuccess;
+}
+
+template <int type>
+static ncclResult_t selectTransport(struct ncclInfo* myInfo, struct ncclInfo* peerInfo, struct ncclConnect* connect, struct ncclTransport** transportRet, struct ncclRing* ring) {
+  for (int t=0; t<NTRANSPORTS; t++) {
+    struct ncclTransport *transport = ncclTransports+t;
+    struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
+    ncclTvalue_t ret = 0;
+    NCCLCHECK(transport->canConnect(&ret, myInfo->tinfo+t, peerInfo->tinfo+t));
+    if (ret > 0) {
+      NCCLCHECK(transportComm->setup(myInfo->tinfo+t, peerInfo->tinfo+t, connect, ring));
+      *transportRet = transport;
+      return ncclSuccess;
+    }
+  }
+  WARN("No transport found !");
+  *transportRet = NULL;
+  return ncclInternalError;
+}
+
+static ncclResult_t setupRing(struct ncclComm* comm, int ringid, int rank, int nranks, int* ringRanks, struct ncclInfo* allInfo, struct ncclConnect* connect) {
+  NCCLCHECK(initRing(comm, ringid));
+
+  struct ncclRing* ring = comm->rings+ringid;
+  // Reorganize ranks to start with rank.
+  int shift;
+  for (shift = 0; shift<nranks; shift++) {
+    if (ringRanks[shift] == rank) {
+      break;
+    }
+  }
+  for (int i=0; i<nranks; i++) {
+    ring->userRanks[i] = ringRanks[(i+shift)%nranks];
+  }
+  int prev = ring->userRanks[nranks-1];
+  int next = ring->userRanks[1];
+
+  NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+0, &ring->recv.transport, ring));
+  NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+1, &ring->send.transport, ring));
+  NCCLCHECK(transportCreateProxy(0, ring, comm));
+  NCCLCHECK(transportCreateProxy(1, ring, comm));
+  return ncclSuccess;
+}
+
+static ncclResult_t fillConnect(struct ncclInfo* allInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) {
+  for (int r=0; r<nranks; r++) {
+    connectTransport[r] = -1;
+    for (int t=0; t<NTRANSPORTS; t++) {
+      NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, allInfo[rank].tinfo+t, allInfo[r].tinfo+t));
+      if (connectValue[r] > 0) {
+        connectTransport[r] = t;
+        break;
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+static void swap(void* mem1, void* mem2, int size) {
+  char tmp[size];
+  memcpy(tmp, mem1, size); memcpy(mem1, mem2, size); memcpy(mem2, tmp, size);
+}
+
+#define MAXWIDTH 20
+#define PREFIXLEN 15
+#define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
+void dumpMatrix(int* connectMatrix, int nranks) {
+  char line[STRLENGTH+1];
+  line[STRLENGTH] = '\0';
+  memset(line, ' ', STRLENGTH);
+  for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j);
+  INFO(INIT,"%s", line);
+  for (int i=0; i<nranks; i++) {
+    memset(line, ' ', STRLENGTH);
+    sprintf(line, "%3d ", i);
+    for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]);
+    INFO(INIT,"%s", line);
+  }
+}
+
+void dumpMatrixTvalue(ncclTvalue_t* connectMatrix, int nranks) {
+  char line[STRLENGTH+1];
+  line[STRLENGTH] = '\0';
+  memset(line, ' ', STRLENGTH);
+  for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4d", j);
+  INFO(INIT,"%s", line);
+  for (int i=0; i<nranks; i++) {
+    memset(line, ' ', STRLENGTH);
+    sprintf(line, "%3d ", i);
+    for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4o", (int)connectMatrix[i*nranks+j]);
+    INFO(INIT,"%s", line);
+  }
+}
+
+
+void dumpLine(int* values, int nranks, const char* prefix) {
+  int prefixlen = strlen(prefix);
+  char line[STRLENGTH+1];
+  line[STRLENGTH] = '\0';
+  memset(line, ' ', STRLENGTH);
+  strncpy(line, prefix, PREFIXLEN);
+  for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
+  INFO(INIT,"%s", line);
+}
+
+static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
+  for (int r=0; r<nrings; r++) {
+    char prefix[30];
+    /*sprintf(prefix, "[%d] Ring %d Prev : ", rank, r);
+    dumpLine(prev+r*nranks, nranks, prefix);
+    sprintf(prefix, "[%d] Ring %d Next : ", rank, r);
+    dumpLine(next+r*nranks, nranks, prefix);*/
+
+    int current = rank;
+    for (int i=0; i<nranks; i++) {
+      rings[r*nranks+i] = current;
+      current = next[r*nranks+current];
+    }
+    sprintf(prefix, "Ring %02d : ", r);
+    if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
+    if (current != rank) {
+      WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
+      return ncclInternalError;
+    }
+    // Check that all ranks are there
+    for (int i=0; i<nranks; i++) {
+      int found = 0;
+      for (int j=0; j<nranks; j++) {
+        if (rings[r*nranks+j] == i) {
+          found = 1;
+          break;
+        }
+      }
+      if (found == 0) {
+        WARN("Error : ring %d does not contain rank %d", r, i);
+        return ncclInternalError;
+      }
+    }
+  }
+  return ncclSuccess;
+}
+
+void* waitForNonNullPtr(void* p) {
+  volatile void** ptr = (volatile void**) p;
+  while (*ptr == NULL) sched_yield();
+  return (void*)*ptr;
+}
+
+ncclResult_t initParams(struct ncclComm* comm) {
+  struct cudaLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
+  params->args = &comm->argsptr;
+  params->stream = NULL;
+  params->sharedMem = 0;
+  params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
+  params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1;
+  return ncclSuccess;
+}
+
+// Allocate/Set Intra Process Structures and set CG options
+ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) {
+  comm->intraRank = rank;
+  comm->intraRanks = ranks;
+  comm->intraPhase = 0;
+
+  // Alloc shared structures
+  if (rank == 0) {
+    assert(comm == comm0);
+    int* bar;
+    NCCLCHECK(ncclCalloc(&bar, 2));
+    bar[0] = bar[1] = 0;
+    comm->intraBarrier = bar;
+    NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks));
+    NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks));
+    int* CGMode;
+    NCCLCHECK(ncclCalloc(&CGMode, 1));
+    *CGMode = 0x11;
+    comm->intraCGMode = CGMode;
+    int* CC;
+    NCCLCHECK(ncclCalloc(&CC, 1));
+    *CC = ncclCudaFullCompCap();
+    comm->intraCC = CC;
+  } else {
+    comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
+    comm->intraParams = (struct cudaLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
+    comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
+    comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
+    comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
+  }
+  comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
+  NCCLCHECK(initParams(comm));
+
+  int cgMdLaunch = 0;
+
+  // Set CG Mode
+  comm->launchMode = ncclComm::GROUP;
+  char* str = getenv("NCCL_LAUNCH_MODE");
+  if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) {
+    comm->launchMode = ncclComm::PARALLEL;
+  }
+  if (comm->launchMode == ncclComm::GROUP) {
+    CUDACHECK(cudaStreamCreateWithFlags(&comm->groupStream, cudaStreamNonBlocking));
+#if __CUDACC_VER_MAJOR__ >= 9
+    if (*comm->intraCC && (ncclCudaFullCompCap() == *comm->intraCC)) {
+      // Check whether the GPU supports Cooperative Group Multi Device Launch
+      (void) cudaDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
+    }
+#endif
+  }
+
+  // Disable cgMdLaunch if any rank does not support it
+  if (cgMdLaunch == 0) {
+    *comm->intraCGMode = 0x10;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
+  int rank = comm->rank;
+  int nranks = comm->nRanks;
+  void* commState;
+  NCCLCHECK(bootstrapInit(commId, rank, nranks, &commState));
+
+  struct ncclInfo* allInfo;
+  NCCLCHECK(ncclCalloc(&allInfo, nranks));
+  NCCLCHECK(fillInfo(allInfo+rank, rank));
+  NCCLCHECK(bootstrapAllGather(commState, allInfo, sizeof(struct ncclInfo)));
+
+  int* connectTransport;
+  ncclTvalue_t* connectValue;
+  NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
+  NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
+
+  NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
+  NCCLCHECK(bootstrapAllGather(commState, connectTransport, nranks*(sizeof(int))));
+  NCCLCHECK(bootstrapAllGather(commState, connectValue, nranks*(sizeof(ncclTvalue_t))));
+  //if (rank == 0) dumpMatrix(connectTransport, nranks);
+  //if (rank == 0) dumpMatrixTvalue(connectValue, nranks);
+
+  // Get my rings
+  int nrings;
+  int* prev, *next;
+  NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
+  comm->nThreads = getDefaultThreads();
+  NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next));
+  free(connectTransport);
+  free(connectValue);
+
+  // Find max nThreads
+  int allData[nranks];
+  allData[rank] = comm->nThreads;
+  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
+  for (int i=0; i<nranks; i++)
+    comm->nThreads = std::max(allData[i], comm->nThreads);
+  if (rank == 0) INFO(INIT,"Using %d threads", comm->nThreads);
+
+  // Determine the minimum CUDA Compute capability of all GPUs
+  int myCompCap = ncclCudaCompCap();
+  int minCompCap = myCompCap;
+  allData[rank] = myCompCap;
+  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
+  for (int i=0; i<nranks; i++)
+    minCompCap = std::min(allData[i], minCompCap);
+  if (rank == 0) INFO(INIT,"Min Comp Cap %d", minCompCap);
+
+  // Find min nrings across ranks
+  allData[rank] = nrings;
+  NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
+  for (int i=0; i<nranks; i++)
+    nrings = std::min(allData[i], nrings);
+
+  // Exchange data with others to build complete rings
+  comm->nRings = nrings;
+  for (int r=0; r<nrings; r++) {
+    NCCLCHECK(bootstrapAllGather(commState, prev+r*nranks, sizeof(int)));
+    NCCLCHECK(bootstrapAllGather(commState, next+r*nranks, sizeof(int)));
+  }
+  int *rings;
+  NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
+  NCCLCHECK(buildRings(nrings, rings, rank, nranks, prev, next));
+  free(prev);
+  free(next);
+
+  // Connect with prev/next for each ring
+  struct ncclConnect *connectData;
+  NCCLCHECK(ncclCalloc(&connectData, 2*nranks));
+  for (int r=0; r<nrings; r++) {
+    int* ringRanks = rings+r*nranks;
+    struct ncclRing *ring = comm->rings+r;
+    NCCLCHECK(setupRing(comm, r, rank, nranks, ringRanks, allInfo, connectData+2*rank));
+    int prev_offset = ring->userRanks[nranks-1]*2+1;
+    int next_offset = ring->userRanks[1]*2;
+    NCCLCHECK(bootstrapAllGather(commState, connectData, sizeof(struct ncclConnect)*2));
+    NCCLCHECK(ring->send.transport->send.connect(connectData+next_offset, &ring->send));
+    NCCLCHECK(ring->recv.transport->recv.connect(connectData+prev_offset, &ring->recv));
+  }
+  free(connectData);
+  free(rings);
+  free(allInfo);
+
+  // Intra-process barrier setup
+  struct rankInfo {
+    uint64_t hostHash;
+    uint64_t pidHash;
+    struct ncclComm* comm;
+  } rankInfos[nranks];
+  rankInfos[rank].hostHash = getHostHash();
+  rankInfos[rank].pidHash = getPidHash();
+  rankInfos[rank].comm = comm;
+  NCCLCHECK(bootstrapAllGather(commState, rankInfos, sizeof(struct rankInfo)));
+
+  // Compute intra ranks
+  int intraRank0 = -1, intraRank = -1, intraRanks = 0;
+  int multiNode = 0;
+  for (int r=0; r<nranks; r++) {
+    if ((rankInfos[r].hostHash == rankInfos[rank].hostHash) &&
+        (rankInfos[r].pidHash == rankInfos[rank].pidHash)) {
+      if (intraRanks == 0) intraRank0 = r;
+      if (r == rank) intraRank = intraRanks;
+      intraRanks++;
+    } else if (rankInfos[r].hostHash != rankInfos[rank].hostHash) {
+      multiNode = 1;
+    }
+  }
+  TRACE(INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+      rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
+  if (intraRank == -1 || intraRank0 == -1 || rankInfos[intraRank0].comm == NULL) {
+    WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+        rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
+    return ncclInternalError;
+  }
+  NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, rankInfos[intraRank0].comm));
+
+  // Determine thread threshold across all GPUs
+  comm->threadThreshold = ncclThreadThreshold(minCompCap, multiNode);
+
+  // Barrier
+  bootstrapClose(commState);
+  return ncclSuccess;
+}
+
+bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice) {
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+  if (cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev) != cudaSuccess) return false;
+  if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevice) != ncclSuccess) return false;
+  if (wrapNvmlDeviceSetCpuAffinity(*nvmlDevice) != ncclSuccess) {
+    WARN("Failed to set CPU affinity");
+    return false;
+  }
+  return true;
+}
+
+ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
+  cpu_set_t affinitySave;
+  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+
+  NCCLCHECK(wrapNvmlSymbols());
+  NCCLCHECK(wrapNvmlInit());
+
+  // Make sure all host memory allocation are close to the GPU
+  int cudaDev;
+  nvmlDevice_t nvmlDevice;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  SetCpuAffinity(cudaDev, &nvmlDevice);
+  ncclResult_t res;
+
+  NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
+  NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
+  NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
+
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
+
+  INFO(INIT,"comm %p rank %d nranks %d - COMPLETE", *newcomm, myrank, nranks);
+
+  return ncclSuccess;
+cleanup:
+  *newcomm = NULL;
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  return res;
+}
+
+NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
+ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
+  char* env = getenv("NCCL_COMM_ID");
+  if (env && myrank == 0) {
+    NCCLCHECK(bootstrapCreateRoot(&commId, true));
+  }
+
+  NCCLCHECK(ncclInit());
+  if (myrank == 0) showVersion();
+
+  INFO(INIT,"rank %d nranks %d", myrank, nranks);
+
+  // Make sure the CUDA runtime is initialized.
+  CUDACHECK(cudaFree(NULL));
+
+  NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm"));
+  if (nranks < 1 || myrank < 0 || myrank >= nranks) {
+    WARN("Invalid rank requested : %d/%d", myrank, nranks);
+    return ncclInvalidArgument;
+  }
+
+  if (ncclAsyncMode()) {
+    int cudaDev;
+    CUDACHECK(cudaGetDevice(&cudaDev));
+    return ncclAsyncInit(ncclCommInitRankSync, cudaDev, newcomm, nranks, commId, myrank);
+  } else {
+    return ncclCommInitRankSync(newcomm, nranks, commId, myrank);
+  }
+}
+
+static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) {
+  struct ncclInfo* allInfo;
+  NCCLCHECK(ncclCalloc(&allInfo, nranks));
+  for (int rank=0; rank<nranks; rank++) {
+    CUDACHECK(cudaSetDevice(devs[rank]));
+    NCCLCHECK(fillInfo(allInfo+rank, rank));
+  }
+
+  int* connectTransport;
+  ncclTvalue_t* connectValue;
+  NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
+  NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
+  for (int rank=0; rank<nranks; rank++)
+    NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
+
+  int* prev, *prevFinal, *next, *nextFinal;
+  NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXRINGS));
+  int nrings = MAXRINGS;
+  int nthreads=0;
+  int myCompCap = ncclCudaCompCap();
+  int minCompCap = myCompCap;
+  for (int rank=0; rank<nranks; rank++) {
+    CUDACHECK(cudaSetDevice(devs[rank]));
+    int nringsRank;
+    int nthreadsRank = getDefaultThreads();
+    myCompCap = ncclCudaCompCap();
+    NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next));
+    nrings = std::min(nrings, nringsRank);
+    nthreads = std::max(nthreads, nthreadsRank);
+    minCompCap = std::min(minCompCap, myCompCap);
+    for (int ring=0; ring<nrings; ring++) {
+      int index = ring*nranks+rank;
+      prevFinal[index] = prev[index];
+      nextFinal[index] = next[index];
+    }
+  }
+  free(connectTransport);
+  free(connectValue);
+  free(prev);
+  free(next);
+
+  INFO(INIT,"Using %d threads", nthreads);
+  INFO(INIT,"Min Comp Cap %d", minCompCap);
+
+  int* rings;
+  NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
+  NCCLCHECK(buildRings(nrings, rings, 0, nranks, prevFinal, nextFinal));
+  free(prevFinal);
+  free(nextFinal);
+
+  // Determine thread threshold across all GPUs
+  int threadThreshold = ncclThreadThreshold(minCompCap, 0);
+
+  for (int rank=0; rank<nranks; rank++) {
+    comms[rank]->nRings = nrings;
+    comms[rank]->nThreads = nthreads;
+    comms[rank]->threadThreshold = threadThreshold;
+  }
+
+  for (int r=0; r<nrings; r++) {
+    struct ncclConnect connect[2*nranks];
+    int* ringRanks = rings+r*nranks;
+    for (int rank=0; rank<nranks; rank++) {
+      CUDACHECK(cudaSetDevice(devs[rank]));
+      NCCLCHECK(setupRing(comms[rank], r, rank, nranks, ringRanks, allInfo, connect+2*rank));
+    }
+    // RingExchange connect information
+    for (int rank=0; rank<nranks; rank++) {
+      // Swap rank->prev and prevRank->next
+      struct ncclRing *ring = comms[rank]->rings+r;
+      int prevRank = ring->userRanks[nranks-1];
+      struct ncclConnect* prevRankNextConnect = connect+2*prevRank+1;
+      struct ncclConnect* rankPrevConnect = connect+2*rank;
+      swap(prevRankNextConnect, rankPrevConnect, sizeof(struct ncclConnect));
+    }
+    for (int rank=0; rank<nranks; rank++) {
+      CUDACHECK(cudaSetDevice(devs[rank]));
+      struct ncclRing *ring = comms[rank]->rings+r;
+      NCCLCHECK(ring->send.transport->send.connect(connect+2*rank+1, &ring->send));
+      NCCLCHECK(ring->recv.transport->recv.connect(connect+2*rank+0, &ring->recv));
+    }
+  }
+  free(rings);
+  free(allInfo);
+  return ncclSuccess;
+}
+
+
+NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
+ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
+  NCCLCHECK(ncclInit());
+  NCCLCHECK(wrapNvmlSymbols());
+  NCCLCHECK(wrapNvmlInit());
+  showVersion();
+
+  INFO(INIT,"nranks %d", ndev);
+
+  NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
+  if (ndev < 1) {
+    WARN("Invalid device count requested : %d", ndev);
+    return ncclInvalidArgument;
+  }
+
+  ncclResult_t res;
+  int savedDevice;
+  int rank, cudaDev;
+  ncclComm_t comm = NULL;
+  nvmlDevice_t nvmlDevice;
+  int ncclDevList[ndev];
+  for (int i=0; i<ndev; i++) {
+    ncclDevList[i] = devlist ? devlist[i] : i;
+  }
+
+  cudaGetDevice(&savedDevice);
+
+  for(rank=0; rank<ndev; ++rank)
+    comms[rank] = NULL;
+
+  cpu_set_t affinitySave;
+  sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+
+  for (rank=0; rank<ndev; ++rank) {
+    cudaDev = ncclDevList[rank];
+    CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
+
+    SetCpuAffinity(cudaDev, &nvmlDevice);
+
+    NCCLCHECKGOTO(commAlloc(&comm, ndev, rank), res, cleanup);
+    comms[rank] = comm;
+
+    NCCLCHECKGOTO(ncclCommSetIntra(comm, rank, ndev, comms[0]), res, cleanup);
+  }
+
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+
+  NCCLCHECKGOTO(initTransportsAll(comms, ncclDevList, ndev), res, cleanup);
+
+  for(rank=0; rank<ndev; ++rank) {
+    cudaDev = ncclDevList[rank];
+    CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
+    NCCLCHECKGOTO(devCommSetup(comms[rank]), res, cleanup);
+  }
+
+  res = ncclSuccess;
+  goto final;
+
+cleanup:
+  for(rank=0; rank<ndev; ++rank) {
+    if(comms[rank] != NULL) {
+      commFree(comms[rank]);
+    }
+  }
+
+final:
+  if(wrapNvmlShutdown() != ncclSuccess)
+    INFO(INIT,"NCCL did not shutdown nvml properly");
+  cudaSetDevice(savedDevice);
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  return res;
+}
+
+NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
+ncclResult_t ncclCommDestroy(ncclComm_t comm) {
+
+  if (comm == NULL)
+    return ncclSuccess;
+  int savedDevice;
+  CUDACHECK(cudaGetDevice(&savedDevice));
+  int commDevice = comm->cudaDev;
+
+  if (savedDevice != commDevice) {
+    CUDACHECK(cudaSetDevice(commDevice));
+  }
+
+  NCCLCHECK(commFree(comm));
+
+  if (savedDevice != commDevice)
+    CUDACHECK(cudaSetDevice(savedDevice));
+
+  return ncclSuccess;
+}
+
+NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
+const char* ncclGetErrorString(ncclResult_t code) {
+  switch (code) {
+    case ncclSuccess                : return "no error";
+    case ncclUnhandledCudaError     : return "unhandled cuda error";
+    case ncclSystemError            : return "unhandled system error";
+    case ncclInternalError          : return "internal error";
+    case ncclInvalidArgument        : return "invalid argument";
+    case ncclInvalidUsage           : return "invalid usage";
+    default                         : return "unknown result code";
+  }
+}
+
+NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
+ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
+  NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
+  NCCLCHECK(PtrCheck(count, "CommCount", "count"));
+  *count = comm->nRanks;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
+ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
+  NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
+  NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
+  *devid = comm->cudaDev;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
+ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
+  NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
+  NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
+  *rank = comm->rank;
+  return ncclSuccess;
+}
diff --git a/src/3rd_party/nccl/src/misc/enqueue.cu b/src/3rd_party/nccl/src/misc/enqueue.cu
new file mode 100644
index 00000000..dc5d9cc4
--- /dev/null
+++ b/src/3rd_party/nccl/src/misc/enqueue.cu
@@ -0,0 +1,248 @@
+/*************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "common_coll.h"
+#include "param.h"
+
+#include "collectives/collectives.h"
+
+#define NCCL_FUNC4(coll, op, dtype) \
+  (void*)NCCL_KERN_NAME(coll, op, dtype), \
+  (void*)NCCL_KERN_NAME(coll##LL, op, dtype)
+
+// Must be consistent with ncclDataType_t
+#define NCCL_FUNCS3A(coll, op) \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  u8), \
+  (void*)NCCL_FUNC4(coll, op, i32), \
+  (void*)NCCL_FUNC4(coll, op, u32), \
+  (void*)NCCL_FUNC4(coll, op, i64), \
+  (void*)NCCL_FUNC4(coll, op, u64), \
+  (void*)NCCL_FUNC4(coll, op, f16), \
+  (void*)NCCL_FUNC4(coll, op, f32), \
+  (void*)NCCL_FUNC4(coll, op, f64)
+#define NCCL_FUNCS3B(coll, op) \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8), \
+  (void*)NCCL_FUNC4(coll, op,  i8)
+
+// Must be consistent with ncclRedOp_t
+#define NCCL_FUNCS2A(coll) \
+  NCCL_FUNCS3A(coll, sum ), \
+  NCCL_FUNCS3A(coll, prod), \
+  NCCL_FUNCS3A(coll, max ), \
+  NCCL_FUNCS3A(coll, min )
+#define NCCL_FUNCS2B(coll) \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy), \
+  NCCL_FUNCS3B(coll, copy)
+
+// Must be consistent with the ncclFuncSet enum
+static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
+  NCCL_FUNCS2B(ncclBroadcast),
+  NCCL_FUNCS2A(ncclReduce),
+  NCCL_FUNCS2B(ncclAllGather),
+  NCCL_FUNCS2A(ncclReduceScatter),
+  NCCL_FUNCS2A(ncclAllReduce)
+};
+
+ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
+#if __CUDACC_VER_MAJOR__ >= 9
+  if (cgMode & 0x01) {
+    CUDACHECK(cudaLaunchCooperativeKernelMultiDevice(paramsList, numDevices,
+            // These flags are to reduce the latency of using this API
+            cudaCooperativeLaunchMultiDeviceNoPreSync|cudaCooperativeLaunchMultiDeviceNoPostSync));
+    return ncclSuccess;
+  }
+#endif
+  int savedDev;
+  CUDACHECK(cudaGetDevice(&savedDev));
+  for (int i = 0; i < numDevices; i++) {
+    struct cudaLaunchParams* params = paramsList+i;
+    CUDACHECK(cudaSetDevice(cudaDevs[i]));
+    CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
+  }
+  CUDACHECK(cudaSetDevice(savedDev));
+  return ncclSuccess;
+}
+
+ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
+  params->gridDim.x = std::min((int) params->gridDim.x, comm->nRings);
+
+  // Set active = 2 for the last operation
+  for (int r=0; r<params->gridDim.x; r++) {
+    struct ncclRing* ring = comm->rings+r;
+    ring->collectives[(ring->collStart+ring->collCount-1)%NCCL_MAX_OPS].active = 2;
+  }
+
+  // Find the first operation, choose the kernel accordingly and pass it
+  // as the first argument.
+  struct ncclColl* coll = comm->rings[0].collectives+comm->rings[0].collStart;
+  memcpy(&comm->args, coll, sizeof(struct ncclColl));
+  // As we pass that coll directly, we can free it immediately.
+  coll->active = 0;
+
+  params->func = ncclKerns[coll->funcIndex];
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
+  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+  int val = *ptr;
+  bool done = false;
+  while (done == false) {
+    if (val >= comm->intraRanks) {
+      WARN("Trying to launch too many collectives");
+      return ncclInvalidUsage;
+    }
+    if (val+1 == comm->intraRanks) {
+      // Reset the barrier.
+      comm->intraBarrier[comm->intraPhase^1] = 0;
+      *isLast = 1;
+      return ncclSuccess;
+    }
+    done = __sync_bool_compare_and_swap(ptr, val, val+1);
+    val++;
+  }
+  *isLast = 0;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
+  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+  int val = *ptr;
+  if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
+    WARN("Trying to launch too many collectives");
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
+  volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+  while (*ptr < comm->intraRanks) pthread_yield();
+  comm->intraPhase ^= 1;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
+  if (comm->nRanks == 1) return ncclSuccess;
+  struct cudaLaunchParams* params = comm->myParams;
+
+  NCCLCHECK(setupLaunch(comm, params));
+
+  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
+  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
+    // Enqueue event in user stream
+    CUDACHECK(cudaEventRecord(comm->doneEvent, comm->userStream));
+    // Create dependency between user stream and internal NCCL stream
+    CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
+    params->stream = comm->groupStream;
+  } else {
+    if (comm->userStream != params->stream) {
+      // Stream changed from last call, create dependency against last NCCL kernel launch
+      CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
+    }
+    params->stream = comm->userStream;
+  }
+
+  int isLast = 0;
+  NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
+
+  if (isLast) {
+    if (comm->launchMode == ncclComm::GROUP) {
+      // I'm the last. Launch all operations.
+      NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode));
+    }
+    NCCLCHECK(ncclCpuBarrierLast(comm));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
+  if (comm->nRanks == 1) return ncclSuccess;
+  // We can't print the CG mode before the first barrier happened.
+  if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
+    *comm->intraCGMode ^= 0x10;
+    INFO(INIT,"Launch mode %s%s%s",
+        comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
+        *comm->intraCGMode ? "/CGMD" : "",
+        (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
+  }
+
+  NCCLCHECK(ncclCpuBarrierOut(comm));
+
+  struct cudaLaunchParams *params = comm->myParams;
+  if (comm->launchMode == ncclComm::PARALLEL) {
+    CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
+  }
+  // Start the network proxies as soon as the kernel has been launched. We can't
+  // perform any CUDA call between the two or having a cudaFree between the CUDA
+  // launch and the transportStartProxies call could cause a deadlock.
+  // Also, starting the proxies after the CUDA launch seems to be better for
+  // performance (latency).
+  for (int r=0; r<params->gridDim.x; r++) {
+    struct ncclRing* ring = comm->rings+r;
+    ring->collStart = ring->collFifoTail;
+    ring->collCount = 0;
+  }
+  params->gridDim.x = params->blockDim.x = 0;
+  NCCLCHECK(transportStartProxies(comm));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
+  struct cudaLaunchParams *params = comm->myParams;
+  // Enqueue event after NCCL kernel
+  CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream));
+  // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
+  if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
+    // Create dependency between NCCL internal stream and user stream
+    CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
+  }
+  comm->userStreamSet = false;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff,
+    void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root,
+    ncclComm_t comm, cudaStream_t stream) {
+  if (comm == NULL) return ncclInvalidArgument;
+  // Launch asynchronously if needed
+  if (ncclAsyncMode()) {
+    ncclResult_t ret = ncclSuccess;
+    int savedDev = -1;
+    if (comm->checkPointers) {
+      CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end);
+      CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, end);
+    }
+    // Check arguments
+    NCCLCHECKGOTO(ArgsCheck(sendbuff, recvbuff, count, type, op, root, comm, primName), ret, end);
+    // Always register comm even in case of error to make sure ncclGroupEnd
+    // cleans it up.
+    NCCLCHECK(ncclAsyncColl(comm));
+    NCCLCHECKGOTO(func(sendbuff, recvbuff, count, type, op, root, comm, stream), ret, end);
+end:
+    if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev));
+    ncclAsyncErrCheck(ret);
+    return ret;
+  } else {
+    NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, type, op, root, comm, primName));
+    NCCLCHECK(func(sendbuff, recvbuff, count, type, op, root, comm, stream));
+    NCCLCHECK(ncclBarrierEnqueue(comm));
+    NCCLCHECK(ncclBarrierEnqueueWait(comm));
+    NCCLCHECK(ncclEnqueueEvents(comm));
+    return ncclSuccess;
+  }
+}
diff --git a/src/3rd_party/nccl/src/misc/group.cu b/src/3rd_party/nccl/src/misc/group.cu
new file mode 100644
index 00000000..c7b31cfe
--- /dev/null
+++ b/src/3rd_party/nccl/src/misc/group.cu
@@ -0,0 +1,198 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "group.h"
+#include "debug.h"
+#include "enqueue.h"
+
+#define MAX_ASYNC_OPS 128
+thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS];
+thread_local int ncclGroupIndex = 0;
+thread_local int ncclGroupMode = 0;
+thread_local ncclResult_t ncclGroupError = ncclSuccess;
+
+bool ncclAsyncMode() {
+  return ncclGroupMode > 0;
+}
+
+ncclResult_t ncclAsyncErrCheck(ncclResult_t ret) {
+  if (ncclGroupError == ncclSuccess || ret != ncclSuccess) ncclGroupError = ret;
+  return ret;
+}
+
+struct ncclInitArgs {
+  ncclInitFunc_t func;
+  int cudaDev;
+  ncclComm_t* newcomm;
+  int ndev;
+  ncclUniqueId commId;
+  int myrank;
+};
+struct ncclCollArgs {
+  ncclComm_t comm;
+};
+
+enum ncclAsyncFuncType {
+  ASYNC_FUNC_INVALID = 0,
+  ASYNC_FUNC_INIT = 1,
+  ASYNC_FUNC_COLL = 2,
+};
+struct ncclAsyncArgs {
+  ncclResult_t ret;
+  enum ncclAsyncFuncType funcType;
+  union {
+    ncclCollArgs coll;
+    ncclInitArgs init;
+  };
+};
+
+thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];
+
+ncclResult_t ncclSetDevice(int cudaDev) {
+  CUDACHECK(cudaSetDevice(cudaDev));
+  return ncclSuccess;
+}
+
+#define CHECK(a) do { \
+  if ((args->ret = (a)) != ncclSuccess) { \
+    INFO(INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
+    return args; \
+  } \
+} while(0)
+
+void* ncclAsyncThreadMain(void* args_) {
+  struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
+  CHECK(ncclSetDevice(args->init.cudaDev));
+  CHECK(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank));
+  return args;
+}
+
+ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) {
+  if (ncclGroupIndex >= MAX_ASYNC_OPS) {
+    WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
+    return ncclAsyncErrCheck(ncclInternalError);
+  }
+  int index = ncclGroupIndex++;
+  struct ncclAsyncArgs* args = ncclGroupArgs+index;
+  args->funcType = ASYNC_FUNC_INIT;
+  args->init.func = func;
+  args->init.cudaDev = cudaDev;
+  args->init.newcomm = newcomm;
+  args->init.ndev = ndev;
+  memcpy(&args->init.commId, &commId, sizeof(commId));
+  args->init.myrank = myrank;
+  // We need to use threads for Init
+  pthread_create(ncclGroupThreads+index, NULL, ncclAsyncThreadMain, args);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclAsyncColl(ncclComm_t comm) {
+  struct ncclAsyncArgs* args = ncclGroupArgs;
+  for (int i=0; i<ncclGroupIndex; i++) {
+    if (args->coll.comm == comm) return ncclSuccess;
+    args++;
+  }
+  if (ncclGroupIndex >= MAX_ASYNC_OPS) {
+    WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
+    return ncclAsyncErrCheck(ncclInternalError);
+  }
+  ncclGroupIndex++;
+  args->funcType = ASYNC_FUNC_COLL;
+  args->coll.comm = comm;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclGroupStart);
+ncclResult_t ncclGroupStart() {
+  ncclGroupMode++;
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclGroupEnd);
+ncclResult_t ncclGroupEnd() {
+  ncclGroupMode--;
+  if (ncclGroupMode > 0) return ncclSuccess;
+  int savedDev;
+  CUDACHECK(cudaGetDevice(&savedDev));
+  int done = ncclGroupIndex;
+  int doneArray[ncclGroupIndex];
+  for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 0;
+
+  ncclResult_t ret = ncclGroupError;
+  if (ret != ncclSuccess) goto group_cleanup;
+
+  /* Collectives are done in three steps :
+   * 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative]
+   * 2. Barrier Wait. No CUDA call is permitted
+   * 3. Enqueue Events. CUDA event wait/enqueue.
+   * This is needed because step 2 cannot call any CUDA primitive, otherwise if
+   * cudaFree happens between 1 and 3, it could block that CUDA call and
+   * prevent some ranks from launching their network threads, which would
+   * prevent the NCCL call from completing, blocking the cudaFree call.
+   */
+  for (int i=0; i<ncclGroupIndex; i++) {
+    struct ncclAsyncArgs* args = ncclGroupArgs+i;
+    if (args->funcType == ASYNC_FUNC_COLL) {
+      if (args->coll.comm->userStream == NULL)
+        CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
+      NCCLCHECKGOTO(ncclBarrierEnqueue(args->coll.comm), ret, end);
+    }
+  }
+  for (int i=0; i<ncclGroupIndex; i++) {
+    struct ncclAsyncArgs* args = ncclGroupArgs+i;
+    if (args->funcType == ASYNC_FUNC_COLL) {
+      CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
+      NCCLCHECKGOTO(ncclBarrierEnqueueWait(args->coll.comm), ret, end);
+    }
+  }
+  for (int i=0; i<ncclGroupIndex; i++) {
+    struct ncclAsyncArgs* args = ncclGroupArgs+i;
+    if (args->funcType == ASYNC_FUNC_COLL) {
+      if (args->coll.comm->userStream == NULL)
+        CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
+      NCCLCHECKGOTO(ncclEnqueueEvents(args->coll.comm), ret, end);
+      doneArray[i] = 1;
+      done--;
+    }
+  }
+
+  /* For init, since we use threads, we just wait for threads to complete */
+  while (done) {
+    for (int i=0; i<ncclGroupIndex; i++) {
+      struct ncclAsyncArgs* args = ncclGroupArgs+i;
+      if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
+        int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL);
+        if (err == EBUSY) continue;
+        if (err != 0) { ret = ncclSystemError; goto end; }
+        if (args->ret != ncclSuccess) { ret = args->ret; goto end; }
+        doneArray[i] = 1;
+        done--;
+      }
+    }
+  }
+  goto end;
+group_cleanup:
+  // At least one call in the group failed. Since we want to make that group
+  // an atomic operation, we need to cancel all operations.
+  for (int i=0; i<ncclGroupIndex; i++) {
+    struct ncclComm* comm = ncclGroupArgs[i].coll.comm;
+    for (int r=0; r<comm->nRings; r++) {
+      struct ncclRing* ring = comm->rings+r;
+      for (int i=0; i<ring->collCount; i++) {
+        ring->collectives[(ring->collStart + i)%NCCL_MAX_OPS].active = 0;
+      }
+      ring->collFifoTail = ring->collStart;
+      ring->collCount = 0;
+    }
+    comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
+    comm->userStreamSet = false;
+  }
+end:
+  ncclGroupError = ncclSuccess;
+  ncclGroupIndex = 0;
+  CUDACHECK(cudaSetDevice(savedDev)); // do other clean-ups first before calling cudaSetDevice, because this call can fail too
+  return ret;
+}
diff --git a/src/3rd_party/nccl/src/misc/ibvwrap.cu b/src/3rd_party/nccl/src/misc/ibvwrap.cu
new file mode 100644
index 00000000..7ac3431c
--- /dev/null
+++ b/src/3rd_party/nccl/src/misc/ibvwrap.cu
@@ -0,0 +1,290 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "ibvwrap.h"
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <dlfcn.h>
+#include "core.h"
+
+static enum { ibvUninitialized, ibvInitializing, ibvInitialized, ibvError } ibvState = ibvUninitialized;
+
+/*Function Pointers*/
+int (*ibv_internal_fork_init)(void);
+struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
+void (*ibv_internal_free_device_list)(struct ibv_device **list);
+const char * (*ibv_internal_get_device_name)(struct ibv_device *device);
+struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device);
+int (*ibv_internal_close_device)(struct ibv_context *context);
+int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event);
+void (*ibv_internal_ack_async_event)(struct ibv_async_event *event);
+int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr);
+int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
+int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
+int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
+struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
+int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
+struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
+int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
+struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
+int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
+struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
+int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
+int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
+const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
+
+// IBVERBS Library versioning
+#define IBVERBS_VERSION "IBVERBS_1.1"
+
+ncclResult_t wrap_ibv_symbols(void) {
+  if (ibvState == ibvInitialized)
+    return ncclSuccess;
+  if (ibvState == ibvError)
+    return ncclSystemError;
+
+  if (__sync_bool_compare_and_swap(&ibvState, ibvUninitialized, ibvInitializing) == false) {
+    // Another thread raced in front of us. Wait for it to be done.
+    while (ibvState == ibvInitializing) pthread_yield();
+    return (ibvState == ibvInitialized) ? ncclSuccess : ncclSystemError;
+  }
+
+  static void* ibvhandle = NULL;
+  void* tmp;
+  void** cast;
+
+  ibvhandle=dlopen("libibverbs.so", RTLD_NOW);
+  if (!ibvhandle) {
+    ibvhandle=dlopen("libibverbs.so.1", RTLD_NOW);
+    if (!ibvhandle) {
+      WARN("Failed to open libibverbs.so[.1]");
+      goto teardown;
+    }
+  }
+
+#define LOAD_SYM(handle, symbol, funcptr) do {         \
+    cast = (void**)&funcptr;                             \
+    tmp = dlvsym(handle, symbol, IBVERBS_VERSION);       \
+    if (tmp == NULL) {                                   \
+      WARN("dlvsym failed on %s - %s version %s", symbol, dlerror(), IBVERBS_VERSION);  \
+      goto teardown;                                     \
+    }                                                    \
+    *cast = tmp;                                         \
+  } while (0)
+
+  LOAD_SYM(ibvhandle, "ibv_get_device_list", ibv_internal_get_device_list);
+  LOAD_SYM(ibvhandle, "ibv_free_device_list", ibv_internal_free_device_list);
+  LOAD_SYM(ibvhandle, "ibv_get_device_name", ibv_internal_get_device_name);
+  LOAD_SYM(ibvhandle, "ibv_open_device", ibv_internal_open_device);
+  LOAD_SYM(ibvhandle, "ibv_close_device", ibv_internal_close_device);
+  LOAD_SYM(ibvhandle, "ibv_get_async_event", ibv_internal_get_async_event);
+  LOAD_SYM(ibvhandle, "ibv_ack_async_event", ibv_internal_ack_async_event);
+  LOAD_SYM(ibvhandle, "ibv_query_device", ibv_internal_query_device);
+  LOAD_SYM(ibvhandle, "ibv_query_port", ibv_internal_query_port);
+  LOAD_SYM(ibvhandle, "ibv_query_gid", ibv_internal_query_gid);
+  LOAD_SYM(ibvhandle, "ibv_query_qp", ibv_internal_query_qp);
+  LOAD_SYM(ibvhandle, "ibv_alloc_pd", ibv_internal_alloc_pd);
+  LOAD_SYM(ibvhandle, "ibv_dealloc_pd", ibv_internal_dealloc_pd);
+  LOAD_SYM(ibvhandle, "ibv_reg_mr", ibv_internal_reg_mr);
+  LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibv_internal_dereg_mr);
+  LOAD_SYM(ibvhandle, "ibv_create_cq", ibv_internal_create_cq);
+  LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibv_internal_destroy_cq);
+  LOAD_SYM(ibvhandle, "ibv_create_qp", ibv_internal_create_qp);
+  LOAD_SYM(ibvhandle, "ibv_modify_qp", ibv_internal_modify_qp);
+  LOAD_SYM(ibvhandle, "ibv_destroy_qp", ibv_internal_destroy_qp);
+  LOAD_SYM(ibvhandle, "ibv_fork_init", ibv_internal_fork_init);
+  LOAD_SYM(ibvhandle, "ibv_event_type_str", ibv_internal_event_type_str);
+
+  ibvState = ibvInitialized;
+  return ncclSuccess;
+
+teardown:
+  ibv_internal_get_device_list = NULL;
+  ibv_internal_free_device_list = NULL;
+  ibv_internal_get_device_name = NULL;
+  ibv_internal_open_device = NULL;
+  ibv_internal_close_device = NULL;
+  ibv_internal_get_async_event = NULL;
+  ibv_internal_ack_async_event = NULL;
+  ibv_internal_query_device = NULL;
+  ibv_internal_query_port = NULL;
+  ibv_internal_query_gid = NULL;
+  ibv_internal_query_qp = NULL;
+  ibv_internal_alloc_pd = NULL;
+  ibv_internal_dealloc_pd = NULL;
+  ibv_internal_reg_mr = NULL;
+  ibv_internal_dereg_mr = NULL;
+  ibv_internal_create_cq = NULL;
+  ibv_internal_destroy_cq = NULL;
+  ibv_internal_create_qp = NULL;
+  ibv_internal_modify_qp = NULL;
+  ibv_internal_destroy_qp = NULL;
+  ibv_internal_fork_init = NULL;
+  ibv_internal_event_type_str = NULL;
+
+  if (ibvhandle != NULL) dlclose(ibvhandle);
+  ibvState = ibvError;
+  return ncclSystemError;
+}
+
+#define IBV_PTR_CHECK_ERRNO(name_internal, call, retval, error_retval, name) \
+  if (name_internal == NULL) { \
+     WARN("lib wrapper not initialized."); \
+     return ncclInternalError; \
+  } \
+  retval = call; \
+  if (retval == error_retval) { \
+    WARN("Call to " name " failed with error %s", strerror(errno)); \
+    return ncclSystemError; \
+  } \
+  return ncclSuccess;
+
+#define IBV_PTR_CHECK(name_internal, call, retval, error_retval, name) \
+  if (name_internal == NULL) { \
+     WARN("lib wrapper not initialized."); \
+     return ncclInternalError; \
+  } \
+  retval = call; \
+  if (retval == error_retval) { \
+    WARN("Call to " name " failed"); \
+    return ncclSystemError; \
+  } \
+  return ncclSuccess;
+
+#define IBV_INT_CHECK_RET_ERRNO(name_internal, call, success_retval, name) \
+  if (name_internal == NULL) { \
+     WARN("lib wrapper not initialized."); \
+     return ncclInternalError; \
+  } \
+  int ret = call; \
+  if (ret != success_retval) { \
+    WARN("Call to " name " failed with error %s", strerror(ret)); \
+    return ncclSystemError; \
+  } \
+  return ncclSuccess;
+
+#define IBV_INT_CHECK(name_internal, call, error_retval, name) \
+  if (name_internal == NULL) { \
+     WARN("lib wrapper not initialized."); \
+     return ncclInternalError; \
+  } \
+  int ret = call; \
+  if (ret == error_retval) { \
+    WARN("Call to " name " failed"); \
+    return ncclSystemError; \
+  } \
+  return ncclSuccess;
+
+#define IBV_PASSTHRU(name_internal, call) \
+  if (name_internal == NULL) { \
+     WARN("lib wrapper not initialized."); \
+     return ncclInternalError; \
+  } \
+  call; \
+  return ncclSuccess;
+
+ncclResult_t wrap_ibv_fork_init() {
+  IBV_INT_CHECK(ibv_internal_fork_init, ibv_internal_fork_init(), -1, "ibv_fork_init");
+}
+
+ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices) {
+  *ret = ibv_internal_get_device_list(num_devices);
+  if (*ret == NULL) *num_devices = 0;
+  return ncclSuccess;
+}
+
+ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list) {
+  IBV_PASSTHRU(ibv_internal_free_device_list, ibv_internal_free_device_list(list));
+}
+
+const char *wrap_ibv_get_device_name(struct ibv_device *device) {
+  if (ibv_internal_get_device_name == NULL) {
+    WARN("lib wrapper not initialized.");
+    exit(-1);
+  }
+  return ibv_internal_get_device_name(device);
+}
+
+ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device) { /*returns 0 on success, -1 on failure*/
+  IBV_PTR_CHECK(ibv_internal_open_device, ibv_internal_open_device(device), *ret, NULL, "ibv_open_device");
+}
+
+ncclResult_t wrap_ibv_close_device(struct ibv_context *context) { /*returns 0 on success, -1 on failure*/
+  IBV_INT_CHECK(ibv_internal_close_device, ibv_internal_close_device(context), -1, "ibv_close_device");
+}
+
+ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event) { /*returns 0 on success, and -1 on error*/
+  IBV_INT_CHECK(ibv_internal_get_async_event, ibv_internal_get_async_event(context, event), -1, "ibv_get_async_event");
+}
+
+ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event) {
+  IBV_PASSTHRU(ibv_internal_ack_async_event, ibv_internal_ack_async_event(event));
+}
+
+ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_device, ibv_internal_query_device(context, device_attr), 0, "ibv_query_device");
+}
+
+ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port");
+}
+
+ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid) {
+  IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_gid, ibv_internal_query_gid(context, port_num, index, gid), 0, "ibv_query_gid");
+}
+
+ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) {
+  IBV_INT_CHECK_RET_ERRNO(ibv_internal_query_qp, ibv_internal_query_qp(qp, attr, attr_mask, init_attr), 0, "ibv_query_qp");
+}
+
+ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context) {
+  IBV_PTR_CHECK(ibv_internal_alloc_pd, ibv_internal_alloc_pd(context), *ret, NULL, "ibv_alloc_pd");
+}
+
+ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  IBV_INT_CHECK_RET_ERRNO(ibv_internal_dealloc_pd, ibv_internal_dealloc_pd(pd), 0, "ibv_dealloc_pd");
+}
+
+ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access) {
+  IBV_PTR_CHECK(ibv_internal_reg_mr, ibv_internal_reg_mr(pd, addr, length, access), *ret, NULL, "ibv_reg_mr");
+}
+
+struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) {
+  if (ibv_internal_reg_mr == NULL) {
+    WARN("lib wrapper not initialized.");
+    return NULL;
+  }
+  return ibv_internal_reg_mr(pd, addr, length, access);
+}
+
+ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  IBV_INT_CHECK_RET_ERRNO(ibv_internal_dereg_mr, ibv_internal_dereg_mr(mr), 0, "ibv_dereg_mr");
+}
+
+ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector) {
+  IBV_PTR_CHECK(ibv_internal_create_cq, ibv_internal_create_cq(context, cqe, cq_context, channel, comp_vector), *ret, NULL, "ibv_create_cq");
+}
+
+ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq) {
+  IBV_INT_CHECK_RET_ERRNO(ibv_internal_destroy_cq, ibv_internal_destroy_cq(cq), 0, "ibv_destroy_cq");
+}
+
+ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp) {
+  IBV_INT_CHECK_RET_ERRNO(ibv_internal_destroy_qp, ibv_internal_destroy_qp(qp), 0, "ibv_destroy_qp");
+}
+
+ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) {
+  IBV_PTR_CHECK(ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp");
+}
+
+ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  IBV_INT_CHECK_RET_ERRNO(ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp");
+}
+
+ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event) {
+  *ret = (char *) ibv_internal_event_type_str(event);
+  return ncclSuccess;
+}
diff --git a/src/3rd_party/nccl/src/misc/nvmlwrap.cu b/src/3rd_party/nccl/src/misc/nvmlwrap.cu
new file mode 100644
index 00000000..52d4d414
--- /dev/null
+++ b/src/3rd_party/nccl/src/misc/nvmlwrap.cu
@@ -0,0 +1,248 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nvmlwrap.h"
+
+#ifndef NVML_DIRECT
+#include <dlfcn.h>
+#include "core.h"
+
+static enum { nvmlUninitialized, nvmlInitializing, nvmlInitialized, nvmlError } nvmlState = nvmlUninitialized;
+
+static nvmlReturn_t (*nvmlInternalInit)(void);
+static nvmlReturn_t (*nvmlInternalShutdown)(void);
+static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
+static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
+static nvmlReturn_t (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device);
+static nvmlReturn_t (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device);
+static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
+static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
+static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci);
+static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
+static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link,
+    nvmlNvLinkCapability_t capability, unsigned int *capResult);
+
+ncclResult_t wrapNvmlSymbols(void) {
+  if (nvmlState == nvmlInitialized)
+    return ncclSuccess;
+  if (nvmlState == nvmlError)
+    return ncclSystemError;
+
+  if (__sync_bool_compare_and_swap(&nvmlState, nvmlUninitialized, nvmlInitializing) == false) {
+    // Another thread raced in front of us. Wait for it to be done.
+    while (nvmlState == nvmlInitializing) pthread_yield();
+    return (nvmlState == nvmlInitialized) ? ncclSuccess : ncclSystemError;
+  }
+
+  static void* nvmlhandle = NULL;
+  void* tmp;
+  void** cast;
+
+  nvmlhandle=dlopen("libnvidia-ml.so.1", RTLD_NOW);
+  if (!nvmlhandle) {
+    WARN("Failed to open libnvidia-ml.so.1");
+    goto teardown;
+  }
+
+#define LOAD_SYM(handle, symbol, funcptr) do {         \
+    cast = (void**)&funcptr;                             \
+    tmp = dlsym(handle, symbol);                         \
+    if (tmp == NULL) {                                   \
+      WARN("dlsym failed on %s - %s", symbol, dlerror());\
+      goto teardown;                                     \
+    }                                                    \
+    *cast = tmp;                                         \
+  } while (0)
+
+#define LOAD_SYM_OPTIONAL(handle, symbol, funcptr) do {\
+    cast = (void**)&funcptr;                             \
+    tmp = dlsym(handle, symbol);                         \
+    if (tmp == NULL) {                                   \
+      INFO(INIT,"dlsym failed on %s, ignoring", symbol); \
+    }                                                    \
+    *cast = tmp;                                         \
+  } while (0)
+
+  LOAD_SYM(nvmlhandle, "nvmlInit", nvmlInternalInit);
+  LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceSetCpuAffinity", nvmlInternalDeviceSetCpuAffinity);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity);
+  LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
+  LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo);
+  LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState);
+  LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo);
+  LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability);
+
+  nvmlState = nvmlInitialized;
+  return ncclSuccess;
+
+teardown:
+  nvmlInternalInit = NULL;
+  nvmlInternalShutdown = NULL;
+  nvmlInternalDeviceGetHandleByPciBusId = NULL;
+  nvmlInternalDeviceGetIndex = NULL;
+  nvmlInternalDeviceSetCpuAffinity = NULL;
+  nvmlInternalDeviceClearCpuAffinity = NULL;
+  nvmlInternalDeviceGetPciInfo = NULL;
+  nvmlInternalDeviceGetNvLinkState = NULL;
+  nvmlInternalDeviceGetNvLinkRemotePciInfo = NULL;
+  nvmlInternalDeviceGetNvLinkCapability = NULL;
+
+  if (nvmlhandle != NULL) dlclose(nvmlhandle);
+  nvmlState = nvmlError;
+  return ncclSystemError;
+}
+
+
+ncclResult_t wrapNvmlInit(void) {
+  if (nvmlInternalInit == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret = nvmlInternalInit();
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlInit() failed: %s",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlShutdown(void) {
+  if (nvmlInternalShutdown == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret = nvmlInternalShutdown();
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlShutdown() failed: %s ",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
+  if (nvmlInternalDeviceGetHandleByPciBusId == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret = nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device);
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
+  if (nvmlInternalDeviceGetIndex == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret = nvmlInternalDeviceGetIndex(device, index);
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlDeviceGetIndex() failed: %s ",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
+  if (nvmlInternalDeviceSetCpuAffinity == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclInternalError;
+  }
+  // Workaround : it seems SetCpuAffinity is not thread safe.
+  static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+  pthread_mutex_lock(&lock);
+  nvmlReturn_t ret = nvmlInternalDeviceSetCpuAffinity(device);
+  pthread_mutex_unlock(&lock);
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlDeviceSetCpuAffinity() failed: %s ",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
+  if (nvmlInternalInit == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret = nvmlInternalDeviceClearCpuAffinity(device);
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlDeviceClearCpuAffinity() failed: %s ",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
+  if (nvmlInternalDeviceGetPciInfo == NULL) {
+    WARN("lib wrapper not initialized.");
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret = nvmlInternalDeviceGetPciInfo(device, pci);
+  if (ret != NVML_SUCCESS) {
+    WARN("nvmlDeviceGetPciInfo() failed: %s ",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
+  if (nvmlInternalDeviceGetNvLinkState == NULL) {
+    /* Do not warn, this symbol is optional. */
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive);
+  if (ret != NVML_SUCCESS) {
+    INFO(INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
+        nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
+  if (nvmlInternalDeviceGetNvLinkRemotePciInfo == NULL) {
+    /* Do not warn, this symbol is optional. */
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci);
+  if (ret != NVML_SUCCESS) {
+    if (ret != NVML_ERROR_NOT_SUPPORTED)
+      INFO(INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ",
+          nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
+    nvmlNvLinkCapability_t capability, unsigned int *capResult) {
+  if (nvmlInternalDeviceGetNvLinkCapability == NULL) {
+    /* Do not warn, this symbol is optional. */
+    return ncclInternalError;
+  }
+  nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult);
+  if (ret != NVML_SUCCESS) {
+    if (ret != NVML_ERROR_NOT_SUPPORTED)
+      INFO(INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ",
+          nvmlInternalErrorString(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+#endif
diff --git a/src/3rd_party/nccl/src/misc/rings.cu b/src/3rd_party/nccl/src/misc/rings.cu
new file mode 100644
index 00000000..9ecda49c
--- /dev/null
+++ b/src/3rd_party/nccl/src/misc/rings.cu
@@ -0,0 +1,355 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "net.h"
+#include "param.h"
+
+/* Parse user defined rings. Format is like :
+ * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
+ * Rings with a non-matching number of ranks are ignored so we can provide
+ * rings for multiple cases.
+ */
+#define MAX_ENV_RANKS 512
+static ncclResult_t parseRings(const char* str, int* nringsRet, int nranks, int* prev, int* next) {
+  int ranks[MAX_ENV_RANKS];
+  int nrings = 0;
+  int rank = 0;
+  int offset = 0;
+  int status = 0; // 0 : between numbers, 1 : inside number
+  do {
+    int digit = str[offset] - '0';
+    if (digit >= 0 && digit <= 9) {
+      if (status == 0) {
+        ranks[rank] = digit;
+        status = 1;
+      } else {
+        ranks[rank] = ranks[rank]*10+digit;
+      }
+    } else {
+      if (status == 1) {
+        rank++;
+        if (rank == MAX_ENV_RANKS) goto end;
+      }
+      status = 0;
+      if (str[offset] == '|' || str[offset] == '\0') {
+        int prevRank = ranks[rank-1];
+        // Ignore rings if nranks doesn't match
+        if (rank != nranks) goto newring;
+
+        for (int r=0; r<nranks; r++) {
+          int rank = ranks[r];
+          // Ignore rings with ranks out of bounds
+          if (rank < 0 || rank >= nranks) goto newring;
+          // Ignore rings with duplicate ranks
+          for (int i=0; i<r; i++)
+            if (ranks[i] == rank) goto newring;
+
+          next[nrings*nranks+prevRank] = rank;
+          prev[nrings*nranks+rank] = prevRank;
+          prevRank = rank;
+        }
+        nrings++;
+newring:
+        rank = 0;
+      }
+    }
+  } while (str[offset++] != 0);
+end:
+  *nringsRet = nrings;
+  return ncclSuccess;
+}
+
+/*
+ * Ring creation algorithm
+ *
+ * First, we establish hierarchical coordinates depending on the way ranks can
+ * communicate. After fillCoords, we have for each rank a unique 3-int array
+ * {   node, pci_domain,   rank } corresponding to the three transports :
+ * { 2[NET],     1[SHM], 0[P2P] }.
+ * Also, we renumber ranks (to indexes) based on their growing coordinates.
+ *
+ * Then, we ask transports to connect groups together. We start with net, then
+ * shm, then p2p. We maintain two arrays, prev and next, where values are equal
+ * to -1 when ranks are not yet connected, and a rank otherwise. We never
+ * connect ranks outside our group, meaning that on 4 nodes of 2 sockets of 4
+ * ranks, if we are rank 13, we should see something like (provided we have a
+ * single net interface, hence a single ring) :
+ *
+ * Connecting all nodes                                <13>
+ * 2[NET] : prev 31 -1 -1 -1 -1 -1 -1 -1  7 -1 -1 -1 -1 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
+ *          next -1 -1 -1 -1 -1 -1 -1  8 -1 -1 -1 -1 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1  0
+ *
+ * Connecting P2P domains with shared memory           <13>
+ * 1[SHM] : prev 31 -1 -1 -1 -1 -1 -1 -1  7 -1 -1 -1 11 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
+ *          next -1 -1 -1 -1 -1 -1 -1  8 -1 -1 -1 12 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1  0
+ *
+ * Connecting ranks (only inside the P2P domain)       <13>
+ * 0[P2P] : prev 31 -1 -1 -1 -1 -1 -1 -1  7 -1 -1 -1 11 12 13 14 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
+ *          next -1 -1 -1 -1 -1 -1 -1  8 -1 -1 -1 12 13 14 15 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1  0
+ *
+ * Hence, when we ask a transport to connect groups, we provide it with a subview of the ranks (except for net
+ * which always sees the full world). That way, P2P can bruteforce all combinations inside the node without
+ * risking to explode in terms of combinations, and we scale better.
+ *
+ * Finally, we loop over Network scores to try to create rings with high scores (=locality) and decrease until
+ * we get at least one ring.
+ */
+
+static void recIsConnected(int rank, int* connected, int nranks, int* matrix, int transport) {
+  connected[rank] = 1;
+  for (int r=0; r<nranks; r++) {
+    if (connected[r] == 0 && matrix[rank*nranks+r] == transport) {
+      recIsConnected(r, connected, nranks, matrix, transport);
+    }
+  }
+}
+
+static void isConnected(int rank, int* connected, int nranks, int* matrix, int transport) {
+  for (int r=0; r<nranks; r++) connected[r] = 0;
+  recIsConnected(rank, connected, nranks, matrix, transport);
+}
+
+#define NEW_IDX(rank) do { \
+  rankToIdx[rank] = idx; \
+  idxToRank[idx] = rank; \
+  for (int t=0; t<NTRANSPORTS; t++) coords[rank*NTRANSPORTS+t] = current[t]; \
+  idx++; \
+} while (0)
+
+int findConnected(int rank, int* matrix, int nranks, int transport, int* coords) {
+  for (int r=0; r<nranks; r++) {
+    if (coords[r*NTRANSPORTS] == -1 && matrix[rank*nranks+r] == transport) return r;
+  }
+  return -1;
+}
+
+static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankToIdx, int* idxToRank) {
+  int current[NTRANSPORTS];
+  int* p2pConnected;
+  NCCLCHECK(ncclCalloc(&p2pConnected, nranks));
+  for (int i=0; i<NTRANSPORTS; i++) current[i] = 0;
+  int curRank = 0, idx = 0;
+  while (1) {
+    // P2P is handled separately as there is no level below it and we need to
+    // cover the case of being connected to another GPU indirectly.
+    // So we detect all GPUs in the same P2P domain once and add them all at
+    // once.
+    isConnected(curRank, p2pConnected, nranks, matrix, 0);
+    for (int r=0; r<nranks; r++) {
+      if (p2pConnected[r]) {
+        NEW_IDX(r);
+        curRank = r;
+        current[0]++;
+      }
+    }
+    current[0] = 0;
+
+    if (idx == nranks) {
+      free(p2pConnected);
+      return ncclSuccess;
+    }
+
+    // Find next group, either connected through SHM or NET.
+    int rank;
+    int transport = 1;
+    while ((rank = findConnected(curRank, matrix, nranks, transport, coords)) == -1) {
+      current[transport] = 0;
+      transport++;
+      if (transport == NTRANSPORTS) { free(p2pConnected); return ncclInternalError; }
+    }
+    curRank = rank;
+    current[transport]++;
+  }
+}
+
+NCCL_PARAM(MinNrings, "MIN_NRINGS", 0);
+NCCL_PARAM(MaxNrings, "MAX_NRINGS", 0);
+
+/* Users can force the number of threads with an environment variable */
+NCCL_PARAM(Nthreads, "NTHREADS", -2);
+ncclResult_t getEnvThreads(int* nthreads) {
+  int64_t nt = ncclParamNthreads();
+  if (nt != -2)
+    *nthreads = nt;
+  return ncclSuccess;
+}
+
+/* Main ring creation function */
+ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next) {
+  *nrings = 0;
+
+  if (nranks == 1) return ncclSuccess;
+
+  char* str = getenv("NCCL_RINGS");
+  if (str && strlen(str)>0) {
+    int ret = parseRings(str, nrings, nranks, prev, next);
+    if (ret == ncclSuccess && *nrings > 0) {
+      if (rank == 0) INFO(INIT,"%d ring(s) set by environment", *nrings);
+      NCCLCHECK(getEnvThreads(nthreads));
+      return ncclSuccess;
+    }
+    if (rank == 0) INFO(INIT,"No valid ring found in environment, ignoring");
+    *nrings = 0;
+  }
+
+  // Compute hierarchical topology groups, indexes, and rank<->index tables
+  int* coords, *globalIdxToRank, *globalRankToIdx;
+  NCCLCHECK(ncclCalloc(&coords, nranks*NTRANSPORTS));
+  for (int i=0; i<nranks*NTRANSPORTS; i++) coords[i] = -1;
+  NCCLCHECK(ncclCalloc(&globalIdxToRank, nranks));
+  NCCLCHECK(ncclCalloc(&globalRankToIdx, nranks));
+
+  NCCLCHECK(fillCoords(nranks, transports, coords, globalRankToIdx, globalIdxToRank));
+
+  // Start with a high score, then decrease until we find rings
+  int minScore = NCCL_MAX_SCORE;
+  int nringsTmp;
+  int *prevTmp, *nextTmp, *idxToRank, *rankToIdx, *groups, *subgroups;
+  NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXRINGS));
+  NCCLCHECK(ncclCalloc(&idxToRank, nranks));
+  NCCLCHECK(ncclCalloc(&rankToIdx, nranks));
+  NCCLCHECK(ncclCalloc(&groups, nranks));
+  NCCLCHECK(ncclCalloc(&subgroups, nranks));
+
+  int nThreads;
+  do {
+    nThreads = *nthreads;
+    for (int i=0; i<nranks*MAXRINGS; i++) prevTmp[i] = nextTmp[i] = -1;
+    nringsTmp = MAXRINGS;
+    // Loop over transports to connect groups
+    for (int t=NTRANSPORTS-1; t>=0; t--) {
+      for (int i=0; i<nranks; i++) idxToRank[i] = rankToIdx[i] = -1;
+
+      int nidx = 0;
+      for (int i=0; i<nranks; i++) {
+        // Extract only ranks in the same local area as rank
+        // We need to extract them in the topological order, hence we iterate over indexes, not ranks
+        int r = globalIdxToRank[i];
+        int sameLocal = 1;
+        for (int tr = NTRANSPORTS-1; tr > t; tr--) if (coords[r*NTRANSPORTS+tr] != coords[rank*NTRANSPORTS+tr]) sameLocal = 0;
+        if (!sameLocal) continue;
+
+        groups[nidx] = coords[r*NTRANSPORTS+t];
+        subgroups[nidx] = t ? coords[r*NTRANSPORTS+t-1] : nidx;
+        rankToIdx[r] = nidx;
+        idxToRank[nidx] = r;
+        nidx++;
+      }
+
+      int ngroups = groups[nidx-1] + 1; // Coords should be ordered
+
+      ncclTvalue_t* subvalues;
+      int *subprev, *subnext;
+      NCCLCHECK(ncclCalloc(&subvalues, nidx*nidx));
+      NCCLCHECK(ncclCalloc(&subprev, nidx*nringsTmp));
+      NCCLCHECK(ncclCalloc(&subnext, nidx*nringsTmp));
+      if (ngroups > 1) {
+        /* Extract subvalues */
+        for (int i=0; i<nidx; i++) {
+          for (int j=0; j<nidx; j++) {
+            if (transports[idxToRank[i]*nranks+idxToRank[j]] == t)
+              subvalues[i*nidx+j] = values[idxToRank[i]*nranks+idxToRank[j]];
+            else
+              subvalues[i*nidx+j] = 0;
+          }
+        }
+        /* Extract subprev/subnext */
+        for (int i=0; i<nidx*nringsTmp; i++) {
+          subprev[i] = subnext[i] = -1;
+        }
+        for (int r=0; r<nringsTmp; r++) {
+          int start = -1, end = -1;
+          for (int i=0; i<nranks; i++) {
+            if (rankToIdx[i] == -1) continue;
+            if (prevTmp[r*nranks+i] != -1) start = i;
+            if (nextTmp[r*nranks+i] != -1) end = i;
+          }
+          if (start != -1 && end != -1) {
+            subprev[r*nidx+rankToIdx[start]] = rankToIdx[end];
+            subnext[r*nidx+rankToIdx[end]] = rankToIdx[start];
+          }
+        }
+        /* Get rings */
+        NCCLCHECK(ncclTransports[t].getRings(nidx, groups, subgroups, subvalues, &nringsTmp, subprev, subnext, minScore, &nThreads));
+        /* Merge subprev/subnext into prev/next */
+        for (int r=0; r<nringsTmp; r++) {
+          for (int i=0; i<nidx; i++) {
+            if ((prevTmp[r*nranks+idxToRank[i]] == -1) && (subprev[r*nidx+i] != -1)) prevTmp[r*nranks+idxToRank[i]] = idxToRank[subprev[r*nidx+i]];
+            if ((nextTmp[r*nranks+idxToRank[i]] == -1) && (subnext[r*nidx+i] != -1)) nextTmp[r*nranks+idxToRank[i]] = idxToRank[subnext[r*nidx+i]];
+          }
+        }
+        //for (int r=0; r<nringsTmp; r++) {
+        //printf("[%d] [%d] [%d] [%d] Prev ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", prevTmp[r*nranks+i]); printf("\n");
+        //printf("[%d] [%d] [%d] [%d] Next ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", nextTmp[r*nranks+i]); printf("\n");
+        //}
+      }
+      free(subvalues);
+      free(subprev);
+      free(subnext);
+      if (nringsTmp == 0) break;
+    }
+    minScore--;
+    if (nringsTmp > *nrings) {
+      *nrings = nringsTmp;
+      for (int i=0; i<nranks*(*nrings); i++) {
+        prev[i] = prevTmp[i];
+        next[i] = nextTmp[i];
+      }
+    }
+  } while (nringsTmp == 0 && minScore);
+
+  free(coords);
+  free(globalRankToIdx);
+  free(globalIdxToRank);
+  free(prevTmp);
+  free(nextTmp);
+  free(idxToRank);
+  free(rankToIdx);
+  free(groups);
+  free(subgroups);
+
+  *nthreads = nThreads;
+
+  if (*nrings == 0) {
+    WARN("Could not create rings, falling back on simple ring");
+    *nrings = 1;
+    prev[rank] = (rank-1+nranks) % nranks;
+    next[rank] = (rank+1)%nranks;
+  }
+
+  int maxNrings = ncclParamMaxNrings();
+  int minNrings = ncclParamMinNrings();
+  if (maxNrings > 0 && minNrings > maxNrings) {
+    if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than NCCL_MAX_NRINGS, ignoring NCCL_MIN_NRINGS");
+    minNrings = 0;
+  }
+  if (minNrings > MAXRINGS) {
+    if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXRINGS, MAXRINGS);
+    minNrings = MAXRINGS;
+  }
+  if (maxNrings > 0 && maxNrings <= *nrings) {
+    if (rank == 0) INFO(INIT,"Limiting to %d rings per user request.", maxNrings);
+    *nrings = maxNrings;
+  } else {
+    int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
+    if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
+    if (minNrings > 0 && minNrings > *nrings) {
+      if (rank == 0 && minNrings > defaultMinNrings) INFO(INIT,"Duplicating rings to %d per user request.", minNrings);
+      for (int r=*nrings; r<MAXRINGS && r <minNrings; r++) {
+        for (int i=0; i<nranks; i++) {
+          prev[r*nranks+i] = prev[(r-*nrings)*nranks+i];
+          next[r*nranks+i] = next[(r-*nrings)*nranks+i];
+        }
+      }
+      *nrings = minNrings;
+    }
+  }
+
+  NCCLCHECK(getEnvThreads(nthreads));
+  return ncclSuccess;
+}
diff --git a/src/3rd_party/nccl/src/misc/utils.cu b/src/3rd_party/nccl/src/misc/utils.cu
new file mode 100644
index 00000000..d517267f
--- /dev/null
+++ b/src/3rd_party/nccl/src/misc/utils.cu
@@ -0,0 +1,136 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "utils.h"
+#include "debug.h"
+#include <unistd.h>
+#include <string.h>
+
+ncclResult_t getHostName(char* hostname, int maxlen) {
+  if (gethostname(hostname, maxlen) != 0) {
+    strncpy(hostname, "unknown", maxlen);
+    return ncclSystemError;
+  }
+  int i = 0;
+  while ((hostname[i] != '.') && (hostname[i] != '\0') && (i < maxlen-1)) i++;
+  hostname[i] = '\0';
+  return ncclSuccess;
+}
+
+uint64_t getHash(const char* string) {
+  // Based on DJB2, result = result * 33 + char
+  uint64_t result = 5381;
+  for (int c = 0; string[c] != '\0'; c++) {
+    result = ((result << 5) + result) + string[c];
+  }
+  return result;
+}
+
+/* Generate a hash of the unique identifying string for this host
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $(hostname) $(readlink /proc/self/ns/uts) $(readlink /proc/self/ns/mnt)
+ */
+uint64_t getHostHash(void) {
+  char uname[1024];
+  // Start off with the hostname
+  (void) getHostName(uname, sizeof(uname));
+  int offset = strlen(uname);
+  int len;
+  // $(readlink /proc/self/ns/uts)
+  len = readlink("/proc/self/ns/uts", uname+offset, sizeof(uname)-1-offset);
+  if (len < 0) len = 0;
+  offset += len;
+  // $(readlink /proc/self/ns/mnt)
+  len = readlink("/proc/self/ns/mnt", uname+offset, sizeof(uname)-1-offset);
+  if (len < 0) len = 0;
+  offset += len;
+  // Trailing '\0'
+  uname[offset]='\0';
+  TRACE(INIT,"unique hostname '%s'", uname);
+
+  return getHash(uname);
+}
+
+/* Generate a hash of the unique identifying string for this process
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $$ $(readlink /proc/self/ns/pid)
+ */
+uint64_t getPidHash(void) {
+  char pname[1024];
+  // Start off with our pid ($$)
+  sprintf(pname, "%ld", (long) getpid());
+  int plen = strlen(pname);
+  int len = readlink("/proc/self/ns/pid", pname+plen, sizeof(pname)-1-plen);
+  if (len < 0) len = 0;
+
+  pname[plen+len]='\0';
+  TRACE(INIT,"unique PID '%s'", pname);
+
+  return getHash(pname);
+}
+
+int parseStringList(const char* string, struct netIf* ifList, int maxList) {
+  if (!string) return 0;
+
+  const char* ptr = string;
+  // Ignore "^" prefix, will be detected outside of this function
+  if (ptr[0] == '^') ptr++;
+
+  int ifNum = 0;
+  int ifC = 0;
+  char c;
+  do {
+    c = *ptr;
+    if (c == ':') {
+      if (ifC > 0) {
+        ifList[ifNum].prefix[ifC] = '\0';
+        ifList[ifNum].port = atoi(ptr+1);
+        ifNum++; ifC = 0;
+      }
+      while (c != ',' && c != '\0') c = *(++ptr);
+    } else if (c == ',' || c == '\0') {
+      if (ifC > 0) {
+        ifList[ifNum].prefix[ifC] = '\0';
+        ifList[ifNum].port = -1;
+        ifNum++; ifC = 0;
+      }
+    } else {
+      ifList[ifNum].prefix[ifC] = c;
+      ifC++;
+    }
+    ptr++;
+  } while (ifNum < maxList && c);
+  return ifNum;
+}
+
+static bool matchPrefix(const char* string, const char* prefix) {
+  return (strncmp(string, prefix, strlen(prefix)) == 0);
+}
+
+static bool matchPort(const int port1, const int port2) {
+  if (port1 == -1) return true;
+  if (port2 == -1) return true;
+  if (port1 == port2) return true;
+  return false;
+}
+
+
+bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize) {
+  // Make an exception for the case where no user list is defined
+  if (listSize == 0) return true;
+
+  for (int i=0; i<listSize; i++) {
+    if (matchPrefix(string, ifList[i].prefix)
+        && matchPort(port, ifList[i].port)) {
+      return true;
+    }
+  }
+  return false;
+}
diff --git a/src/3rd_party/nccl/src/nccl.h.in b/src/3rd_party/nccl/src/nccl.h.in
new file mode 100644
index 00000000..72276254
--- /dev/null
+++ b/src/3rd_party/nccl/src/nccl.h.in
@@ -0,0 +1,251 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_H_
+#define NCCL_H_
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#define NCCL_MAJOR ${nccl:Major}
+#define NCCL_MINOR ${nccl:Minor}
+#define NCCL_PATCH ${nccl:Patch}
+#define NCCL_SUFFIX "${nccl:Suffix}"
+
+#define NCCL_VERSION_CODE ${nccl:Version}
+#define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Opaque handle to communicator */
+typedef struct ncclComm* ncclComm_t;
+
+#define NCCL_UNIQUE_ID_BYTES 128
+typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
+
+/* Error type */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclNumResults              =  6 } ncclResult_t;
+
+/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
+ * This integer is coded with the MAJOR, MINOR and PATCH level of the
+ * NCCL library
+ */
+ncclResult_t ncclGetVersion(int *version);
+ncclResult_t pncclGetVersion(int *version);
+
+/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
+ * called once and the Id should be distributed to all ranks in the
+ * communicator before calling ncclCommInitRank. */
+ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
+ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
+
+/* Creates a new communicator (multi thread/process version).
+ * rank must be between 0 and nranks-1 and unique within a communicator clique.
+ * Each rank is associated to a CUDA device, which has to be set before calling
+ * ncclCommInitRank.
+ * ncclCommInitRank implicitly syncronizes with other ranks, so it must be
+ * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
+ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+
+/* Creates a clique of communicators (single process version).
+ * This is a convenience function to create a single-process communicator clique.
+ * Returns an array of ndev newly initialized communicators in comm.
+ * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
+ * If devlist is NULL, the first ndev CUDA devices are used.
+ * Order of devlist defines user-order of processors within the communicator. */
+ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+
+/* Frees resources associated with communicator object. */
+ncclResult_t  ncclCommDestroy(ncclComm_t comm);
+ncclResult_t pncclCommDestroy(ncclComm_t comm);
+
+/* Returns a human-readable error message. */
+const char*  ncclGetErrorString(ncclResult_t result);
+const char* pncclGetErrorString(ncclResult_t result);
+
+/* Gets the number of ranks in the communicator clique. */
+ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
+ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
+
+/* Returns the cuda device number associated with the communicator. */
+ncclResult_t  ncclCommCuDevice(const ncclComm_t comm, int* device);
+ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
+
+/* Returns the user-ordered "rank" associated with the communicator. */
+ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
+ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
+
+/* Reduction operation selector */
+typedef enum { ncclSum        = 0,
+               ncclProd       = 1,
+               ncclMax        = 2,
+               ncclMin        = 3,
+               ncclNumOps     = 4 } ncclRedOp_t;
+
+/* Data types */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclNumTypes   = 9 } ncclDataType_t;
+
+/*
+ * Collective communication operations
+ *
+ * Collective communication operations must be called separately for each
+ * communicator in a communicator clique.
+ *
+ * They return when operations have been enqueued on the CUDA stream.
+ *
+ * Since they may perform inter-CPU synchronization, each call has to be done
+ * from a different thread or process, or need to use Group Semantics (see
+ * below).
+ */
+
+/*
+ * Reduce
+ *
+ * Reduces data arrays of length count in sendbuff into recvbuff using op
+ * operation.
+ * recvbuff may be NULL on all calls except for root device.
+ * root is the rank (not the CUDA device) where data will reside after the
+ * operation is complete.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * (deprecated) Broadcast (in-place)
+ *
+ * Copies count values from root to all other devices.
+ * root is the rank (not the CUDA device) where data resides before the
+ * operation is started.
+ *
+ * This operation is implicitely in place.
+ */
+ncclResult_t  ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Broadcast
+ *
+ * Copies count values from root to all other devices.
+ * root is the rank (not the CUDA device) where data resides before the
+ * operation is started.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * All-Reduce
+ *
+ * Reduces data arrays of length count in sendbuff using op operation, and
+ * leaves identical copies of result on each recvbuff.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Reduce-Scatter
+ *
+ * Reduces data in sendbuff using op operation and leaves reduced result
+ * scattered over the devices so that recvbuff on rank i will contain the i-th
+ * block of the result.
+ * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
+ * should have a size of at least nranks*recvcount elements.
+ *
+ * In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
+ */
+ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream);
+ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream);
+
+/*
+ * All-Gather
+ *
+ * Each device gathers sendcount values from other GPUs into recvbuff,
+ * receiving data from rank i at offset i*sendcount.
+ * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
+ * should have a size of at least nranks*sendcount elements.
+ *
+ * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
+ */
+ncclResult_t  ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Group semantics
+ *
+ * When managing multiple GPUs from a single thread, and since NCCL collective
+ * calls may perform inter-CPU synchronization, we need to "group" calls for
+ * different ranks/devices into a single call.
+ *
+ * Grouping NCCL calls as being part of the same collective operation is done
+ * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
+ * collective calls until the ncclGroupEnd call, which will wait for all calls
+ * to be complete. Note that for collective communication, ncclGroupEnd only
+ * guarantees that the operations are enqueued on the streams, not that
+ * the operation is effectively done.
+ *
+ * Both collective communication and ncclCommInitRank can be used in conjunction
+ * of ncclGroupStart/ncclGroupEnd.
+ */
+
+/*
+ * Group Start
+ *
+ * Start a group call. All subsequent calls to NCCL may not block due to
+ * inter-CPU synchronization.
+ */
+ncclResult_t ncclGroupStart();
+
+/*
+ * Group End
+ *
+ * End a group call. Wait for all calls since ncclGroupStart to complete
+ * before returning.
+ */
+ncclResult_t ncclGroupEnd();
+
+#ifdef __cplusplus
+} // end extern "C"
+#endif
+
+#endif // end include guard
diff --git a/src/3rd_party/nccl/src/ring.cu b/src/3rd_party/nccl/src/ring.cu
new file mode 100644
index 00000000..e9e95737
--- /dev/null
+++ b/src/3rd_party/nccl/src/ring.cu
@@ -0,0 +1,70 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "ring.h"
+#include "param.h"
+
+NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
+
+ncclResult_t initRing(struct ncclComm* comm, int ringid) {
+  struct ncclRing* ring = comm->rings+ringid;
+  ring->id = ringid;
+
+  // Setup intermediate buffering
+  ring->buffSize = ncclParamBuffsize();
+
+  const int sendSize = ring->devMemSendSize = sizeof(struct ncclSendMem);
+  struct ncclSendMem* sendMem;
+  NCCLCHECK(ncclCudaCalloc((char**)&sendMem, sendSize));
+  ring->devMemSend = sendMem;
+
+  const int recvSize = ring->devMemRecvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
+  struct ncclRecvMem* recvMem;
+  NCCLCHECK(ncclCudaCalloc((char**)&recvMem, recvSize));
+  ring->devMemRecv = recvMem;
+
+  TRACE(INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize);
+
+  // Pre-configure send/recv pointers. Those are the default, they may change later.
+  ring->recv.conn.buff = recvMem->buff;
+  ring->recv.conn.llBuff = recvMem->llBuff;
+  ring->recv.conn.tail = &recvMem->tail;
+  ring->recv.conn.opCount = &recvMem->opCount;
+  ring->recv.conn.direct = 0;
+  ring->send.conn.head = &sendMem->head;
+  ring->send.conn.llHead = &sendMem->llHead;
+  ring->send.conn.direct = 0;
+  ring->send.conn.llStep = 0;
+  ring->send.conn.llLastCleaning = 0;
+
+  // Ring index to user rank table.
+  NCCLCHECK(ncclCudaCalloc(&ring->devUserRanks, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&ring->userRanks, comm->nRanks));
+
+  // Per-ring operation list.
+  NCCLCHECK(ncclCudaHostAlloc((void**)&ring->collectives, (void**)&ring->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
+  return ncclSuccess;
+}
+
+ncclResult_t freeRing(struct ncclRing* ring) {
+  // Intermediate buffering
+  CUDACHECK(cudaFree(ring->devMemSend));
+  CUDACHECK(cudaFree(ring->devMemRecv));
+
+  // Index to rank table
+  free(ring->userRanks);
+  CUDACHECK(cudaFree(ring->devUserRanks));
+
+  // Operation list
+  NCCLCHECK(ncclCudaHostFree(ring->collectives));
+
+  // Free transport proxy resources
+  if (ring->send.transportResources) NCCLCHECK(ring->send.transport->send.free(ring->send.transportResources));
+  NCCLCHECK(transportDestroyProxy(&ring->send));
+  if (ring->recv.transportResources) NCCLCHECK(ring->recv.transport->recv.free(ring->recv.transportResources));
+  NCCLCHECK(transportDestroyProxy(&ring->recv));
+  return ncclSuccess;
+}
diff --git a/src/3rd_party/nccl/src/transport.cu b/src/3rd_party/nccl/src/transport.cu
new file mode 100644
index 00000000..f5f9d75e
--- /dev/null
+++ b/src/3rd_party/nccl/src/transport.cu
@@ -0,0 +1,187 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "common_coll.h"
+
+extern struct ncclTransport p2pTransport;
+extern struct ncclTransport shmTransport;
+extern struct ncclTransport netTransport;
+
+struct ncclTransport ncclTransports[NTRANSPORTS] = {
+  p2pTransport,
+  shmTransport,
+  netTransport,
+};
+
+static void FifoPullArgs(struct transportProxyInfo* info, struct ncclProxyArgs *args) {
+  struct ncclProxyArgs *fifoArgs = info->argsFifo + (info->argsFifoHead % TRANSPORT_PROXY_FIFO_SIZE);
+  pthread_mutex_lock(&info->mutex);
+  while (fifoArgs->active == 0)
+    pthread_cond_wait(&info->cond, &info->mutex);
+  __sync_synchronize();
+  memcpy(args, fifoArgs, sizeof(struct ncclProxyArgs));
+  __sync_synchronize();
+  fifoArgs->active = 0;
+  pthread_cond_signal(&info->cond);
+  pthread_mutex_unlock(&info->mutex);
+  info->argsFifoHead++;
+}
+
+static struct ncclProxyArgs* FifoGetNextArgs(struct transportProxyInfo* info) {
+  if (info == NULL) return NULL;
+  struct ncclProxyArgs* fifoArgs = info->argsFifo + (info->argsFifoTail % TRANSPORT_PROXY_FIFO_SIZE);
+  pthread_mutex_lock(&info->mutex);
+  while (fifoArgs->active == 1)
+    pthread_cond_wait(&info->cond, &info->mutex);
+  pthread_mutex_unlock(&info->mutex);
+  info->argsFifoTail++;
+  return fifoArgs;
+}
+
+static void FifoPushArgs(struct transportProxyInfo* info) {
+  if (info == NULL) return;
+
+  struct ncclProxyArgs* fifoArgs = info->argsFifo + ((info->argsFifoTail-1) % TRANSPORT_PROXY_FIFO_SIZE);
+  if (fifoArgs->active == 0) return;
+
+  pthread_mutex_lock(&info->mutex);
+  pthread_cond_signal(&info->cond);
+  pthread_mutex_unlock(&info->mutex);
+}
+
+static void WaitProxyReady(struct transportProxyInfo* info) {
+  pthread_mutex_lock(&info->mutex);
+  while (info->proxyReady == 0)
+    pthread_cond_wait(&info->cond, &info->mutex);
+  pthread_mutex_unlock(&info->mutex);
+}
+
+static void SetProxyReady(struct transportProxyInfo* info) {
+  pthread_mutex_lock(&info->mutex);
+  info->proxyReady = 1;
+  pthread_cond_signal(&info->cond);
+  pthread_mutex_unlock(&info->mutex);
+}
+
+static void StopProxy(struct transportProxyInfo* info) {
+  struct ncclProxyArgs* fifoArgs = FifoGetNextArgs(info);
+  fifoArgs->active = -1;
+  FifoPushArgs(info);
+}
+
+#define RECV 0
+#define SEND 1
+
+static bool NeedProxy(int type, int pattern, struct ncclRing* ring, int nranks) {
+  enum proxyMode mode = proxyPatternMode(pattern);
+  if (mode == proxyRing) return true;
+
+  /* In chains, one rank does not need a proxy. Let's figure out which one it is */
+  int root = proxyPatternRoot(pattern);
+  // Which index in the reorganized rings should we compare root against */
+  const int myrank = 0, nextrank = 1, prevrank = nranks-1;
+  int index = mode == proxyFrom ?
+      /*                            no recv /  no send    if root = */
+      /* bcast  */ (type == RECV ?   myrank : nextrank ):
+      /* reduce */ (type == RECV ? prevrank :   myrank );
+  int rank = ring->userRanks[index];
+  return (root != rank);
+}
+
+static void SaveProxy(struct ncclConnector* connector, struct ncclProxyArgs* args, int needProxy) {
+  struct transportProxyInfo* info = connector->proxyInfo;
+  if (info == NULL) return;
+  struct ncclProxyArgs* fifoArgs = FifoGetNextArgs(info);
+  args->needProxy = needProxy;
+  __sync_synchronize();
+  memcpy(fifoArgs, args, sizeof(struct ncclProxyArgs));
+  __sync_synchronize();
+  fifoArgs->active = 1;
+}
+
+ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t nbytes, int pattern, struct ncclComm* comm) {
+  int llMode, nrings, nthreads;
+  ncclGetCollResource(comm, nbytes, &nrings, &nthreads, &llMode);
+  nbytes       = llMode ? nbytes * 2    : nbytes;
+  substeps     = llMode ? 1             : substeps;
+  subchunks    = llMode ? NCCL_LL_CHUNKS : subchunks;
+  int buffSize = llMode ? NCCL_LL_BUFF_SIZE : comm->rings[0].buffSize;
+
+  int nrounds = (int)(DIVUP(nbytes, ((size_t)nrings * nblocksPerRound * (buffSize/subchunks)))); // Fixed 32-bit overflow
+  int nsteps = nstepsPerRound * nrounds * substeps;
+  TRACE(NET,"opCount %lx substeps %d subchunks %d nrounds %d nsteps %d comm %p", comm->opCount, subchunks, subchunks, nrounds, nsteps, comm);
+  TRACE(NET,"opCount %lx nbytes %zi nrings %d buffSize %d pattern %d comm %p", comm->opCount, nbytes, nrings, buffSize, pattern, comm);
+  for (int r=0; r<nrings; r++) {
+    struct ncclRing* ring = comm->rings+((comm->myParams->gridDim.x+r)%comm->nRings);
+    struct ncclProxyArgs args = { ring, substeps*subchunks, nsteps, comm->opCount, llMode, 0 };
+    SaveProxy(&ring->recv, &args, NeedProxy(RECV, pattern, ring, comm->nRanks));
+    SaveProxy(&ring->send, &args, NeedProxy(SEND, pattern, ring, comm->nRanks));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t transportStartProxies(ncclComm* comm) {
+  for (int r=0; r<comm->nRings; r++) {
+    FifoPushArgs(comm->rings[r].send.proxyInfo);
+    FifoPushArgs(comm->rings[r].recv.proxyInfo);
+  }
+  pthread_yield(); // Let other threads run
+  return ncclSuccess;
+}
+
+void* persistentThread(void *opaqueInfo) {
+  struct transportProxyInfo* info = (struct transportProxyInfo*)opaqueInfo;
+  // We need to initialize the context before launching any NCCL cuda kernel,
+  // otherwise we would create it during the first cudaMemcpyAsync inside the
+  // proxy function and that would cause a deadlock
+  cudaSetDevice(info->comm->cudaDev);
+  // Signal the main thread the context is created and it can proceed.
+  SetProxyReady(info);
+  while (1) {
+    struct ncclProxyArgs args;
+    FifoPullArgs(info, &args);
+    if (args.active == -1) {
+      // Main thread asked to stop
+      return NULL;
+    }
+    ncclResult_t res = info->func(&args);
+    if (res != ncclSuccess) {
+      WARN("%s:%d -> %d [Proxy thread error]", __FILE__, __LINE__, res);
+    }
+  }
+}
+
+ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm) {
+  struct ncclConnector* connector = (type == RECV) ? &ring->recv : &ring->send;
+  threadFunc_t proxyfunc = (threadFunc_t) ((type == RECV) ? connector->transport->recv.proxy : connector->transport->send.proxy);
+  if (proxyfunc) {
+    TRACE(NET,"type %d ring %p proxyfunc %p comm %p", type, ring, proxyfunc, comm);
+    struct transportProxyInfo* info;
+    NCCLCHECK(ncclCalloc(&info, 1));
+    connector->proxyInfo = info;
+    info->comm = comm;
+    info->cond = PTHREAD_COND_INITIALIZER;
+    info->mutex = PTHREAD_MUTEX_INITIALIZER;
+    info->func = proxyfunc;
+    info->argsFifoHead = info->argsFifoTail = 0;
+    info->proxyReady = 0;
+    pthread_create(&connector->proxyInfo->thread, NULL, persistentThread, info);
+    // Wait for thread to initialize its CUDA context.
+    WaitProxyReady(info);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t transportDestroyProxy(struct ncclConnector* connector) {
+  if (connector->proxyInfo) {
+    StopProxy(connector->proxyInfo);
+    pthread_join(connector->proxyInfo->thread, NULL);
+    free(connector->proxyInfo);
+    connector->proxyInfo = NULL;
+  }
+  return ncclSuccess;
+}
diff --git a/src/3rd_party/nccl/src/transport/net.cu b/src/3rd_party/nccl/src/transport/net.cu
new file mode 100644
index 00000000..ed62a66c
--- /dev/null
+++ b/src/3rd_party/nccl/src/transport/net.cu
@@ -0,0 +1,521 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "transport.h"
+#include "nvmlwrap.h"
+#include "net.h"
+#include "param.h"
+#include "nvlink.h"
+#include <cuda_runtime.h>
+#include <assert.h>
+
+#define NET_MAX_IFS 16
+
+// We encode 3 bits of distance per interface into a ncclTvalue_t (64-bit)
+#define NET_BITS_PER_IF 3
+#define NET_BITS_PER_IF_MASK ((1<<NET_BITS_PER_IF)-1)
+static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t");
+
+struct netInfo {
+  int rank;
+  int ndev;
+  short scores[NET_MAX_IFS];
+};
+
+struct netConnectInfo {
+  ncclNetHandle_t netHandle;
+};
+
+struct netSendResources {
+  void* netSendComm;
+  struct ncclSendMem* hostSendMem;
+  struct ncclRecvMem* hostRecvMem;
+  struct ncclSendMem* devHostSendMem;
+  struct ncclRecvMem* devHostRecvMem;
+  struct ncclSendMem* hostDevMem;
+  int netDev;
+  bool cudaSupport;
+  struct ncclRecvMem* devNetMem;
+  uint64_t llStep;
+  uint64_t llLastCleaning;
+};
+
+struct netRecvResources {
+  void* netListenComm;
+  void* netRecvComm;
+  struct ncclSendMem* hostSendMem;
+  struct ncclRecvMem* hostRecvMem;
+  struct ncclSendMem* devHostSendMem;
+  struct ncclRecvMem* devHostRecvMem;
+  struct ncclRecvMem* hostDevMem;
+  int netDev;
+  bool cudaSupport;
+  uint64_t llStep;
+  uint64_t llLastCleaning;
+};
+
+/* Fill information necessary to exchange between ranks to choose whether or not
+ * to use this transport */
+ncclResult_t netFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
+  struct netInfo* info = (struct netInfo*)opaqueInfo;
+  static_assert(sizeof(struct netInfo) <= sizeof(ncclTinfo_t), "NET Info too large");
+  info->rank = rank;
+  int *scores;
+  NCCLCHECK(ncclNetDevices(&info->ndev, &scores));
+  if (info->ndev == 0) {
+    WARN("Error : Network returned 0 device");
+    return ncclSystemError;
+  }
+  if (info->ndev > NET_MAX_IFS) info->ndev = NET_MAX_IFS;
+  for (int d=0; d<info->ndev; d++) info->scores[d] = scores[d];
+  free(scores);
+  return ncclSuccess;
+}
+
+/* Determine if we can communicate with the peer */
+ncclResult_t netCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
+  ret[0] = 0;
+  struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
+  for (int d=0; d<myInfo->ndev; d++) {
+    // Keep 3 bits of score info per dev
+    ret[0] |= ((myInfo->scores[d] & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
+  }
+  return ncclSuccess;
+}
+
+static inline int groupBestStart(int nranks, int* groups, int group, ncclTvalue_t* values, int card, int minScore) {
+  int bestRank = -1;
+  int bestScore = 0;
+  for (int rank=0; rank<nranks; rank++) {
+    if (groups[rank] != group) continue;
+    for (int i=0; i<nranks; i++) {
+      ncclTvalue_t netValue = values[rank*nranks+i];
+      if (netValue != 0) {
+        ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK;
+        if (score >= minScore && score > bestScore) {
+          bestScore = score;
+          bestRank = rank;
+        }
+        // All other values should be the same, stop here for this rank
+        break;
+      }
+    }
+  }
+  return bestRank;
+}
+static inline int groupBestEnd(int nranks, int* groups, int group, int* subgroups, int startSubGroup, int startRank, ncclTvalue_t* values, int card, int minScore) {
+  // For the last rank, we don't need the absolute best score, just to be within minScore.
+  for (int rank=nranks-1; rank>=0; rank--) {
+    if (groups[rank] != group) continue;
+    if (startSubGroup != -1 && startSubGroup == subgroups[rank]) continue;
+    if (startRank == rank) continue;
+    for (int i=0; i<nranks; i++) {
+      ncclTvalue_t netValue = values[rank*nranks+i];
+      if (netValue != 0) {
+        ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK;
+        if (score >= minScore) {
+          return rank;
+        }
+        // All other values should be the same, stop here for this rank
+        break;
+      }
+    }
+  }
+  return -1;
+}
+
+
+ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
+  int nGroups = groups[nranks-1] + 1;
+  int cardUsed[NET_MAX_IFS*nGroups];
+  for (int c=0; c<NET_MAX_IFS*nGroups; c++) cardUsed[c] = 0;
+
+  for (int ring = 0; ring<*nringsRet; ring++) {
+    int starts[nGroups];
+    int ends[nGroups];
+    for (int group = 0; group<nGroups; group++) {
+      int nranksInGroup = 0;
+      int nsubGroups = 0;
+      for (int rank=0; rank<nranks; rank++) if (groups[rank] == group) {
+          nranksInGroup++;
+          nsubGroups = std::max(subgroups[rank], nsubGroups);
+        }
+      starts[group] = ends[group] = -1;
+      // Receive on the rank closest to the NIC
+      for (int card=0; card<NET_MAX_IFS; card++) {
+        if (cardUsed[group*NET_MAX_IFS+card] == 1) continue;
+        int start = groupBestStart(nranks, groups, group, values, card, minScore);
+        // Send from any rank, but best on a different subgroup and close to the NIC also.
+        int end = (nranksInGroup == 1) ? start
+            : groupBestEnd(nranks, groups, group, subgroups, nsubGroups ? subgroups[start] : -1, start, values, card, minScore);
+        //printf("Ring %d, Minscore %d, Card %d, group %d, start = %d, end = %d\n", ring, minScore, card, group, start, end);
+        if (start != -1 && end != -1) {
+          cardUsed[group*NET_MAX_IFS+card] = 1;
+          starts[group] = start;
+          ends[group] = end;
+          break;
+        }
+      }
+      if (starts[group] == -1 || ends[group] == -1) {
+        *nringsRet = ring;
+        return ncclSuccess;
+      }
+    }
+    // Link groups together
+    for (int group = 0; group<nGroups; group++) {
+      int nextGroup = (group+1)%nGroups;
+      next[ring*nranks+ends[group]] = starts[nextGroup];
+      prev[ring*nranks+starts[nextGroup]] = ends[group];
+    }
+  }
+  return ncclSuccess;
+}
+
+int getDev(int ringId, int nDev, short* scores) {
+  int maxScore = 0;
+  for (int d=0; d<nDev; d++) if (scores[d] > maxScore) maxScore = scores[d];
+  int skip = ringId+1;
+  while (skip) {
+    for (int d=0; d<nDev; d++) {
+      if (scores[d] == maxScore) {
+        skip--;
+        if (skip == 0) return d;
+      }
+    }
+  }
+  return 0;
+}
+
+NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
+
+/* Determine if we will use this transport for this peer and return connect
+ * information for this peer */
+ncclResult_t netSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
+  struct netSendResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  ring->send.transportResources = resources;
+
+  struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
+  resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->scores);
+  resources->cudaSupport = false;
+
+  // Get user's GDR READ setting
+  int gdrReadParam = ncclParamNetGdrRead();
+
+  // Determine whether the GPU has NVLink
+  int cudaDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+  CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
+  int nvlinks = getNumNvlinks(busId);
+
+  // Enable GDR read when:
+  // 1) user sets it, or
+  // 2) we are on a NVSwitch platform (i.e. no P2P traffic over PCI-E switch) AND the GPU is Volta
+  bool enableGdrRead = (gdrReadParam > 0) || (nvlinks >= CONNECT_NVSWITCH && ncclCudaCompCap() > 6 && gdrReadParam != 0);
+  if (enableGdrRead) {
+    int flags;
+    NCCLCHECK(ncclNetPtrSupport(resources->netDev, &flags));
+    if (flags & NCCL_PTR_CUDA)
+      resources->cudaSupport = true;
+  }
+  if (resources->cudaSupport)
+    INFO(INIT|NET, "Net: enabling net device %d to read from rank %d", resources->netDev, myInfo->rank);
+
+  int size = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
+  if (resources->cudaSupport) {
+    NCCLCHECK(ncclCudaCalloc((char**)(&resources->devNetMem), size));
+  }
+
+  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, size));
+  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, size));
+
+  return ncclSuccess;
+}
+
+ncclResult_t netRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
+  struct netRecvResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  ring->recv.transportResources = resources;
+
+  struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
+  resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->scores);
+  int flags;
+  NCCLCHECK(ncclNetPtrSupport(resources->netDev, &flags));
+  resources->cudaSupport = (flags & NCCL_PTR_CUDA) ? true : false;
+
+  int sendSize = sizeof(struct ncclSendMem);
+  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
+
+  int recvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
+  NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
+
+  struct netInfo* peerInfo = (struct netInfo*)peerOpaqueInfo;
+  INFO(INIT|NET,"Ring %02d : %d -> %d via NET/%s/%d%s%s", ring->id, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
+      resources->cudaSupport ? "/GDRDMA" : "",
+      (resources->hostDevMem != NULL) ? "/GDCopy" : "");
+  struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
+  NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
+  return ncclSuccess;
+}
+
+ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+  // Setup device pointers
+  struct netSendResources* resources = (struct netSendResources*)send->transportResources;
+
+  if (resources->cudaSupport) {
+    send->conn.buff = resources->devNetMem->buff;
+    // We don't use devMem for llMode because the CPU has to read the data
+    send->conn.llBuff = resources->devHostRecvMem->llBuff;
+  } else {
+    send->conn.buff = resources->devHostRecvMem->buff;
+    send->conn.llBuff = resources->devHostRecvMem->llBuff;
+  }
+  send->conn.tail = &resources->devHostRecvMem->tail;
+  send->conn.opCount = &resources->devHostRecvMem->opCount;
+  send->conn.fifo = resources->devHostRecvMem->sizesFifo;
+  send->conn.llFifo = resources->devHostRecvMem->llSizesFifo;
+
+  if (resources->hostDevMem == NULL) {
+    send->conn.head = &resources->devHostSendMem->head;
+    send->conn.llHead = &resources->devHostSendMem->llHead;
+  }
+
+  // Connect to remote peer
+  struct netConnectInfo* info = (struct netConnectInfo*)connectInfo;
+  NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm));
+  return ncclSuccess;
+}
+
+/* Connect to this peer */
+ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+  // Setup device pointers
+  struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources;
+
+  recv->conn.head = &resources->devHostSendMem->head;
+  recv->conn.llHead = &resources->devHostSendMem->llHead;
+
+  if (resources->cudaSupport == false) {
+    recv->conn.buff = resources->devHostRecvMem->buff;
+    recv->conn.llBuff = resources->devHostRecvMem->llBuff;
+  }
+
+  if (resources->hostDevMem == NULL) {
+    recv->conn.tail = &resources->devHostRecvMem->tail;
+    recv->conn.opCount = &resources->devHostRecvMem->opCount;
+  }
+
+  // Finish connection establishment
+  NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
+  NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
+
+  return ncclSuccess;
+}
+
+ncclResult_t netSendFree(void* transportResources) {
+  struct netSendResources* resources = (struct netSendResources*)transportResources;
+  NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
+  NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
+  if (resources->cudaSupport)
+    CUDACHECK(cudaFree(resources->devNetMem));
+  NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
+  free(resources);
+  return ncclSuccess;
+}
+
+ncclResult_t netRecvFree(void* transportResources) {
+  struct netRecvResources* resources = (struct netRecvResources*)transportResources;
+  NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
+  NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
+  NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
+  free(resources);
+  return ncclSuccess;
+}
+
+ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
+  struct ncclRing* ring = args->ring;
+  struct netSendResources* resources = (struct netSendResources*) (ring->send.transportResources);
+  const int llMode = args->llMode;
+
+  volatile uint64_t* prevTail = &resources->hostRecvMem->tail;
+  struct ncclSendMem* prevMem = resources->hostDevMem ? resources->hostDevMem : resources->hostSendMem;
+  uint64_t* prevHead = llMode ? &prevMem->llHead : &prevMem->head;
+  struct ncclRecvMem* localMem = resources->cudaSupport ? resources->devNetMem : resources->hostRecvMem;
+  char* localBuff = llMode ? resources->hostRecvMem->llBuff : localMem->buff;
+  int ptrType = resources->cudaSupport ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
+  volatile int* sizesFifo = llMode ? resources->hostRecvMem->llSizesFifo : resources->hostRecvMem->sizesFifo;
+  int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
+  int sliceSize = buffSize / args->substeps;
+
+  assert(args->substeps <= SIZES_FIFO_SIZE);
+
+  uint64_t head = llMode ? resources->llStep : 0ULL;
+  uint64_t tail = llMode ? resources->llStep : 0ULL;
+  uint64_t end = head + args->nsteps;
+
+  int idle = 0;
+  void* requests[args->substeps];
+
+  if (!args->needProxy) goto nextColl;
+
+  TRACE(NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
+  TRACE(NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
+
+  // Update in case we skipped some collectives
+  if (llMode == 0) resources->hostRecvMem->opCount = args->opCount;
+
+  while (head < end) {
+    idle++;
+    if (llMode) {
+      if (tail < end && tail < head + args->substeps) {
+        int slot = tail%args->substeps;
+        int size = sizesFifo[slot];
+        if (size != 0) {
+          if (size == -1) size = 0;
+          uint32_t flag = tail + 1;
+          int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
+          size = nFifoLines * sizeof(union ncclLLFifoLine);
+          union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+slot*sliceSize);
+          for (int i=0; i<nFifoLines; i++) {
+            volatile uint32_t *f1 = &lines[i].flag1;
+            volatile uint32_t *f2 = &lines[i].flag2;
+            while (f1[0] != flag || f2[0] != flag);
+          }
+          NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, ptrType, requests+slot));
+          sizesFifo[slot] = size;
+          tail++;
+          idle = 0;
+        }
+      }
+    } else while (tail < *prevTail) {
+        // Send through network
+        int slot = tail%args->substeps;
+        NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+slot*sliceSize, sizesFifo[slot], ptrType, requests+slot));
+        tail++;
+        idle = 0;
+      }
+    if (head < tail) {
+      int done;
+      int slot = head%args->substeps;
+      NCCLCHECK(ncclNetTest(requests[slot], &done, NULL));
+      if (done) {
+        if (llMode) {
+          sizesFifo[slot] = 0;
+          // Make sure size is reset to zero before we update the head.
+          __sync_synchronize();
+        }
+        head++;
+        *prevHead = head;
+        idle = 0;
+      }
+    }
+    if (idle) transportProxyIdle(idle);
+  }
+
+  // Reset
+  if (llMode == 0) *prevTail = 0;
+
+nextColl:
+  if (llMode) {
+    resources->llStep += args->nsteps;
+    // Don't forget to ack otherwise the GPU won't be able to push data.
+    *prevHead = resources->llStep;
+    if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
+      memset(localBuff, 0, NCCL_LL_BUFF_SIZE);
+      resources->llStep += NCCL_LL_CHUNKS;
+      *prevHead = resources->llStep;
+      resources->llLastCleaning = resources->llStep;
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
+  struct ncclRing* ring = args->ring;
+  struct netRecvResources* resources = (struct netRecvResources*) (ring->recv.transportResources);
+  int llMode = args->llMode;
+
+  volatile uint64_t* nextHead = llMode ? &resources->hostSendMem->llHead : &resources->hostSendMem->head;
+  struct ncclRecvMem* localMem = resources->cudaSupport ? ring->devMemRecv : resources->hostRecvMem;
+  char* localBuff = llMode ? localMem->llBuff : localMem->buff;
+  char* nextBuff = (resources->cudaSupport == false && resources->hostDevMem) ? resources->hostDevMem->buff : NULL;
+  int ptrType = resources->cudaSupport ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
+  uint64_t* nextTail = resources->hostDevMem ? &resources->hostDevMem->tail : &resources->hostRecvMem->tail;
+
+  int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
+  int sliceSize = buffSize / args->substeps;
+
+  uint64_t head = llMode ? resources->llStep : 0ULL;
+  uint64_t tail = llMode ? resources->llStep : 0ULL;
+  uint64_t end = head + args->nsteps;
+
+  int idle = 0;
+  void* requests[args->substeps];
+
+  if (!args->needProxy) goto nextColl;
+
+  TRACE(NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
+  TRACE(NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
+
+  if (llMode == 0) {
+    // Waiting for next opCount is only needed before writing nextTail.
+    uint64_t* nextOpCount = resources->hostDevMem ? &resources->hostDevMem->opCount : &resources->hostRecvMem->opCount;
+    transportProxyWait([=] { return *nextOpCount >= args->opCount; });
+  }
+
+  while (head < end) {
+    idle++;
+    if ((tail < head + args->substeps) && (tail < *nextHead + args->substeps) && (tail < end)) {
+      int slot = tail%args->substeps;
+      NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+slot*sliceSize, sliceSize, ptrType, requests+slot));
+      tail++;
+      idle = 0;
+    }
+    if (tail > head) {
+      int done;
+      int slot = head%args->substeps;
+      int size;
+      NCCLCHECK(ncclNetTest(requests[slot], &done, &size));
+      if (done) {
+        if (nextBuff) memcpy(nextBuff+slot*sliceSize, localBuff+slot*sliceSize, size);
+        head++;
+        if (llMode == 0) {
+          if (ptrType == NCCL_PTR_CUDA) ncclNetFlush(resources->netRecvComm, localBuff+slot*sliceSize, size);
+          *nextTail = head;
+        }
+        idle = 0;
+      }
+    }
+    if (idle) transportProxyIdle(idle);
+  }
+
+  // Wait for last ack and reset
+  if (llMode == 0) {
+    transportProxyWait([=] { return *nextHead == head; });
+    *nextHead = 0;
+  }
+
+nextColl:
+  if (llMode) {
+    resources->llStep += args->nsteps;
+    if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
+      resources->llStep += NCCL_LL_CHUNKS;
+      while (*nextHead < resources->llStep);
+      resources->llLastCleaning = resources->llStep;
+    }
+  }
+  return ncclSuccess;
+}
+
+struct ncclTransport netTransport = {
+  "NET",
+  netFillInfo,
+  netCanConnect,
+  netGetRings,
+  { netSendSetup, netSendConnect, netSendFree, netSendProxy },
+  { netRecvSetup, netRecvConnect, netRecvFree, netRecvProxy }
+};
diff --git a/src/3rd_party/nccl/src/transport/net_ib.cu b/src/3rd_party/nccl/src/transport/net_ib.cu
new file mode 100644
index 00000000..7b1fc99f
--- /dev/null
+++ b/src/3rd_party/nccl/src/transport/net_ib.cu
@@ -0,0 +1,926 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "core.h"
+#include "socket.h"
+#include "net.h"
+#include "topo.h"
+#include "utils.h"
+#include "param.h"
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <poll.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "ibvwrap.h"
+
+#define USE_RDMA_WRITE 1
+#define USE_RDMA_SEND_INLINE 0
+#define MAXNAMESIZE 64
+static char ncclIbIfName[MAX_IF_NAME_SIZE];
+static union socketAddress ncclIbIfAddr;
+static int ncclNIbDevs = -1;
+struct ncclIbDev {
+  int device;
+  uint8_t port;
+  ibv_context* context;
+  char devName[MAXNAMESIZE];
+};
+
+#define MAX_IB_PORT 15
+struct userIbDev {
+  char devName[MAXNAMESIZE];
+  uint16_t port_en;
+};
+
+#define MAX_IB_DEVS 16
+struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
+struct userIbDev userIbDevs[MAX_IB_DEVS];
+pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
+
+NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", 0);
+NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 14);
+NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
+NCCL_PARAM(IbSl, "IB_SL", 0);
+NCCL_PARAM(IbTc, "IB_TC", 0);
+
+// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
+// allocated on separate pages as those pages will be marked DONTFORK
+// and if they are shared, that could cause a crash in a child process
+static ncclResult_t ncclIbMalloc(void** ptr, size_t size) {
+  size_t page_size = sysconf(_SC_PAGESIZE);
+  void* p;
+  int size_aligned = ROUNDUP(size, page_size);
+  int ret = posix_memalign(&p, page_size, size_aligned);
+  if (ret != 0) return ncclSystemError;
+  memset(p, 0, size);
+  *ptr = p;
+  return ncclSuccess;
+}
+
+pthread_t ncclIbAsyncThread;
+static void* ncclIbAsyncThreadMain(void* args) {
+  struct ibv_context* context = (struct ibv_context*)args;
+  while (1) {
+    struct ibv_async_event event;
+    if (ncclSuccess != wrap_ibv_get_async_event(context, &event)) { break; }
+    char *str;
+    if (ncclSuccess != wrap_ibv_event_type_str(&str, event.event_type)) { break; }
+    if (event.event_type != IBV_EVENT_COMM_EST)
+      WARN("NET/IB : Got async event : %s", str);
+    if (ncclSuccess != wrap_ibv_ack_async_event(&event)) { break; }
+  }
+  return NULL;
+}
+
+static void initDevices() {
+  if(wrap_ibv_symbols() != ncclSuccess) { return; }
+  if (ncclNIbDevs == -1) {
+    pthread_mutex_lock(&ncclIbLock);
+    wrap_ibv_fork_init();
+    if (ncclNIbDevs == -1) {
+      ncclNIbDevs = 0;
+      if (findInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
+        WARN("NET/IB : No IP interface found.");
+        return;
+      }
+      INFO(INIT|NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName);
+
+      // Detect IB cards
+      int nIbDevs;
+      struct ibv_device** devices;
+
+      // Check if user defined which IB device:port to use
+      char* userIbEnv = getenv("NCCL_IB_HCA");
+      struct netIf userIfs[MAX_IB_DEVS];
+      bool searchNot = userIbEnv && userIbEnv[0] == '^';
+      int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS);
+
+      if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return;
+
+      for (int d=0; d<nIbDevs; d++) {
+        struct ibv_context * context;
+        if (ncclSuccess != wrap_ibv_open_device(&context, devices[d])) {
+          WARN("NET/IB : Unable to open device %s", devices[d]->name);
+          continue;
+        }
+        int found = 0;
+        if (context) {
+          struct ibv_device_attr devAttr;
+          if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
+            WARN("NET/IB : Unable to query device %s", devices[d]->name);
+            continue;
+          }
+          for (int port = 1; port <= devAttr.phys_port_cnt; port++) {
+            struct ibv_port_attr portAttr;
+            if (ncclSuccess != wrap_ibv_query_port(context, port, &portAttr)) {
+              WARN("NET/IB : Unable to query port %d", port);
+              continue;
+            }
+            if (portAttr.state != IBV_PORT_ACTIVE) continue;
+            if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND
+                && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
+
+            // check against user specified HCAs/ports
+            if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) {
+              continue;
+            }
+            INFO(INIT|NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
+                portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
+            ncclIbDevs[ncclNIbDevs].device = d;
+            ncclIbDevs[ncclNIbDevs].port = port;
+            ncclIbDevs[ncclNIbDevs].context = context;
+            strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
+            ncclNIbDevs++;
+            found++;
+            pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
+          }
+
+          if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return; } }
+        }
+      }
+      if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return; };
+    }
+
+    pthread_mutex_unlock(&ncclIbLock);
+  }
+}
+
+ncclResult_t ncclIbDevices(int* ndev, int** scores) {
+  initDevices();
+  *ndev = ncclNIbDevs;
+  int cudaDev;
+  cudaGetDevice(&cudaDev);
+  char* cudaPath;
+  ncclResult_t err1 = getCudaPath(cudaDev, &cudaPath);
+  int* sc;
+  NCCLCHECK(ncclCalloc(&sc, ncclNIbDevs));
+  char line[1024];
+  sprintf(line, "CUDA Dev %d, IB Ports : ", cudaDev);
+  for (int d=0; d<ncclNIbDevs; d++) {
+    char* mlxPath;
+    ncclResult_t err2 = getMlxPath(ncclIbDevs[d].devName, &mlxPath);
+    int distance = (err1 != ncclSuccess || err2 != ncclSuccess || mlxPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(mlxPath, cudaPath);
+    sprintf(line+strlen(line), "%s/%d(%s) ", ncclIbDevs[d].devName, ncclIbDevs[d].port, pathDists[distance]);
+    sc[d] = 1+PATH_SOC-distance;
+    if (err2 == ncclSuccess) free(mlxPath);
+  }
+  INFO(INIT|NET,"%s", line);
+  if (err1 == ncclSuccess) free(cudaPath);
+  *scores = sc;
+  return ncclSuccess;
+}
+
+// Detect whether GDR can work on a given NIC with the current CUDA device
+// Returns :
+// ncclSuccess : GDR works
+// ncclSystemError : no module or module loaded but not supported by GPU
+ncclResult_t ncclIbGdrSupport(int ibDev) {
+  static int moduleLoaded = -1;
+  if (moduleLoaded == -1) {
+    moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
+  }
+  if (moduleLoaded == 0) return ncclSystemError;
+  ncclResult_t ret = ncclSystemError;
+  void* ptr;
+  if (cudaMalloc(&ptr, sizeof(int)) == cudaSuccess) {
+    struct ibv_mr* mr;
+    struct ibv_pd* pd;
+    if (wrap_ibv_alloc_pd(&pd, ncclIbDevs[ibDev].context) == ncclSuccess) {
+      if ((mr = wrap_direct_ibv_reg_mr(pd, ptr, sizeof(int), IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)) != NULL) {
+        ret = ncclSuccess;
+        wrap_ibv_dereg_mr(mr);
+      }
+      wrap_ibv_dealloc_pd(pd);
+    }
+    cudaFree(ptr);
+  }
+  return ret;
+}
+
+NCCL_PARAM(IbGdrLevel, "IB_GDR_LEVEL", -2);
+NCCL_PARAM(IbCudaSupport, "IB_CUDA_SUPPORT", -2);
+
+ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
+  initDevices();
+  *supportedTypes = NCCL_PTR_HOST;
+
+  int cudaDev;
+  if (cudaGetDevice(&cudaDev) != cudaSuccess) return ncclSuccess;
+
+  int ibGdrLevel = PATH_PHB;
+  if (ncclParamIbCudaSupport() != -2) ibGdrLevel = ncclParamIbCudaSupport() ? PATH_SOC + 1 : 0;
+  if (ncclParamIbGdrLevel() != -2) ibGdrLevel = ncclParamIbGdrLevel();
+  if (ibGdrLevel > 0) {
+    int gdrSupport = ncclIbGdrSupport(dev);
+    if (gdrSupport > 0) {
+      INFO(INIT|NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (%s)", cudaDev, ncclIbDevs[dev].devName, gdrSupport == 1 ? "no module" : "not supported by GPU");
+      ibGdrLevel = 0;
+    }
+  }
+
+  if (ibGdrLevel <= 0) return ncclSuccess;
+
+  char* cudaPath;
+  if (getCudaPath(cudaDev, &cudaPath) != ncclSuccess) return ncclSuccess;
+  char* mlxPath;
+  if (getMlxPath(ncclIbDevs[dev].devName, &mlxPath) != ncclSuccess) { free(cudaPath); return ncclSuccess; }
+  int distance = (mlxPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(mlxPath, cudaPath);
+  free(mlxPath); free(cudaPath);
+  if (distance < ibGdrLevel) {
+    *supportedTypes |= NCCL_PTR_CUDA;
+  } else {
+    INFO(INIT|NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (distance %d >= %d)", cudaDev, ncclIbDevs[dev].devName, distance, ibGdrLevel);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t GetSocketAddr(union socketAddress* addr) {
+  if (ncclNIbDevs == -1) initDevices();
+  memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
+  return ncclSuccess;
+}
+
+#define MAX_REQUESTS 128
+
+struct ncclIbQpInfo {
+  uint32_t lid;
+  uint8_t ib_port;
+  uint32_t qpn;
+
+  // For RoCE
+  uint64_t spn;
+  uint64_t iid;
+  enum ibv_mtu mtu;
+
+  // FIFO RDMA info
+  uint32_t fifoRkey;
+  uint64_t fifoAddr;
+};
+
+struct ncclIbHandle {
+  union socketAddress connectAddr;
+};
+
+struct ncclIbMr {
+  struct ibv_mr* mr;
+  int refcnt;
+};
+
+struct ncclIbVerbs {
+  struct ibv_pd* pd;
+  struct ibv_cq* cq;
+  struct ncclIbMr mrPool[MAX_REQUESTS];
+  int mrRotation;
+};
+
+struct ncclIbRequest {
+  int used;
+  int type;
+  struct ncclIbVerbs* verbs;
+  struct ncclIbMr * ibMr;
+  int done;
+  int size;
+  int free;
+};
+
+struct ncclIbListenComm {
+  int dev;
+  int fd;
+};
+
+struct ncclIbSendFifo {
+  uint64_t addr;
+  int      size;
+  uint32_t seq;
+  uint32_t rkey;
+  uint32_t ready;
+};
+
+struct ncclIbSendComm {
+  struct ncclIbSendFifo fifo[MAX_REQUESTS];
+  struct ncclIbRequest reqs[MAX_REQUESTS];
+  uint32_t fifoHead;
+  int fd;
+  int ready;
+  struct ncclIbVerbs verbs;
+  struct ibv_qp* qp;
+  struct ibv_mr* fifoMr;
+};
+
+struct ncclIbGpuFlush {
+  int enabled;
+  int hostMem;
+  struct ibv_mr* hostMr;
+  struct ibv_sge sge;
+  struct ibv_qp* qp;
+};
+
+struct ncclIbRemFifo {
+  struct ncclIbSendFifo elems[MAX_REQUESTS];
+  uint64_t addr;
+  uint32_t rkey;
+  uint32_t tail;
+  uint32_t flags;
+  struct ibv_mr* mr;
+  struct ibv_sge sge;
+};
+
+struct ncclIbRecvComm {
+  struct ncclIbRemFifo remFifo;
+  struct ncclIbRequest reqs[MAX_REQUESTS];
+  int fd;
+  int ready;
+  struct ncclIbVerbs verbs;
+  struct ibv_qp* qp;
+  struct ncclIbGpuFlush gpuFlush;
+};
+
+ncclResult_t ncclIbInitVerbs(ibv_context* ctx, struct ncclIbVerbs* verbs) {
+  NCCLCHECK(wrap_ibv_alloc_pd(&verbs->pd, ctx));
+  NCCLCHECK(wrap_ibv_create_cq(&verbs->cq, ctx, MAX_REQUESTS, NULL, NULL, 0));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbDestroyVerbs(struct ncclIbVerbs* verbs) {
+  NCCLCHECK(wrap_ibv_destroy_cq(verbs->cq));
+  NCCLCHECK(wrap_ibv_dealloc_pd(verbs->pd));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int access_flags, struct ibv_qp** qp) {
+  struct ibv_qp_init_attr qpInitAttr;
+  memset(&qpInitAttr, 0, sizeof(struct ibv_qp_init_attr));
+  qpInitAttr.send_cq = verbs->cq;
+  qpInitAttr.recv_cq = verbs->cq;
+  qpInitAttr.qp_type = IBV_QPT_RC;
+  qpInitAttr.cap.max_send_wr = MAX_REQUESTS;
+  qpInitAttr.cap.max_recv_wr = MAX_REQUESTS;
+  qpInitAttr.cap.max_send_sge = 1;
+  qpInitAttr.cap.max_recv_sge = 1;
+  qpInitAttr.cap.max_inline_data = 0;
+  NCCLCHECK(wrap_ibv_create_qp(qp, verbs->pd, &qpInitAttr));
+  struct ibv_qp_attr qpAttr;
+  memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
+  qpAttr.qp_state = IBV_QPS_INIT;
+  qpAttr.pkey_index = 0;
+  qpAttr.port_num = ib_port;
+  qpAttr.qp_access_flags = access_flags;
+  NCCLCHECK(wrap_ibv_modify_qp(*qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbRtrQp(ibv_qp* qp, struct ncclIbQpInfo* info) {
+  struct ibv_qp_attr qpAttr;
+  memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
+  qpAttr.qp_state = IBV_QPS_RTR;
+  qpAttr.path_mtu = info->mtu;
+  qpAttr.dest_qp_num = info->qpn;
+  qpAttr.rq_psn = 0;
+  qpAttr.max_dest_rd_atomic = 1;
+  qpAttr.min_rnr_timer = 12;
+  if (info->lid == 0) {
+    qpAttr.ah_attr.is_global = 1;
+    qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->spn;
+    qpAttr.ah_attr.grh.dgid.global.interface_id = info->iid;
+    qpAttr.ah_attr.grh.flow_label = 0;
+    qpAttr.ah_attr.grh.sgid_index = ncclParamIbGidIndex();
+    qpAttr.ah_attr.grh.hop_limit = 255;
+    qpAttr.ah_attr.grh.traffic_class = ncclParamIbTc();
+  } else {
+    qpAttr.ah_attr.is_global = 0;
+    qpAttr.ah_attr.dlid = info->lid;
+  }
+  qpAttr.ah_attr.sl = ncclParamIbSl();
+  qpAttr.ah_attr.src_path_bits = 0;
+  qpAttr.ah_attr.port_num = info->ib_port;
+  NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbRtsQp(ibv_qp* qp) {
+  struct ibv_qp_attr qpAttr;
+  memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
+  qpAttr.qp_state = IBV_QPS_RTS;
+  qpAttr.timeout = ncclParamIbTimeout();
+  qpAttr.retry_cnt = ncclParamIbRetryCnt();
+  qpAttr.rnr_retry = 7;
+  qpAttr.sq_psn = 0;
+  qpAttr.max_rd_atomic = 1;
+  NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC));
+  return ncclSuccess;
+}
+
+
+ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
+  struct ncclIbListenComm* comm;
+  NCCLCHECK(ncclCalloc(&comm, 1));
+  struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
+  static_assert(sizeof(struct ncclIbHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclIbHandle size too large");
+  comm->dev = dev;
+  NCCLCHECK(GetSocketAddr(&(handle->connectAddr)));
+  NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
+  *listenComm = comm;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
+  struct ncclIbSendComm* comm;
+  NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm)));
+
+  struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
+  NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr));
+  *sendComm = comm;
+
+  // IB Setup
+  initDevices(); /*NOTE: We need to do this for ncclNet unit test that bypasses nccl initialization*/
+  ibv_context* ctx = ncclIbDevs[dev].context;
+  NCCLCHECK(ncclIbInitVerbs(ctx, &comm->verbs));
+  uint8_t ib_port = ncclIbDevs[dev].port;
+  NCCLCHECK(ncclIbCreateQp(ib_port, &comm->verbs, IBV_ACCESS_REMOTE_WRITE, &comm->qp));
+
+  // Send my QP Info to receiver through the socket. Hope this won't block.
+  struct ibv_port_attr portAttr;
+  NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr));
+  struct ncclIbQpInfo qpInfo;
+  qpInfo.ib_port = ib_port;
+  qpInfo.qpn = comm->qp->qp_num;
+  qpInfo.mtu = portAttr.active_mtu;
+
+  // Prepare my fifo
+  NCCLCHECK(wrap_ibv_reg_mr(&comm->fifoMr, comm->verbs.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
+  qpInfo.fifoRkey = comm->fifoMr->rkey;
+  qpInfo.fifoAddr = (uint64_t)comm->fifo;
+
+  // RoCE support
+  qpInfo.lid = portAttr.lid;
+  if (qpInfo.lid) { // IB
+    INFO(INIT|NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid);
+  } else { // RoCE
+    union ibv_gid gid;
+    NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
+    qpInfo.spn = gid.global.subnet_prefix;
+    qpInfo.iid = gid.global.interface_id;
+    INFO(INIT|NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
+  }
+
+  NCCLCHECK(socketSend(comm->fd, &qpInfo, sizeof(qpInfo)));
+  return ncclSuccess;
+}
+
+NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
+
+ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
+  struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm;
+  struct ncclIbRecvComm* rComm;
+  NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm)));
+
+  struct sockaddr_in sockaddr;
+  socklen_t socklen = sizeof(struct sockaddr_in);
+  SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
+  struct ncclIbQpInfo remQpInfo;
+  NCCLCHECK(socketReceive(rComm->fd, &remQpInfo, sizeof(remQpInfo)));
+
+  // IB setup
+  ibv_context* ctx = ncclIbDevs[lComm->dev].context;
+  uint8_t ib_port = ncclIbDevs[lComm->dev].port;
+  struct ibv_port_attr portAttr;
+  NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr));
+  union ibv_gid gid;
+  NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
+
+  // QP Creation
+  NCCLCHECK(ncclIbInitVerbs(ctx, &rComm->verbs));
+  NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_REMOTE_WRITE, &rComm->qp));
+
+  // Adjust the MTU
+  remQpInfo.mtu = (enum ibv_mtu)std::min(remQpInfo.mtu, portAttr.active_mtu);
+
+  // Setup QP
+  struct ibv_qp* qp = rComm->qp;
+  NCCLCHECK(ncclIbRtrQp(qp, &remQpInfo));
+  NCCLCHECK(ncclIbRtsQp(qp));
+
+  // Retain remote fifo info and prepare my RDMA ops
+  rComm->remFifo.rkey = remQpInfo.fifoRkey;
+  rComm->remFifo.addr = remQpInfo.fifoAddr;
+  NCCLCHECK(wrap_ibv_reg_mr(&rComm->remFifo.mr, rComm->verbs.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
+  rComm->remFifo.sge.length = sizeof(struct ncclIbSendFifo);
+  rComm->remFifo.sge.lkey = rComm->remFifo.mr->lkey;
+
+#if USE_RDMA_SEND_INLINE
+  // Determine whether the remFifo element data can be sent INLINE
+  struct ibv_qp_attr attr;
+  struct ibv_qp_init_attr init_attr;
+  NCCLCHECK(wrap_ibv_query_qp(qp, &attr, IBV_QP_CAP, &init_attr));
+  if (init_attr.cap.max_inline_data >= rComm->remFifo.sge.length) rComm->remFifo.flags = IBV_SEND_INLINE;
+#endif
+
+  // Allocate Flush dummy buffer for GPU Direct RDMA
+  rComm->gpuFlush.enabled = (ncclIbGdrSupport(lComm->dev) == 0) && (ncclParamIbGdrFlushDisable() == 0) ? 1 : 0;
+  if (rComm->gpuFlush.enabled) {
+    NCCLCHECK(wrap_ibv_reg_mr(&rComm->gpuFlush.hostMr, rComm->verbs.pd, &rComm->gpuFlush.hostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE));
+    rComm->gpuFlush.sge.addr = (uint64_t)&rComm->gpuFlush.hostMem;
+    rComm->gpuFlush.sge.length = 1;
+    rComm->gpuFlush.sge.lkey = rComm->gpuFlush.hostMr->lkey;
+    NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rComm->gpuFlush.qp));
+    struct ncclIbQpInfo localQpInfo = {
+      .lid=portAttr.lid,
+      .ib_port=ib_port,
+      .qpn=rComm->gpuFlush.qp->qp_num,
+      .spn=gid.global.subnet_prefix,
+      .iid=gid.global.interface_id,
+      .mtu=portAttr.active_mtu
+    };
+    NCCLCHECK(ncclIbRtrQp(rComm->gpuFlush.qp, &localQpInfo));
+    NCCLCHECK(ncclIbRtsQp(rComm->gpuFlush.qp));
+  }
+
+  // Fill Handle
+  struct ncclIbQpInfo qpInfo = {
+    .lid=portAttr.lid,
+    .ib_port=ib_port,
+    .qpn=qp->qp_num,
+    .spn=gid.global.subnet_prefix,
+    .iid=gid.global.interface_id,
+    .mtu=remQpInfo.mtu
+  };
+
+  NCCLCHECK(socketSend(rComm->fd, &qpInfo, sizeof(qpInfo)));
+  *recvComm = rComm;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbGetRequest(struct ncclIbRequest* reqs, struct ncclIbRequest** req) {
+  for (int i=0; i<MAX_REQUESTS; i++) {
+    struct ncclIbRequest* r = reqs+i;
+    if (r->used == 0) {
+      r->used = 1;
+      r->type = 0;
+      r->verbs = NULL;
+      r->ibMr = NULL;
+      r->done = 0;
+      r->size = -1;
+      r->free = 0;
+      *req = r;
+      return ncclSuccess;
+    }
+  }
+  WARN("NET/IB : unable to allocate requests");
+  *req = NULL;
+  return ncclInternalError;
+}
+
+ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
+  if (comm->ready == 0) {
+    struct ncclIbQpInfo remQpInfo;
+    struct ibv_qp* qp = comm->qp;
+    NCCLCHECK(socketReceive(comm->fd, &remQpInfo, sizeof(remQpInfo)));
+    NCCLCHECK(ncclIbRtrQp(qp, &remQpInfo));
+    NCCLCHECK(ncclIbRtsQp(qp));
+    int go = 1;
+    NCCLCHECK(socketSend(comm->fd, &go, sizeof(go)));
+    comm->ready = 1;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclRecvCheck(struct ncclIbRecvComm* comm) {
+  if (comm->ready == 0) {
+    int go;
+    NCCLCHECK(socketReceive(comm->fd, &go, sizeof(go)));
+    comm->ready = 1;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbTest(void* request, int* done, int* size);
+
+#define REG_ALIGN (4096)
+
+// Cache previous MRs to avoid registering/unregistering for each Isend/Irecv
+ncclResult_t ncclIbGetMr(struct ncclIbVerbs* verbs, void* data, int size, struct ncclIbMr** mrRet) {
+  uint64_t addr = (uint64_t)data;
+  int elem = -1;
+  assert(size > 0);
+
+  // Look for an already existing MR
+  for (int i=0; i<MAX_REQUESTS; i++) {
+    if (verbs->mrPool[i].mr == NULL) continue;
+    uint64_t regAddr = (uint64_t)verbs->mrPool[i].mr->addr;
+    uint64_t regSize = (uint64_t)verbs->mrPool[i].mr->length;
+    if (regAddr <= addr && addr+size <= regAddr+regSize) {
+      *mrRet = verbs->mrPool+i;
+      verbs->mrPool[i].refcnt++;
+      return ncclSuccess;
+    }
+  }
+
+  // Find an unused element
+  if (elem == -1) {
+    elem = (verbs->mrRotation++);
+    for (int i=0; i<MAX_REQUESTS; i++) {
+      elem %= MAX_REQUESTS;
+      if (verbs->mrPool[elem].refcnt > 0) elem++; else break;
+    }
+    if (verbs->mrPool[elem].refcnt > 0) {
+      WARN("NET/IB : memory register : no MR available");
+      return ncclInternalError;
+    }
+  }
+
+  assert(elem < MAX_REQUESTS);
+  assert(verbs->mrPool[elem].refcnt == 0);
+
+  // Deregister / register
+  uint64_t regAddr = addr & (~(REG_ALIGN-1));
+  uint64_t regSize = addr+size - regAddr;
+  regSize = ((regSize + REG_ALIGN-1) / REG_ALIGN ) * REG_ALIGN;
+  if (verbs->mrPool[elem].mr) NCCLCHECK(wrap_ibv_dereg_mr(verbs->mrPool[elem].mr));
+  NCCLCHECK(wrap_ibv_reg_mr(&verbs->mrPool[elem].mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
+  *mrRet = verbs->mrPool+elem;
+  verbs->mrPool[elem].refcnt++;
+  TRACE(INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void** request) {
+  struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
+  NCCLCHECK(ncclSendCheck(comm));
+
+  struct ncclIbRequest* req;
+  NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
+  req->type = type;
+  req->verbs = &comm->verbs;
+  req->size = size;
+
+  struct ibv_send_wr wr;
+  memset(&wr, 0, sizeof(wr));
+  wr.wr_id = (uint64_t)req;
+
+  struct ibv_sge sge;
+  if (size == 0) {
+    wr.sg_list = NULL;
+    wr.num_sge = 0;
+  } else {
+    NCCLCHECK(ncclIbGetMr(&comm->verbs, data, size, &req->ibMr));
+    sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=req->ibMr->mr->lkey;
+    wr.sg_list = &sge;
+    wr.num_sge = 1;
+  }
+  wr.opcode = IBV_WR_SEND;
+  wr.send_flags = IBV_SEND_SIGNALED;
+
+  // Wait for receiver to have posted the recv
+  volatile struct ncclIbSendFifo* slot = comm->fifo + (comm->fifoHead%MAX_REQUESTS);
+  volatile uint32_t * readyPtr = &slot->ready;
+  while (*readyPtr == 0) sched_yield();
+#if USE_RDMA_WRITE
+  __sync_synchronize(); // order the readyPtr load against rkey load below
+  // Sanity checks to catch user collective call count/size mismatches
+  // plus any potential programming errors
+  if (size > slot->size || slot->size <= 0 || slot->addr == 0 || slot->rkey == 0 || slot->seq != comm->fifoHead) {
+    WARN("NET/IB : collective mismatch error local size %d remote %d addr %lx rkey %x seq %x/%x",
+        size, slot->size, slot->addr, slot->rkey, slot->seq, comm->fifoHead);
+    return ncclInternalError;
+  }
+  wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+  wr.wr.rdma.remote_addr = slot->addr;
+  wr.wr.rdma.rkey = slot->rkey;
+  wr.imm_data = size; // Send the message size via imm_data
+  __sync_synchronize();
+#endif
+  // We must clear slot->ready, but reset other fields to aid
+  // debugging and sanity checks
+  slot->ready = 0;
+  slot->addr = 0ULL;
+  slot->rkey = slot->size = slot->seq = 0;
+  comm->fifoHead++;
+
+  struct ibv_send_wr* bad_wr;
+  NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
+  *request = req;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t addr, int size) {
+  struct ibv_send_wr wr;
+  memset(&wr, 0, sizeof(wr));
+  struct ncclIbRequest* req;
+  NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
+  req->verbs = &comm->verbs;
+  req->free = 1; // Not a user req ; free as soon as it is complete.
+  wr.wr_id = (uint64_t)req;
+
+  struct ncclIbSendFifo* localElem = comm->remFifo.elems + (comm->remFifo.tail % MAX_REQUESTS);
+  localElem->addr = addr;
+  localElem->rkey = rkey;
+  localElem->ready = 1;
+  localElem->size = size; // Sanity/Debugging
+  localElem->seq = comm->remFifo.tail; // Sanity/Debugging
+  wr.wr.rdma.remote_addr = comm->remFifo.addr + (comm->remFifo.tail % MAX_REQUESTS) * sizeof(struct ncclIbSendFifo);
+  wr.wr.rdma.rkey = comm->remFifo.rkey;
+  comm->remFifo.sge.addr = (uint64_t)localElem;
+  wr.sg_list = &comm->remFifo.sge;
+  wr.num_sge = 1;
+  wr.opcode = IBV_WR_RDMA_WRITE;
+  wr.send_flags = IBV_SEND_SIGNALED | comm->remFifo.flags; // IBV_SEND_INLINE
+
+  struct ibv_send_wr* bad_wr;
+  NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
+  comm->remFifo.tail++;
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void** request) {
+  struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
+  NCCLCHECK(ncclRecvCheck(comm));
+
+  struct ncclIbRequest* req;
+  NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
+  req->type = type;
+  req->verbs = &comm->verbs;
+  req->size = size;
+
+  struct ibv_recv_wr wr;
+  memset(&wr, 0, sizeof(wr));
+  wr.wr_id = (uint64_t)req;
+
+  struct ibv_sge sge;
+  if (size == 0) {
+    wr.sg_list = NULL;
+    wr.num_sge = 0;
+    req->ibMr = NULL;
+  } else {
+    NCCLCHECK(ncclIbGetMr(&comm->verbs, data, size, &req->ibMr));
+    sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=req->ibMr->mr->lkey;
+    wr.sg_list = &sge;
+    wr.num_sge = 1;
+  }
+
+  struct ibv_recv_wr* bad_wr;
+  NCCLCHECK(wrap_ibv_post_recv(comm->qp, &wr, &bad_wr));
+  *request = req;
+
+  // Post to FIFO to notify sender
+  NCCLCHECK(ncclIbPostFifo(comm, req->ibMr->mr->rkey, (uint64_t)data, size));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbFlush(void* recvComm, void* data, int size) {
+  struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
+  if (comm->gpuFlush.enabled == 0 || size == 0) return ncclSuccess;
+
+  struct ncclIbRequest* req;
+  NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
+  req->verbs = &comm->verbs;
+  NCCLCHECK(ncclIbGetMr(&comm->verbs, data, 1, &req->ibMr));
+
+  struct ibv_send_wr wr;
+  memset(&wr, 0, sizeof(wr));
+  wr.wr_id = (uint64_t)req;
+
+  wr.wr.rdma.remote_addr = (uint64_t)data;
+  wr.wr.rdma.rkey = req->ibMr->mr->rkey;
+  wr.sg_list = &comm->gpuFlush.sge;
+  wr.num_sge = 1;
+  wr.opcode = IBV_WR_RDMA_READ;
+  wr.send_flags = IBV_SEND_SIGNALED;
+
+  struct ibv_send_wr* bad_wr;
+  NCCLCHECK(wrap_ibv_post_send(comm->gpuFlush.qp, &wr, &bad_wr));
+
+  int done = 0;
+  while (done == 0) {
+    NCCLCHECK((ncclResult_t)ncclIbTest(req, &done, NULL));
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbTest(void* request, int* done, int* size) {
+  struct ncclIbRequest *r = (struct ncclIbRequest*)request;
+  *done = 0;
+
+  while (1) {
+    if (r->done == 1) {
+      *done = 1;
+      if (size) *size = r->size;
+      r->used = 0;
+      return ncclSuccess;
+    }
+
+    int wrDone = 0;
+    struct ibv_wc wc;
+    NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 1, &wc, &wrDone));
+    if (wrDone == 0) return ncclSuccess;
+
+    if (wc.status != IBV_WC_SUCCESS) {
+      WARN("NET/IB : Got completion with error %d, opcode %d, len %d, vendor err %d", wc.status, wc.opcode, wc.byte_len, wc.vendor_err);
+      return ncclSystemError;
+    }
+
+    struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc.wr_id;
+    if (doneReq) {
+      if (wc.opcode == IBV_WC_RECV) {
+        doneReq->size = wc.byte_len;
+#if USE_RDMA_WRITE
+      } else if (wc.opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
+        doneReq->size = wc.imm_data;
+#endif
+      }
+      if (doneReq->ibMr != NULL) {
+        doneReq->ibMr->refcnt--;
+        if (doneReq->ibMr->refcnt < 0) WARN("NET/IB : doneReq %p MR %p refcount now %d", doneReq, doneReq->ibMr, doneReq->ibMr->refcnt);
+      }
+      doneReq->done = 1;
+      if (doneReq->free == 1) {
+        // This is an internal (FIFO post) req. Free it immediately.
+        doneReq->used = 0;
+      }
+    }
+  }
+}
+
+ncclResult_t ncclIbCloseSend(void* sendComm) {
+  struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
+  if (comm) {
+    close(comm->fd);
+    if (comm->qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qp));
+    if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr));
+    for (int i=0; i<MAX_REQUESTS; i++) {
+      if (comm->verbs.mrPool[i].mr != NULL) {
+        if (comm->verbs.mrPool[i].refcnt != 0) WARN("NET/IB : TX MR #%d has non-zero (%d) refcnt", i, comm->verbs.mrPool[i].refcnt);
+        NCCLCHECK(wrap_ibv_dereg_mr(comm->verbs.mrPool[i].mr));
+      }
+    }
+    NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs));
+    free(comm);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbCloseRecv(void* recvComm) {
+  struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
+  if (comm) {
+    close(comm->fd);
+    if (comm->qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qp));
+    if (comm->gpuFlush.enabled) {
+      if (comm->gpuFlush.qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->gpuFlush.qp));
+      if (comm->gpuFlush.hostMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->gpuFlush.hostMr));
+    }
+    if (comm->remFifo.mr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remFifo.mr));
+    for (int i=0; i<MAX_REQUESTS; i++) {
+      if (comm->verbs.mrPool[i].mr != NULL) {
+        if (comm->verbs.mrPool[i].refcnt != 0) WARN("NET/IB : RX MR #%d has non-zero (%d) refcnt", i, comm->verbs.mrPool[i].refcnt);
+        NCCLCHECK(wrap_ibv_dereg_mr(comm->verbs.mrPool[i].mr));
+      }
+    }
+    NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs));
+    free(comm);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbCloseListen(void* listenComm) {
+  struct ncclIbListenComm* comm = (struct ncclIbListenComm*)listenComm;
+  if (comm) {
+    close(comm->fd);
+    free(comm);
+  }
+  return ncclSuccess;
+}
+
+ncclNet_t ncclNetIb = {
+  "IB",
+  ncclIbDevices,
+  ncclIbPtrSupport,
+  ncclIbListen,
+  ncclIbConnect,
+  ncclIbAccept,
+  ncclIbIsend,
+  ncclIbIrecv,
+  ncclIbFlush,
+  ncclIbTest,
+  ncclIbCloseSend,
+  ncclIbCloseRecv,
+  ncclIbCloseListen
+};
+
+NCCL_PARAM(IbDisable, "IB_DISABLE", 0);
+
+bool ncclIbSupport() {
+  if (ncclParamIbDisable()) return 0;
+  initDevices();
+  return ncclNIbDevs > 0;
+}
diff --git a/src/3rd_party/nccl/src/transport/net_socket.cu b/src/3rd_party/nccl/src/transport/net_socket.cu
new file mode 100644
index 00000000..cff1973a
--- /dev/null
+++ b/src/3rd_party/nccl/src/transport/net_socket.cu
@@ -0,0 +1,238 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "core.h"
+#include "socket.h"
+#include "net.h"
+#include "topo.h"
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <poll.h>
+
+/* Init functions */
+
+ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) {
+  *supportedTypes = NCCL_PTR_HOST;
+  return ncclSuccess;
+}
+
+static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
+static union socketAddress ncclNetIfAddrs[MAX_IFS];
+static int ncclNetIfs = -1;
+pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
+
+static void initDevices() {
+  if (ncclNetIfs == -1) {
+    pthread_mutex_lock(&ncclSocketLock);
+    if (ncclNetIfs == -1) {
+      ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
+      INFO(INIT|NET,"NET/Socket : %d interfaces found", ncclNetIfs);
+      if (ncclNetIfs <= 0) {
+        WARN("NET/Socket : no interface found");
+      }
+    }
+    pthread_mutex_unlock(&ncclSocketLock);
+  }
+}
+
+ncclResult_t ncclSocketDevices(int* ndev, int** scores) {
+  initDevices();
+  *ndev = ncclNetIfs;
+  int cudaDev;
+  cudaGetDevice(&cudaDev);
+  char* cudaPath;
+  ncclResult_t err1 = getCudaPath(cudaDev, &cudaPath);
+  int* sc;
+  NCCLCHECK(ncclCalloc(&sc, ncclNetIfs));
+  char line[1024];
+  sprintf(line, "CUDA Dev %d, IP Interfaces : ", cudaDev);
+  for (int i=0; i<ncclNetIfs; i++) {
+    char* sockPath;
+    ncclResult_t err2 = getSockPath(ncclNetIfNames+i*MAX_IF_NAME_SIZE, &sockPath);
+    int distance = (err1 != ncclSuccess || err2 != ncclSuccess || sockPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(sockPath, cudaPath);
+    sprintf(line+strlen(line), "%s(%s) ", ncclNetIfNames+i*MAX_IF_NAME_SIZE, pathDists[distance]);
+    sc[i] = 1+PATH_SOC-distance;
+    if (err2 == ncclSuccess) free(sockPath);
+  }
+  INFO(INIT|NET,"%s", line);
+  if (err1 == ncclSuccess) free(cudaPath);
+  *scores = sc;
+  return ncclSuccess;
+}
+
+static ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
+  if (ncclNetIfs == -1) initDevices();
+  if (dev >= ncclNetIfs) return ncclInternalError;
+  memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr));
+  return ncclSuccess;
+}
+
+/* Communication functions */
+
+struct ncclSocketHandle {
+  union socketAddress connectAddr;
+};
+
+struct ncclSocketRequest {
+  int used;
+  int size;
+};
+
+struct ncclSocketReqs {
+  struct ncclSocketRequest* requests;
+};
+
+struct ncclSocketComm {
+  int fd;
+  struct ncclSocketReqs reqs;
+};
+
+ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) {
+  NCCLCHECK(ncclCalloc(comm, 1));
+  (*comm)->fd = -1;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str) {
+  struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
+  NCCLCHECK(GetSocketAddrFromString(&(handle->connectAddr), str));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
+  struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
+  static_assert(sizeof(struct ncclSocketHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large");
+  // if dev >= 0, listen based on dev
+  if (dev >= 0) {
+    NCCLCHECK(GetSocketAddr(dev, &(handle->connectAddr)));
+  } else if (dev == findSubnetIf) {
+    // handle stores a remote address
+    // need to find a local addr that is in the same network as the remote addr
+    union socketAddress localAddr;
+    char ifName[MAX_IF_NAME_SIZE];
+    if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
+      WARN("No usable listening interface found");
+      return ncclSystemError;
+    }
+    // pass the local address back
+    memcpy(&handle->connectAddr, &localAddr, sizeof(handle->connectAddr));
+  } // Otherwise, handle stores a local address
+  struct ncclSocketComm* comm;
+  NCCLCHECK(ncclSocketNewComm(&comm));
+  NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
+  *listenComm = comm;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) {
+  struct ncclSocketComm* comm;
+  NCCLCHECK(ncclSocketNewComm(&comm));
+  struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
+  NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr));
+  *sendComm = comm;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) {
+  struct ncclSocketComm* lComm = (struct ncclSocketComm*)listenComm;
+  struct ncclSocketComm* rComm;
+  NCCLCHECK(ncclSocketNewComm(&rComm));
+  struct sockaddr_in sockaddr;
+  socklen_t socklen = sizeof(struct sockaddr_in);
+  SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
+  *recvComm = rComm;
+  return ncclSuccess;
+}
+
+#define MAX_REQUESTS 128
+
+ncclResult_t ncclSocketGetRequest(struct ncclSocketReqs* reqs, struct ncclSocketRequest** req) {
+  if (reqs->requests == NULL) {
+    NCCLCHECK(ncclCalloc(&reqs->requests, MAX_REQUESTS));
+  }
+  for (int i=0; i<MAX_REQUESTS; i++) {
+    struct ncclSocketRequest* r = reqs->requests+i;
+    if (r->used == 0) {
+      r->used = 1;
+      r->size = -1;
+      *req = r;
+      return ncclSuccess;
+    }
+  }
+  WARN("Socket : unable to allocate requests");
+  return ncclInternalError;
+}
+
+ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, int type, void** request) {
+  if (type != NCCL_PTR_HOST) return ncclInternalError;
+  struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm;
+  *request = NULL;
+  NCCLCHECK(socketSend(comm->fd, &size, sizeof(int)));
+  NCCLCHECK(socketSend(comm->fd, data, size));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, int type, void** request) {
+  if (type != NCCL_PTR_HOST) return ncclInternalError;
+  struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm;
+  int recvSize;
+  NCCLCHECK(socketReceive(comm->fd, &recvSize, sizeof(int)));
+  if (recvSize > size) {
+    WARN("Message truncated : received %d bytes instead of %d", recvSize, size);
+    return ncclInternalError;
+  }
+  NCCLCHECK(socketReceive(comm->fd, data, std::min(recvSize, size)));
+  struct ncclSocketRequest* recvReq = NULL;
+  NCCLCHECK(ncclSocketGetRequest(&comm->reqs, &recvReq));
+  recvReq->size = recvSize;
+  *request = recvReq;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size) {
+  // We don't support CUDA pointers, so we don't need a flush operation
+  return ncclInternalError;
+}
+
+ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
+  *done = 1;
+  struct ncclSocketRequest *r = (struct ncclSocketRequest*)request;
+  if (r) {
+    if (size) *size = r->size;
+    r->used = 0;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketClose(void* opaqueComm) {
+  struct ncclSocketComm* comm = (struct ncclSocketComm*)opaqueComm;
+  if (comm) {
+    free(comm->reqs.requests);
+    close(comm->fd);
+    free(comm);
+  }
+  return ncclSuccess;
+}
+
+ncclNet_t ncclNetSocket = {
+  "Socket",
+  ncclSocketDevices,
+  ncclSocketPtrSupport,
+  ncclSocketListen,
+  ncclSocketConnect,
+  ncclSocketAccept,
+  ncclSocketIsend,
+  ncclSocketIrecv,
+  ncclSocketFlush,
+  ncclSocketTest,
+  ncclSocketClose,
+  ncclSocketClose,
+  ncclSocketClose
+};
diff --git a/src/3rd_party/nccl/src/transport/p2p.cu b/src/3rd_party/nccl/src/transport/p2p.cu
new file mode 100644
index 00000000..35aebb4d
--- /dev/null
+++ b/src/3rd_party/nccl/src/transport/p2p.cu
@@ -0,0 +1,611 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "utils.h"
+#include "topo.h"
+#include "transport.h"
+#include "param.h"
+#include <unistd.h>
+#include <cuda_runtime.h>
+#include "nvmlwrap.h"
+#include <ctype.h>
+#include "nvlink.h"
+
+struct p2pInfo {
+  int rank;
+  int cudaDev;
+  uint64_t hostHash;
+  uint64_t pidHash;
+  char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+};
+
+struct p2pConnectInfo {
+  int direct;
+  union {
+    void* directPtr;
+    cudaIpcMemHandle_t devIpc;
+  };
+};
+
+#include <sys/types.h>
+
+/* Fill information necessary to exchange between ranks to choose whether or not
+ * to use this transport */
+ncclResult_t p2pFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
+  struct p2pInfo* info = (struct p2pInfo*)opaqueInfo;
+  static_assert(sizeof(struct p2pInfo) <= sizeof(ncclTinfo_t), "p2p Info too large");
+  info->rank = rank;
+  CUDACHECK(cudaGetDevice(&info->cudaDev));
+  info->hostHash=getHostHash();
+  info->pidHash=getPidHash();
+
+  // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
+  // cudaDev is a CUDA runtime dev number which could be different from the
+  // NVML device number. Then we get the busID from NVML to be sure it is
+  // consistent with NVML remote PCI bus Ids.
+  CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
+  nvmlDevice_t nvmlDevice;
+  NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice));
+  nvmlPciInfo_t pciInfo;
+  NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo));
+  strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE);
+  return ncclSuccess;
+}
+
+NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2);
+NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2);
+
+/* Determine if we can communicate with the peer through p2p */
+ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
+  // Do not use P2P across root complexes by default (provided CUDA permits it)
+  int p2pLevel = PATH_SOC;
+  if (ncclParamP2pDisable() == 1) p2pLevel = 0;
+  if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();
+
+  *ret = 0;
+
+  if (p2pLevel == 0) return ncclSuccess;
+
+  struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
+  struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
+
+  // Rule out different nodes
+  if (myInfo->hostHash != peerInfo->hostHash) return ncclSuccess;
+
+  // Do not detect topology if we're on the same GPU. Note this is not really supported.
+  if (myInfo->cudaDev == peerInfo->cudaDev) {
+    *ret = 1 + PATH_SOC;
+    return ncclSuccess;
+  }
+
+  // See if CUDA can do P2P
+  int p2p;
+  if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerInfo->cudaDev) != cudaSuccess) {
+    INFO(INIT|P2P,"peer query failed between dev %d and dev %d",
+        myInfo->cudaDev, peerInfo->cudaDev);
+    return ncclSuccess;
+  }
+  if (p2p == 0) return ncclSuccess;
+
+  // Check for NVLink/NVswitch
+  int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
+  if (nvlinkp2p > 0) {
+    *ret = nvlinkp2p;
+    return ncclSuccess;
+  }
+
+  // Finally compute the PCI distance and compare with the p2pLevel.
+  char* myPath;
+  char* peerPath;
+  ncclResult_t err1 = getCudaPath(myInfo->cudaDev, &myPath);
+  ncclResult_t err2 = getCudaPath(peerInfo->cudaDev, &peerPath);
+  if (err1 == ncclSuccess && err2 == ncclSuccess) {
+    int distance = pciDistance(myPath, peerPath);
+    if (distance < p2pLevel) {
+      *ret = 1 + PATH_SOC - distance;
+    }
+  }
+  if (err1 == ncclSuccess) free(myPath);
+  if (err2 == ncclSuccess) free(peerPath);
+  return ncclSuccess;
+}
+
+static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) {
+  int nrings = 0;
+  ncclTvalue_t* line = matrix+current*n;
+  inTheRing[current] = 1;
+  int currentStep = (currentRing+1)*n-remaining;
+  rings[currentStep-1] = current;
+  if (remaining == 0) {
+    int looprank = rings[currentRing*n];
+    if (line[looprank] > 0) {
+      if (currentRing+1 == nRingsMax) {
+        nrings = 1;
+      } else {
+        line[looprank]--;
+        for (int i=0; i<n; i++) inTheRing[i] = 0;
+        if (connect) {
+          // First two slots are already set and we need to respect those constraints
+          inTheRing[rings[currentStep]] = 1;
+          nrings = 1 + computeRingsRec(matrix, n, rings, currentRing+1, nRingsMax, inTheRing, rings[currentStep+1], n-2, connect);
+        } else {
+          rings[(currentRing+1)*n] = 0;
+          nrings = 1 + computeRingsRec(matrix, n, rings, currentRing+1, nRingsMax, inTheRing, 0, n-1, connect);
+        }
+        line[looprank]++;
+        for (int i=0; i<n; i++) inTheRing[i] = 1;
+      }
+    }
+  } else {
+    int ringsSave[nRingsMax*n];
+    int maxStep = 0;
+    for (int i=0; i<n; i++) {
+      if (inTheRing[i] == 0 && line[i] > 0) {
+        line[i]--;
+        int nr = computeRingsRec(matrix, n, rings, currentRing, nRingsMax, inTheRing, i, remaining-1, connect);
+        if (nr > nrings) {
+          nrings = nr;
+          maxStep = (nr+currentRing)*n;
+          ringsSave[currentStep] = i;
+          // Save the rest of the rings
+          for (int r=currentStep+1; r<maxStep; r++) {
+            ringsSave[r] = rings[r];
+          }
+          if (nrings + currentRing == nRingsMax) {
+            // We found an optimal solution. Let's stop there.
+            break;
+          }
+        }
+        line[i]++;
+      }
+    }
+    for (int r=currentStep; r<maxStep; r++) {
+      rings[r] = ringsSave[r];
+    }
+  }
+  inTheRing[current] = 0;
+  return nrings;
+}
+
+static inline int copyRings(int nranks, int* rings, int nrings, int newNrings) {
+  if (nrings == 0) return 0;
+  // Copy rings by dup times
+  if (newNrings > MAXRINGS) {
+    newNrings = MAXRINGS;
+  }
+  for (int r=nrings; r<newNrings; r++) {
+    for (int i=0; i<nranks; i++) rings[r*nranks+i] = rings[(r%nrings)*nranks+i];
+  }
+  return newNrings;
+}
+
+int p2pComputeRingsNvLink(ncclTvalue_t* matrix, int nranks, int *rings, int nringsMax, int connect) {
+  int* inTheRing = (int*)malloc(sizeof(int)*nranks);
+  if (inTheRing == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*nranks); return 0; }
+  for (int i=0; i<nranks; i++) inTheRing[i] = 0;
+  int nrings;
+  if (connect) {
+    inTheRing[rings[0]] = 1;
+    nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, rings[1], nranks-2, connect);
+    nrings = copyRings(nranks, rings, nrings, nringsMax);
+  } else {
+    rings[0] = 0;
+    nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, 0, nranks-1, connect);
+  }
+  free(inTheRing);
+  return nrings;
+}
+
+static inline int findConnect(int nranks, int* ranks) {
+  for (int i = 0; i<nranks; i++) {
+    if (ranks[i] != -1) return i;
+  }
+  return -1;
+}
+
+int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int oversubscribe, int* nthreads) {
+  if (nrings == 0) return 0;
+  if (nrings > MAXRINGS) {
+    WARN("Max rings reached, limiting to %d", MAXRINGS);
+    nrings = MAXRINGS;
+  }
+  // Find existing constraints / connections
+  int connect = 0;
+  for (int r=0; r<nrings; r++) {
+    int start = findConnect(nranks, prev+r*nranks);
+    int end = findConnect(nranks, next+r*nranks);
+    if (start != -1 && end != -1) {
+      rings[r*nranks] = end;
+      rings[r*nranks+1] = start;
+      connect = 1;
+    }
+  }
+
+  // Compute rings
+  ncclTvalue_t* matrix = (ncclTvalue_t*)malloc(sizeof(ncclTvalue_t)*nranks*nranks);
+  if (matrix == NULL) { WARN("malloc of %ld bytes failed", sizeof(ncclTvalue_t)*nranks*nranks); return 0; }
+  for (int i=0; i<nranks; i++) for (int j=0; j<nranks; j++)
+      matrix[i*nranks+j] = oversubscribe ? values[i*nranks+j]/CONNECT_NVLINK*2 : values[i*nranks+j]/CONNECT_NVLINK ;
+
+  int compNrings = p2pComputeRingsNvLink(matrix, nranks, rings, nrings, connect);
+
+  free(matrix);
+
+  if (oversubscribe || connect) return compNrings;
+
+  if (compNrings && compNrings < nrings && nranks <= 4) {
+    // Try to oversubscribe to get a better result
+    int *rings2 = (int *)malloc(sizeof(int)*MAXRINGS*nranks);
+    if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXRINGS*nranks); return 0; }
+    for (int i=0; i<MAXRINGS*nranks; i++) rings2[i] = -1;
+    int nThreads = *nthreads;
+    int compNrings2 = p2pComputeRingsNvLink(values, nranks, rings2, nrings, prev, next, 1, &nThreads);
+    if (compNrings2 > compNrings*2) {
+      // Oversubscription worked.
+      for (int i=0; i<compNrings2*nranks; i++) rings[i] = rings2[i];
+      compNrings = compNrings2;
+    }
+    free(rings2);
+  }
+
+  // Duplicate the rings for direct NVLink
+  compNrings = copyRings(nranks, rings, compNrings, compNrings*2);
+
+  if (ncclCudaCompCap() == 6) *nthreads /= 2;
+  return compNrings;
+}
+
+int p2pComputeRingsSeqConnect(ncclTvalue_t* values, int nranks, int* rings, int nringsStart, int* prev, int* next, int minScore, int* nthreads) {
+  int nrings = nringsStart;
+  int connect = 0;
+  for (int r=0; r<nrings; r++) {
+    int start = findConnect(nranks, prev+r*nranks);
+    int end = findConnect(nranks, next+r*nranks);
+    if (start != -1 && end != -1) {
+      rings[r*nranks] = end;
+      rings[r*nranks+1] = start;
+      int cur = start;
+      for (int i=2; i<nranks; i++) {
+        int next = (cur+1) % nranks;
+        while (next == end || next == start) next = (next+1) % nranks;
+        if (values[cur*nranks+next] < minScore) {
+          return 0;
+        }
+        rings[r*nranks+i] = next;
+        cur = next;
+      }
+      connect = 1;
+    } else {
+      if (connect == 1 && r > 0) {
+        WARN("Connecting rings but did not find start/end for ring %d. Disabling other rings.", r);
+        return r;
+      } else {
+        return 0;
+      }
+    }
+  }
+  return nrings;
+}
+
+int p2pComputeRingsSeqNew(ncclTvalue_t* values, int nranks, int* rings, int nringsStart, int* prev, int* next, int minScore, int* nthreads) {
+  for (int r=0; r<nringsStart; r++) {
+    for (int i=0; i<nranks; i++) {
+      rings[r*nranks+i] = i;
+    }
+  }
+  return nringsStart;
+}
+
+static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) {
+  for (int score = PATH_SOC+1; score >= minScore; score--) {
+    int best = -1;
+    int worst_end_score = PATH_SOC+2; // find the closest to rank, farthest from end
+    for (int n = 0; n < nranks; n++) {
+      if (inRing[n]) continue;
+      if (values[rank*nranks+n] == score) {
+        if (end == -1) return n;
+        if (values[end*nranks+n] < worst_end_score) {
+          best = n;
+          worst_end_score = values[end*nranks+n];
+        }
+      }
+    }
+    if (best != -1) return best;
+  }
+  return -1;
+}
+
+int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int minScore) {
+  int connect = 0;
+  for (int r=0; r<nrings; r++) {
+    int start = findConnect(nranks, prev+r*nranks);
+    int end = findConnect(nranks, next+r*nranks);
+
+    int inRing[nranks];
+    for (int i=0; i<nranks; i++) inRing[i] = 0;
+
+    if (start == -1 && end == -1) {
+      if (connect == 1 && r > 0) {
+        WARN("Connecting ring %d : did not find start/end. Disabling other rings.", r);
+        return r;
+      }
+      end = 0;
+      inRing[end] = 1;
+      start = findClosestPci(values, inRing, end, -1, nranks, minScore);
+      if (start == -1) return r;
+    } else if (start == -1 || end == -1) {
+      WARN("Connecting ring %d : inconsistent start/end. Disabling other rings.", r);
+      return r;
+    } else {
+      connect = 1;
+    }
+    rings[r*nranks] = end;
+    rings[r*nranks+1] = start;
+    inRing[start] = inRing[end] = 1;
+    int cur = start;
+    for (int i=2; i<nranks; i++) {
+      int next = findClosestPci(values, inRing, cur, end, nranks, minScore);
+      if (next == -1) return r;
+
+      inRing[next] = 1;
+      rings[r*nranks+i] = next;
+      cur = next;
+    }
+    // Check the loop is closing
+    inRing[end] = 0;
+    if (findClosestPci(values, inRing, cur, end, nranks, minScore) != end) return r;
+
+    if (connect == 0) return 1;
+  }
+  return nrings;
+}
+
+ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
+  if (*nringsRet == 0) return ncclSuccess;
+  int *rings;
+  NCCLCHECK(ncclCalloc(&rings, MAXRINGS*nranks));
+  for (int i=0; i<MAXRINGS*nranks; i++) rings[i] = -1;
+  int nrings = *nringsRet;
+
+  // NVswitch
+  int nvswitchLinks = 0;
+  int directLinks = 0;
+  for (int rank=0; rank<nranks; rank++) {
+    for (int j=1; j<nranks; j++) {
+      int i = (rank + j) % nranks;
+      ncclTvalue_t links = values[rank*nranks+i]/CONNECT_NVSWITCH;
+      if (j>1 && links != nvswitchLinks) {
+        WARN("Internal error : NVswitch links mismatch");
+        return ncclInternalError;
+      }
+      nvswitchLinks = links;
+    }
+  }
+  if (nvswitchLinks) {
+    // NVSwitch : Connect existing rings
+    int nringsConnected = p2pComputeRingsSeqConnect(values, nranks, rings, nrings, prev, next, minScore, nthreads);
+    if (nringsConnected > 0) {
+      nrings = nringsConnected;
+    } else {
+      nrings = std::min(nrings, nvswitchLinks); // NVSwitch: Limit rings to number of NVLinks
+      // Or create new ones
+      nrings = p2pComputeRingsSeqNew(values, nranks, rings, nrings, prev, next, minScore, nthreads);
+      // And duplicate them
+      nrings = copyRings(nranks, rings, nrings, nrings*2);
+    }
+    goto end;
+  }
+
+  // point-to-point NVLink
+  for (int rank=0; rank<nranks; rank++) {
+    int links = 0;
+    for (int i=0; i<nranks; i++) {
+      ncclTvalue_t val = values[rank*nranks+i];
+      if (val >= CONNECT_NVSWITCH) continue;
+      links += val/CONNECT_NVLINK;
+    }
+    if (rank == 0) directLinks = links;
+    else directLinks =  std::min(directLinks, links);
+  }
+  if (directLinks > 0) {
+    // NVLink : Connect rings or create new ones
+    nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads);
+    goto end;
+  }
+
+  // PCIe or QPI : Connect rings or create new ones
+  nrings = p2pComputeRingsPci(values, nranks, rings, *nringsRet, prev, next, minScore);
+
+end:
+  *nringsRet = nrings;
+  for (int ring = 0; ring<nrings; ring++) {
+    for (int index=0; index<nranks; index++) {
+      int prevIndex = (index - 1 + nranks) % nranks;
+      int nextIndex = (index + 1) % nranks;
+      int curRank = rings[ring*nranks+index];
+      int prevRank = rings[ring*nranks+prevIndex];
+      int nextRank = rings[ring*nranks+nextIndex];
+      if (prev[ring*nranks+curRank] == -1) prev[ring*nranks+curRank] = prevRank;
+      if (next[ring*nranks+curRank] == -1) next[ring*nranks+curRank] = nextRank;
+    }
+  }
+
+  free(rings);
+  return ncclSuccess;
+}
+
+#define TRACE_DUMP_IPC(DEVIPC)                                                             \
+  do {                                                                                     \
+    unsigned long *devIpc = (unsigned long *) (DEVIPC);                                    \
+    TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[0], devIpc[1], devIpc[2], devIpc[3]); \
+    TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \
+  } while (0)
+
+/* Send: Create and return connect structures for this peer to connect to me */
+ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
+  struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
+  struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
+  struct p2pConnectInfo info;
+  if (myInfo->pidHash == peerInfo->pidHash) {
+    info.direct = 1;
+    info.directPtr = ring->devMemSend;
+    if (myInfo->cudaDev == peerInfo->cudaDev) {
+      INFO(INIT|P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank);
+    } else {
+      // Enable P2P access
+      cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
+      if (err == cudaErrorPeerAccessAlreadyEnabled) {
+        cudaGetLastError();
+      } else if (err != cudaSuccess) {
+        WARN("failed to peer with device %d: %d %s",
+            peerInfo->cudaDev, err, cudaGetErrorString(err));
+        return ncclInternalError;
+      }
+      INFO(INIT|P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
+          ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+    }
+  } else {
+    info.direct = 0;
+    // Map IPC and enable P2P access
+    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)ring->devMemSend);
+    if (err != cudaSuccess) {
+      WARN("rank %d failed to get CUDA IPC handle to device %d : %d %s",
+          myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
+      return ncclInternalError;
+    }
+    INFO(INIT|P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
+        ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+    //TRACE_DUMP_IPC(&info.devIpc);
+  }
+  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
+  memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
+  return ncclSuccess;
+}
+
+/* Create and return connect structures for this peer to connect to me */
+ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
+  struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
+  struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
+  struct p2pConnectInfo info;
+  if (myInfo->pidHash == peerInfo->pidHash) {
+    info.direct = 1;
+    info.directPtr = ring->devMemRecv;
+    if (myInfo->cudaDev == peerInfo->cudaDev) {
+      TRACE(INIT|P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
+    } else {
+      // Enable P2P access
+      cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
+      if (err == cudaErrorPeerAccessAlreadyEnabled) {
+        cudaGetLastError();
+      } else if (err != cudaSuccess) {
+        WARN("failed to peer with device %d: %d %s",
+            peerInfo->cudaDev, err, cudaGetErrorString(err));
+        return ncclInternalError;
+      }
+      TRACE(INIT|P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+    }
+  } else {
+    info.direct = 0;
+    // Map IPC and enable P2P access
+    cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)ring->devMemRecv);
+    if (err != cudaSuccess) {
+      WARN("rank %d failed to get CUDA IPC handle to device %d : %d %s",
+          myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
+      return ncclInternalError;
+    }
+    TRACE(INIT|P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+    //TRACE_DUMP_IPC(&info.devIpc);
+  }
+  static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
+  memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
+  return ncclSuccess;
+}
+
+/* Connect/Send to this peer */
+static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+  void** resources = &send->transportResources;
+  struct ncclRecvMem* remDevMem;
+  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
+  if (info->direct) {
+    remDevMem = (struct ncclRecvMem*)(info->directPtr);
+    send->conn.direct = 1;
+    *resources = NULL;
+  } else {
+    void* remPtr = NULL;
+    //TRACE_DUMP_IPC(&info->devIpc);
+    cudaError_t err = cudaIpcOpenMemHandle(&remPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
+    void** ipcPtrSave;
+    NCCLCHECK(ncclCalloc(&ipcPtrSave, 1));
+    *resources = ipcPtrSave;
+    *ipcPtrSave = remPtr;
+    remDevMem = (struct ncclRecvMem*)remPtr;
+    if (err != cudaSuccess) {
+      WARN("failed to open CUDA IPC handle : %d %s",
+          err, cudaGetErrorString(err));
+      return ncclUnhandledCudaError;
+    }
+  }
+
+  send->conn.buff = remDevMem->buff;
+  send->conn.llBuff = remDevMem->llBuff;
+  send->conn.tail = &remDevMem->tail;
+  send->conn.opCount = &remDevMem->opCount;
+  // send->conn->head should have been set to devMemSend already
+  return ncclSuccess;
+}
+
+/* Connect/Recv from this peer */
+ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+  void** resources = &recv->transportResources;
+  struct ncclSendMem* remDevMem;
+  struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
+  if (info->direct) {
+    remDevMem = (struct ncclSendMem*)(info->directPtr);
+    recv->conn.direct = 1;
+    recv->conn.ptrExchange = &remDevMem->ptrExchange;
+    *resources = NULL;
+  } else {
+    void* remPtr = NULL;
+    //TRACE_DUMP_IPC(&info->devIpc);
+    cudaError_t err = cudaIpcOpenMemHandle(&remPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
+    void** ipcPtrSave;
+    NCCLCHECK(ncclCalloc(&ipcPtrSave, 1));
+    *resources = ipcPtrSave;
+    *ipcPtrSave = remPtr;
+    remDevMem = (struct ncclSendMem*)remPtr;
+    if (err != cudaSuccess) {
+      WARN("failed to open CUDA IPC handle : %d %s",
+          err, cudaGetErrorString(err));
+      return ncclUnhandledCudaError;
+    }
+  }
+
+  // recv->conn->buff should have been set to devMemRecv already
+  // recv->conn->tail should have been set to devMemRecv already
+  // recv->conn->opCount should have been set to devMemRecv already
+  recv->conn.head = &remDevMem->head;
+  recv->conn.llHead = &remDevMem->llHead;
+  return ncclSuccess;
+}
+
+ncclResult_t p2pFree(void* resources) {
+  if (resources != NULL) {
+    void** ipcPtrSave = (void**) resources;
+    CUDACHECK(cudaIpcCloseMemHandle(*ipcPtrSave));
+    free(resources);
+  }
+  return ncclSuccess;
+}
+
+struct ncclTransport p2pTransport = {
+  "P2P",
+  p2pFillInfo,
+  p2pCanConnect,
+  p2pGetRings,
+  { p2pSendSetup, p2pSendConnect, p2pFree, NULL },
+  { p2pRecvSetup, p2pRecvConnect, p2pFree, NULL }
+};
diff --git a/src/3rd_party/nccl/src/transport/shm.cu b/src/3rd_party/nccl/src/transport/shm.cu
new file mode 100644
index 00000000..38b6c6a0
--- /dev/null
+++ b/src/3rd_party/nccl/src/transport/shm.cu
@@ -0,0 +1,270 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "utils.h"
+#include "transport.h"
+#include "param.h"
+#include "shm.h"
+#include <unistd.h>
+#include <cuda_runtime.h>
+
+struct shmInfo {
+  int rank;
+  int cudaDev;
+  uint64_t hostHash;
+  uint64_t pidHash;
+};
+
+struct shmSendConnectInfo {
+  uint64_t pidHash;
+  int id;
+  int rank;
+  int shmSize;
+};
+
+struct shmRecvConnectInfo {
+  uint64_t pidHash;
+  int id;
+  int rank;
+  int shmSize;
+};
+
+struct shmSendResources {
+  int remShmSize;
+  struct ncclRecvMem* remHostMem;
+  struct ncclRecvMem* devRemHostMem;
+  int shmSize;
+  struct ncclSendMem* hostMem;
+  struct ncclSendMem* devHostMem;
+};
+
+struct shmRecvResources {
+  int remShmSize;
+  struct ncclSendMem* remHostMem;
+  struct ncclSendMem* devRemHostMem;
+  int shmSize;
+  struct ncclRecvMem* hostMem;
+  struct ncclRecvMem* devHostMem;
+};
+
+/* Fill information necessary to exchange between ranks to choose whether or not
+ * to use this transport */
+ncclResult_t shmFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
+  struct shmInfo* info = (struct shmInfo*)opaqueInfo;
+  static_assert(sizeof(struct shmInfo) <= sizeof(ncclTinfo_t), "shm Info too large");
+  info->rank = rank;
+  CUDACHECK(cudaGetDevice(&info->cudaDev));
+  info->hostHash=getHostHash();
+  info->pidHash=getPidHash();
+  return ncclSuccess;
+}
+
+NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);
+
+/* Determine if we can communicate with the peer */
+ncclResult_t shmCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
+  struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo;
+  struct shmInfo* peerInfo = (struct shmInfo*)peerOpaqueInfo;
+  *ret = ((ncclParamShmDisable() == 1) || (myInfo->hostHash != peerInfo->hostHash)) ? 0 : 1;
+  return ncclSuccess;
+}
+
+static inline int groupFirst(int nranks, int* groups, int group, int rankToAvoid) {
+  for (int rank = 0; rank<nranks; rank++) {
+    if ((groups[rank] == group) && (rank != rankToAvoid)) return rank;
+  }
+  return -1;
+}
+
+static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid) {
+  for (int rank = nranks-1; rank>=0; rank--) {
+    if ((groups[rank] == group) && (rank != rankToAvoid)) return rank;
+  }
+  return -1;
+}
+
+ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
+  if (*nringsRet == MAXRINGS) *nringsRet = 1;
+  int nGroups = groups[nranks-1] + 1;
+  int starts[nGroups];
+  int ends[nGroups];
+  for (int ring = 0; ring<*nringsRet; ring++) {
+    int startGroup = -1, endGroup = -1;
+    for (int group = 0; group<nGroups; group++) {
+      int start = -1;
+      int end = -1;
+      int nranksInGroup = 0;
+      for (int rank=0; rank<nranks; rank++) {
+        if (groups[rank] != group) continue;
+        nranksInGroup++;
+        if (prev[ring*nranks+rank] != -1) {
+          if (start != -1) {
+            WARN("Multiple starts found in group");
+          }
+          start = rank;
+          startGroup = group;
+        }
+        if (next[ring*nranks+rank] != -1) {
+          if (end != -1) {
+            WARN("Multiple ends found in group");
+          }
+          end = rank;
+          endGroup = group;
+        }
+      }
+      if (nranksInGroup == 1) {
+        start = end = groupFirst(nranks, groups, group, -1);
+      } else {
+        if (start == -1)
+          start = groupFirst(nranks, groups, group, end);
+        if (end == -1)
+          end = groupLast(nranks, groups, group, start);
+      }
+      if (start == -1 || end == -1) {
+        *nringsRet = ring;
+        return ncclSuccess;
+      }
+      starts[group] = start;
+      ends[group] = end;
+    }
+    if (endGroup == -1 || startGroup == -1) {
+      startGroup = 0;
+      endGroup = nGroups-1;
+      // Close the loop
+      next[ring*nranks+ends[endGroup]] = starts[startGroup];
+      prev[ring*nranks+starts[startGroup]] = ends[endGroup];
+    }
+    int group = startGroup;
+    for (int i=0; i<nGroups-2; i++) {
+      int nextGroup = (group+1)%nGroups;
+      if (nextGroup == endGroup) nextGroup = (nextGroup+1)%nGroups;
+      next[ring*nranks+ends[group]] = starts[nextGroup];
+      prev[ring*nranks+starts[nextGroup]] = ends[group];
+      group = nextGroup;
+    }
+    // Connect with the last
+    next[ring*nranks+ends[group]] = starts[endGroup];
+    prev[ring*nranks+starts[endGroup]] = ends[group];
+  }
+  return ncclSuccess;
+}
+
+#define MAX_SHM_NAME_LEN 1024
+
+/* Create and return connect structures for this peer to connect to me */
+ncclResult_t shmSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
+  struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo;
+  struct shmInfo* peerInfo = (struct shmInfo*)peerOpaqueInfo;
+
+  struct shmSendResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  ring->send.transportResources = resources;
+
+  struct shmRecvConnectInfo info;
+  char shmName[MAX_SHM_NAME_LEN];
+  sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
+  info.shmSize = resources->shmSize = sizeof(struct ncclSendMem);
+  TRACE(SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
+  NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+
+  INFO(INIT|SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+  info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
+  static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
+  memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo));
+  return ncclSuccess;
+}
+
+ncclResult_t shmRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
+  struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo;
+  struct shmRecvResources* resources;
+  NCCLCHECK(ncclCalloc(&resources, 1));
+  ring->recv.transportResources = resources;
+
+  struct shmSendConnectInfo info;
+
+  char shmName[MAX_SHM_NAME_LEN];
+  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
+  info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
+  TRACE(SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
+  NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+
+  info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
+  static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big");
+  memcpy(connectInfo, &info, sizeof(struct shmSendConnectInfo));
+  return ncclSuccess;
+}
+
+/* Connect to this peer */
+ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+  // Setup device pointers
+  struct shmSendConnectInfo* info = (struct shmSendConnectInfo*)connectInfo;
+  struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
+
+  char shmName[MAX_SHM_NAME_LEN];
+  sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", info->pidHash, info->id, info->rank);
+  resources->remShmSize = info->shmSize;
+  TRACE(SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
+  NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
+  // Remove the file to ensure proper clean-up
+  NCCLCHECK(shmUnlink(shmName));
+
+  send->transportResources = resources;
+  send->conn.buff = resources->devRemHostMem->buff;
+  send->conn.llBuff = resources->devRemHostMem->llBuff;
+  send->conn.tail = &resources->devRemHostMem->tail;
+  send->conn.opCount = &resources->devRemHostMem->opCount;
+
+  send->conn.head = &resources->devHostMem->head;
+  send->conn.llHead = &resources->devHostMem->llHead;
+  return ncclSuccess;
+}
+
+ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+  // Setup device pointers
+  struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
+  struct shmRecvConnectInfo* info = (struct shmRecvConnectInfo*)connectInfo;
+
+  char shmName[MAX_SHM_NAME_LEN];
+  sprintf(shmName, "nccl-shm-send-%lx-%d-%d", info->pidHash, info->id, info->rank);
+  resources->remShmSize = info->shmSize;
+  TRACE(SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
+  NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
+  NCCLCHECK(shmUnlink(shmName));
+  recv->conn.head = &resources->devRemHostMem->head;
+  recv->conn.llHead = &resources->devRemHostMem->llHead;
+
+  recv->conn.buff = resources->devHostMem->buff;
+  recv->conn.llBuff = resources->devHostMem->llBuff;
+  recv->conn.tail = &resources->devHostMem->tail;
+  recv->conn.opCount = &resources->devHostMem->opCount;
+  return ncclSuccess;
+}
+
+ncclResult_t shmSendFree(void* transportResources) {
+  struct shmSendResources* resources = (struct shmSendResources*)transportResources;
+  NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
+  NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
+  free(resources);
+  return ncclSuccess;
+}
+
+ncclResult_t shmRecvFree(void* transportResources) {
+  struct shmRecvResources* resources = (struct shmRecvResources*)transportResources;
+  NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
+  NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
+  free(resources);
+  return ncclSuccess;
+}
+
+struct ncclTransport shmTransport = {
+  "SHM",
+  shmFillInfo,
+  shmCanConnect,
+  shmGetRings,
+  { shmSendSetup, shmSendConnect, shmSendFree, NULL },
+  { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL }
+};
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 81e60bf4..09864161 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -168,7 +168,6 @@ if(USE_STATIC_LIBS)
                 "marian-conv"
     DEPENDS marian_train marian_decoder marian_scorer marian_vocab marian_conv)
   add_custom_target(marian_tgz DEPENDS "${CMAKE_BINARY_DIR}/marian.tgz")
-
   add_custom_target(philly DEPENDS marian_tgz marian_zip)
 endif()
 
@@ -182,7 +181,7 @@ endif(COMPILE_SERVER)
 foreach(exec ${EXECUTABLES})
   target_link_libraries(${exec} marian ${EXT_LIBS} ${EXT_LIBS} ${CMAKE_THREAD_LIBS_INIT})
   if(CUDA_FOUND)
-    target_link_libraries(${exec} marian marian_cuda ${EXT_LIBS} ${CMAKE_THREAD_LIBS_INIT})
+    target_link_libraries(${exec} marian marian_cuda nccl ${EXT_LIBS} ${CMAKE_THREAD_LIBS_INIT})
     cuda_add_cublas_to_target(${exec})
   endif(CUDA_FOUND)
   set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
diff --git a/src/examples/CMakeLists.txt b/src/examples/CMakeLists.txt
index bcce6083..58f7dc7a 100644
--- a/src/examples/CMakeLists.txt
+++ b/src/examples/CMakeLists.txt
@@ -4,7 +4,7 @@ add_executable(mnist_example mnist/mnist_ffnn.cpp)
 foreach(exec iris_example mnist_example)
   target_link_libraries(${exec} marian ${EXT_LIBS})
   if(CUDA_FOUND)
-    target_link_libraries(${exec} marian marian_cuda ${EXT_LIBS})
+    target_link_libraries(${exec} marian marian_cuda nccl ${EXT_LIBS})
     cuda_add_cublas_to_target(${exec})
   endif(CUDA_FOUND)
   set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index 2ca5c75a..1c845e26 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -43,7 +43,7 @@ foreach(exec
     )
     target_link_libraries(${exec} marian ${EXT_LIBS} Catch)
     if(CUDA_FOUND)
-      target_link_libraries(${exec} marian marian_cuda ${EXT_LIBS} Catch)
+      target_link_libraries(${exec} marian marian_cuda nccl ${EXT_LIBS} Catch)
       cuda_add_cublas_to_target(${exec})
     endif(CUDA_FOUND)
 
diff --git a/src/training/communicator_nccl.h b/src/training/communicator_nccl.h
index 84c5189f..ccb473a0 100755
--- a/src/training/communicator_nccl.h
+++ b/src/training/communicator_nccl.h
@@ -5,7 +5,7 @@
 #include "tensors/gpu/cuda_helpers.h"
 
 #include "cuda_runtime.h"
-#include "nccl.h"
+#include "nccl/build/include/nccl.h"
 #if (NCCL_MAJOR<3 || NCCL_MINOR<2)
 #define ncclGetVersion(pv) (*(pv) = (NCCL_MAJOR * 1000 + NCCL_MINOR * 100 + NCCL_PATCH))
 #endif
author	Marcin Junczys-Dowmunt <marcinjd@microsoft.com>	2018-12-07 00:22:00 +0300
committer	Marcin Junczys-Dowmunt <marcinjd@microsoft.com>	2018-12-07 00:22:00 +0300
commit	d43ffce6080170e970befe5c565bd9d2c05c4e03 (patch)
tree	346097ecf1890efedf73ca78779d959b19f72f86
parent	3efcfdf7f471f80fa73757615116dcc72e0aec05 (diff)
parent	1b2968c8b9465ce2225f304f5deea7f642f3e533 (diff)