Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>2020-01-07 02:48:51 +0300
committerMarcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>2020-01-07 02:49:13 +0300
commit7bed5ac058c3ca3c4617d133f3240b6981f95428 (patch)
tree49e7c6b2e08790b5d7b9eda124ce6cd130c87f1f
parent8e3a3f7c5b520babff49cec54a866fa3eda3a3b6 (diff)
parent3899f6e0f219b0cbf58537f791b0dd104b377750 (diff)
merge with newest master
-rw-r--r--LICENSE.txt2
-rw-r--r--Makefile2
-rw-r--r--README.md4
-rw-r--r--ext-net/dummy/Makefile2
-rw-r--r--ext-net/dummy/plugin.c2
-rw-r--r--makefiles/common.mk18
-rw-r--r--makefiles/formatting.mk2
-rw-r--r--makefiles/version.mk6
-rw-r--r--pkg/Makefile2
-rw-r--r--pkg/debian/Makefile11
-rw-r--r--pkg/redhat/Makefile2
-rw-r--r--pkg/redhat/nccl.spec.in7
-rw-r--r--pkg/srctxz/Makefile2
-rw-r--r--pkg/srctxz/create_srctxz.sh.in2
-rw-r--r--pkg/txz/Makefile2
-rw-r--r--pkg/txz/create_txz.sh.in2
-rw-r--r--src/Makefile56
-rw-r--r--src/bootstrap.cc (renamed from src/bootstrap.cu)224
-rw-r--r--src/channel.cc (renamed from src/channel.cu)7
-rw-r--r--src/collectives/all_gather.cc (renamed from src/collectives/all_gather.cu)2
-rw-r--r--src/collectives/all_reduce.cc (renamed from src/collectives/all_reduce.cu)3
-rw-r--r--src/collectives/broadcast.cc (renamed from src/collectives/broadcast.cu)2
-rw-r--r--src/collectives/device/Makefile4
-rw-r--r--src/collectives/device/all_gather.cu2
-rw-r--r--src/collectives/device/all_gather.h80
-rw-r--r--src/collectives/device/all_reduce.cu2
-rw-r--r--src/collectives/device/all_reduce.h195
-rw-r--r--src/collectives/device/broadcast.cu2
-rw-r--r--src/collectives/device/broadcast.h60
-rw-r--r--src/collectives/device/common.h37
-rw-r--r--src/collectives/device/common_kernel.h6
-rw-r--r--src/collectives/device/functions.cu19
-rwxr-xr-xsrc/collectives/device/gen_rules.sh2
-rw-r--r--src/collectives/device/op128.h36
-rw-r--r--src/collectives/device/primitives.h501
-rw-r--r--src/collectives/device/prims_ll.h259
-rw-r--r--src/collectives/device/prims_ll128.h410
-rw-r--r--src/collectives/device/reduce.cu2
-rw-r--r--src/collectives/device/reduce.h57
-rw-r--r--src/collectives/device/reduce_scatter.cu2
-rw-r--r--src/collectives/device/reduce_scatter.h75
-rw-r--r--src/collectives/reduce.cc (renamed from src/collectives/reduce.cu)2
-rw-r--r--src/collectives/reduce_scatter.cc (renamed from src/collectives/reduce_scatter.cu)2
-rw-r--r--src/debug.cc169
-rw-r--r--src/enqueue.cc (renamed from src/enqueue.cu)176
-rw-r--r--src/graph/connect.cc268
-rw-r--r--src/graph/paths.cc363
-rw-r--r--src/graph/rings.cc57
-rw-r--r--src/graph/rings.h7
-rw-r--r--src/graph/search.cc594
-rw-r--r--src/graph/topo.cc641
-rw-r--r--src/graph/topo.h138
-rw-r--r--src/graph/trees.cc (renamed from src/misc/trees.cu)6
-rw-r--r--src/graph/tuning.cc212
-rw-r--r--src/group.cc (renamed from src/misc/group.cu)83
-rw-r--r--src/include/alloc.h51
-rw-r--r--src/include/argcheck.h16
-rw-r--r--src/include/bootstrap.h4
-rw-r--r--src/include/channel.h4
-rw-r--r--src/include/checks.h71
-rw-r--r--src/include/collectives.h (renamed from src/collectives/collectives.h)10
-rw-r--r--src/include/comm.h141
-rw-r--r--src/include/core.h435
-rw-r--r--src/include/cpuset.h2
-rw-r--r--src/include/debug.h117
-rw-r--r--src/include/devcomm.h214
-rw-r--r--src/include/enqueue.h5
-rw-r--r--src/include/graph.h94
-rw-r--r--src/include/group.h6
-rw-r--r--src/include/ibvwrap.h2
-rw-r--r--src/include/info.h50
-rw-r--r--src/include/nccl_net.h9
-rw-r--r--src/include/net.h40
-rw-r--r--src/include/nvlink.h123
-rw-r--r--src/include/nvmlwrap.h28
-rw-r--r--src/include/param.h3
-rw-r--r--src/include/rings.h17
-rw-r--r--src/include/shm.h2
-rw-r--r--src/include/socket.h65
-rw-r--r--src/include/topo.h59
-rw-r--r--src/include/transport.h29
-rw-r--r--src/include/trees.h2
-rw-r--r--src/include/utils.h21
-rw-r--r--src/init.cc934
-rw-r--r--src/init.cu1240
-rw-r--r--src/misc/argcheck.cc (renamed from src/misc/checks.cu)5
-rw-r--r--src/misc/ibvwrap.cc (renamed from src/misc/ibvwrap.cu)2
-rw-r--r--src/misc/nvmlwrap.cc (renamed from src/misc/nvmlwrap.cu)60
-rw-r--r--src/misc/rings.cu385
-rw-r--r--src/misc/utils.cc194
-rw-r--r--src/misc/utils.cu203
-rw-r--r--src/nccl.h.in8
-rwxr-xr-xsrc/nccl.pc.in10
-rw-r--r--src/transport.cc (renamed from src/transport.cu)11
-rw-r--r--src/transport/net.cc (renamed from src/transport/net.cu)337
-rw-r--r--src/transport/net_ib.cc (renamed from src/transport/net_ib.cu)40
-rw-r--r--src/transport/net_socket.cc503
-rw-r--r--src/transport/net_socket.cu267
-rw-r--r--src/transport/p2p.cc312
-rw-r--r--src/transport/p2p.cu620
-rw-r--r--src/transport/shm.cc175
-rw-r--r--src/transport/shm.cu245
102 files changed, 7275 insertions, 4727 deletions
diff --git a/LICENSE.txt b/LICENSE.txt
index 3593a7a..e318c66 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,5 +1,5 @@
- Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
diff --git a/Makefile b/Makefile
index 605e3bf..caed3d4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/README.md b/README.md
index fa51453..7f0a72f 100644
--- a/README.md
+++ b/README.md
@@ -55,7 +55,7 @@ To install NCCL on the system, create a package then install it as root.
Debian/Ubuntu :
```shell
$ # Install tools to create debian packages
-$ sudo apt install build-essential devscripts debhelper
+$ sudo apt install build-essential devscripts debhelper fakeroot
$ # Build NCCL deb package
$ make pkg.debian.build
$ ls build/pkg/deb/
@@ -89,4 +89,4 @@ $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g <ngpus>
## Copyright
-All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+All source code and accompanying documentation is copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
diff --git a/ext-net/dummy/Makefile b/ext-net/dummy/Makefile
index d1eb4c5..efa841c 100644
--- a/ext-net/dummy/Makefile
+++ b/ext-net/dummy/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/ext-net/dummy/plugin.c b/ext-net/dummy/plugin.c
index f11b365..67d7d88 100644
--- a/ext-net/dummy/plugin.c
+++ b/ext-net/dummy/plugin.c
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/makefiles/common.mk b/makefiles/common.mk
index a81a1b5..37e81be 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -15,6 +15,7 @@ PROFAPI ?= 0
NVCC = $(CUDA_HOME)/bin/nvcc
CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
#CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
@@ -24,8 +25,7 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
# Better define NVCC_GENCODE in your environment to the minimal set
# of archs to reduce compile time.
-CUDA8_GENCODE = -gencode=arch=compute_30,code=sm_30 \
- -gencode=arch=compute_35,code=sm_35 \
+CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \
-gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61
@@ -42,9 +42,13 @@ else
endif
#$(info NVCC_GENCODE is ${NVCC_GENCODE})
-CXXFLAGS := -I$(CUDA_INC) -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
-CXXFLAGS += -Wall -Wno-sign-compare -Wno-unused-function
-NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
+CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
+CXXFLAGS += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla
+CXXFLAGS += -I $(CUDA_INC)
+# Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors)
+# 512 : 120, 640 : 96, 768 : 80, 1024 : 60
+# We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions.
+NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
# Use addprefix so that we can specify more than one path
NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt
@@ -67,7 +71,7 @@ CXXFLAGS += -O0 -g -ggdb3
endif
ifneq ($(VERBOSE), 0)
-NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
+NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
CXXFLAGS += -Wall -Wextra
else
.SILENT:
diff --git a/makefiles/formatting.mk b/makefiles/formatting.mk
index 4a4ab88..a543131 100644
--- a/makefiles/formatting.mk
+++ b/makefiles/formatting.mk
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/makefiles/version.mk b/makefiles/version.mk
index a8c6e3a..80f6221 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
-NCCL_MINOR := 4
-NCCL_PATCH := 2
+NCCL_MINOR := 5
+NCCL_PATCH := 6
NCCL_SUFFIX :=
-PKG_REVISION := 1
+PKG_REVISION := 2
diff --git a/pkg/Makefile b/pkg/Makefile
index 04b23da..ab6487b 100644
--- a/pkg/Makefile
+++ b/pkg/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/pkg/debian/Makefile b/pkg/debian/Makefile
index 439635f..0494f3e 100644
--- a/pkg/debian/Makefile
+++ b/pkg/debian/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -16,13 +16,8 @@ DEBFILES := compat copyright libnccl-dev.install rules $(DEBGEN)
DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES))
PKG_TIMESTAMP := $(shell date -R)
-ARCH := $(shell uname -m)
-PKG_ARCH ?= $(shell uname -m | sed -e "s/x86_64/amd64/g" | sed -e "s/ppc64le/ppc64el/g")
-PKG_MULTIARCH ?= $(shell $(CXX) -print-multiarch)
-ifeq ($(PKG_MULTIARCH),)
-# Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
-PKG_MULTIARCH := $(ARCH)-linux-gnu
-endif
+PKG_ARCH ?= $(shell dpkg-architecture -qDEB_HOST_ARCH)
+PKG_MULTIARCH ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH)
prep : $(DEBTARGETS)
$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
diff --git a/pkg/redhat/Makefile b/pkg/redhat/Makefile
index ffcc973..0808478 100644
--- a/pkg/redhat/Makefile
+++ b/pkg/redhat/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/pkg/redhat/nccl.spec.in b/pkg/redhat/nccl.spec.in
index f9d83a3..5fad346 100644
--- a/pkg/redhat/nccl.spec.in
+++ b/pkg/redhat/nccl.spec.in
@@ -7,6 +7,7 @@ Group: Development/Libraries
License: BSD
URL: http://developer.nvidia.com/nccl
Source0: nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz
+Prereq: /sbin/ldconfig
%description
NCCL (pronounced "Nickel") is a stand-alone library of standard collective
@@ -50,6 +51,12 @@ ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
# static
install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir}
+%post -p /sbin/ldconfig
+%postun -p /sbin/ldconfig
+
+%post devel -p /sbin/ldconfig
+%postun devel -p /sbin/ldconfig
+
%clean
rm -rf $RPM_BUILD_ROOT
diff --git a/pkg/srctxz/Makefile b/pkg/srctxz/Makefile
index ed677fe..01cab95 100644
--- a/pkg/srctxz/Makefile
+++ b/pkg/srctxz/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/pkg/srctxz/create_srctxz.sh.in b/pkg/srctxz/create_srctxz.sh.in
index ae7d01f..11bdd52 100644
--- a/pkg/srctxz/create_srctxz.sh.in
+++ b/pkg/srctxz/create_srctxz.sh.in
@@ -1,6 +1,6 @@
#!/bin/bash
#
-# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/pkg/txz/Makefile b/pkg/txz/Makefile
index fa587ef..b7d9aa5 100644
--- a/pkg/txz/Makefile
+++ b/pkg/txz/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/pkg/txz/create_txz.sh.in b/pkg/txz/create_txz.sh.in
index 73922e0..deae854 100644
--- a/pkg/txz/create_txz.sh.in
+++ b/pkg/txz/create_txz.sh.in
@@ -1,6 +1,6 @@
#!/bin/bash
#
-# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/src/Makefile b/src/Makefile
index fe60b11..b11de5e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -9,34 +9,40 @@ include ../makefiles/version.mk
##### src files
INCEXPORTS := nccl.h nccl_net.h
-LIBSRCFILES := init.cu channel.cu bootstrap.cu transport.cu enqueue.cu \
- misc/group.cu misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/checks.cu misc/trees.cu \
- transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
- collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
+LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc \
+ misc/nvmlwrap.cc misc/ibvwrap.cc misc/utils.cc misc/argcheck.cc \
+ transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
+ collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
+ graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc
##### lib files
LIBNAME := libnccl.so
STATICLIBNAME := libnccl_static.a
+##### pkgconfig files
+PKGCONFIGFILE := nccl.pc
##### dirs
BUILDDIR ?= $(abspath ../build)
INCDIR := $(BUILDDIR)/include
LIBDIR := $(BUILDDIR)/lib
OBJDIR := $(BUILDDIR)/obj
+PKGDIR := $(BUILDDIR)/lib/pkgconfig
##### target files
+CUDARTLIB ?= cudart_static
INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR))
LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
STATICLIBTARGET := $(STATICLIBNAME)
-LIBOBJ := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
+PKGTARGET := $(PKGCONFIGFILE)
+LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
DEPFILES := $(LIBOBJ:%.o=%.d)
-LDFLAGS += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl
+LDFLAGS += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl
DEVICELIB := $(BUILDDIR)/obj/collectives/device/colldevice.a
##### rules
build : lib staticlib
-lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
+lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET)
staticlib : $(LIBDIR)/$(STATICLIBTARGET)
@@ -53,7 +59,7 @@ $(INCDIR)/nccl.h : nccl.h.in
# NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z))
@$(eval NCCL_VERSION := $(shell printf "%d%d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH)))
mkdir -p $(INCDIR)
- printf "Generating %-35s > %s\n" $< $@
+ @printf "Generating %-35s > %s\n" $< $@
sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
@@ -77,37 +83,53 @@ $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB)
ar cr $@ $(LIBOBJ) $(TMP)/*.o
rm -Rf $(TMP)
+$(PKGDIR)/nccl.pc : nccl.pc.in
+ mkdir -p $(PKGDIR)
+ @printf "Generating %-35s > %s\n" $< $@
+ sed -e 's|$${nccl:Prefix}|\$(PREFIX)|g' \
+ -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
+ -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
+ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
+ $< > $@
+
$(INCDIR)/%.h : %.h
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(INCDIR)
- cp -f $< $@
+ install -m 644 $< $@
$(INCDIR)/nccl_%.h : include/nccl_%.h
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(INCDIR)
- cp -f $< $@
+ install -m 644 $< $@
+
+$(PKGDIR)/%.pc : %.pc
+ @printf "Grabbing %-35s > %s\n" $< $@
+ mkdir -p $(PKGDIR)
+ install -m 644 $< $@
-$(OBJDIR)/%.o : %.cu
+$(OBJDIR)/%.o : %.cc
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p `dirname $@`
- $(NVCC) -I. -I$(INCDIR) -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
- @$(NVCC) -I. -I$(INCDIR) -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
+ $(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@
+ @$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp)
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
@rm -f $(@:%.o=%.d.tmp)
clean :
- rm -rf ${INCDIR} ${LIBDIR} ${OBJDIR}
$(MAKE) -C collectives/device clean
+ rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
install : lib
mkdir -p $(PREFIX)/lib
+ mkdir -p $(PREFIX)/lib/pkgconfig
mkdir -p $(PREFIX)/include
- cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
+ cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/
+ cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/
cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
-FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
+FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
# Note that formatting.mk defines a new target so in order to not overwrite the default target,
# it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
# as the BUILDDIR variable.
diff --git a/src/bootstrap.cu b/src/bootstrap.cc
index 6b1d573..11ffc35 100644
--- a/src/bootstrap.cu
+++ b/src/bootstrap.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,48 +9,142 @@
#include "utils.h"
#include "bootstrap.h"
#include "net.h"
+#include "socket.h"
#include <unistd.h>
#include <sys/types.h>
-// Always use sockets for bootstrap
-ncclNet_t* ncclBootstrapNet = &ncclNetSocket;
+struct bootstrapNetComm {
+ int fd;
+};
-static ncclResult_t bootstrapNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; }
-static ncclResult_t bootstrapNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; }
-static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; }
-static ncclResult_t bootstrapNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; }
-static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; }
-static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; }
-static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; }
+/* Init functions */
+static char bootstrapNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
+static union socketAddress bootstrapNetIfAddrs[MAX_IFS];
+static int bootstrapNetIfs = -1;
+pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
+
+ncclResult_t bootstrapNetInit() {
+ if (bootstrapNetIfs == -1) {
+ pthread_mutex_lock(&bootstrapNetLock);
+ if (bootstrapNetIfs == -1) {
+ bootstrapNetIfs = findInterfaces(bootstrapNetIfNames, bootstrapNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
+ if (bootstrapNetIfs <= 0) {
+ WARN("Bootstrap : no socket interface found");
+ return ncclInternalError;
+ } else {
+ char line[1024];
+ char addrline[1024];
+ line[0] = '\0';
+ for (int i=0; i<bootstrapNetIfs; i++) {
+ snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, bootstrapNetIfNames+i*MAX_IF_NAME_SIZE,
+ socketToString(&bootstrapNetIfAddrs[i].sa, addrline));
+ }
+ line[1023] = '\0';
+ INFO(NCCL_INIT, "Bootstrap : Using%s", line);
+ }
+ }
+ pthread_mutex_unlock(&bootstrapNetLock);
+ }
+ return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetNewComm(struct bootstrapNetComm** comm) {
+ NCCLCHECK(ncclCalloc(comm, 1));
+ (*comm)->fd = -1;
+ return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetGetSocketAddr(int dev, union socketAddress* addr) {
+ if (dev >= bootstrapNetIfs) return ncclInternalError;
+ memcpy(addr, bootstrapNetIfAddrs+dev, sizeof(*addr));
+ return ncclSuccess;
+}
+
+/* Socket Interface Selection type */
+enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
+
+static ncclResult_t bootstrapNetListen(int dev, ncclNetHandle_t* netHandle, void** listenComm) {
+ union socketAddress* connectAddr = (union socketAddress*) netHandle;
+ static_assert(sizeof(union socketAddress) < NCCL_NET_HANDLE_MAXSIZE, "union socketAddress size is too large");
+ // if dev >= 0, listen based on dev
+ if (dev >= 0) {
+ NCCLCHECK(bootstrapNetGetSocketAddr(dev, connectAddr));
+ } else if (dev == findSubnetIf) {
+ // handle stores a remote address
+ // need to find a local addr that is in the same network as the remote addr
+ union socketAddress localAddr;
+ char ifName[MAX_IF_NAME_SIZE];
+ if (findInterfaceMatchSubnet(ifName, &localAddr, connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
+ WARN("NET/Socket : No usable listening interface found");
+ return ncclSystemError;
+ }
+ // pass the local address back
+ memcpy(connectAddr, &localAddr, sizeof(localAddr));
+ } // Otherwise, handle stores a local address
+ struct bootstrapNetComm* comm;
+ NCCLCHECK(bootstrapNetNewComm(&comm));
+ NCCLCHECK(createListenSocket(&comm->fd, connectAddr));
+ *listenComm = comm;
+ return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetConnect(int dev, ncclNetHandle_t* netHandle, void** sendComm) {
+ union socketAddress* connectAddr = (union socketAddress*) netHandle;
+ struct bootstrapNetComm* comm;
+ NCCLCHECK(bootstrapNetNewComm(&comm));
+ NCCLCHECK(connectAddress(&comm->fd, connectAddr));
+ *sendComm = comm;
+ return ncclSuccess;
+}
-// Additional sync functions based on async + test for bootstrap, using host ptrs.
+static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) {
+ struct bootstrapNetComm* lComm = (struct bootstrapNetComm*)listenComm;
+ struct bootstrapNetComm* rComm;
+ NCCLCHECK(bootstrapNetNewComm(&rComm));
+ struct sockaddr_in sockaddr;
+ socklen_t socklen = sizeof(struct sockaddr_in);
+ SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
+ *recvComm = rComm;
+ return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetClose(void* opaqueComm) {
+ struct bootstrapNetComm* comm = (struct bootstrapNetComm*)opaqueComm;
+ if (comm) {
+ close(comm->fd);
+ free(comm);
+ }
+ return ncclSuccess;
+}
+
+static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(bootstrapNetClose(sendComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(bootstrapNetClose(recvComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(bootstrapNetClose(listenComm)); return ncclSuccess; }
+
+// Additional sync functions
static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) {
- void* request, *mhandle;
- NCCLCHECK(ncclBootstrapNet->regMr(sendComm, data, size, NCCL_PTR_HOST, &mhandle));
- NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, mhandle, &request));
- NCCLCHECK(ncclBootstrapNet->deregMr(sendComm, mhandle));
- int done = 0;
- while (!done) NCCLCHECK(bootstrapNetTest(request, &done, NULL));
+ struct bootstrapNetComm* comm = (struct bootstrapNetComm*)sendComm;
+ NCCLCHECK(socketSend(comm->fd, &size, sizeof(int)));
+ NCCLCHECK(socketSend(comm->fd, data, size));
return ncclSuccess;
}
static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) {
- void* request, *mhandle;
- NCCLCHECK(ncclBootstrapNet->regMr(recvComm, data, size, NCCL_PTR_HOST, &mhandle));
- NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, mhandle, &request));
- NCCLCHECK(ncclBootstrapNet->deregMr(recvComm, mhandle));
- int done = 0;
- while (!done) NCCLCHECK(bootstrapNetTest(request, &done, NULL));
+ struct bootstrapNetComm* comm = (struct bootstrapNetComm*)recvComm;
+ int recvSize;
+ NCCLCHECK(socketReceive(comm->fd, &recvSize, sizeof(int)));
+ if (recvSize > size) {
+ WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size);
+ return ncclInternalError;
+ }
+ NCCLCHECK(socketReceive(comm->fd, data, std::min(recvSize, size)));
return ncclSuccess;
}
-struct extId {
- ncclNetHandle_t extHandleRoot;
- void* extListenComm;
- uint64_t hostHash;
- pid_t pid;
- int fd;
- pthread_t boostrapThread;
-};
+ncclResult_t bootstrapNetCreateHandle(ncclNetHandle_t* netHandle, const char* str) {
+ union socketAddress* connectAddr = (union socketAddress*) netHandle;
+ NCCLCHECK(GetSocketAddrFromString(connectAddr, str));
+ return ncclSuccess;
+}
struct extInfo {
int rank;
@@ -69,9 +163,8 @@ static ncclResult_t setFilesLimit() {
return ncclSuccess;
}
-static void *bootstrapRoot(void* commId) {
+static void *bootstrapRoot(void* listenComm) {
struct extInfo info;
- struct extId* id = (struct extId*)commId;
ncclNetHandle_t *rankHandles = NULL;
ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
ncclNetHandle_t zero = { 0 }; // for sanity checking
@@ -83,7 +176,7 @@ static void *bootstrapRoot(void* commId) {
/* Receive addresses from all ranks */
int nranks = 0, c = 0;
do {
- NCCLCHECKGOTO(bootstrapNetAccept(id->extListenComm, &tmpComm), res, out);
+ NCCLCHECKGOTO(bootstrapNetAccept(listenComm, &tmpComm), res, out);
NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out);
NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out);
@@ -108,22 +201,22 @@ static void *bootstrapRoot(void* commId) {
memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t));
++c;
+ TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks);
} while (c < nranks);
- TRACE(NCCL_INIT, "COLLECTED HANDLES");
+ TRACE(NCCL_INIT, "COLLECTED ALL %d HANDLES", nranks);
// Send the connect handle for the next rank in the AllGather ring
for (int r=0; r<nranks; ++r) {
int next = (r+1) % nranks;
void *tmpSendComm;
- NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot[r], &tmpSendComm), res, out);
+ NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot+r, &tmpSendComm), res, out);
NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
}
- TRACE(NCCL_INIT, "SENT OUT HANDLES");
+ TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);
out:
- bootstrapNetCloseListen(id->extListenComm);
- free(commId);
+ bootstrapNetCloseListen(listenComm);
if (rankHandles) free(rankHandles);
if (rankHandlesRoot) free(rankHandlesRoot);
@@ -131,31 +224,28 @@ out:
return NULL;
}
-ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
- struct extId* id = (struct extId*)commId;
- id->hostHash = getHostHash();
- NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
- ncclUniqueId* threadIdCopy;
- NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
- memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
- pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy);
+ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
+ ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
+ void* listenComm;
+ NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, netHandle, &listenComm));
+ pthread_t thread;
+ pthread_create(&thread, NULL, bootstrapRoot, listenComm);
return ncclSuccess;
}
-ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
- static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
- extId* id = (extId*)out;
+ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
+ static_assert(sizeof(ncclNetHandle_t) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
+ memset(id, 0, sizeof(ncclUniqueId));
+ ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
char* env = getenv("NCCL_COMM_ID");
if (env) {
- if (ncclSocketCreateHandle(&id->extHandleRoot, env) != 0) {
+ if (bootstrapNetCreateHandle(netHandle, env) != 0) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return ncclInvalidArgument;
}
- id->pid = -1;
} else {
- id->pid = getpid();
- NCCLCHECK(bootstrapCreateRoot(out, false));
+ NCCLCHECK(bootstrapCreateRoot(id, false));
}
return ncclSuccess;
@@ -178,9 +268,9 @@ struct extState {
int dev;
};
-ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) {
- struct extId* id = (struct extId*)commId;
- bool idFromEnv = id->pid < 0;
+ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState) {
+ ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
+ bool idFromEnv = getenv("NCCL_COMM_ID") != NULL;
struct extState* state;
NCCLCHECK(ncclCalloc(&state, 1));
state->rank = rank;
@@ -195,8 +285,8 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co
void *tmpSendComm, *tmpRecvComm;
// Pass the remote address to listen via info
if (idFromEnv) {
- memcpy(&info.extHandleListen, &id->extHandleRoot, sizeof(ncclNetHandle_t));
- memcpy(&info.extHandleListenRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
+ memcpy(&info.extHandleListen, netHandle, sizeof(ncclNetHandle_t));
+ memcpy(&info.extHandleListenRoot, netHandle, sizeof(ncclNetHandle_t));
}
// listen will return the local address via info (specify interface type 'findSubnetIf')
state->dev = idFromEnv ? findSubnetIf : 0;
@@ -215,7 +305,7 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co
}
// send info on my listening socket to root
- NCCLCHECK(bootstrapNetConnect(state->dev, id->extHandleRoot, &tmpSendComm));
+ NCCLCHECK(bootstrapNetConnect(state->dev, netHandle, &tmpSendComm));
NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info)));
NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
@@ -226,7 +316,7 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co
NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));
- NCCLCHECK(bootstrapNetConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
+ NCCLCHECK(bootstrapNetConnect(state->dev, &extHandleNext, &state->extBstrapRingSendComm));
// Accept the connect request from the previous rank in the AllGather ring
NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm));
@@ -269,7 +359,7 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) {
struct extState* state = (struct extState*)commState;
void* tmpSendComm;
- NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles[peer], &tmpSendComm));
+ NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles+peer, &tmpSendComm));
NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int)));
NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size));
NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
@@ -357,3 +447,13 @@ ncclResult_t bootstrapClose(void* commState) {
return ncclSuccess;
}
+
+ncclResult_t bootstrapAbort(void* commState) {
+ struct extState* state = (struct extState*)commState;
+ bootstrapNetCloseListen(state->extBstrapListenComm);
+ bootstrapNetCloseSend(state->extBstrapRingSendComm);
+ bootstrapNetCloseRecv(state->extBstrapRingRecvComm);
+ free(state->peerBstrapHandles);
+ free(state);
+ return ncclSuccess;
+}
diff --git a/src/channel.cu b/src/channel.cc
index 937e84e..b053e5b 100644
--- a/src/channel.cu
+++ b/src/channel.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -47,5 +47,10 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
}
+
+ // Free the peer structures.
+ CUDACHECK(cudaFree(channel->devPeers));
+ free(channel->peers);
+
return ncclSuccess;
}
diff --git a/src/collectives/all_gather.cu b/src/collectives/all_gather.cc
index db21dee..348c176 100644
--- a/src/collectives/all_gather.cu
+++ b/src/collectives/all_gather.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/all_reduce.cu b/src/collectives/all_reduce.cc
index 1492c90..7796d5b 100644
--- a/src/collectives/all_reduce.cu
+++ b/src/collectives/all_reduce.cc
@@ -1,11 +1,10 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
-#include "collectives.h"
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
diff --git a/src/collectives/broadcast.cu b/src/collectives/broadcast.cc
index 6a3d0a8..042301b 100644
--- a/src/collectives/broadcast.cu
+++ b/src/collectives/broadcast.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile
index 8e92596..001059c 100644
--- a/src/collectives/device/Makefile
+++ b/src/collectives/device/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -68,4 +68,4 @@ $(DEVOBJ) : $(LIBOBJ)
$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
clean:
- rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(STATICLIB) test
+ rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(RULESFILE) $(STATICLIB)
diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu
index 530bf14..109c341 100644
--- a/src/collectives/device/all_gather.cu
+++ b/src/collectives/device/all_gather.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index 36809c9..0ad5ba9 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -1,33 +1,33 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int nthreads = blockDim.x - 1;
+ const int nthreads = args->nThreads-WARP_SIZE;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
const int nranks = comm->nRanks;
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
- const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+ const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
- ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
- prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
+ ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, FUNC>
+ prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
@@ -74,7 +74,7 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
@@ -129,3 +129,67 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
+
+#include "prims_ll128.h"
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
+ const int tid = threadIdx.x;
+ const int bid = args->bid;
+ const int nthreads = args->nThreads;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+
+ ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+ const ssize_t size = args->N;
+ //const int rank = comm->rank;
+ const int nranks = comm->nRanks;
+ ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+ // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
+ const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+
+ const ssize_t loopSize = args->nChannels*chunkSize;
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->ThisInput;
+ T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
+
+ ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+ /////////////// begin AllGather steps ///////////////
+ ssize_t offset;
+ int nelem = min(chunkSize, size-chunkOffset);
+ int rankDest;
+
+ // step 0: push data to next GPU
+ rankDest = ring->devUserRanks[0];
+ offset = chunkOffset + rankDest * size;
+
+ if (thisInput + chunkOffset == thisOutput + offset) { // In place
+ LLprims.send(thisInput+chunkOffset, nelem);
+ } else {
+ LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
+ }
+
+ // k-2 steps: copy to next GPU
+ for (int j=1; j<nranks-1; ++j) {
+ rankDest = ring->devUserRanks[nranks-j];
+ offset = chunkOffset + rankDest * size;
+
+ LLprims.recvCopySend(thisOutput+offset, nelem);
+ }
+
+ // step k-1: final store
+ rankDest = ring->devUserRanks[1];
+ offset = chunkOffset + rankDest * size;
+
+ LLprims.recv(thisOutput+offset, nelem);
+ }
+}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu
index aaa96b4..85d007e 100644
--- a/src/collectives/device/all_reduce.cu
+++ b/src/collectives/device/all_reduce.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index ea89a71..2449c2b 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -1,19 +1,19 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int nthreads = blockDim.x - 1;
+ const int nthreads = args->nThreads-WARP_SIZE;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
@@ -27,7 +27,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
T * __restrict__ thisOutput = (T*)args->ThisOutput;
ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
- prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
+ prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels));
@@ -85,23 +85,28 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
template<int UNROLL, class FUNC, typename T>
__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int nthreads = blockDim.x - 1;
+ const int nthreads = args->nThreads-WARP_SIZE;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
- struct ncclTree* tree = &channel->tree;
const ssize_t size = args->N;
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
- const int chunkSize = args->lastChunkSize;
+ int chunkSize = args->lastChunkSize;
+ const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = args->nChannels*chunkSize;
+ if (loopSize > size) {
+ chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+ }
+
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
do {
+ struct ncclTree* tree = &channel->treeUp;
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
- ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
+ ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
@@ -117,8 +122,9 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
} while(0);
do {
+ struct ncclTree* tree = &channel->treeDn;
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
- ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
+ ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
@@ -139,7 +145,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
@@ -149,6 +155,8 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
//const int rank = comm->rank;
const int nranks = comm->nRanks;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+ const ssize_t minChunkSize = nthreads * (sizeof(uint64_t)) / sizeof(T);
+
const ssize_t loopSize = args->nChannels*nranks*chunkSize;
// Compute pointers
@@ -156,10 +164,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
T * __restrict__ thisOutput = (T*)args->ThisOutput;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- if (size-gridOffset < loopSize) {
- chunkSize = args->lastChunkSize;
- }
- ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
+ chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
/////////////// begin AllReduce steps ///////////////
ssize_t offset;
@@ -168,7 +173,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
// step 0: push data to next GPU
slice = ring->devUserRanks[nranks-1];
- offset = chunkOffset + slice * chunkSize;
+ offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.send(thisInput+offset, nelem);
@@ -176,7 +181,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
slice = ring->devUserRanks[nranks-j];
- offset = chunkOffset + slice * chunkSize;
+ offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvReduceSend(thisInput+offset, nelem);
@@ -185,7 +190,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
slice = ring->devUserRanks[0];
- offset = chunkOffset + slice * chunkSize;
+ offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
@@ -193,7 +198,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
slice = ring->devUserRanks[nranks-j];
- offset = chunkOffset + slice * chunkSize;
+ offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvCopySend(thisOutput+offset, nelem);
@@ -201,7 +206,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
// Make final copy from buffer to dest.
slice = ring->devUserRanks[1];
- offset = chunkOffset + slice * chunkSize;
+ offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
// Here we need to copy from buffer to this output.
@@ -214,18 +219,23 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
- struct ncclTree* tree = &channel->tree;
const ssize_t size = args->N;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+ const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = args->nChannels*chunkSize;
+ if (loopSize > size) {
+ chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+ }
+
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
do {
+ struct ncclTree* tree = &channel->treeUp;
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
@@ -243,6 +253,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
} while(0);
do {
+ struct ncclTree* tree = &channel->treeDn;
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
@@ -259,3 +270,141 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
}
} while(0);
}
+
+#include "prims_ll128.h"
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
+ const int tid = threadIdx.x;
+ const int bid = args->bid;
+ const int nthreads = args->nThreads;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+
+ ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+ const ssize_t size = args->N;
+ //const int rank = comm->rank;
+ const int nranks = comm->nRanks;
+ ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+ // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
+ const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+
+ const ssize_t loopSize = args->nChannels*nranks*chunkSize;
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->ThisInput;
+ T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
+
+ /////////////// begin AllReduce steps ///////////////
+ ssize_t offset;
+ int nelem;
+ int slice;
+
+ // step 0: push data to next GPU
+ slice = ring->devUserRanks[nranks-1];
+ offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+ nelem = min(chunkSize, size-offset);
+
+ LLprims.send(thisInput+offset, nelem);
+
+ // k-2 steps: reduce and copy to next GPU
+ for (int j=2; j<nranks; ++j) {
+ slice = ring->devUserRanks[nranks-j];
+ offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+ nelem = min(chunkSize, size-offset);
+
+ LLprims.recvReduceSend(thisInput+offset, nelem);
+ }
+
+ // step k-1: reduce this buffer and data, which will produce the final
+ // result that we store in this data and push to the next GPU
+ slice = ring->devUserRanks[0];
+ offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+ nelem = min(chunkSize, size-offset);
+
+ LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
+
+ // k-2 steps: copy to next GPU
+ for (int j=1; j<nranks-1; ++j) {
+ slice = ring->devUserRanks[nranks-j];
+ offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+ nelem = min(chunkSize, size-offset);
+
+ LLprims.recvCopySend(thisOutput+offset, nelem);
+ }
+
+ // Make final copy from buffer to dest.
+ slice = ring->devUserRanks[1];
+ offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+ nelem = min(chunkSize, size-offset);
+
+ // Here we need to copy from buffer to this output.
+ LLprims.recv(thisOutput+offset, nelem);
+ }
+}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads;
+ const int bid = args->bid;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclTree* treeUp = &channel->treeUp;
+ struct ncclTree* treeDn = &channel->treeDn;
+ const ssize_t size = args->N;
+ ssize_t chunkSize = args->lastChunkSize;
+ const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/8;
+ const ssize_t loopSize = args->nChannels*chunkSize;
+ int nthreadsSplit = NCCL_LL128_SPLIT(nthreads);
+
+ if (loopSize > size) {
+ chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+ }
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->ThisInput;
+ T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+ if (treeUp->up == -1) {
+ // ReduceAndBroadcast : max number of recv is 3, max number of send is 3
+ ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, treeUp->down, treeDn->down, channel, comm, args->opCount);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
+ }
+ } else {
+ if (tid < nthreadsSplit) {
+ // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
+ ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreadsSplit, treeUp->down, &treeUp->up, channel, comm, args->opCount);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Up
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (treeUp->down[0] == -1) {
+ LLprims.send(thisInput+offset, nelem);
+ } else {
+ LLprims.recvReduceSend(thisInput+offset, nelem);
+ }
+ }
+ } else {
+ // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
+ ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &treeDn->up, treeDn->down, channel, comm, args->opCount);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Down
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (treeDn->down[0] == -1) {
+ LLprims.recv(thisOutput+offset, nelem);
+ } else {
+ LLprims.recvCopySend(thisOutput+offset, nelem);
+ }
+ }
+ }
+ }
+}
diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu
index b83ee70..8c8dbb6 100644
--- a/src/collectives/device/broadcast.cu
+++ b/src/collectives/device/broadcast.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index fb18312..de8b989 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -1,19 +1,19 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int nthreads = blockDim.x - 1;
+ const int nthreads = args->nThreads-WARP_SIZE;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
@@ -29,7 +29,7 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
T * __restrict__ thisOutput = (T*)args->ThisOutput;
ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, FUNC>
- prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
+ prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
@@ -59,7 +59,7 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
@@ -100,3 +100,51 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
+
+#include "prims_ll128.h"
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
+ const int tid = threadIdx.x;
+ const int bid = args->bid;
+ const int nthreads = args->nThreads;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+
+ ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+ const ssize_t size = args->N;
+ const int rank = ring->devUserRanks[0];
+ const int nextRank = ring->devUserRanks[1];
+ const int root = args->root;
+
+ ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+ const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+
+ const ssize_t loopSize = args->nChannels*chunkSize;
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->ThisInput;
+ T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
+ ssize_t offset = gridOffset + bid*chunkSize;
+
+ int nelem = min(chunkSize, size-offset);
+ if (rank == root) {
+ if (thisInput == thisOutput) {
+ LLprims.send(thisInput+offset, nelem);
+ } else {
+ LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
+ }
+ } else if (nextRank == root) {
+ LLprims.recv(thisOutput + offset, nelem);
+ } else {
+ LLprims.recvCopySend(thisOutput + offset, nelem);
+ }
+ }
+}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index e4aecbd..46eb9f5 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,9 +7,8 @@
#ifndef NCCL_DEVICE_COMMON_H_
#define NCCL_DEVICE_COMMON_H_
-#include "../collectives.h"
-#include "core.h"
-#include "nccl.h"
+#include "collectives.h"
+#include "devcomm.h"
// Exit If Abort Barrier across CTA: make sure all threads exit consistently
// Each thread sets a predicate to true if abort == 1
@@ -31,17 +30,19 @@ extern __device__ ncclKern_t ncclFuncs[];
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
int* d = (int*)dst;
int* s = (int*)src;
- // When aggregation is effective, if some threads have aborted inside the LL kernel,
- // make sure the rest of the threads abort as well
- exitIfAbortBarrier(0);
for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
- __syncthreads();
}
-static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid) {
+static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, struct ncclDevComm* comm) {
+ // Check whether the last operation was aborted and make sure all threads exit
+ int abort = tid == 0 ? *(comm->abortFlag) : 0;
+ exitIfAbortBarrier(abort);
load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid);
+ __syncthreads();
if (tid == 0) hostColl->active = 0;
}
+extern __device__ volatile uint64_t* ncclShmem;
+
/* Functions for aggregation case */
#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
__device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
@@ -51,13 +52,14 @@ __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
#if NCCL_OP == 0
/* Kernels with the first operation inlined */
#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \
-__launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \
__global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
int tid = threadIdx.x; \
int bid = blockIdx.x; \
+ __shared__ volatile uint64_t shmem[NCCL_LL128_SHMEM_SIZE]; \
+ ncclShmem = shmem; \
__shared__ struct ncclColl localColl; \
\
- struct ncclComm* comm = firstColl.args.comm; \
+ struct ncclDevComm* comm = firstColl.args.comm; \
struct ncclChannel* channel = comm->channels+bid; \
struct ncclColl* c; \
if (bid == 0) { \
@@ -65,7 +67,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
c = &firstColl; \
} else { \
c = &localColl; \
- load_coll(c, channel->devCollectives+channel->collFifoHead, tid); \
+ load_coll(c, channel->devCollectives+channel->collFifoHead, tid, comm); \
} \
while (1) { \
if (tid < c->args.nThreads) { \
@@ -84,7 +86,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
\
/* Load next collective operation*/ \
c = &localColl; /* for bid 0 */ \
- load_coll(c, channel->devCollectives+nextIndex, tid); \
+ load_coll(c, channel->devCollectives+nextIndex, tid, comm); \
} \
}
#else
@@ -93,13 +95,14 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
// Only generate inline kernels for LL
#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \
- IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \
- IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1, al)) \
+ IMPL_COLL_FUNC(coll##LL128, op, ncclFunc, dtype, ctype) \
+ IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
+ IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, al, NCCL_PROTO_LL)) \
#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
- IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0) \
- IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 1)
+ IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \
+ IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING)
#if NCCL_TYPE == 0
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
index e1fb096..aa1e936 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,7 +7,7 @@
#ifndef NCCL_COMMON_KERNEL_H_
#define NCCL_COMMON_KERNEL_H_
-#include "core.h"
+#include "devcomm.h"
#include <cstdio>
#include <cstdint>
@@ -263,8 +263,6 @@ __device__ __forceinline__ void ReduceCopyMulti(const int tid, const int nthread
}
}
-#define WARP_SIZE 32
-
template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ __forceinline__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu
index ea06b68..034fe96 100644
--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@@ -1,20 +1,23 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#include "devcomm.h"
#include "collectives.h"
#include "common.h"
+__device__ volatile uint64_t* ncclShmem;
+
#define NCCL_FUNC5(coll, op, dtype) \
- NCCL_COLL_NAME(coll, op, dtype), \
- NCCL_COLL_NAME(coll##LL, op, dtype)
+ NCCL_COLL_NAME(coll##LL, op, dtype), \
+ NCCL_COLL_NAME(coll##LL128, op, dtype), \
+ NCCL_COLL_NAME(coll, op, dtype)
#define NCCL_FUNC4(coll, op, dtype) \
- NCCL_FUNC5(coll##Ring, op, dtype), \
- NCCL_FUNC5(coll##Tree, op, dtype)
+ NCCL_FUNC5(coll##Tree, op, dtype), \
+ NCCL_FUNC5(coll##Ring, op, dtype)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
@@ -50,7 +53,7 @@
NCCL_FUNCS3B(coll, copy), \
NCCL_FUNCS3B(coll, copy)
-// Must be consistent with ncclColl_t
+// Must be consistent with ncclFunc_t
#define NCCL_FUNCS() { \
NCCL_FUNCS2B(ncclBroadcast), \
NCCL_FUNCS2A(ncclReduce), \
@@ -59,7 +62,7 @@
NCCL_FUNCS2A(ncclAllReduce) }
// Must be consistent with the ncclFuncSet enum
-__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
+__device__ ncclKern_t ncclFuncs[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
// Don't try to initialize the host shadow copy of this device-side global
// variable. There is no host pointer to a device-side function, which
// confuses clang. This will be fixed in the next clang release.
diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh
index 3942c8c..4413213 100755
--- a/src/collectives/device/gen_rules.sh
+++ b/src/collectives/device/gen_rules.sh
@@ -1,6 +1,6 @@
#!/bin/bash
#
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/src/collectives/device/op128.h b/src/collectives/device/op128.h
new file mode 100644
index 0000000..9405dc2
--- /dev/null
+++ b/src/collectives/device/op128.h
@@ -0,0 +1,36 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef OP128_H_
+#define OP128_H_
+
+inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
+ asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
+ : "=l"(v0), "=l"(v1) : "l"(ptr));
+}
+
+inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
+ asm volatile("st.volatile.global.v2.u64 [%2], {%0,%1};"
+ :: "l"(v0), "l"(v1), "l"(ptr));
+}
+
+inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) {
+ uint64_t* shmemAsmPtr;
+ asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr));
+ return shmemAsmPtr;
+}
+
+inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) {
+ asm volatile("ld.volatile.shared.v2.u64 {%0,%1}, [%2];"
+ : "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr));
+}
+
+inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) {
+ asm volatile("st.volatile.shared.v2.u64 [%2], {%0,%1};"
+ :: "l"(v0), "l"(v1), "l"(shmemAsmPtr));
+}
+
+#endif
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index c5aaf54..aa3d20d 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -37,20 +37,32 @@ class ncclPrimitives {
private:
const int tid;
const int nthreads;
+ const int wid;
+ const int stepSize;
int nrecv = 0;
int nsend = 0;
- const int stepSize;
- struct ncclConnInfo* recvConn[NRECV];
- struct ncclConnInfo* sendConn[NSEND];
- volatile uint64_t* waitPtr;
+ struct ncclConnInfo* recvConn = NULL;
+ volatile uint64_t* recvConnHeadPtr = NULL;
+ uint64_t recvConnHead;
+ volatile uint64_t* recvConnTailPtr = NULL;
+ uint64_t recvConnTail;
+ uint64_t recvConnTailCache; // Cache last seen value
+
+ struct ncclConnInfo* sendConn = NULL;
+ volatile int* sendConnFifoPtr = NULL;
+ volatile uint64_t* sendConnTailPtr = NULL;
+ uint64_t sendConnTail;
+ volatile uint64_t* sendConnHeadPtr = NULL;
+ uint64_t sendConnHead;
+ uint64_t sendConnHeadCache; // Cache last seen value
+
uint64_t recvStep[NRECV];
uint64_t sendStep[NSEND];
- uint64_t sendConnHead[NSEND];
const T* recvDirectBuff[NRECV];
T* sendDirectBuff[NSEND];
const T* recvBuff[NRECV];
T* sendBuff[NSEND];
- struct ncclComm* comm;
+ struct ncclDevComm* comm;
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
@@ -60,15 +72,18 @@ class ncclPrimitives {
inline __device__ void barrier() {
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
}
+ inline __device__ void subBarrier() {
+ asm volatile ("bar.sync 2, %0;" :: "r"(nthreads-WARP_SIZE));
+ }
uint32_t mismatch = 0;
const uint64_t opCount;
- inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
+ inline __device__ void checkMismatch(struct ncclConnInfo* conn) {
if (mismatch) {
// In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch
*(comm->fatalDevError) = ncclDevAssertedMismatch;
- } else if (remoteOpCount && *remoteOpCount > opCount) {
+ } else if (conn && *conn->opCountRem > opCount) {
mismatch += 1;
}
}
@@ -76,49 +91,55 @@ class ncclPrimitives {
uint32_t spins = 0;
uint32_t abort = 0;
- inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
+ inline __device__ int checkAbort(int i, int send) {
spins++;
- if (spins == SPINS_BEFORE_CHECK_ABORT) {
+ if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
abort = *(comm->abortFlag);
- checkMismatch(remoteOpCount);
+ if (wid == i) checkMismatch(send ? sendConn : recvConn);
spins = 0;
}
return abort;
}
- inline __device__ void waitRecv(int i) {
+ inline __device__ void waitSend(int nbytes) {
spins = 0;
mismatch = 0;
- recvStep[i] += SLICESTEPS;
- if (tid == i) {
- while (*(waitPtr) < recvStep[i]) {
- if (checkAbort(recvConn[i]->opCountRem)) break;
+ if (sendConnHeadPtr) {
+ while (sendConnHeadCache + NCCL_STEPS < sendConnHead + SLICESTEPS) {
+ sendConnHeadCache = *sendConnHeadPtr;
+ if (checkAbort(wid, 1)) break;
+ }
+ if (sendConnFifoPtr) {
+ sendConnFifoPtr[sendConnHead%NCCL_STEPS] = nbytes;
}
+ sendConnHead += SLICESTEPS;
}
}
- inline __device__ void waitSend(int i) {
+ inline __device__ void waitRecv() {
spins = 0;
mismatch = 0;
- sendStep[i] += SLICESTEPS;
- if (tid == WARP_SIZE+i) {
- while (sendConnHead[i] + NCCL_STEPS < sendStep[i]) {
- sendConnHead[i] = *waitPtr;
- if (checkAbort(sendConn[i]->opCountRem)) break;
+ if (recvConnTailPtr) {
+ while (recvConnTailCache < recvConnTail + SLICESTEPS) {
+ recvConnTailCache = *recvConnTailPtr;
+ if (checkAbort(wid, 0)) break;
}
+ recvConnTail += SLICESTEPS;
}
}
- inline __device__ void postRecv(int i) {
- *(recvConn[i]->head) = recvStep[i] += SLICESTEPS;
+ inline __device__ void incRecv(int i) {
+ recvStep[i] += SLICESTEPS;
}
-
- inline __device__ void postSend(int i) {
- *(sendConn[i]->tail) = sendStep[i] += SLICESTEPS;
+ inline __device__ void postRecv() {
+ if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += SLICESTEPS;
}
- inline __device__ void postSendSize(int i, int size) {
- if (sendConn[i]->fifo) sendConn[i]->fifo[sendStep[i]%NCCL_STEPS] = size;
+ inline __device__ void incSend(int i) {
+ sendStep[i] += SLICESTEPS;
+ }
+ inline __device__ void postSend() {
+ if (sendConnTailPtr) *sendConnTailPtr = sendConnTail += SLICESTEPS;
}
template <int DIRECTRECV>
@@ -131,11 +152,22 @@ class ncclPrimitives {
return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
}
+ template <int DIRECTRECV>
+ inline __device__ int directRecvInc(int i, int directInc, int sliceInc) {
+ return DIRECTRECV && recvDirectBuff[i] ? directInc : sliceInc;
+ }
+
+ template <int DIRECTSEND>
+ inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
+ return DIRECTSEND && sendDirectBuff[i] ? directInc : sliceInc;
+ }
+
template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
inline __device__ void
GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) {
int offset = 0;
- int sliceSize = stepSize * SLICESTEPS;
+ int sliceSize = stepSize*SLICESTEPS;
+ int dataSize = max(DIVUP(nelem, 16*SLICESPERCHUNK)*16, sliceSize/32);
const T* srcs[RECV*NRECV+SRC];
srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset);
@@ -151,101 +183,126 @@ class ncclPrimitives {
for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
}
- #pragma unroll 1
+ bool syncThread = tid >= nthreads-WARP_SIZE;
+
+ #pragma unroll
for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
- int realSize = max(0, min(sliceSize, nelem-offset));
- if (tid < nthreads) {
- FOR_SEND(waitSend);
- FOR_RECV(waitRecv);
+ int realSize = max(0, min(dataSize, nelem-offset));
+ if (!syncThread) {
+ if (SEND) waitSend(realSize*sizeof(T));
+ if (RECV) waitRecv();
if (realSize > 0) {
- barrier();
+ subBarrier();
if (DIRECTRECV && recvDirectBuff[0]) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (SEND) {
- ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
+ ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads-WARP_SIZE, 1, srcs, nsend, dsts+1, realSize);
}
} else {
- ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
+ ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads-WARP_SIZE, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
}
}
- exitIfAbortBarrier(abort);
- } else {
- exitIfAbortBarrier(abort);
- FOR_SEND(postSendSize, realSize*sizeof(T));
- if (SEND) __threadfence_system();
- FOR_SEND(postSend);
- FOR_RECV(postRecv);
}
- for (int i=0; i<RECV*NRECV+SRC; i++) srcs[i] += sliceSize;
- for (int i=0; i<SEND*NSEND+DST; i++) dsts[i] += sliceSize;
- offset += sliceSize;
+ barrier();
+ FOR_SEND(incSend);
+ FOR_RECV(incRecv);
+ if (syncThread) {
+ if (SEND) {
+ if (realSize > 0 && wid == 0) __threadfence_system();
+ __syncwarp();
+ postSend();
+ }
+ if (RECV) postRecv();
+ }
+ srcs[0] += SRC ? realSize : directRecvInc<DIRECTRECV>(0, realSize, sliceSize);
+ for (int i=1-SRC; i<RECV*NRECV; i++) srcs[SRC+i] += sliceSize;
+ dsts[0] += DST ? realSize : directSendInc<DIRECTSEND>(0, realSize, sliceSize);
+ for (int i=1-DST; i<SEND*NSEND; i++) dsts[DST+i] += directSendInc<DIRECTSEND>(i, realSize, sliceSize);
+ offset += realSize;
}
}
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
- recvConn[i] = conn;
- recvBuff[i] = (const T*)recvConn[i]->buff;
- recvStep[i] = recvConn[i]->step;
+ recvBuff[i] = (const T*)conn->buff;
+ recvStep[i] = conn->step;
recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
- // Return credits in case we rounded up.
- if (tid == nthreads) *recvConn[i]->head = recvStep[i];
- if (tid == i) {
- waitPtr = recvConn[i]->tail;
- *(recvConn[i]->opCountLoc) = opCount;
- }
recvDirectBuff[i] = NULL;
- if (directBuff && recvConn[i]->direct) {
+ if (directBuff && conn->direct) {
recvDirectBuff[i] = directBuff;
- if (tid == 0) *recvConn[i]->ptrExchange = directBuff;
+ if (tid == 0) *conn->ptrExchange = directBuff;
}
+ if (wid == i) recvConn = conn;
+ if (wid == i) recvConnTail = recvConnHead = recvStep[i]; // Make sure we set this after rounding up
nrecv++;
}
+ __device__ __forceinline__ void loadRecvSync() {
+ if (tid >= WARP_SIZE && tid < 2*WARP_SIZE && wid<nrecv) {
+ recvConnTailPtr = recvConn->tail;
+ recvConnTailCache = *recvConnTailPtr;
+ }
+ if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+ recvConnHeadPtr = recvConn->head;
+ // Return credits in case we rounded up.
+ *recvConnHeadPtr = recvConnHead;
+ // Update opCount in case we skipped some operations
+ *(recvConn->opCountLoc) = opCount;
+ }
+ }
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
- sendConn[i] = conn;
- sendBuff[i] = (T*)sendConn[i]->buff;
- sendStep[i] = sendConn[i]->step;
+ sendBuff[i] = (T*)conn->buff;
+ sendStep[i] = conn->step;
sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
- if (tid == WARP_SIZE+i) {
- waitPtr = sendConn[i]->head;
- sendConnHead[i] = *waitPtr;
- *(sendConn[i]->opCountLoc) = opCount;
- }
sendDirectBuff[i] = NULL;
- if (directBuff && sendConn[i]->direct) {
- void* volatile* ptr = sendConn[i]->ptrExchange;
+ if (directBuff && conn->direct) {
+ void* volatile* ptr = conn->ptrExchange;
while ((sendDirectBuff[i] = (T*)(*ptr)) == NULL);
- __syncthreads();
+ barrier();
if (tid == 0) *ptr = NULL;
}
+ if (wid == i) sendConn = conn;
+ if (wid == i) sendConnTail = sendConnHead = sendStep[i]; // Make sure we set this after rounding up
nsend++;
}
+ __device__ __forceinline__ void loadSendSync() {
+ if (tid < nsend) {
+ sendConnHeadPtr = sendConn->head;
+ sendConnHeadCache = *sendConnHeadPtr;
+ sendConnFifoPtr = sendConn->fifo;
+ *(sendConn->opCountLoc) = opCount;
+ }
+ if (tid >= nthreads-WARP_SIZE && wid<nsend) {
+ sendConnTailPtr = sendConn->tail;
+ }
+ }
- __device__ __forceinline__ void saveRecvConn(int i) {
- if (tid == i) {
- recvConn[i]->step = recvStep[i];
+ __device__ __forceinline__ void saveRecvSync() {
+ if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+ recvConn->step = recvConnHead;
+ *(recvConn->opCountLoc) = opCount+1;
__threadfence_system();
- *(recvConn[i]->opCountLoc) += 1;
}
}
- __device__ __forceinline__ void saveSendConn(int i) {
- if (tid == WARP_SIZE+i) {
- sendConn[i]->step = sendStep[i];
+ __device__ __forceinline__ void saveSendSync() {
+ if (tid < nsend) {
+ sendConn->step = sendConnHead;
+ *(sendConn->opCountLoc) = opCount+1;
__threadfence_system();
- *(sendConn[i]->opCountLoc) += 1;
}
}
public:
__device__ __forceinline__
- ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
- : comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
- // Make sure step is updated before we read it
- __syncthreads();
+ ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
+ : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepSize(stepSize), opCount(opCount) {
+ // Make sure step is updated before we read it.
+ barrier();
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, directBuff);
for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, directBuff);
+ loadRecvSync();
+ loadSendSync();
}
__device__ __forceinline__ void
@@ -305,283 +362,13 @@ class ncclPrimitives {
}
__device__ __forceinline__ ~ncclPrimitives() {
- // Save steps for next collective. Have thread 0 do it to be compatible
- // with the way LL works.
- for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
- for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
+ // Save steps for the next operation
+ saveRecvSync();
+ saveSendSync();
}
};
-template <typename T, class FUNC, int NRECV, int NSEND>
-class ncclLLPrimitives {
- private:
- const int tid;
- const int nthreads;
- int nrecv = 0;
- int nsend = 0;
- struct ncclConnInfo* recvConn[NRECV];
- struct ncclConnInfo* sendConn[NSEND];
- volatile uint64_t* waitPtr;
- volatile uint64_t* postPtr;
- volatile int* fifoPtr;
- uint64_t recvStep[NRECV];
- uint64_t sendStep[NSEND];
- uint64_t sendConnHead;
- union ncclLLFifoLine* recvBuff[NRECV];
- union ncclLLFifoLine* sendBuff[NSEND];
- struct ncclComm* comm;
-
- inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
- inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
- inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
- inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
- inline __device__ uint32_t recvFlag(int i) { return recvStep[i]+1; }
- inline __device__ uint32_t sendFlag(int i) { return sendStep[i]+1; }
-
- // Exit If Abort Barrier : make sure all threads exit consistently
- // Each thread sets a predicate to true if val == 1
- // all CTA's threads enter the barrier and do a popc on their predicates being True
- // If any of the thread's predicate was True, all the threads call exit()
- inline __device__ void exitIfAbortLocalBarrier() {
- uint32_t popc;
- asm ("{");
- asm volatile (" .reg .pred barr_pred;");
- asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
- asm volatile (" bar.red.popc.u32 %0, 14, %1, barr_pred;" : "=r"(popc) : "r"(nthreads));
- asm ("}");
- if (popc) {
- // Make sure threads not participating in the operation get the abort and all threads exit
- exitIfAbortBarrier(1);
- }
- }
-
- inline __device__ void barrier() {
- asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
- }
-
- uint32_t mismatch = 0;
- const uint64_t opCount;
-
- inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
- if (mismatch > 20) {
- // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
- // Note that we are not using _threadfence_system in LL so the error cannot be asserted
- *(comm->fatalDevError) = ncclDevSuspectedMismatch;
- } else if (remoteOpCount && *remoteOpCount > opCount) {
- mismatch += 1;
- }
- }
-
- uint32_t spins = 0;
- uint32_t abort = 0;
-
- inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
- spins++;
- if (spins == SPINS_BEFORE_CHECK_ABORT) {
- abort = *(comm->abortFlag);
- checkMismatch(remoteOpCount);
- spins = 0;
- }
- return abort;
- }
-
- inline __device__ void waitSend(int i, int nbytes) {
- spins = 0;
- mismatch = 0;
- if (tid == WARP_SIZE+i) {
- while (sendConnHead + NCCL_STEPS < sendStep[i] + 1) {
- sendConnHead = *waitPtr;
- if (checkAbort(sendConn[i]->opCountRem)) break;
- }
- if (fifoPtr) fifoPtr[sendStep[i]%NCCL_STEPS] = nbytes;
- }
- }
+#include "prims_ll.h"
+//#include "prims_ll128.h"
- inline __device__ void postRecv(int i) {
- recvStep[i]++;
- if (tid == i) *postPtr = recvStep[i];
- }
-
- inline __device__ void postSend(int i) {
- sendStep[i]++;
- }
-
- __device__ uint64_t readLL(int i, int offset) {
- union ncclLLFifoLine* src = recvPtr(i) + offset;
- uint32_t flag = recvFlag(i);
- uint32_t data1, flag1, data2, flag2;
- spins = 0;
- mismatch = 0;
- do {
- asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
- if (checkAbort(recvConn[i]->opCountRem)) break;
- } while ((flag1 != flag) || (flag2 != flag));
- uint64_t val64 = data1 + (((uint64_t)data2) << 32);
- return val64;
- }
-
- __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
- asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
- }
-
- // Using memcpy handles misaligned pointers.
- __device__ uint64_t readAL(uint64_t* src) {
- uint64_t val;
- memcpy((char*)&val, (char*)src, sizeof(uint64_t));
- return val;
- }
-
- __device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) {
- memcpy((char*)dst, (char*)&val, nbytes);
- }
-
- template <int RECV, int SEND, int SRC, int DST>
- __device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) {
- uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T);
- FOR_SEND(waitSend, nbytes*2);
- barrier();
- uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
- uint64_t* srcPack = (uint64_t*)srcPtr;
- uint64_t* dstPack = (uint64_t*)dstPtr;
- // Do multiples of 64 bits
- #pragma unroll 2
- for (int offset=tid; offset<npack; offset+=nthreads) {
- // Recv : local, then intra-node, then inter-node
- uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
- if (RECV) {
- if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val);
- for (int i=1; i<NRECV && i<nrecv; i++) {
- val = MULTI<FUNC, T>()(readLL(i, offset), val);
- }
- }
-
- // Send : inter-node, then intra-node, then local
- if (SEND) {
- for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i));
- storeLL(sendPtr(0)+offset, val, sendFlag(0));
- }
- if (DST) {
- if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) {
- // Last incomplete word
- storeAL(dstPack+offset, val, nbytes & 0x7);
- } else {
- storeAL(dstPack+offset, val, sizeof(uint64_t));
- }
- }
- }
- exitIfAbortLocalBarrier();
- FOR_RECV(postRecv);
- FOR_SEND(postSend);
- }
-
- __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
- recvConn[i] = conn;
- recvBuff[i] = recvConn[i]->llBuff;
- recvStep[i] = recvConn[i]->step;
- if (tid == i) {
- postPtr = recvConn[i]->head;
- *(recvConn[i]->opCountLoc) = opCount;
- }
- nrecv++;
- }
-
- __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
- sendConn[i] = conn;
- sendBuff[i] = sendConn[i]->llBuff;
- sendStep[i] = sendConn[i]->step;
- if (tid == WARP_SIZE+i) {
- waitPtr = sendConn[i]->head;
- fifoPtr = sendConn[i]->fifo;
- sendConnHead = *waitPtr;
- *(sendConn[i]->opCountLoc) = opCount;
- }
- nsend++;
- }
-
- __device__ __forceinline__ void saveRecvConn(int i) {
- if (tid == i) {
- recvConn[i]->step = recvStep[i];
- *(recvConn[i]->opCountLoc) += 1;
- __threadfence_block();
- }
- }
-
- __device__ __forceinline__ void saveSendConn(int i) {
- if (tid == WARP_SIZE+i) {
- sendConn[i]->step = sendStep[i];
- *(sendConn[i]->opCountLoc) += 1;
- __threadfence_block();
- }
- }
-
- __device__ __forceinline__ void llSendCleaning(int i) {
- if (sendStep[i] > sendConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
- /* Reset all flags */
- static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS");
- static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS");
- for (int s=0; s<NCCL_STEPS; s++) {
- waitSend(i, 0);
- for (int o=tid; o<NCCL_LL_SLICE_LINES; o+=nthreads) {
- const union ncclLLFifoLine resetLine = { 0, sendFlag(i), 0, sendFlag(i) };
- sendPtr(i)[o].i4 = resetLine.i4;
- }
- }
- if (tid == 0) sendConn[i]->llLastCleaning = sendStep[i];
- }
- }
-
- __device__ __forceinline__ void llRecvCleaning(int i) {
- if (recvStep[i] > recvConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
- recvStep[i] += NCCL_STEPS;
- if (tid == 0) recvConn[i]->llLastCleaning = recvStep[i];
- }
- }
-
- public:
- __device__ __forceinline__
- ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
- : comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
- // Make sure step is updated before we read it.
- barrier();
-
- for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
- for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
- }
-
- __device__ void send(const T* src, int nelem) {
- return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem);
- }
-
- __device__ void recv(T* dst, int nelem) {
- return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem);
- }
-
- __device__ void recvReduceSend(const T* src, int nelem) {
- return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem);
- }
-
- __device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
- return LLGenericOp<1, 0, 1, 1>(src, dst, nelem);
- }
-
- __device__ void copySend(const T* src, T* dst, int nelem) {
- return LLGenericOp<0, 1, 1, 1>(src, dst, nelem);
- }
-
- __device__ void recvCopySend(T* dst, int nelem) {
- return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem);
- }
-
- __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
- return LLGenericOp<1, 1, 1, 1>(src, dst, nelem);
- }
-
- __device__ __forceinline__ ~ncclLLPrimitives() {
- for (int i=0; i<NSEND && i<nsend; i++) llSendCleaning(i);
- for (int i=0; i<NRECV && i<nrecv; i++) llRecvCleaning(i);
- // Save steps for the next operation
- for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
- for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
- }
-};
#endif
diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h
new file mode 100644
index 0000000..f919493
--- /dev/null
+++ b/src/collectives/device/prims_ll.h
@@ -0,0 +1,259 @@
+template <typename T, class FUNC, int NRECV, int NSEND>
+class ncclLLPrimitives {
+ private:
+ const int tid;
+ const int nthreads;
+ const int wid;
+ int nrecv = 0;
+ int nsend = 0;
+ struct ncclConnInfo* recvConn = NULL;
+ volatile uint64_t* recvConnHeadPtr = NULL;
+ uint64_t recvConnHead;
+
+ struct ncclConnInfo* sendConn = NULL;
+ volatile int* sendConnFifoPtr = NULL;
+ volatile uint64_t* sendConnHeadPtr = NULL;
+ uint64_t sendConnHead;
+ uint64_t sendConnHeadCache; // Cache last seen value
+
+ uint64_t recvStep[NRECV];
+ uint64_t sendStep[NSEND];
+ union ncclLLFifoLine* recvBuff[NRECV];
+ union ncclLLFifoLine* sendBuff[NSEND];
+ struct ncclDevComm* comm;
+
+ inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+ inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+ inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
+ inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
+ inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
+ inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
+
+ inline __device__ void barrier() {
+ asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+ }
+
+ uint32_t mismatch = 0;
+ const uint64_t opCount;
+
+ inline __device__ void checkMismatch(struct ncclConnInfo* conn) {
+ if (mismatch > 20) {
+ // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
+ // Note that we are not using _threadfence_system in LL so the error cannot be asserted
+ *(comm->fatalDevError) = ncclDevSuspectedMismatch;
+ } else if (conn && *conn->opCountRem > opCount) {
+ mismatch += 1;
+ }
+ }
+
+ uint32_t spins = 0;
+ uint32_t abort = 0;
+
+ inline __device__ int checkAbort(int i, int send) {
+ spins++;
+ if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
+ abort = *(comm->abortFlag);
+ if (wid == i) checkMismatch(send ? sendConn : recvConn);
+ spins = 0;
+ }
+ return abort;
+ }
+
+ inline __device__ void waitSend(int nbytes) {
+ spins = 0;
+ mismatch = 0;
+ if (sendConnHeadPtr) {
+ while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
+ sendConnHeadCache = *sendConnHeadPtr;
+ if (checkAbort(wid, 1)) break;
+ }
+ if (sendConnFifoPtr) {
+ int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
+ sendConnFifoPtr[sendConnHead%NCCL_STEPS] = size;
+ }
+ sendConnHead += 1;
+ }
+ barrier();
+ }
+
+ inline __device__ void incRecv(int i) {
+ recvStep[i] += 1;
+ }
+ inline __device__ void postRecv() {
+ barrier();
+ if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1;
+ }
+
+ inline __device__ void incSend(int i, int offset) {
+ // LL Cleanup : write all flags in the slice to make sure we don't have
+ // data corruption when flag loops over.
+ if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
+ for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
+ }
+ sendStep[i]++;
+ }
+
+ __device__ uint64_t readLL(int i, int offset) {
+ union ncclLLFifoLine* src = recvPtr(i) + offset;
+ uint32_t flag = recvFlag(i);
+ uint32_t data1, flag1, data2, flag2;
+ spins = 0;
+ mismatch = 0;
+ do {
+ asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
+ if (checkAbort(i, 0)) break;
+ } while ((flag1 != flag) || (flag2 != flag));
+ uint64_t val64 = data1 + (((uint64_t)data2) << 32);
+ return val64;
+ }
+
+ __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
+ asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
+ }
+
+ // Using memcpy handles misaligned pointers.
+ __device__ uint64_t readAL(uint64_t* src) {
+ uint64_t val;
+ memcpy((char*)&val, (char*)src, sizeof(uint64_t));
+ return val;
+ }
+
+ __device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) {
+ memcpy((char*)dst, (char*)&val, nbytes);
+ }
+
+ template <int RECV, int SEND, int SRC, int DST>
+ __device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) {
+ uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T);
+ uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
+ uint64_t* srcPack = (uint64_t*)srcPtr;
+ uint64_t* dstPack = (uint64_t*)dstPtr;
+ int offset = tid;
+
+ // Always waitSend in case of cleanup
+ if (SEND) waitSend(npack*sizeof(union ncclLLFifoLine));
+
+ // Do multiples of 64 bits
+ #pragma unroll 2
+ for (; offset<npack; offset+=nthreads) {
+ // Recv : local, then intra-node, then inter-node
+ uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
+ if (RECV) {
+ if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val);
+ for (int i=1; i<NRECV && i<nrecv; i++) {
+ val = MULTI<FUNC, T>()(readLL(i, offset), val);
+ }
+ }
+
+ // Send : inter-node, then intra-node, then local
+ if (SEND) {
+ for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i));
+ storeLL(sendPtr(0)+offset, val, sendFlag(0));
+ }
+ if (DST) {
+ if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) {
+ // Last incomplete word
+ storeAL(dstPack+offset, val, nbytes & 0x7);
+ } else {
+ storeAL(dstPack+offset, val, sizeof(uint64_t));
+ }
+ }
+ }
+ FOR_RECV(incRecv); if (RECV) postRecv();
+ FOR_SEND(incSend, offset);
+ }
+
+ __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
+ recvBuff[i] = conn->llBuff;
+ recvStep[i] = conn->step;
+ if (wid == i) recvConn = conn;
+ nrecv++;
+ }
+ __device__ __forceinline__ void loadRecvSync() {
+ if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+ recvConnHeadPtr = recvConn->head;
+ recvConnHead = recvConn->step;
+ // Update opCount in case we skipped some operations
+ *(recvConn->opCountLoc) = opCount;
+ }
+ }
+
+ __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
+ sendBuff[i] = conn->llBuff;
+ sendStep[i] = conn->step;
+ if (wid == i) sendConn = conn;
+ nsend++;
+ }
+ __device__ __forceinline__ void loadSendSync() {
+ if (tid < nsend) {
+ sendConnHeadPtr = sendConn->head;
+ sendConnHeadCache = *sendConnHeadPtr;
+ sendConnHead = sendConn->step;
+ sendConnFifoPtr = sendConn->fifo;
+ *(sendConn->opCountLoc) = opCount;
+ }
+ }
+
+ __device__ __forceinline__ void saveRecvSync() {
+ if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+ recvConn->step = recvConnHead;
+ *(recvConn->opCountLoc) = opCount+1;
+ __threadfence_block();
+ }
+ }
+
+ __device__ __forceinline__ void saveSendSync() {
+ if (tid < nsend) {
+ sendConn->step = sendConnHead;
+ *(sendConn->opCountLoc) = opCount+1;
+ __threadfence_block();
+ }
+ }
+
+ public:
+ __device__ __forceinline__
+ ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
+ : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), opCount(opCount) {
+ // Make sure step is updated before we read it.
+ barrier();
+
+ for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
+ for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
+ loadRecvSync();
+ loadSendSync();
+ }
+
+ __device__ void send(const T* src, int nelem) {
+ return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem);
+ }
+
+ __device__ void recv(T* dst, int nelem) {
+ return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem);
+ }
+
+ __device__ void recvReduceSend(const T* src, int nelem) {
+ return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem);
+ }
+
+ __device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
+ return LLGenericOp<1, 0, 1, 1>(src, dst, nelem);
+ }
+
+ __device__ void copySend(const T* src, T* dst, int nelem) {
+ return LLGenericOp<0, 1, 1, 1>(src, dst, nelem);
+ }
+
+ __device__ void recvCopySend(T* dst, int nelem) {
+ return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem);
+ }
+
+ __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
+ return LLGenericOp<1, 1, 1, 1>(src, dst, nelem);
+ }
+
+ __device__ __forceinline__ ~ncclLLPrimitives() {
+ // Save steps for the next operation
+ saveRecvSync();
+ saveSendSync();
+ }
+};
diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h
new file mode 100644
index 0000000..40a8cff
--- /dev/null
+++ b/src/collectives/device/prims_ll128.h
@@ -0,0 +1,410 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "op128.h"
+
+#define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1)
+
+template <typename T, class FUNC, int NRECV, int NSEND>
+class ncclLL128Primitives {
+ private:
+ const int tid;
+ const int nthreads;
+ const int wid;
+ const int warp;
+ const bool flagThread;
+ int nrecv = 0;
+ int nsend = 0;
+ struct ncclConnInfo* recvConn = NULL;
+ volatile uint64_t* recvConnHeadPtr = NULL;
+ uint64_t recvConnHead;
+
+ struct ncclConnInfo* sendConn = NULL;
+ volatile int* sendConnFifoPtr = NULL;
+ volatile uint64_t* sendConnTailPtr = NULL;
+ uint64_t sendConnTail;
+ volatile uint64_t* sendConnHeadPtr = NULL;
+ uint64_t sendConnHead;
+ uint64_t sendConnHeadCache; // Cache last seen value
+
+ uint64_t recvStep[NRECV];
+ uint64_t sendStep[NSEND];
+ uint64_t* recvBuff[NRECV];
+ uint64_t* sendBuff[NSEND];
+ struct ncclDevComm* comm;
+
+ volatile uint64_t* shmem;
+
+ inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; }
+ inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; }
+ inline __device__ uint64_t* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
+ inline __device__ uint64_t* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
+ inline __device__ uint64_t recvFlag(int i) { return recvStep[i]+1; }
+ inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; }
+
+ inline __device__ void barrier() {
+ if (NSEND>NRECV) {
+ asm volatile ("bar.sync 2, %0;" :: "r"(nthreads));
+ } else {
+ asm volatile ("bar.sync 3, %0;" :: "r"(nthreads));
+ }
+ }
+
+ uint32_t mismatch = 0;
+ const uint64_t opCount;
+
+ inline __device__ void checkMismatch(struct ncclConnInfo* conn) {
+ if (mismatch > 20) {
+ // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
+ // Note that we are not using _threadfence_system in LL so the error cannot be asserted
+ *(comm->fatalDevError) = ncclDevSuspectedMismatch;
+ } else if (conn && *conn->opCountRem > opCount) {
+ mismatch += 1;
+ }
+ }
+
+ uint32_t spins = 0;
+ uint32_t abort = 0;
+
+ inline __device__ int checkAbort(int i, int send) {
+ spins++;
+ if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
+ abort = *(comm->abortFlag);
+ if (wid == i) checkMismatch(send ? sendConn : recvConn);
+ spins = 0;
+ }
+ return abort;
+ }
+
+ inline __device__ void waitSend(int nbytes) {
+ spins = 0;
+ mismatch = 0;
+ if (sendConnHeadPtr) {
+ while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
+ sendConnHeadCache = *sendConnHeadPtr;
+ if (checkAbort(wid, 1)) break;
+ }
+ if (sendConnFifoPtr) {
+ sendConnFifoPtr[sendStep[wid]%NCCL_STEPS] = nbytes;
+ }
+ sendConnHead += 1;
+ }
+ }
+
+ inline __device__ void incRecv(int i) {
+ recvStep[i] += 1;
+ }
+ inline __device__ void postRecv() {
+ if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1;
+ }
+
+ inline __device__ void incSend(int i) {
+ sendStep[i] += 1;
+ }
+ inline __device__ void postSend() {
+ if (sendConnTailPtr) { __threadfence(); *sendConnTailPtr = sendConnTail += 1; }
+ }
+
+ template <int ELEMS_PER_THREAD>
+ inline __device__ void loadSrcToShmem128(int maxOffset, const uint64_t* src64Ptr) {
+#if 0
+ uint64_t v[ELEMS_PER_THREAD];
+ #pragma unroll
+ for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+ if (u*WARP_SIZE < maxOffset) load128(src64Ptr+u*WARP_SIZE, v[u], v[u+1]);
+ }
+ uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
+ #pragma unroll
+ for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+ storeShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
+ }
+#else
+ uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
+ #pragma unroll
+ for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+ if (u*WARP_SIZE < maxOffset) {
+ uint64_t v0, v1;
+ load128(src64Ptr+u*WARP_SIZE, v0, v1);
+ storeShmem128(shmemAsmPtr+u*WARP_SIZE, v0, v1);
+ }
+ }
+#endif
+ }
+
+ inline __device__ void loadSrcToShmem(int start, int end, const T* srcPtr) {
+ T* shmemPtr = (T*)(shmem-2*wid);
+ for (int offset = start+wid; offset < end; offset += WARP_SIZE) {
+ shmemPtr[offset] = srcPtr[offset];
+ }
+ }
+
+ template <int ELEMS_PER_THREAD>
+ inline __device__ void storeShmemToDst128(int maxOffset, uint64_t* dst64Ptr) {
+ uint64_t v[ELEMS_PER_THREAD];
+ uint64_t* shmemAsmPtr = shmemCvtPtr(shmem);
+ #pragma unroll
+ for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+ loadShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]);
+ }
+ #pragma unroll
+ for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+ if (u*WARP_SIZE < maxOffset) store128(dst64Ptr+u*WARP_SIZE, v[u], v[u+1]);
+ }
+ }
+
+ inline __device__ void storeShmemToDst(int start, int end, T* dstPtr) {
+ T* shmemPtr = (T*)(shmem-2*wid);
+ for (int offset = start+wid; offset < end; offset += WARP_SIZE) {
+ dstPtr[offset] = shmemPtr[offset];
+ }
+ }
+
+ #define WARP_MASK 0xffffffff
+
+ template <int ELEMS_PER_THREAD, int RECV, int SEND, int SRC, int DST>
+ __device__ __forceinline__ void recvReduceSendCopy(int ll128Offset) {
+ uint64_t v[ELEMS_PER_THREAD];
+
+ /************* Data Loading : SHMEM -> REG **************/
+ if (SRC) {
+ volatile uint64_t* shmem64Ptr = shmem - (2*wid)/NCCL_LL128_LINEELEMS;
+ #pragma unroll
+ for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+ v[u] = shmem64Ptr[u*(WARP_SIZE-2)];
+ if (!flagThread) v[u+1] = shmem64Ptr[u*(WARP_SIZE-2)+1];
+ }
+ }
+ /*********** End Data Loading : SHMEM -> REG ************/
+
+ /************************ Recv **************************/
+ if (RECV) {
+ uint64_t flag = recvFlag(0);
+ uint64_t* ptr = recvPtr(0)+ll128Offset;
+ bool needReload;
+ uint64_t v0, v1;
+ do {
+ needReload = false;
+ #pragma unroll
+ for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+ load128(ptr+u*WARP_SIZE, v0, v1);
+ needReload |= flagThread && (v1 != flag);
+ }
+ } while (__any_sync(WARP_MASK, needReload) && checkAbort(0, 0) == 0);
+ #pragma unroll
+ for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+ load128(ptr+u*WARP_SIZE, v0, v1);
+ v[u] = SRC ? MULTI<FUNC, T>()(v0, v[u]) : v0;
+ v[u+1] = SRC ? MULTI<FUNC, T>()(v1, v[u+1]) : v1;
+ }
+
+ for (int i=1; i<NRECV && i<nrecv; i++) {
+ uint64_t flag = recvFlag(i);
+ uint64_t* ptr = recvPtr(i)+ll128Offset;
+ uint64_t v0, v1;
+ do {
+ needReload = false;
+ #pragma unroll
+ for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+ load128(ptr+u*WARP_SIZE, v0, v1);
+ needReload |= flagThread && (v1 != flag);
+ }
+ } while (__any_sync(WARP_MASK, needReload) && checkAbort(i, 0) == 0);
+ #pragma unroll
+ for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+ load128(ptr+u*WARP_SIZE, v0, v1);
+ v[u] = MULTI<FUNC, T>()(v0, v[u]);
+ v[u+1] = MULTI<FUNC, T>()(v1, v[u+1]);
+ }
+ }
+ }
+ /********************** End Recv ************************/
+
+ /************************ Send **************************/
+ if (SEND) {
+ for (int i=1; i<NSEND && i<nsend; i++) {
+ int flag = sendFlag(i);
+ uint64_t* ptr = sendPtr(i)+ll128Offset;
+ #pragma unroll
+ for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+ store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
+ }
+ }
+ int flag = sendFlag(0);
+ uint64_t* ptr = sendPtr(0)+ll128Offset;
+ #pragma unroll
+ for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+ store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
+ }
+ }
+ /********************** End Send ************************/
+
+ /************* Data Storing : REG -> SHMEM **************/
+ if (DST) {
+ volatile uint64_t* shmem64Ptr = shmem - (2*wid)/NCCL_LL128_LINEELEMS;
+ #pragma unroll
+ for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+ shmem64Ptr[u*(WARP_SIZE-2)] = v[u];
+ if (!flagThread) shmem64Ptr[u*(WARP_SIZE-2)+1] = v[u+1];
+ }
+ }
+ /*********** End data Storing : REG -> SHMEM ************/
+ }
+
+ #define LL128INC (WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD)
+ #define ELEMINC (LL128INC-(LL128INC/NCCL_LL128_LINEELEMS))
+
+ template <int RECV, int SEND, int SRC, int DST>
+ __device__ void GenericOp(const T* srcPtr, T* dstPtr, int nelem) {
+ if (nelem <= 0) {
+ // Don't move any data but still increase steps and sync with prev/next
+ if (SEND) waitSend(0);
+ FOR_SEND(incSend); if (SEND) postSend();
+ FOR_RECV(incRecv); if (RECV) postRecv();
+ return;
+ }
+ const int nelem64 = ((nelem*sizeof(T))/(2*sizeof(uint64_t)))*2;
+ const uint64_t* src64Ptr = ((uint64_t*)srcPtr);
+ uint64_t* dst64Ptr = ((uint64_t*)dstPtr);
+
+ int ll128Offset = LL128INC*warp+2*wid;
+ int elemOffset = ELEMINC*warp;
+ const int nwarps = nthreads/WARP_SIZE;
+
+ if (SEND) waitSend(DIVUP(nelem*sizeof(T), ELEMINC*sizeof(uint64_t))*LL128INC*sizeof(uint64_t));
+ barrier();
+
+ while (elemOffset*(sizeof(uint64_t)/sizeof(T)) < nelem) {
+ const int maxOffset128 = min(nelem64-elemOffset, (int)ELEMINC);
+ const int maxOffset = min(nelem-(elemOffset*((int)(sizeof(uint64_t)/sizeof(T)))), (int)(ELEMINC*(sizeof(uint64_t)/sizeof(T))));
+ if (SRC) {
+ int done = 0;
+ if ((((uint64_t)srcPtr)&0xf) == 0) {
+ loadSrcToShmem128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, src64Ptr+elemOffset+2*wid);
+ done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
+ }
+ loadSrcToShmem(done, maxOffset, (T*)(src64Ptr+elemOffset));
+ }
+ __syncwarp();
+ recvReduceSendCopy<NCCL_LL128_SHMEM_ELEMS_PER_THREAD, RECV, SEND, SRC, DST>(ll128Offset);
+ __syncwarp();
+ if (DST) {
+ int done = 0;
+ if ((((uint64_t)dstPtr)&0xf) == 0) {
+ storeShmemToDst128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, dst64Ptr+elemOffset+2*wid);
+ done = maxOffset128*(sizeof(uint64_t)/sizeof(T));
+ }
+ storeShmemToDst(done, maxOffset, (T*)(dst64Ptr+elemOffset));
+ }
+ __syncwarp();
+ ll128Offset += LL128INC*nwarps;
+ elemOffset += ELEMINC*nwarps;
+ }
+
+ barrier();
+ FOR_SEND(incSend); if (SEND) postSend();
+ FOR_RECV(incRecv); if (RECV) postRecv();
+ }
+
+ __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
+ recvBuff[i] = conn->ll128Buff;
+ recvStep[i] = conn->step;
+ if (wid == i) recvConn = conn;
+ nrecv++;
+ }
+ __device__ __forceinline__ void loadRecvSync() {
+ if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+ recvConnHeadPtr = recvConn->head;
+ recvConnHead = recvConn->step;
+ // Update opCount in case we skipped some operations
+ *(recvConn->opCountLoc) = opCount;
+ }
+ }
+
+ __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
+ sendBuff[i] = conn->ll128Buff;
+ sendStep[i] = conn->step;
+ if (wid == i) sendConn = conn;
+ nsend++;
+ }
+ __device__ __forceinline__ void loadSendSync() {
+ if (tid < nsend) {
+ sendConnHeadPtr = sendConn->head;
+ sendConnHeadCache = *sendConnHeadPtr;
+ sendConnHead = sendConn->step;
+ sendConnFifoPtr = sendConn->fifo;
+ *(sendConn->opCountLoc) = opCount;
+ }
+ if (tid >= nthreads-WARP_SIZE && wid<nsend) {
+ if (sendConn->fifo) {
+ sendConnTailPtr = sendConn->tail;
+ sendConnTail = sendConn->step;
+ }
+ }
+ }
+
+ __device__ __forceinline__ void saveRecvSync() {
+ if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+ recvConn->step = recvConnHead;
+ *(recvConn->opCountLoc) = opCount+1;
+ __threadfence_block();
+ }
+ }
+
+ __device__ __forceinline__ void saveSendSync() {
+ if (tid < nsend) {
+ sendConn->step = sendConnHead;
+ *(sendConn->opCountLoc) = opCount+1;
+ __threadfence_block();
+ }
+ }
+
+ public:
+ __device__ __forceinline__
+ ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
+ : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), opCount(opCount), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) {
+ // Make sure step is updated before we read it.
+ barrier();
+
+ for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
+ for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
+ loadRecvSync();
+ loadSendSync();
+ }
+
+ __device__ void send(const T* src, int nelem) {
+ return GenericOp<0, 1, 1, 0>(src, NULL, nelem);
+ }
+
+ __device__ void recv(T* dst, int nelem) {
+ return GenericOp<1, 0, 0, 1>(NULL, dst, nelem);
+ }
+
+ __device__ void recvReduceSend(const T* src, int nelem) {
+ return GenericOp<1, 1, 1, 0>(src, NULL, nelem);
+ }
+
+ __device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
+ return GenericOp<1, 0, 1, 1>(src, dst, nelem);
+ }
+
+ __device__ void copySend(const T* src, T* dst, int nelem) {
+ return GenericOp<0, 1, 1, 1>(src, dst, nelem);
+ }
+
+ __device__ void recvCopySend(T* dst, int nelem) {
+ return GenericOp<1, 1, 0, 1>(NULL, dst, nelem);
+ }
+
+ __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
+ return GenericOp<1, 1, 1, 1>(src, dst, nelem);
+ }
+
+ __device__ __forceinline__ ~ncclLL128Primitives() {
+ // Save steps for the next operation
+ saveRecvSync();
+ saveSendSync();
+ }
+};
diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu
index 1ef66d4..a2caac5 100644
--- a/src/collectives/device/reduce.cu
+++ b/src/collectives/device/reduce.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index 302d053..0680abe 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -1,19 +1,19 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int nthreads = blockDim.x - 1;
+ const int nthreads = args->nThreads-WARP_SIZE;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
@@ -30,7 +30,7 @@ __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
T * __restrict__ thisOutput = (T*)args->ThisOutput;
ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, FUNC>
- prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
+ prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
@@ -55,7 +55,7 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
@@ -93,3 +93,48 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
+
+#include "prims_ll128.h"
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) {
+ const int tid = threadIdx.x;
+ const int bid = args->bid;
+ const int nthreads = args->nThreads;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+
+ ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+ const ssize_t size = args->N;
+ const int rank = comm->rank;
+ const int nranks = comm->nRanks;
+ const int prevRank = ring->devUserRanks[nranks-1];
+ const int root = args->root;
+
+ ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+ const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+
+ const ssize_t loopSize = args->nChannels*chunkSize;
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->ThisInput;
+ T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
+ ssize_t offset = gridOffset + bid*chunkSize;
+
+ int nelem = min(chunkSize, size-offset);
+ if (prevRank == root) {
+ LLprims.send(thisInput+offset, nelem);
+ } else if (rank == root) {
+ LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+ } else {
+ LLprims.recvReduceSend(thisInput+offset, nelem);
+ }
+ }
+}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceTreeLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu
index 10857ed..8b45299 100644
--- a/src/collectives/device/reduce_scatter.cu
+++ b/src/collectives/device/reduce_scatter.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index c70c845..1985148 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -1,25 +1,25 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
template<int UNROLL, class FUNC, typename T>
__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int nthreads = blockDim.x - 1;
+ const int nthreads = args->nThreads-WARP_SIZE;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
const int nranks = comm->nRanks;
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
- const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+ const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
// Compute pointers
@@ -27,7 +27,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
T * __restrict__ thisOutput = (T*)args->ThisOutput;
ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, FUNC>
- prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
+ prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
@@ -69,7 +69,7 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
@@ -121,3 +121,64 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
+
+#include "prims_ll128.h"
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) {
+ const int tid = threadIdx.x;
+ const int bid = args->bid;
+ const int nthreads = args->nThreads;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+
+ ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+ const ssize_t size = args->N;
+ //const int rank = comm->rank;
+ const int nranks = comm->nRanks;
+ ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+ // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
+ const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+
+ const ssize_t loopSize = args->nChannels*chunkSize;
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->ThisInput;
+ T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
+
+ ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+ /////////////// begin ReduceScatter steps ///////////////
+ ssize_t offset;
+ int nelem = min(chunkSize, size-chunkOffset);
+ int rankDest;
+
+ // step 0: push data to next GPU
+ rankDest = ring->devUserRanks[nranks-1];
+ offset = chunkOffset + rankDest * size;
+
+ LLprims.send(thisInput+offset, nelem);
+
+ // k-2 steps: reduce and copy to next GPU
+ for (int j=2; j<nranks; ++j) {
+ rankDest = ring->devUserRanks[nranks-j];
+ offset = chunkOffset + rankDest * size;
+
+ LLprims.recvReduceSend(thisInput+offset, nelem);
+ }
+
+ // step k-1: reduce this buffer and data, which will produce the final
+ // result that we store in this data
+ rankDest = ring->devUserRanks[0];
+ offset = chunkOffset + rankDest * size;
+
+ LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
+ }
+}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceScatterTreeLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/reduce.cu b/src/collectives/reduce.cc
index 302d4bc..67f2fae 100644
--- a/src/collectives/reduce.cu
+++ b/src/collectives/reduce.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/reduce_scatter.cu b/src/collectives/reduce_scatter.cc
index 4ee77ef..5ad7f5f 100644
--- a/src/collectives/reduce_scatter.cu
+++ b/src/collectives/reduce_scatter.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/debug.cc b/src/debug.cc
new file mode 100644
index 0000000..03a77ae
--- /dev/null
+++ b/src/debug.cc
@@ -0,0 +1,169 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "nccl_net.h"
+#include <stdlib.h>
+#include <stdarg.h>
+
+int ncclDebugLevel = -1;
+thread_local int ncclDebugNoWarn = 0;
+uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
+FILE *ncclDebugFile = stdout;
+pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
+
+void ncclDebugInit() {
+ pthread_mutex_lock(&ncclDebugLock);
+ if (ncclDebugLevel != -1) return;
+ const char* nccl_debug = getenv("NCCL_DEBUG");
+ if (nccl_debug == NULL) {
+ ncclDebugLevel = NCCL_LOG_NONE;
+ } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
+ ncclDebugLevel = NCCL_LOG_VERSION;
+ } else if (strcasecmp(nccl_debug, "WARN") == 0) {
+ ncclDebugLevel = NCCL_LOG_WARN;
+ } else if (strcasecmp(nccl_debug, "INFO") == 0) {
+ ncclDebugLevel = NCCL_LOG_INFO;
+ } else if (strcasecmp(nccl_debug, "ABORT") == 0) {
+ ncclDebugLevel = NCCL_LOG_ABORT;
+ } else if (strcasecmp(nccl_debug, "TRACE") == 0) {
+ ncclDebugLevel = NCCL_LOG_TRACE;
+ }
+
+ /* Parse the NCCL_DEBUG_SUBSYS env var
+ * This can be a comma separated list such as INIT,COLL
+ * or ^INIT,COLL etc
+ */
+ char* ncclDebugSubsysEnv = getenv("NCCL_DEBUG_SUBSYS");
+ if (ncclDebugSubsysEnv != NULL) {
+ int invert = 0;
+ if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; }
+ ncclDebugMask = invert ? ~0ULL : 0ULL;
+ char *ncclDebugSubsys = strdup(ncclDebugSubsysEnv);
+ char *subsys = strtok(ncclDebugSubsys, ",");
+ while (subsys != NULL) {
+ uint64_t mask = 0;
+ if (strcasecmp(subsys, "INIT") == 0) {
+ mask = NCCL_INIT;
+ } else if (strcasecmp(subsys, "COLL") == 0) {
+ mask = NCCL_COLL;
+ } else if (strcasecmp(subsys, "P2P") == 0) {
+ mask = NCCL_P2P;
+ } else if (strcasecmp(subsys, "SHM") == 0) {
+ mask = NCCL_SHM;
+ } else if (strcasecmp(subsys, "NET") == 0) {
+ mask = NCCL_NET;
+ } else if (strcasecmp(subsys, "GRAPH") == 0) {
+ mask = NCCL_GRAPH;
+ } else if (strcasecmp(subsys, "TUNING") == 0) {
+ mask = NCCL_TUNING;
+ } else if (strcasecmp(subsys, "ALL") == 0) {
+ mask = NCCL_ALL;
+ }
+ if (mask) {
+ if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
+ }
+ subsys = strtok(NULL, ",");
+ }
+ free(ncclDebugSubsys);
+ }
+
+ /* Parse and expand the NCCL_DEBUG_FILE path and
+ * then create the debug file. But don't bother unless the
+ * NCCL_DEBUG level is > VERSION
+ */
+ const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE");
+ if (ncclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
+ int c = 0;
+ char debugFn[PATH_MAX+1] = "";
+ char *dfn = debugFn;
+ while (ncclDebugFileEnv[c] != '\0' && c < PATH_MAX) {
+ if (ncclDebugFileEnv[c++] != '%') {
+ *dfn++ = ncclDebugFileEnv[c-1];
+ continue;
+ }
+ switch (ncclDebugFileEnv[c++]) {
+ case '%': // Double %
+ *dfn++ = '%';
+ break;
+ case 'h': // %h = hostname
+ char hostname[1024];
+ getHostName(hostname, 1024, '.');
+ dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
+ break;
+ case 'p': // %p = pid
+ dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
+ break;
+ default: // Echo everything we don't understand
+ *dfn++ = '%';
+ *dfn++ = ncclDebugFileEnv[c-1];
+ break;
+ }
+ }
+ *dfn = '\0';
+ if (debugFn[0] != '\0') {
+ FILE *file = fopen(debugFn, "w");
+ if (file != NULL) {
+ INFO(NCCL_ALL,"DEBUG file is '%s'", debugFn);
+ ncclDebugFile = file;
+ }
+ }
+ }
+
+#ifdef ENABLE_TRACE
+ ncclEpoch = std::chrono::high_resolution_clock::now();
+#endif
+ pthread_mutex_unlock(&ncclDebugLock);
+}
+
+/* Common logging function used by the INFO, WARN and TRACE macros
+ * Also exported to the dynamically loadable Net transport modules so
+ * they can share the debugging mechanisms and output files
+ */
+void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
+ if (ncclDebugLevel == -1) ncclDebugInit();
+ if (ncclDebugNoWarn == 1 && level == NCCL_LOG_WARN) level = NCCL_LOG_INFO;
+
+ char hostname[1024];
+ getHostName(hostname, 1024, '.');
+ int cudaDev;
+ cudaGetDevice(&cudaDev);
+
+ char buffer[1024];
+ size_t len = 0;
+ pthread_mutex_lock(&ncclDebugLock);
+ if (ncclDebugNoWarn && ncclDebugLevel == NCCL_LOG_WARN) printf("WARN -> INFO\n");
+ if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
+ len = snprintf(buffer, sizeof(buffer),
+ "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
+ else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask))
+ len = snprintf(buffer, sizeof(buffer),
+ "%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev);
+#ifdef ENABLE_TRACE
+ else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
+ auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
+ double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
+ len = snprintf(buffer, sizeof(buffer),
+ "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line);
+ }
+#endif
+ if (len) {
+ va_list vargs;
+ va_start(vargs, fmt);
+ (void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
+ va_end(vargs);
+ fprintf(ncclDebugFile,"%s\n", buffer);
+ fflush(ncclDebugFile);
+ }
+ pthread_mutex_unlock(&ncclDebugLock);
+
+ // If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort()
+ if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) {
+ fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n",
+ hostname, getpid(), gettid(), cudaDev, filefunc, line);
+ abort();
+ }
+}
diff --git a/src/enqueue.cu b/src/enqueue.cc
index d283223..2239865 100644
--- a/src/enqueue.cu
+++ b/src/enqueue.cc
@@ -1,23 +1,21 @@
/*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
-#include "checks.h"
-#include "param.h"
-
-#include "collectives/collectives.h"
+#include "argcheck.h"
// Only generate inline kernels for LL
#define NCCL_FUNC5(coll, op, dtype) \
(void*)NCCL_KERN_NAME(coll##LL, op, dtype), \
+ (void*)NCCL_KERN_NAME(coll##LL, op, dtype), \
(void*)NCCL_KERN_NAME(coll##LL, op, dtype)
#define NCCL_FUNC4(coll, op, dtype) \
- (void*)NCCL_FUNC5(coll##Ring, op, dtype), \
- (void*)NCCL_FUNC5(coll##Tree, op, dtype)
+ (void*)NCCL_FUNC5(coll##Tree, op, dtype), \
+ (void*)NCCL_FUNC5(coll##Ring, op, dtype)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
@@ -54,7 +52,7 @@
NCCL_FUNCS3B(coll, copy)
// Must be consistent with the ncclFuncSet enum
-static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
+static void* const ncclKerns[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
NCCL_FUNCS2B(ncclBroadcast),
NCCL_FUNCS2A(ncclReduce),
NCCL_FUNCS2B(ncclAllGather),
@@ -87,7 +85,7 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *par
}
ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
- params->gridDim.x = std::min((int) params->gridDim.x, comm->nChannels);
+ params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);
// Set active = 2 for the last operation
for (int r=0; r<params->gridDim.x; r++) {
@@ -207,6 +205,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
channel->collCount = 0;
}
params->gridDim.x = params->blockDim.x = 0;
+ comm->lastOpCount = comm->opCount;
NCCLCHECK(transportStartProxy(comm));
return ncclSuccess;
}
@@ -228,20 +227,70 @@ ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
/* Enqueueing system : computation of kernel and proxy operations parameters */
/*****************************************************************************/
-static ncclResult_t getPatternInfo(struct ncclInfo* info) {
- if (info->coll == ncclCollBroadcast) info->pattern = ncclPatternPipelineFrom;
- else if (info->coll == ncclCollReduce) info->pattern = ncclPatternPipelineTo;
- else if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->pattern = ncclPatternRing;
- else if (info->coll == ncclCollAllReduce) {
- if (info->nBytes <= info->comm->treeThreshold)
- info->pattern = ncclPatternTreeUpDown;
- else
- info->pattern = ncclPatternRingTwice;
+// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
+// factor is not ideal but works quite well. Powers of two, 64 B to 1 GB.
+static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
+ { 1.0, 1.0, 1.0, 1.0, .9, .8, .7, .7, .7, .7, .6, .5, .5, .5, .6, .7, .8, .9, .9, 1.0, 1.0, 1.0 },
+ { 1.0, 1.0, 1.0, 1.0, 1.0, .9, .8, .8, .8, .8, .7, .7, .7, .6, .6, .7, .7, .8, .8, .9, .9, 1.0 },
+ { .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .5, .5, .6, .6, .7, .8, .9 }
+};
+
+static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
+ struct ncclComm* comm = info->comm;
+ float minTime = 3600000.0; // Hopefully no operation will take an hour to complete.
+ // Find algorithm / protocol.
+ info->algorithm = -1;
+ info->protocol = -1;
+ for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ float bw = comm->bandwidths[info->coll][a][p];
+ if (bw == 0) continue;
+ int logSize = log2i(info->nBytes>>6);
+ if (a == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[p][logSize];
+ float time = comm->latencies[info->coll][a][p] + (info->nBytes) / (1000 * bw);
+ if (time < minTime) {
+ info->algorithm = a;
+ info->protocol = p;
+ minTime = time;
+ }
+ }
}
- else {
- WARN("Unknown collective %d", info->coll);
+ if (info->algorithm == -1 || info->protocol == -1) {
+ WARN("Error : no algorithm/protocol available");
return ncclInternalError;
}
+ //if (comm->rank == 0) INFO(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %d", info->nBytes, info->algorithm, info->protocol, minTime);
+ TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
+
+ int nc = comm->nChannels;
+ int nt = comm->maxThreads[info->protocol];
+ int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
+ while (info->nBytes < nc*nt*threadThreshold) {
+ if (nc >= 2) nc--;
+ else if ((nt % 128) == 0) nt/=2;
+ else break;
+ }
+ if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync
+ info->nChannels = nc;
+ info->nThreads = nt;
+ return ncclSuccess;
+}
+
+static ncclResult_t getPatternInfo(struct ncclInfo* info) {
+ switch (info->coll) {
+ case ncclCollBroadcast:
+ info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; break;
+ case ncclCollReduce:
+ info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break;
+ case ncclCollReduceScatter:
+ case ncclCollAllGather:
+ info->pattern = ncclPatternRing; break;
+ case ncclCollAllReduce:
+ info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
+ default:
+ WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
+ return ncclInternalError;
+ }
return ncclSuccess;
}
@@ -264,40 +313,9 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
return ncclSuccess;
}
-static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) {
- // Compute thresholds and limits that users can override
- int perThreadLLThreshold = std::min(info->comm->threadThreshold, (ssize_t)NCCL_LL_CHANNEL_THRESHOLD);
- int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads);
-
- // First compute nThreads
- int nt = NCCL_LL_MIN_NTHREADS;
- while (DIVUP(info->nBytes, nt*info->nchunksPerLoop) > perThreadLLThreshold && nt*2 <= maxLLNthreads) nt *= 2;
-
- // Then compute nChannels
- int nc = DIVUP(info->nBytes, nt*info->nchunksPerLoop*perThreadLLThreshold);
- if (nc == 0) nc = 1;
- if (nc > info->comm->nChannels) nc = info->comm->nChannels;
-
- // Check if we have a fixed LL threshold, otherwise compute it.
- int perThreadThreshold = info->comm->threadThreshold;
- if (info->pattern >= ncclPatternTreeUp) perThreadThreshold *= 4;
- ssize_t llThreshold = info->comm->llThreshold >= 0 ?
- info->comm->llThreshold :
- nc*nt*info->nchunksPerLoop*perThreadThreshold;
-
- if (info->nBytes <= llThreshold) {
- *llMode = 1;
- *nChannels = nc;
- *nThreads = nt;
- } else {
- *llMode = 0;
- *nChannels = info->comm->nChannels;
- *nThreads = info->comm->nThreads+1;
- }
-}
-
static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
// Set nstepsPerLoop and nchunksPerLoop
+ NCCLCHECK(getAlgoInfo(info));
NCCLCHECK(getPatternInfo(info));
NCCLCHECK(getLoopInfo(info));
@@ -307,48 +325,52 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
coll->args.ThisOutput = info->recvbuff;
coll->args.comm = info->comm->devComm;
coll->args.opCount = info->comm->opCount;
+ coll->args.nChannels = info->nChannels;
+ coll->args.nThreads = info->nThreads;
- // Compute llMode, nChannels, nThreads
- int llMode;
- getKernelInfo(info, &coll->args.nChannels, &coll->args.nThreads, &llMode);
-
- int treeMode = info->pattern >= ncclPatternTreeUp ? 1 : 0;
- coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, llMode, treeMode);
+ coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
- int stepSize = ( llMode ? NCCL_LL_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS;
- int chunkSteps = (llMode|treeMode) ? 1 : info->chunkSteps;
- int sliceSteps = (llMode|treeMode) ? 1 : info->sliceSteps;
+ int stepSize = (info->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : info->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS;
+ int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
+ int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1;
int chunkSize = stepSize*chunkSteps;
// Compute lastChunkSize
- if (treeMode == 1 && llMode == 0) {
+ if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_SIMPLE) {
if (info->pattern == ncclPatternTreeUpDown) {
// Optimize chunkSize / nSteps
- while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
- while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
- while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
+ while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*8 && chunkSize > 131072) chunkSize /= 2;
+ while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
+ while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth && chunkSize > 32768) chunkSize /= 2;
}
// Use lastChunkSize as chunkSize
coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
- } else if (llMode == 1) {
+ } else if (info->protocol == NCCL_PROTO_LL) {
int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t);
- const ssize_t loopSize = coll->args.nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
- coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), coll->args.nChannels*info->nchunksPerLoop);
- ALIGN_SIZE(coll->args.lastChunkSize, coll->args.nThreads*sizeof(uint64_t));
+ const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
+ coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
+ ALIGN_SIZE(coll->args.lastChunkSize, info->nThreads*sizeof(uint64_t));
coll->args.lastChunkSize /= ncclTypeSize(info->datatype);
+ } else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) {
+ int nstepsInter = 1+log2i(info->comm->nNodes);
+ while (info->nBytes / (info->nChannels*chunkSize) < nstepsInter*4 && chunkSize > 32768) chunkSize /= 2;
+ // Use lastChunkSize as chunkSize
+ coll->args.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
}
// Compute nSteps for proxies
- size_t nBytes = llMode ? info->nBytes*2 : info->nBytes;
-
- int nLoops = (int)(DIVUP(nBytes, (((size_t)(coll->args.nChannels))*info->nchunksPerLoop*chunkSize)));
+ int chunkEffectiveSize = chunkSize;
+ if (info->protocol == NCCL_PROTO_LL) chunkEffectiveSize /= 2;
+ if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS;
+ //if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol);
+ int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
proxyArgs->sliceSteps = sliceSteps;
proxyArgs->chunkSteps = chunkSteps;
- proxyArgs->llMode = llMode;
+ proxyArgs->protocol = info->protocol;
proxyArgs->opCount = info->comm->opCount;
- TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> llmode %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
- coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, nBytes, llMode, coll->args.nChannels, coll->args.nThreads,
+ TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
+ coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
nLoops, proxyArgs->nsteps, info->comm);
return ncclSuccess;
}
@@ -365,7 +387,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
NCCLCHECK(computeColl(info, &coll, &proxyArgs));
- info->comm->myParams->blockDim.x = max(info->comm->myParams->blockDim.x, coll.args.nThreads);
+ info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, coll.args.nThreads);
if (info->comm->userStreamSet == false) {
info->comm->userStream = info->stream;
info->comm->userStreamSet = true;
@@ -401,7 +423,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
channel->collFifoTail = opIndex;
channel->collCount++;
}
- /*if (llMode == 0)*/ info->comm->opCount++;
+ info->comm->opCount++;
return ncclSuccess;
}
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
new file mode 100644
index 0000000..af481d2
--- /dev/null
+++ b/src/graph/connect.cc
@@ -0,0 +1,268 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "graph.h"
+#include "trees.h"
+#include "rings.h"
+
+/******************************************************************/
+/********************* Internode connection ***********************/
+/******************************************************************/
+
+ncclResult_t ncclTopoPreset(struct ncclComm* comm,
+ struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
+ struct ncclTopoRanks* topoRanks) {
+ int rank = comm->rank;
+ int localRanks = comm->localRanks;
+ int nChannels = comm->nChannels;
+
+ for (int c=0; c<nChannels; c++) {
+ struct ncclChannel* channel = comm->channels+c;
+ channel->ring.prev = channel->ring.next = -1;
+ channel->treeUp.up = -1;
+ for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeUp.down[i] = -1;
+ channel->treeDn.up = -1;
+ for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeDn.down[i] = -1;
+
+ int* ringIntra = ringGraph->intra+c*localRanks;
+ int* treeIntra = treeGraph->intra+c*localRanks;
+
+ for (int i=0; i<localRanks; i++) {
+ if (ringIntra[i] == rank) {
+ topoRanks->ringRecv[c] = ringIntra[0];
+ topoRanks->ringSend[c] = ringIntra[localRanks-1];
+ channel->ring.prev = (i == 0) ? -1 : ringIntra[i-1];
+ channel->ring.next = (i == localRanks-1) ? -1 : ringIntra[i+1];
+ }
+ if (treeIntra[i] == rank) {
+ int recvIndex = 0, sendIndex = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
+ int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
+
+ // Tree loop always flows in the same direction. Other trees are symmetric, i.e.
+ // up/down go in reverse directions
+ int sym = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP ? 0 : 1;
+
+ // Down tree is common
+ topoRanks->treeDnRecv[c] = treeIntra[recvIndex];
+ topoRanks->treeDnSend[c] = treeIntra[sendIndex];
+ channel->treeDn.up = treeIntra[prev];
+ channel->treeDn.down[0] = treeIntra[next];
+ // Up tree depends on the pattern
+ topoRanks->treeUpRecv[c] = sym ? topoRanks->treeDnSend[c] : topoRanks->treeDnRecv[c];
+ topoRanks->treeUpSend[c] = sym ? topoRanks->treeDnRecv[c] : topoRanks->treeDnSend[c];
+ channel->treeUp.down[0] = sym ? channel->treeDn.down[0] : channel->treeDn.up ;
+ channel->treeUp.up = sym ? channel->treeDn.up : channel->treeDn.down[0];
+ }
+ }
+ topoRanks->ringPrev[c] = channel->ring.prev;
+ topoRanks->ringNext[c] = channel->ring.next;
+ }
+ // Duplicate channels rings/trees
+ struct ncclChannel* channel0 = comm->channels;
+ struct ncclChannel* channel1 = channel0+nChannels;
+ memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel));
+ return ncclSuccess;
+}
+
+static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext, int* firstRanks) {
+ int nChannels = comm->nChannels;
+ int nNodes = comm->nNodes;
+ for (int c=0; c<nChannels; c++) {
+ int* recv = ringRecv+c*comm->nRanks;
+ int* send = ringSend+c*comm->nRanks;
+ int* prev = ringPrev+c*comm->nRanks;
+ int* next = ringNext+c*comm->nRanks;
+ struct ncclChannel* channel0 = comm->channels+c;
+ struct ncclChannel* channel1 = channel0+nChannels;
+ for (int n=0; n<nNodes; n++) {
+ int recvRank = recv[firstRanks[n]];
+ int prevSendRank = send[firstRanks[(n-1+nNodes)%nNodes]];
+ prev[recvRank] = prevSendRank;
+ if (comm->rank == recvRank) {
+ channel0->ring.prev = prevSendRank;
+ channel1->ring.prev = prevSendRank;
+ }
+ int sendRank = send[firstRanks[n]];
+ int nextRecvRank = recv[firstRanks[(n+1)%nNodes]];
+ next[sendRank] = nextRecvRank;
+ if (comm->rank == sendRank) {
+ channel0->ring.next = nextRecvRank;
+ channel1->ring.next = nextRecvRank;
+ }
+ }
+ TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, channel0->ring.prev, comm->rank, channel0->ring.next);
+ TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c+nChannels, channel1->ring.prev, comm->rank, channel1->ring.next);
+ }
+ return ncclSuccess;
+}
+
+static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes, int* firstRanks) {
+ for (int n=0; n<nNodes; n++) indexes[n] = ranks[firstRanks[n]];
+ return ncclSuccess;
+}
+
+static ncclResult_t setTreeUp(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int u0, int u1) {
+ if (u0 != -1) tree0->up = indexes[u0];
+ if (u1 != -1) tree1->up = indexes[u1];
+ return ncclSuccess;
+}
+
+static ncclResult_t addRanksDown(int* down, int* indexes, int r0, int r1) {
+ int x = 0;
+ if (down[x] >= 0) x++;
+ if (down[x] >= 0) {
+ WARN("Internal error : tree already has more than one child (%d %d %d)\n", down[0], down[1], down[2]);
+ return ncclInternalError;
+ }
+ if (r0 != -1) down[x++] = indexes[r0];
+ if (r1 != -1) down[x++] = indexes[r1];
+ return ncclSuccess;
+}
+
+static ncclResult_t setTreeDown(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int d0_0, int d0_1, int d1_0, int d1_1) {
+ NCCLCHECK(addRanksDown(tree0->down, indexes, d0_0, d0_1));
+ NCCLCHECK(addRanksDown(tree1->down, indexes, d1_0, d1_1));
+ return ncclSuccess;
+}
+
+static ncclResult_t openRing(struct ncclTree* tree, int rank, int upRank) {
+ if (tree->down[0] == upRank) tree->down[0] = -1;
+ if (rank == upRank) tree->up = -1;
+ return ncclSuccess;
+}
+
+static ncclResult_t connectTrees(struct ncclComm* comm, int* treeUpRecv, int* treeUpSend, int* treeDnRecv, int* treeDnSend, int* firstRanks) {
+ const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node;
+ int* indexesSend, *indexesRecv;
+ NCCLCHECK(ncclCalloc(&indexesSend, nNodes));
+ NCCLCHECK(ncclCalloc(&indexesRecv, nNodes));
+
+ // Compute tree depth. Not an exact value but a good approximation in most
+ // cases
+ int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
+
+ int u0, d0_0, d0_1, u1, d1_0, d1_1;
+ NCCLCHECK(ncclGetDtree(nNodes, node, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1));
+ for (int c=0; c<nChannels; c++) {
+ struct ncclChannel* channel0 = comm->channels+c;
+ struct ncclChannel* channel1 = channel0+nChannels;
+ NCCLCHECK(getIndexes(treeUpSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
+ NCCLCHECK(getIndexes(treeUpRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
+ NCCLCHECK(openRing(&channel0->treeUp, comm->rank, indexesSend[node]));
+ NCCLCHECK(openRing(&channel1->treeUp, comm->rank, indexesSend[node]));
+ int root = indexesSend[node];
+ if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeUp, &channel1->treeUp, indexesRecv, u0, u1));
+ if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeUp, &channel1->treeUp, indexesSend, d0_0, d0_1, d1_0, d1_1));
+ NCCLCHECK(getIndexes(treeDnSend+c*comm->nRanks, indexesSend, nNodes, firstRanks));
+ NCCLCHECK(getIndexes(treeDnRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks));
+ NCCLCHECK(openRing(&channel0->treeDn, comm->rank, u0 == -1 ? root : indexesRecv[node]));
+ NCCLCHECK(openRing(&channel1->treeDn, comm->rank, u1 == -1 ? root : indexesRecv[node]));
+ if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeDn, &channel1->treeDn, indexesRecv, d0_0, d0_1, d1_0, d1_1));
+ if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeDn, &channel1->treeDn, indexesSend, u0, u1));
+ TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c, channel0->treeUp.up, channel0->treeUp.down[0], channel0->treeUp.down[1], channel0->treeUp.down[2]);
+ TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c+nChannels, channel1->treeUp.up, channel1->treeUp.down[0], channel1->treeUp.down[1], channel1->treeUp.down[2]);
+ TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c, channel0->treeDn.up, channel0->treeDn.down[0], channel0->treeDn.down[1], channel0->treeDn.down[2]);
+ TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c+nChannels, channel1->treeDn.up, channel1->treeDn.down[0], channel1->treeDn.down[1], channel1->treeDn.down[2]);
+ channel0->treeUp.depth = channel1->treeUp.depth = depth;
+ }
+ free(indexesSend);
+ free(indexesRecv);
+ return ncclSuccess;
+}
+
+// Legacy naming
+NCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
+NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
+// New naming
+NCCL_PARAM(MinNchannels, "MIN_NCHANNELS", -2);
+NCCL_PARAM(MaxNchannels, "MAX_NCHANNELS", -2);
+
+int ncclMinNchannels() {
+ int minNchannels = 0;
+ if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
+ if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
+ if (minNchannels > MAXCHANNELS) {
+ WARN("User asked for a minimum of %d channels, limiting to %d\n", minNchannels, MAXCHANNELS);
+ minNchannels = MAXCHANNELS;
+ }
+ if (minNchannels < 0) minNchannels = 0;
+ return minNchannels;
+}
+int ncclMaxNchannels() {
+ int maxNchannels = MAXCHANNELS;
+ if (ncclParamMaxNrings() != -2) maxNchannels = ncclParamMaxNrings();
+ if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
+ if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
+ if (maxNchannels < 1) {
+ WARN("User asked for a maximum of %d channels, setting it to 1\n", maxNchannels);
+ maxNchannels = 1;
+ }
+ return maxNchannels;
+}
+
+ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings) {
+ // Gather data from all ranks
+ int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend;
+ int nranks = comm->nRanks;
+ int nChannels = comm->nChannels;
+ NCCLCHECK(ncclCalloc(&ringRecv, nranks*MAXCHANNELS));
+ NCCLCHECK(ncclCalloc(&ringSend, nranks*MAXCHANNELS));
+ NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
+ NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
+ NCCLCHECK(ncclCalloc(&treeUpRecv, nranks*MAXCHANNELS));
+ NCCLCHECK(ncclCalloc(&treeUpSend, nranks*MAXCHANNELS));
+ NCCLCHECK(ncclCalloc(&treeDnRecv, nranks*MAXCHANNELS));
+ NCCLCHECK(ncclCalloc(&treeDnSend, nranks*MAXCHANNELS));
+ for (int i=0; i<nranks; i++) {
+ for (int c=0; c<nChannels;c++) {
+ ringRecv[c*nranks+i] = allTopoRanks[i]->ringRecv[c];
+ ringSend[c*nranks+i] = allTopoRanks[i]->ringSend[c];
+ ringPrev[c*nranks+i] = allTopoRanks[i]->ringPrev[c];
+ ringNext[c*nranks+i] = allTopoRanks[i]->ringNext[c];
+ treeUpRecv[c*nranks+i] = allTopoRanks[i]->treeUpRecv[c];
+ treeUpSend[c*nranks+i] = allTopoRanks[i]->treeUpSend[c];
+ treeDnRecv[c*nranks+i] = allTopoRanks[i]->treeDnRecv[c];
+ treeDnSend[c*nranks+i] = allTopoRanks[i]->treeDnSend[c];
+ }
+ }
+
+ // Connect rings and trees. This should also duplicate the channels.
+ NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext, firstRanks));
+ NCCLCHECK(connectTrees(comm, treeUpRecv, treeUpSend, treeDnRecv, treeDnSend, firstRanks));
+
+ // Duplicate ringPrev/ringNext for ncclBuildRing
+ memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
+ memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
+
+ // Duplication should be complete now
+ nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);
+
+ // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
+ // We permit combining max, then min, to only use the first channels, then duplicate them.
+ nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels);
+ int c;
+ for (c=nChannels; c<ncclMinNchannels(); c++) {
+ memcpy(ringPrev+c*nranks, ringPrev+(c-nChannels)*nranks, nranks*sizeof(int));
+ memcpy(ringNext+c*nranks, ringNext+(c-nChannels)*nranks, nranks*sizeof(int));
+ memcpy(comm->channels+c, comm->channels+c-nChannels, sizeof(struct ncclChannel));
+ }
+ nChannels = comm->nChannels = c;
+
+ // Create rings array and check all is fine
+ NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
+
+ free(ringRecv);
+ free(ringSend);
+ free(ringPrev);
+ free(ringNext);
+ free(treeUpRecv);
+ free(treeUpSend);
+ free(treeDnRecv);
+ free(treeDnSend);
+
+ return ncclSuccess;
+}
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
new file mode 100644
index 0000000..ce1772c
--- /dev/null
+++ b/src/graph/paths.cc
@@ -0,0 +1,363 @@
+/*************************************************************************
+ * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "graph.h"
+#include "topo.h"
+#include "comm.h"
+#include "net.h"
+
+// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths
+
+struct ncclTopoNodeList {
+ struct ncclTopoNode* list[NCCL_TOPO_MAX_NODES];
+ int count;
+};
+
+static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode* node, int t, int64_t id, struct ncclTopoLinkList** path) {
+ for (int i=0; i<system->nodes[t].count; i++) {
+ if (system->nodes[t].nodes[i].id == id) {
+ *path = node->paths[t]+i;
+ return ncclSuccess;
+ }
+ }
+ WARN("Could not find node of type %d id %lx\n", t, id);
+ return ncclInternalError;
+}
+
+static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
+ if (baseNode->paths[baseNode->type] == NULL) {
+ NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
+ }
+
+ // breadth-first search to set all paths to that node in the system
+ struct ncclTopoNodeList nodeList;
+ struct ncclTopoNodeList nextNodeList;
+ nodeList.count = 1; nodeList.list[0] = baseNode;
+ nextNodeList.count = 0;
+ struct ncclTopoLinkList* basePath;
+ NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
+ basePath->count = 0;
+ basePath->width = LOC_WIDTH;
+ basePath->type = LINK_LOC;
+
+ while (nodeList.count) {
+ nextNodeList.count = 0;
+ for (int n=0; n<nodeList.count; n++) {
+ struct ncclTopoNode* node = nodeList.list[n];
+ struct ncclTopoLinkList* path;
+ NCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
+ for (int l=0; l<node->nlinks; l++) {
+ struct ncclTopoLink* link = node->links+l;
+ struct ncclTopoNode* remNode = link->remNode;
+ if (remNode->paths[baseNode->type] == NULL) {
+ NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count));
+ }
+ struct ncclTopoLinkList* remPath;
+ NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
+ int width = std::min(path->width, link->width);
+ if (remPath->width < width) {
+ // Find reverse link
+ for (int l=0; l<remNode->nlinks; l++) {
+ if (remNode->links[l].remNode == node) {
+ remPath->list[0] = remNode->links+l;
+ break;
+ }
+ }
+ if (remPath->list[0] == NULL) {
+ WARN("Failed to find reverse path from remNode id %d type %d nlinks %d to node id %d type %d",
+ remNode->id, remNode->type, remNode->nlinks, node->id, node->type);
+ return ncclInternalError;
+ }
+ // Copy the rest of the path
+ for (int i=0; i<path->count; i++) remPath->list[i+1] = path->list[i];
+ remPath->count = path->count + 1;
+ remPath->width = width;
+
+ // Consider the path is QPI when going through the CPU
+ // Also don't consider LINK_NET as we only care about the NIC->GPU path.
+ int type = remNode->type == CPU ? LINK_QPI : link->type == LINK_NET ? 0 : link->type;
+ remPath->type = std::max(path->type, type);
+
+ // Add to the list for the next iteration if not already in the list
+ // Disallow GPUs as intermediate steps for now
+ if (remNode->type != GPU) {
+ int i;
+ for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break;
+ if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode;
+ }
+ }
+ }
+ }
+ memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
+ }
+ return ncclSuccess;
+}
+
+static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* node) {
+ char line[1024];
+#ifdef ENABLE_TRACE
+ INFO(NCCL_GRAPH, "Paths from %s/%lX :", topoNodeTypeStr[node->type], node->id);
+#else
+ sprintf(line, "%s/%lX :", topoNodeTypeStr[node->type], node->id);
+ int offset = strlen(line);
+#endif
+ for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
+ if (node->paths[t] == NULL) continue;
+ for (int n = 0; n<system->nodes[t].count; n++) {
+#ifdef ENABLE_TRACE
+ line[0] = 0;
+ int offset = 0;
+ for (int i=0; i<node->paths[t][n].count; i++) {
+ struct ncclTopoLink* link = node->paths[t][n].list[i];
+ struct ncclTopoNode* remNode = link->remNode;
+ sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id);
+ offset = strlen(line);
+ }
+ INFO(NCCL_GRAPH, "%s (%d)", line, node->paths[t][n].width);
+#else
+ sprintf(line+offset, "%s/%lX (%d/%d/%d) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, node->paths[t][n].type);
+ offset = strlen(line);
+#endif
+ }
+ }
+#ifndef ENABLE_TRACE
+ INFO(NCCL_GRAPH, "%s", line);
+#endif
+}
+
+ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system) {
+ for (int i=0; i<system->nodes[GPU].count; i++) {
+ printNodePaths(system, system->nodes[GPU].nodes+i);
+ }
+ for (int i=0; i<system->nodes[NET].count; i++) {
+ printNodePaths(system, system->nodes[NET].nodes+i);
+ }
+ return ncclSuccess;
+}
+
+static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) {
+ // Find the closest CPU to a GPU
+ int minHops = 0;
+ int localCpu = -1;
+ struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU];
+ for (int c=0; c<system->nodes[CPU].count; c++) {
+ int hops = paths[c].count;
+ if (minHops == 0 || hops < minHops) {
+ localCpu = c;
+ minHops = hops;
+ }
+ }
+ if (localCpu == -1) {
+ WARN("Error : could not find CPU close to GPU %d", gpu);
+ return ncclInternalError;
+ }
+ *retCpu = localCpu;
+ return ncclSuccess;
+}
+
+static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int i1, int t2, int i2) {
+ struct ncclTopoNode* cpuNode = system->nodes[CPU].nodes+c;
+ struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1;
+
+ int l=0;
+ // Node 1 -> CPU
+ for (int i=0; i<srcNode->paths[CPU][c].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[CPU][c].list[i];
+ // CPU -> Node 2
+ for (int i=0; i<cpuNode->paths[t2][i2].count; i++) srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i];
+
+ // Update path characteristics
+ srcNode->paths[t2][i2].count = l;
+ srcNode->paths[t2][i2].type = LINK_QPI;
+ srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width);
+ return ncclSuccess;
+}
+
+// Remove/free paths for a given type
+static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) {
+ for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
+ for (int n=0; n<system->nodes[t].count; n++) {
+ struct ncclTopoNode* node = system->nodes[t].nodes+n;
+ free(node->paths[nodeType]);
+ node->paths[nodeType] = NULL;
+ }
+ }
+}
+
+ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) {
+ // Precompute paths between GPUs/NICs.
+
+ // Remove everything in case we're re-computing
+ for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
+
+ // Set direct paths from/to CPUs. We need them in many cases.
+ for (int c=0; c<system->nodes[CPU].count; c++) {
+ NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system));
+ }
+
+ // Set direct paths from/to GPUs.
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ // Compute paths to GPU g
+ NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system));
+
+ if (peerInfos == NULL) continue;
+ // Update paths from GPUs p to GPU g when we can't or don't want to use P2P or even SHM
+ struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].rank;
+ for (int p=0; p<system->nodes[GPU].count; p++) {
+ if (p == g) continue;
+ struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].rank;
+ int p2p;
+ NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo));
+ if (p2p == 0) {
+ int shm;
+ NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
+ if (shm == 1) {
+ // We cannot use GPU Direct, so we need all traffic to go through a CPU
+ int cpu;
+ NCCLCHECK(getLocalCpu(system, g, &cpu));
+ NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g));
+ } else {
+ // We cannot communicate with that peer.
+ system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
+ }
+ }
+ }
+ }
+
+ // Set direct paths from/to NICs.
+ for (int n=0; n<system->nodes[NET].count; n++) {
+ struct ncclTopoNode* netNode = system->nodes[NET].nodes+n;
+ NCCLCHECK(ncclTopoSetPaths(netNode, system));
+
+ if (peerInfos == NULL) continue;
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ if ((peerInfos[system->nodes[GPU].nodes[g].rank].gdrSupport & (1 << n)) == 0) {
+ // We cannot use GPU Direct RDMA, so we need all NIC<->GPU paths
+ // to go through a CPU
+ int localCpu;
+ NCCLCHECK(getLocalCpu(system, g, &localCpu));
+ NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g));
+ NCCLCHECK(addCpuStep(system, localCpu, GPU, g, NET, n));
+ }
+ }
+ }
+
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm) {
+ int *domains;
+ int64_t *ids;
+ NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count));
+ NCCLCHECK(ncclCalloc(&ids, system->nodes[GPU].count));
+ int myDomain = 0;
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+ domains[g] = g;
+ ids[g] = gpu->id;
+ for (int p=0; p<g; p++) {
+ if (gpu->paths[GPU][p].count > 0) {
+ domains[g] = std::min(domains[g], domains[p]);
+ }
+ }
+ if (gpu->rank == comm->rank) myDomain = domains[g];
+ }
+
+ int ngpus = system->nodes[GPU].count;
+ for (int i=0; i<ngpus; i++) {
+ if (domains[i] == myDomain) continue;
+ struct ncclTopoNode* gpu = NULL;
+ int g;
+ for (g=0; g<system->nodes[GPU].count /* This one varies over the loops */; g++) {
+ gpu = system->nodes[GPU].nodes+g;
+ if (gpu->id == ids[i]) break; else gpu=NULL;
+ }
+ if (gpu == NULL) {
+ WARN("Could not find id %lx", ids[i]);
+ free(domains);
+ free(ids);
+ return ncclInternalError;
+ }
+
+ // Remove GPUs I can't access (even indirectly) from my view of the node
+ for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
+ for (int n=0; n<system->nodes[t].count; n++) {
+ struct ncclTopoNode* node = system->nodes[t].nodes+n;
+ if (node == gpu) continue;
+ for (int l=0; l<node->nlinks; l++) {
+ while (l<node->nlinks && node->links[l].remNode == gpu) {
+ if (l<node->nlinks-1)
+ memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink));
+ node->nlinks--;
+ }
+ if (l<node->nlinks && node->links[l].remNode->type == GPU && node->links[l].remNode >= gpu) {
+ node->links[l].remNode--;
+ }
+ }
+ }
+ }
+ if (g != system->nodes[GPU].count-1)
+ memmove(gpu, gpu+1, (system->nodes[GPU].count-g-1)*sizeof(struct ncclTopoNode));
+ system->nodes[GPU].count--;
+ }
+
+ comm->localRanks = system->nodes[GPU].count;
+ if (system->nodes[GPU].count == comm->nRanks) {
+ // Trim network
+ ncclTopoRemovePathType(system, NET);
+ system->nodes[NET].count = 0;
+ }
+ free(domains);
+ free(ids);
+ return ncclSuccess;
+}
+
+static ncclResult_t getGpuSpeed(struct ncclTopoNode* node, int* speed) {
+ int nvlSpeed = 0;
+ int nvlPeers = 0;
+ int pciSpeed = 0;
+ for (int l=0; l<node->nlinks; l++) {
+ if (node->links[l].type == LINK_NVL) nvlSpeed += node->links[l].width;
+ if (node->links[l].remNode->type == GPU) nvlPeers++; else nvlPeers = 2;
+ if (node->links[l].type == LINK_PCI) pciSpeed = node->links[l].width;
+ }
+ *speed = std::min(*speed, std::max(nvlSpeed, pciSpeed));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system) {
+ // Compute max speed to try to accelerate the search.
+ system->maxSpeed = LOC_WIDTH;
+
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ NCCLCHECK(getGpuSpeed(system->nodes[GPU].nodes+g, &system->maxSpeed));
+ }
+ if (system->nodes[NET].count) {
+ // Try to assign one NIC per GPU
+ int netMaxSpeed = 0;
+ int netMaxSpeedCount = 0;
+ for (int n=0; n<system->nodes[NET].count; n++) {
+ int maxSpeed = 0;
+ struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ maxSpeed = std::max(maxSpeed, net->paths[GPU][g].width);
+ }
+ if (maxSpeed > netMaxSpeed) {
+ netMaxSpeed = maxSpeed;
+ netMaxSpeedCount = 1;
+ } else if (maxSpeed == netMaxSpeed) {
+ netMaxSpeedCount++;
+ }
+ }
+ system->maxSpeed = std::min(system->maxSpeed, netMaxSpeedCount*NET_WIDTH);
+ }
+ return ncclSuccess;
+}
+
+void ncclTopoFree(struct ncclTopoSystem* system) {
+ for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
+ free(system);
+}
diff --git a/src/graph/rings.cc b/src/graph/rings.cc
new file mode 100644
index 0000000..5aacbb5
--- /dev/null
+++ b/src/graph/rings.cc
@@ -0,0 +1,57 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+
+#define MAXWIDTH 20
+#define PREFIXLEN 15
+#define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
+void dumpLine(int* values, int nranks, const char* prefix) {
+ int prefixlen = strlen(prefix);
+ char line[STRLENGTH+1];
+ line[STRLENGTH] = '\0';
+ memset(line, ' ', STRLENGTH);
+ strncpy(line, prefix, PREFIXLEN);
+ for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
+ INFO(NCCL_INIT,"%s", line);
+}
+
+ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
+ for (int r=0; r<nrings; r++) {
+ char prefix[30];
+ /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
+ dumpLine(prev+r*nranks, nranks, prefix);
+ sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
+ dumpLine(next+r*nranks, nranks, prefix);*/
+
+ int current = rank;
+ for (int i=0; i<nranks; i++) {
+ rings[r*nranks+i] = current;
+ current = next[r*nranks+current];
+ }
+ sprintf(prefix, "Channel %02d/%02d : ", r, nrings);
+ if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
+ if (current != rank) {
+ WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
+ return ncclInternalError;
+ }
+ // Check that all ranks are there
+ for (int i=0; i<nranks; i++) {
+ int found = 0;
+ for (int j=0; j<nranks; j++) {
+ if (rings[r*nranks+j] == i) {
+ found = 1;
+ break;
+ }
+ }
+ if (found == 0) {
+ WARN("Error : ring %d does not contain rank %d", r, i);
+ return ncclInternalError;
+ }
+ }
+ }
+ return ncclSuccess;
+}
diff --git a/src/graph/rings.h b/src/graph/rings.h
new file mode 100644
index 0000000..c52b1ca
--- /dev/null
+++ b/src/graph/rings.h
@@ -0,0 +1,7 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next);
diff --git a/src/graph/search.cc b/src/graph/search.cc
new file mode 100644
index 0000000..3a8b4e7
--- /dev/null
+++ b/src/graph/search.cc
@@ -0,0 +1,594 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "graph.h"
+#include "topo.h"
+
+static ncclResult_t ncclTopoFollowPath(struct ncclTopoGraph* graph, struct ncclTopoLinkList* path, struct ncclTopoNode** node, int width, int typeSave) {
+ if (path->count == 0) return ncclSuccess;
+
+ *node = NULL;
+ if (width > 0) {
+ if (path->type > graph->type) return ncclSuccess;
+ graph->type = std::max(graph->type, path->type);
+ graph->nHops += path->count;
+ } else {
+ graph->type = typeSave;
+ graph->nHops -= path->count;
+ }
+
+ for (int i=0; i<path->count; i++) {
+ if (path->list[i]->width < width) {
+ // Can't follow this path, rewind and exit
+ for (int j=0; j<i; j++) path->list[j]->width += width;
+ return ncclSuccess;
+ }
+ path->list[i]->width -= width;
+ }
+ *node = path->list[path->count-1]->remNode;
+ return ncclSuccess;
+}
+
+static int gpuPciWidth(struct ncclTopoNode* gpu) {
+ for (int l=0; l<gpu->nlinks; l++) {
+ struct ncclTopoLink* gpuLink = gpu->links+l;
+ if (gpuLink->type != LINK_PCI) continue;
+ struct ncclTopoNode* pci = gpuLink->remNode;
+ for (int l=0; l<pci->nlinks; l++) {
+ struct ncclTopoLink* pciLink = pci->links+l;
+ if (pciLink->remNode != gpu) continue;
+ return std::min(gpuLink->width, pciLink->width);
+ }
+ }
+ return -1;
+}
+
+/* Choose the order in which we try next GPUs. This is critical for the search
+ to quickly converge to the best solution even if it eventually times out. */
+struct ncclGpuScore {
+ int g; // Retain the index
+ int startIndex; // Least important
+ int intraNhops;
+ int intraWidth;
+ int interNhops;
+ int interPciWidth;
+ int interWidth; // Most important
+};
+
+static int cmpScore(const void * g1, const void * g2) {
+ struct ncclGpuScore *s1 = (struct ncclGpuScore*)g1;
+ struct ncclGpuScore *s2 = (struct ncclGpuScore*)g2;
+ int d;
+ if ((d = (s2->interWidth - s1->interWidth))) return d;
+ if ((d = (s2->interPciWidth - s1->interPciWidth))) return d;
+ if ((d = (s1->interNhops - s2->interNhops))) return d;
+ if ((d = (s2->intraWidth - s1->intraWidth))) return d;
+ if ((d = (s1->intraNhops - s2->intraNhops))) return d;
+ return s1->startIndex - s2->startIndex;
+}
+
+static int cmpIntraScores(struct ncclGpuScore* scores, int count) {
+ int intraWidth = scores[0].intraWidth;
+ int intraNhops = scores[0].intraNhops;
+ for (int i=1; i<count; i++) {
+ if (scores[i].intraWidth != intraWidth || scores[i].intraNhops != intraNhops) return 1;
+ }
+ return 0;
+}
+
+static ncclResult_t getNetPaths(struct ncclTopoSystem* system, const uint64_t flag, struct ncclTopoLinkList** netPaths) {
+ for (int n=0; n<system->nodes[NET].count; n++) {
+ if (system->nodes[NET].nodes[n].used & flag) {
+ *netPaths=system->nodes[NET].nodes[n].paths[GPU];
+ return ncclSuccess;
+ }
+ }
+ return ncclInternalError;
+}
+
+ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoNode* gpu, int* next, int* countPtr, int sortNet) {
+ const uint64_t flag = 1ULL<<(graph->nChannels);
+ int ngpus = system->nodes[GPU].count;
+ struct ncclTopoLinkList* paths = gpu->paths[GPU];
+ struct ncclTopoLinkList* netPaths = NULL;
+ if (sortNet) NCCLCHECK(getNetPaths(system, flag, &netPaths));
+
+ struct ncclGpuScore scores[NCCL_TOPO_MAX_NODES];
+ memset(scores, 0, ngpus*sizeof(struct ncclGpuScore));
+ int start = gpu-system->nodes[GPU].nodes;
+ int count = 0;
+ for (int i=1; i<ngpus; i++) {
+ int g = (start+i)%ngpus;
+ if (paths[g].count == 0) continue; // There is no path to that GPU
+ if (system->nodes[GPU].nodes[g].used & flag) continue;
+ scores[count].g = g;
+ scores[count].startIndex = i;
+ scores[count].intraNhops = paths[g].count;
+ scores[count].intraWidth = paths[g].width;
+ if (netPaths) {
+ scores[count].interNhops = netPaths[g].count;
+ scores[count].interPciWidth = gpuPciWidth(system->nodes[GPU].nodes+g);
+ scores[count].interWidth = netPaths[g].width;
+ }
+ count++;
+ }
+
+ // Sort GPUs
+ qsort(scores, count, sizeof(struct ncclGpuScore), cmpScore);
+
+ // Check if all have the same intra-node score in which case we go reverse for sortNet = -1
+ if (sortNet == -1 && cmpIntraScores(scores, count) == 0) {
+ for (int i=0; i<count; i++) next[i] = scores[count-1-i].g;
+ } else {
+ for (int i=0; i<count; i++) next[i] = scores[i].g;
+ }
+ *countPtr = count;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time);
+
+#define NCCL_SEARCH_TIMEOUT (1ULL<<20) // This should get contain all search within a second or so.
+
+#define FORCED_ORDER_PCI 1
+#define FORCED_ORDER_REPLAY 2
+
+ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int step, int* g) {
+ *g = -1;
+ if (graph->nChannels == 0) return ncclInternalError;
+ int ngpus = system->nodes[GPU].count;
+ int nextRank = graph->intra[(graph->nChannels-1)*ngpus+step+1];
+ for (int i=0; i<ngpus; i++) if (system->nodes[GPU].nodes[i].rank == nextRank) {
+ *g = i;
+ return ncclSuccess;
+ }
+ if (*g == -1) return ncclInternalError;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time);
+
+ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoLinkList* paths, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time, int g, int speed) {
+ int typeSave = graph->type;
+ const uint64_t flag = 1ULL<<(graph->nChannels);
+ struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+ if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, speed, typeSave));
+ if (gpu) {
+ gpu->used ^= flag;
+ NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, maxSpeed, time));
+ gpu->used ^= flag;
+ if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, -speed, typeSave));
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoCompareGraphs(struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
+ // 0. When we are trying to increase speedIntra, do not copy if the solution has less channels
+ // since it would likely impact the rings algorithms too.
+ if (graph->speedIntra > graph->speedInter && graph->nChannels < refGraph->nChannels) return ncclSuccess;
+
+ // 1. Try to get better bandwidth
+ if (graph->nChannels*graph->speedIntra < refGraph->nChannels*refGraph->speedIntra) return ncclSuccess;
+ if (graph->nChannels*graph->speedIntra > refGraph->nChannels*refGraph->speedIntra) {
+ *copy = 1;
+ return ncclSuccess;
+ }
+ // 2. Give an advantage when all channels are the same
+ if (graph->nChannels > 1 && graph->sameChannels && refGraph->sameChannels == 0) {
+ *copy = 1;
+ return ncclSuccess;
+ }
+ // 3. Less hops
+ if (graph->nHops < refGraph->nHops) *copy = 1;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time) {
+ if ((*time) <= 0) return ncclSuccess;
+ (*time)--;
+
+ int ngpus = system->nodes[GPU].count;
+ if (step == ngpus) {
+ // Determine whether we found a better solution or not
+ int copy = 0;
+ int sameChannels = graph->sameChannels;
+ if (graph->nChannels > 0) {
+ int* intra = graph->intra+graph->nChannels*ngpus;
+ for (int g=0; g<ngpus; g++) if (intra[g] != intra[g-ngpus]) graph->sameChannels = 0;
+ }
+ graph->nChannels++;
+ NCCLCHECK(ncclTopoCompareGraphs(graph, saveGraph, &copy));
+ if (copy) {
+ memcpy(saveGraph, graph, sizeof(struct ncclTopoGraph));
+ if (graph->nChannels*graph->speedIntra == maxSpeed) *time = -1;
+ }
+ if (graph->nChannels < MAXCHANNELS/2) {
+ NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, maxSpeed, time));
+ }
+ graph->nChannels--;
+ graph->sameChannels = sameChannels;
+ return ncclSuccess;
+ }
+ graph->intra[graph->nChannels*ngpus+step] = gpu->rank;
+ if (step == backToNet) {
+ // first get back to NIC
+ if (system->nodes[NET].count) {
+ int maxWidth = 0;
+ struct ncclTopoLinkList* paths = gpu->paths[NET];
+ for (int n=0; n<system->nodes[NET].count; n++) {
+ if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue;
+ maxWidth = std::max(paths[n].width, maxWidth);
+ }
+ for (int n=0; n<system->nodes[NET].count; n++) {
+ if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue;
+ if (paths[n].width == maxWidth) {
+ struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+ int typeSave = graph->type;
+ NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, graph->speedInter, typeSave));
+ if (net) {
+ graph->inter[graph->nChannels*2+1] = net->id;
+ NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, maxSpeed, time));
+ NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, -graph->speedInter, typeSave));
+ }
+ }
+ }
+ }
+ } else if (step < system->nodes[GPU].count-1) {
+ // Go to next GPU
+ struct ncclTopoLinkList* paths = gpu->paths[GPU];
+ int next[NCCL_TOPO_MAX_NODES];
+ int count;
+ if (forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order
+ next[0] = step+1;
+ count = 1;
+ } else if (forcedOrder == FORCED_ORDER_REPLAY) { // Try last channel order
+ NCCLCHECK(ncclTopoReplayGetGpu(system, graph, step, next));
+ count = 1;
+ } else { // Normal search
+ NCCLCHECK(ncclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step+1 ? 1 : -1 ));
+ }
+ for (int i=0; i<count; i++) {
+ int g = next[i];
+ int nvlink = graph->nvlink;
+ graph->nvlink &= paths[g].type <= LINK_NVL ? 1 : 0;
+ int speed = graph->speedIntra;
+ if (paths[g].type == LINK_QPI) speed = INTEL_P2P_OVERHEAD(speed);
+ NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, step+1, backToNet, backToFirstRank, forcedOrder, maxSpeed, time, g, speed));
+ graph->nvlink = nvlink;
+ }
+ } else if (step == backToFirstRank) {
+ // Find first GPU and loop back to it
+ int g;
+ int rank = graph->intra[graph->nChannels*ngpus];
+ for (g=0; g<ngpus; g++) {
+ if (system->nodes[GPU].nodes[g].rank == rank) break;
+ }
+ if (g == ngpus) {
+ WARN("Could not find GPU with rank %d\n", rank);
+ return ncclInternalError;
+ }
+ struct ncclTopoLinkList* paths = gpu->paths[GPU];
+ struct ncclTopoNode* firstGpu = system->nodes[GPU].nodes+g;
+ int typeSave = graph->type;
+ NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, graph->speedIntra, typeSave));
+ if (firstGpu) {
+ NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step+1, backToNet, -1, forcedOrder, maxSpeed, time));
+ NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, -graph->speedIntra, typeSave));
+ }
+ } else {
+ // Next path
+ NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, maxSpeed, time));
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int maxSpeed, int* time) {
+ const uint64_t flag = 1ULL<<(graph->nChannels);
+ const int speed = graph->speedInter;
+ for (int n=0; n<system->nodes[NET].count; n++) {
+ struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+ struct ncclTopoNode* gpu;
+ if (net->used == 0) {
+ graph->inter[graph->nChannels*2] = net->id;
+ for (int i=0; i<system->nodes[NET].count; i++) {
+ if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag;
+ }
+ struct ncclTopoLinkList* paths = net->paths[GPU];
+
+ // First try the PCI order to set a reference
+ NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, speed));
+ // Then try to replay the last channel
+ if (graph->nChannels > 0) {
+ int g;
+ NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
+ NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, g, speed));
+ }
+
+ // Then try the most local GPUs
+ int maxWidth = 0, minHops = 0xfffffff;
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ if (paths[g].width > maxWidth) {
+ maxWidth = paths[g].width;
+ minHops = paths[g].count;
+ } else if (paths[g].width == maxWidth && paths[g].count < minHops) {
+ minHops = paths[g].count;
+ }
+ }
+ if (maxWidth >= speed) {
+ // In the first loop, avoid using GPUs in both directions between channels (one channel
+ // sending from that GPU and one channel receiving to that GPU), since that usually leads
+ // to lower BW.
+ for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) {
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ if (paths[g].width == maxWidth && paths[g].count == minHops) {
+ gpu = system->nodes[GPU].nodes+g;
+ int gpuUsed = gpuPciWidth(gpu) > 0 ? 0 : 1;
+ if (tryGpuBidir == gpuUsed) {
+ NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, 0, maxSpeed, time, g, speed));
+ }
+ }
+ }
+ }
+ }
+ for (int i=0; i<system->nodes[NET].count; i++) {
+ if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag;
+ }
+ }
+ }
+ return ncclSuccess;
+}
+
+/* Search Patterns
+ *
+ * Intra-node
+ * Ring : GPU a -> GPU b -> .. -> GPU x -> GPU a
+ * (=Split Tree Loop)
+ * Tree : GPU a -> GPU b -> .. -> GPU x
+ * (=Split Tree)
+ *
+ * Inter-node
+ * Ring : NET n -> GPU a -> GPU b -> .. -> GPU x -> NET n (or m if crossNic)
+ * Tree : NET n -> GPU a -> GPU b -> .. -> GPU x
+ * `--> NET n (or m if crossNic)
+ * Split Tree : NET n -> GPU a -> GPU b -> .. -> GPU x
+ * `--> NET n (or m if crossNic)
+ * Split Tree Loop : NET n -> GPU a -> GPU b -> .. -> GPU x -> GPU a
+ * `--> NET n (or m if crossNic)
+ */
+ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) {
+ if (system->nodes[NET].count) {
+ if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1;
+ else if (pattern == NCCL_TOPO_PATTERN_TREE) *backToNet = 0;
+ else *backToNet = 1;
+ if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
+ else *backToFirstRank = -1;
+ } else {
+ *backToNet = -1;
+ if (pattern == NCCL_TOPO_PATTERN_RING || pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1;
+ else *backToFirstRank = -1;
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time) {
+ int backToNet, backToFirstRank;
+ NCCLCHECK(ncclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank));
+ if (system->nodes[NET].count) {
+ // Start from NET
+ ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, maxSpeed, time);
+ } else {
+ // Start from GPU 0
+ NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, graph->speedIntra));
+ if (graph->nChannels > 0) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, 0, graph->speedIntra));
+ NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, 0, maxSpeed, time, 0, graph->speedIntra));
+ }
+ return ncclSuccess;
+}
+
+/* Parse user defined rings. Format is like :
+ * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
+ * Rings with a non-matching number of ranks are ignored so we can provide
+ * rings for multiple cases.
+ */
+#define MAX_ENV_RANKS 512
+static ncclResult_t parseGraph(const char* str, int* nChannelsRet, int ngpus, int* channels) {
+ int ranks[MAX_ENV_RANKS];
+ int nChannels = 0;
+ int rank = 0;
+ int offset = 0;
+ int status = 0; // 0 : between numbers, 1 : inside number
+ do {
+ int digit = str[offset] - '0';
+ if (digit >= 0 && digit <= 9) {
+ if (status == 0) {
+ ranks[rank] = digit;
+ status = 1;
+ } else {
+ ranks[rank] = ranks[rank]*10+digit;
+ }
+ } else {
+ if (status == 1) {
+ rank++;
+ if (rank == MAX_ENV_RANKS) goto end;
+ }
+ status = 0;
+ if (str[offset] == '|' || str[offset] == '\0') {
+ // Ignore if ngpus doesn't match
+ if (rank != ngpus) goto newchannel;
+
+ for (int r=0; r<ngpus; r++) {
+ int rank = ranks[r];
+ // Ignore if ranks are out of bounds
+ if (rank < 0 || rank >= ngpus) goto newchannel;
+ // Ignore if ranks are duplicate
+ for (int i=0; i<r; i++)
+ if (ranks[i] == rank) goto newchannel;
+
+ channels[nChannels*ngpus+r] = rank;
+ }
+ nChannels++;
+newchannel:
+ rank = 0;
+ }
+ }
+ } while (str[offset++] != 0);
+end:
+ *nChannelsRet = nChannels;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
+ int ngpus = system->nodes[GPU].count;
+ int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
+ graph->speedIntra = graph->speedInter = 0;
+ if (graph->crossNic == 2) graph->crossNic = 0;
+ graph->nvlink = 0;
+ graph->type = LINK_LOC;
+ graph->nChannels = 0;
+ graph->sameChannels = 1;
+
+ char* str = getenv("NCCL_GRAPH");
+ if (str) {
+ NCCLCHECK(parseGraph(str, &graph->nChannels, ngpus, graph->intra));
+ for (int i=0; i<graph->nChannels*ngpus; i++) {
+ // Translate gpu numbers into ranks
+ graph->intra[i] = system->nodes[GPU].nodes[graph->intra[i]].rank;
+ }
+ // TODO : let user specify NICs
+ graph->inter[0] = graph->inter[1] = 0;
+ graph->speedIntra = graph->speedInter = PCI_WIDTH+2;
+ graph->nvlink = 0;
+ if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
+ // Reverse the loop
+ for (int c=0; c<graph->nChannels; c++) {
+ for (int i=0; i<=ngpus/2; i++) {
+ int tmp = graph->intra[ngpus*c+i];
+ graph->intra[ngpus*c+i] = graph->intra[ngpus*c+(ngpus-i)%ngpus];
+ graph->intra[ngpus*c+ngpus-i] = tmp;
+ }
+ }
+ }
+ if (graph->nChannels) return ncclSuccess;
+ }
+
+ if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
+
+ struct ncclTopoGraph tmpGraph;
+ memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
+ int bestSpeed = 0;
+
+ // First try crossnic, then decrease speed and finally increase speedIntra.
+ tmpGraph.speedIntra = tmpGraph.speedInter = system->maxWidth;
+ int maxSpeed = system->maxSpeed;
+ tmpGraph.pattern = graph->pattern;
+
+search:
+ int time = NCCL_SEARCH_TIMEOUT;
+ tmpGraph.nvlink = 1;
+ tmpGraph.nChannels = 0;
+ tmpGraph.sameChannels = 1;
+ NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, maxSpeed, &time));
+#if 0
+ printf("Pattern %d, crossNic %d, Speed %d/%d, type %d -> nChannels %dx%d/%d %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.type, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : "");
+ for (int c=0; c<graph->nChannels; c++) {
+ printf("%2d : ", c);
+ for (int g=0; g<ngpus; g++) {
+ printf("%d ", graph->intra[c*ngpus+g]);
+ }
+ printf("\n");
+ }
+#endif
+ if (time == -1) goto done;
+ // We already have a solution and we timed out so lower speed will just timeout as well
+ if (time == 0 && graph->nChannels > 0) goto done;
+ if ((graph->nChannels > 0) && (bestSpeed == 0)) bestSpeed = graph->speedIntra;
+
+ if (tmpGraph.speedIntra == tmpGraph.speedInter) {
+ // First pass, we don't have a solution yet ; try to go slower.
+
+ // Try a simpler tree
+ if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) {
+ tmpGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
+ goto search;
+ }
+ if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
+ tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
+ goto search;
+ }
+ tmpGraph.pattern = graph->pattern;
+
+ if (tmpGraph.type < LINK_QPI) {
+ tmpGraph.type += 1;
+ goto search;
+ }
+ tmpGraph.type = graph->type;
+
+ if (crossNic && tmpGraph.crossNic == 0) {
+ // Try again with crossNic if permitted
+ tmpGraph.crossNic = crossNic;
+ goto search;
+ }
+ tmpGraph.crossNic = graph->crossNic;
+
+ // Try to reduce speed per channel
+ tmpGraph.speedIntra = tmpGraph.speedInter -= 3;
+ if (tmpGraph.speedIntra >= bestSpeed/2 && tmpGraph.speedIntra >= 3) goto search;
+ }
+
+done:
+ // We have a solution now. See if we can increase speedIntra
+ if (tmpGraph.speedIntra == tmpGraph.speedInter) {
+ time = -1;
+ memcpy(&tmpGraph, graph, sizeof(tmpGraph));
+ }
+ if (time != 0 && tmpGraph.pattern != NCCL_TOPO_PATTERN_RING && tmpGraph.speedIntra == graph->speedIntra) {
+ // Try to increase the intra speed only but keeping nChannels the same
+ tmpGraph.speedIntra += 3;
+ maxSpeed = tmpGraph.speedIntra * graph->nChannels;
+ if (tmpGraph.speedIntra <= tmpGraph.speedInter*2) goto search;
+ }
+
+ if (graph->nChannels == 0) {
+ WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern);
+ for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].rank;
+ graph->inter[0] = graph->inter[1] = 0;
+ graph->speedIntra = graph->speedInter = 3;
+ graph->nvlink = 0;
+ graph->nChannels = 1;
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
+ INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, speed %d/%d, nvlink %d, type %d, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->speedIntra, graph->speedInter, graph->nvlink, graph->type, graph->sameChannels);
+ int ngpus = system->nodes[GPU].count;
+
+ char line[1024];
+ for (int c=0; c<graph->nChannels; c++) {
+ sprintf(line, "%2d :", c);
+ int offset = strlen(line);
+ if (system->nodes[NET].count > 0) {
+ sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c]);
+ offset = strlen(line);
+ }
+ for (int i=0; i<ngpus; i++) {
+ sprintf(line+offset, " %s/%d", topoNodeTypeStr[GPU], graph->intra[ngpus*c+i]);
+ offset = strlen(line);
+ }
+ if (system->nodes[NET].count > 0) {
+ sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c+1]);
+ offset = strlen(line);
+ }
+ INFO(NCCL_GRAPH, "%s", line);
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* dev) {
+ *dev = graph->inter[(channelId%graph->nChannels)*2+dir];
+ return ncclSuccess;
+}
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
new file mode 100644
index 0000000..a1b3209
--- /dev/null
+++ b/src/graph/topo.cc
@@ -0,0 +1,641 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "graph.h"
+#include "topo.h"
+#include "comm.h"
+#include "nvmlwrap.h"
+#include "net.h"
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#define BUSID_SIZE (sizeof("0000:00:00.0"))
+#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
+
+const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
+
+const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
+const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "QPI", "NET" };
+
+/******************************************************************/
+/******************* Graph Creation Functions *********************/
+/******************************************************************/
+static int getNumaId(char *path) {
+ char npath[PATH_MAX];
+ snprintf(npath, PATH_MAX, "%s/numa_node", path);
+ npath[PATH_MAX-1] = '\0';
+
+ int numaId = -1;
+ FILE *file = fopen(npath, "r");
+ if (file == NULL) return -1;
+ if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
+ fclose(file);
+
+ return numaId;
+}
+
+static ncclResult_t getPciPath(char* busId, char** path) {
+ for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
+ char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
+ memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
+ memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
+ *path = realpath(busPath, NULL);
+ if (*path == NULL) {
+ WARN("Could not find real path of %s", busPath);
+ return ncclSystemError;
+ }
+ return ncclSuccess;
+}
+
+// Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000.
+ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
+ char* str = path+offset;
+ // Remove trailing "/"
+ if (*str == '/') str--;
+ // Find next /
+ while (*str != '/') str--;
+ str++;
+ NCCLCHECK(busIdToInt64(str, id));
+ return ncclSuccess;
+}
+
+static ncclResult_t idToIndex(struct ncclTopoSystem* system, int64_t id, int* index) {
+ *index = -1;
+ for (int i=0; i<system->nodes[GPU].count; i++) {
+ if (system->nodes[GPU].nodes[i].id == id) {
+ *index = i;
+ }
+ }
+ return ncclSuccess;
+}
+
+
+static ncclResult_t getPath(int64_t id, char** path) {
+ char busId[] = "0000:00:00.0";
+ NCCLCHECK(int64ToBusId(id, busId));
+ NCCLCHECK(getPciPath(busId, path));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoCudaPath(int cudaDev, char** path) {
+ char busId[BUSID_SIZE];
+ CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
+ NCCLCHECK(getPciPath(busId, path));
+ return ncclSuccess;
+}
+
+
+int interCpuWidth = 0;
+int cpuPciWidth = 0;
+
+static ncclResult_t getCpuWidths() {
+ // Check if already detected
+ if (interCpuWidth + cpuPciWidth) return ncclSuccess;
+
+ // Defaults
+ char cpu[256];
+ sprintf(cpu, "Generic");
+ cpuPciWidth = interCpuWidth = PCI_WIDTH;
+
+#ifdef __PPC__
+ sprintf(cpu, "ppc64");
+ interCpuWidth = P9_WIDTH;
+#endif
+#ifdef __x86_64__
+ sprintf(cpu, "x86_64");
+ union {
+ struct {
+ // CPUID 0 String register order
+ uint32_t ebx;
+ uint32_t edx;
+ uint32_t ecx;
+ };
+ char vendor[12];
+ } cpuid0;
+
+ asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0));
+ if (strncmp(cpuid0.vendor, "GenuineIntel", 12) == 0) sprintf(cpu, "Intel");
+
+ if (strcmp(cpu, "Intel") == 0) {
+ union {
+ struct {
+ int steppingId:4;
+ int model:4;
+ int familyId:4;
+ int processorType:2;
+ int resv0:2;
+ int extModelId:4;
+ int modelId:8;
+ int resv1:4;
+ };
+ uint32_t val;
+ } cpuid1;
+ asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1));
+ if (cpuid1.familyId == 6 && cpuid1.modelId >= 0x55) { // Skylake
+ sprintf(cpu, "Intel/Skylake (or later)");
+ interCpuWidth = SKL_QPI_WIDTH;
+ } else {
+ interCpuWidth = QPI_WIDTH;
+ }
+ }
+#endif
+ INFO(NCCL_GRAPH, "%s CPU (PCI %d, InterCpu %d)", cpu, cpuPciWidth, interCpuWidth);
+ return ncclSuccess;
+}
+
+static ncclResult_t ncclTopoGetInterCpuWidth(int* width) {
+ NCCLCHECK(getCpuWidths());
+ *width = interCpuWidth;
+ return ncclSuccess;
+}
+static ncclResult_t ncclTopoGetCpuPciP2pWidth(int* width) {
+ NCCLCHECK(getCpuWidths());
+ *width = cpuPciWidth;
+ return ncclSuccess;
+}
+static ncclResult_t ncclTopoGetPciWidth(int* width) {
+ *width = PCI_WIDTH;
+ return ncclSuccess;
+}
+static ncclResult_t ncclTopoGetNetWidth(int* width) {
+ *width = NET_WIDTH;
+ return ncclSuccess;
+}
+
+enum ncclNvLinkDeviceType {
+ ncclNvLinkDeviceUnknown,
+ ncclNvLinkDeviceGpu,
+ ncclNvLinkDeviceSwitch,
+ ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
+};
+
+static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
+ char classPath[] = "/sys/bus/pci/devices/0000:00:00.0/class";
+ memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
+ char* rPath = realpath(classPath, NULL);
+ int fd;
+ if ((fd = open(rPath, O_RDONLY)) == -1) {
+ // Could not find device. It might be because we're in a VM and
+ // we don't see the whole machine. This is handled silently so
+ // we don't want to print an INFO error.
+ TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
+ return ncclSystemError;
+ }
+ free(rPath);
+ char pciClass[9];
+ strncpy(pciClass, "0x000000", 9);
+ int len;
+ SYSCHECKVAL(read(fd, pciClass, 8), "read", len);
+ SYSCHECK(close(fd), "close");
+ if (strcmp(pciClass, "0x068000") == 0) {
+ // PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
+ *type = ncclNvLinkDeviceSwitch;
+ } else if (strcmp(pciClass, "0x068001") == 0) {
+ // PCI device is of type "Bridge: IBM Device 04ea"
+ *type = ncclNvLinkDeviceBridge;
+ } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
+ || strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce)
+ *type = ncclNvLinkDeviceGpu;
+ } else {
+ *type = ncclNvLinkDeviceUnknown;
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoConnectCpu(struct ncclTopoSystem* system, int numaId, struct ncclTopoNode* node, int linkType, int linkWidth) {
+ struct ncclTopoNode* cpuNode = NULL;
+ for (int c=0; c<system->nodes[CPU].count; c++) {
+ if (system->nodes[CPU].nodes[c].id == numaId) cpuNode = system->nodes[CPU].nodes+c;
+ }
+ if (cpuNode == NULL) { // Create CPU
+ NCCLCHECK(ncclTopoCreateNode(system, &cpuNode, CPU, numaId));
+ }
+ NCCLCHECK(ncclTopoConnectNodes(node, cpuNode, linkType, linkWidth));
+ NCCLCHECK(ncclTopoConnectNodes(cpuNode, node, linkType, linkWidth));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoConnectNVLink(nvmlDevice_t* nvmlDevs, struct ncclTopoSystem* system) {
+ struct ncclTopoNode* nvsNode = NULL;
+
+ int minNvlinks = 6, minWidth = VOLTA_NVLINK_WIDTH;
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+ int cudaMajor, cudaMinor;
+ NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDevs[g], &cudaMajor, &cudaMinor));
+ int maxNvLinks, width;
+ if (cudaMajor < 6) {
+ maxNvLinks = 0;
+ width = 0;
+ } else if (cudaMajor == 6) {
+ maxNvLinks = 4;
+ width = PASCAL_NVLINK_WIDTH;
+ } else {
+ maxNvLinks = 6;
+ width = VOLTA_NVLINK_WIDTH;
+ }
+
+ int nvlinks = 0;
+ for (int l=0; l<maxNvLinks; ++l) {
+ // Check whether we can use this NVLink for P2P
+ unsigned canP2P;
+ if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDevs[g], l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
+
+ // Make sure the Nvlink is up. The previous call should have trained the link.
+ nvmlEnableState_t isActive;
+ if ((wrapNvmlDeviceGetNvLinkState(nvmlDevs[g], l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
+
+ // Try to figure out what's on the other side of the NVLink
+ nvmlPciInfo_t remoteProc;
+ if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevs[g], l, &remoteProc) != ncclSuccess) continue;
+
+ // Make a lower case copy of the bus ID for calling ncclDeviceType
+ // PCI system path is in lower case
+ char* p = remoteProc.busId;
+ char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+ for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
+ lowerId[c] = tolower(p[c]);
+ if (p[c] == 0) break;
+ }
+
+ enum ncclNvLinkDeviceType type;
+ NCCLCHECK(ncclDeviceType(lowerId, &type));
+ if (type == ncclNvLinkDeviceGpu) {
+ int64_t remoteId;
+ NCCLCHECK(busIdToInt64(lowerId, &remoteId));
+ int peer;
+ NCCLCHECK(idToIndex(system, remoteId, &peer));
+ if (peer != -1) {
+ NCCLCHECK(ncclTopoConnectNodes(gpu, system->nodes[GPU].nodes+peer, LINK_NVL, width));
+ nvlinks++;
+ }
+ } else if (type == ncclNvLinkDeviceBridge) {
+ // Nvlink between GPU and CPU (PPC)
+ // Since the remote bridge does not have a valid numa_node, assume we
+ // are connected to the closest CPU.
+ char* path;
+ NCCLCHECK(getPath(gpu->id, &path));
+ int numaId = getNumaId(path);
+ free(path);
+ NCCLCHECK(ncclTopoConnectCpu(system, numaId, gpu, LINK_NVL, width));
+ nvlinks++;
+ } else { // Nvswitch
+ if (type == ncclNvLinkDeviceUnknown) {
+ // The NVLink is up but we couldn't find the PCI device on the other
+ // side. Assume it's an NVswitch outside a VM.
+ if (l == 0) INFO(NCCL_INIT, "%d/%d -> %s : Assuming NVLink is connected to NVswitch", g, l, lowerId);
+ }
+ if (nvsNode == NULL) { // Create nvswitch
+ NCCLCHECK(ncclTopoCreateNode(system, &nvsNode, NVS, 0));
+ }
+ NCCLCHECK(ncclTopoConnectNodes(gpu, nvsNode, LINK_NVL, VOLTA_NVLINK_WIDTH));
+ NCCLCHECK(ncclTopoConnectNodes(nvsNode, gpu, LINK_NVL, VOLTA_NVLINK_WIDTH));
+ nvlinks++;
+ }
+ }
+ minNvlinks = std::min(minNvlinks, nvlinks);
+ minWidth = std::min(minWidth, width);
+ }
+ int pciWidth;
+ NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
+ system->maxSpeed = minNvlinks ? minNvlinks*minWidth : pciWidth;
+ system->maxWidth = minNvlinks ? minWidth : pciWidth;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoCreatePciPath(struct ncclTopoSystem* system, struct ncclTopoNode* endNode, char* path) {
+ struct ncclTopoNode* lastNode = endNode;
+ int pciWidth;
+ NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
+ // Find intermediate PCI switches
+ int slashCount = 0;
+ int offsetRC = 0;
+ while (offsetRC < strlen(path)) {
+ if (path[offsetRC] == '/') slashCount++;
+ if (slashCount == 4) break;
+ offsetRC++;
+ }
+ int offset = strlen(path);
+ slashCount = 0;
+ while (--offset > offsetRC) {
+ if (path[offset] == '/') {
+ slashCount++;
+ // Find if already existing
+ if ((slashCount%2) == 0) {
+ int64_t pciId;
+ NCCLCHECK(pciPathToInt64(path, offset, offsetRC, &pciId));
+ for (int p=0; p<system->nodes[PCI].count; p++) {
+ if (system->nodes[PCI].nodes[p].id == pciId) {
+ // Found our PCI switch. Attach and stop since the rest should already
+ // be connected
+ NCCLCHECK(ncclTopoConnectNodes(system->nodes[PCI].nodes+p, lastNode, LINK_PCI, pciWidth));
+ NCCLCHECK(ncclTopoConnectNodes(lastNode, system->nodes[PCI].nodes+p, LINK_PCI, pciWidth));
+ return ncclSuccess;
+ }
+ }
+ struct ncclTopoNode* pciNode;
+ NCCLCHECK(ncclTopoCreateNode(system, &pciNode, PCI, pciId));
+ NCCLCHECK(ncclTopoConnectNodes(pciNode, lastNode, LINK_PCI, pciWidth));
+ NCCLCHECK(ncclTopoConnectNodes(lastNode, pciNode, LINK_PCI, pciWidth));
+ lastNode = pciNode;
+ }
+ }
+ }
+ // Then attach to a CPU node
+ int numaId = getNumaId(path);
+ int width;
+ NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
+ NCCLCHECK(ncclTopoConnectCpu(system, numaId, lastNode, LINK_PCI, width));
+ return ncclSuccess;
+}
+
+// Try to detect if IB cards are in fact the same physical NIC, hence sharing ports.
+#include <glob.h>
+#define IB_GUID_PATH "%s/infiniband/mlx5_*/sys_image_guid"
+uint64_t getIbGuid(char* path) {
+ uint64_t guid = 0ULL;
+ char guidPath[PATH_MAX];
+ snprintf(guidPath, PATH_MAX, IB_GUID_PATH, path);
+ // PATH has a wildcard in it so use glob()
+ glob_t globbuf;
+ glob(guidPath, 0, NULL, &globbuf);
+ if (globbuf.gl_pathc > 0)
+ strncpy(guidPath, globbuf.gl_pathv[0], PATH_MAX);
+ globfree(&globbuf);
+ guidPath[PATH_MAX-1] = '\0';
+ FILE *file = fopen(guidPath, "r");
+ if (file != NULL) {
+ uint64_t a, b, c, d;
+ if (fscanf(file, "%04lx:%04lx:%04lx:%04lx", &a, &b, &c, &d) != EOF) {
+ guid = (a << 48) + (b << 32) + (c<<16) + d;
+ TRACE(NCCL_GRAPH, "Opened %s guid %lx", guidPath, guid);
+ }
+ fclose(file);
+ }
+ return guid;
+}
+
+struct netInfo {
+ char* path;
+ int64_t nic;
+ uint64_t asic;
+ int port;
+ int net;
+};
+
+ncclResult_t ncclTopoComputeNetInfo(struct netInfo* netInfos, int ndev) {
+ for (int n=0; n<ndev; n++) {
+ struct netInfo* info = netInfos+n;
+ uint64_t ibGuid;
+ info->nic = n;
+ info->asic = n;
+ info->port = 0;
+ info->net = n;
+ if (info->path && (ibGuid = getIbGuid(info->path)) != 0) {
+ info->asic = ibGuid;
+
+ // Ignore PCI subdevice when computing the ID to merge multi-port cards
+ // and make them use the same PCI link.
+ char* path = strdup(info->path);
+ path[strlen(path)-1]='0';
+ NCCLCHECK(pciPathToInt64(path, strlen(path), 0, &info->nic));
+ free(path);
+
+ // Same PCI path -> different ports of the same NIC
+ for (int i=0; i<n; i++) if (netInfos[i].nic == info->nic) info->port++;
+
+ // Same GUID -> same network links as the other NIC
+ for (int i=0; i<n; i++) if (netInfos[i].asic == info->asic && netInfos[i].port == info->port) info->net = netInfos[i].net;
+ }
+ INFO(NCCL_GRAPH, "%s -> %x/%lx/%d/%d", info->path, info->nic, info->asic, info->port, info->net);
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoConnectPCI(struct ncclTopoSystem* system) {
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+ char* path;
+ NCCLCHECK(getPath(gpu->id, &path));
+ NCCLCHECK(ncclTopoCreatePciPath(system, gpu, path));
+ free(path);
+ }
+
+ // Connect the NICs
+ int netDevCount;
+ NCCLCHECK(ncclNetDevices(&netDevCount));
+ int netWidth;
+ NCCLCHECK(ncclTopoGetNetWidth(&netWidth));
+
+ struct netInfo* netInfos;
+ NCCLCHECK(ncclCalloc(&netInfos, netDevCount));
+
+ for (int n=0; n<netDevCount; n++) {
+ ncclResult_t res = ncclNetPciPath(n, &netInfos[n].path);
+ if (res != ncclSuccess) netInfos[n].path = NULL;
+ }
+
+ NCCLCHECK(ncclTopoComputeNetInfo(netInfos, netDevCount));
+
+ for (int n=0; n<netDevCount; n++) {
+ struct netInfo* info = netInfos+n;
+ // Create NIC and attach it to the PCI tree
+ struct ncclTopoNode* nicNode = NULL;
+ for (int i=0; i<system->nodes[NIC].count; i++) {
+ if (system->nodes[NIC].nodes[i].id == info->nic) {
+ nicNode = system->nodes[NIC].nodes+i;
+ break;
+ }
+ }
+ if (!nicNode) {
+ NCCLCHECK(ncclTopoCreateNode(system, &nicNode, NIC, info->nic));
+ if (info->path) {
+ // Create the PCI path
+ NCCLCHECK(ncclTopoCreatePciPath(system, nicNode, info->path));
+ } else {
+ // This is probably a virtual NIC. Just attach it directly to CPU 0
+ int width;
+ NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
+ NCCLCHECK(ncclTopoConnectCpu(system, 0, nicNode, LINK_PCI, width));
+ }
+ }
+ free(info->path);
+
+ // Create the network side
+ struct ncclTopoNode* netNode;
+ NCCLCHECK(ncclTopoCreateNode(system, &netNode, NET, n));
+
+ // Use rank to store the net information
+ netNode->rank = info->net;
+
+ NCCLCHECK(ncclTopoConnectNodes(nicNode, netNode, LINK_NET, netWidth));
+ NCCLCHECK(ncclTopoConnectNodes(netNode, nicNode, LINK_NET, netWidth));
+ }
+ free(netInfos);
+
+ // And connect all CPU nodes together
+ for (int n=0; n<system->nodes[CPU].count; n++) {
+ for (int p=0; p<system->nodes[CPU].count; p++) {
+ if (n == p) continue;
+ int width;
+ NCCLCHECK(ncclTopoGetInterCpuWidth(&width));
+ NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_QPI, width));
+ }
+ }
+ return ncclSuccess;
+}
+
+static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) {
+ if (node->type == GPU) {
+ sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->rank);
+ } else {
+ sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
+ }
+ INFO(NCCL_GRAPH, "%s", line);
+ for (int i=0; i<offset; i++) line[i] = ' ';
+
+ for (int l=0; l<node->nlinks; l++) {
+ struct ncclTopoLink* link = node->links+l;
+ if (link->type == LINK_LOC) continue;
+ if (link->remNode != prevNode) {
+ sprintf(line+offset, "+ %s[%2d] - ", topoLinkTypeStr[link->type], link->width);
+ int nextOffset = strlen(line);
+ if (link->type == LINK_PCI) {
+ NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
+ } else {
+ if (link->remNode->type == NET) {
+ sprintf(line+nextOffset, "%s/%lX (%d)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->rank);
+ } else {
+ sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
+ }
+ INFO(NCCL_GRAPH, "%s", line);
+ }
+ }
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
+ INFO(NCCL_GRAPH, "=== System : maxWidth %2d maxSpeed %2d ===", s->maxWidth, s->maxSpeed);
+ char line[1024];
+ for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
+ INFO(NCCL_GRAPH, "==========================================");
+ NCCLCHECK(ncclTopoPrintPaths(s));
+ return ncclSuccess;
+}
+
+static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* upNode) {
+ // Shift all links to have upLink as last link
+ if (upNode) {
+ int l=0;
+ while (node->links[l].remNode != upNode) l++;
+ struct ncclTopoLink upLink;
+ memcpy(&upLink, node->links+l, sizeof(struct ncclTopoLink));
+ while (node->links[l+1].remNode) {
+ memcpy(node->links+l, node->links+l+1, sizeof(struct ncclTopoLink));
+ l++;
+ }
+ memcpy(node->links+l, &upLink, sizeof(struct ncclTopoLink));
+ }
+
+ // Recursively sort the PCI tree
+ for (int l=0; l<node->nlinks; l++) {
+ struct ncclTopoLink* link = node->links+l;
+ if (link->type == LINK_PCI && link->remNode != upNode) NCCLCHECK(ncclTopoSort(link->remNode, node));
+ }
+ return ncclSuccess;
+}
+
+// We want the graph to be organized to ease/accelerate traversal :
+// 1. NVLinks (already the case)
+// 2. PCI down
+// 3. PCI up
+// 4. QPI (already the case)
+ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) {
+ for (int n=0; n<system->nodes[CPU].count; n++) NCCLCHECK(ncclTopoSort(system->nodes[CPU].nodes+n, NULL));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
+ struct ncclTopoSystem* s;
+ NCCLCHECK(ncclCalloc(&s, 1));
+ nvmlDevice_t* nvmlDevs;
+ int g = 0;
+ NCCLCHECK(ncclCalloc(&nvmlDevs, comm->nRanks));
+ for (int r=0; r<comm->nRanks; r++) {
+ if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
+ // Consider the GPU as outside of our node if we can't see it through NVML.
+ char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+ NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
+ if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevs+g) != ncclSuccess) continue;
+ g++;
+ struct ncclTopoNode* gpuNode;
+ NCCLCHECK(ncclTopoCreateNode(s, &gpuNode, GPU, comm->peerInfo[r].busId));
+ gpuNode->rank = r;
+ }
+ }
+
+ NCCLCHECK(ncclTopoConnectNVLink(nvmlDevs, s));
+ NCCLCHECK(ncclTopoConnectPCI(s));
+
+ free(nvmlDevs);
+ NCCLCHECK(ncclTopoSortSystem(s));
+ *system = s;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink) {
+ int g1, g2;
+ NCCLCHECK(idToIndex(system, busId1, &g1));
+ NCCLCHECK(idToIndex(system, busId2, &g2));
+ *nvlink = g1 != -1 && g2 != -1 && system->nodes[GPU].nodes[g1].paths[GPU][g2].type == LINK_NVL;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink) {
+ int g;
+ NCCLCHECK(idToIndex(system, busId, &g));
+ for (int i=0; i<system->nodes[GPU].count; i++) {
+ if (i == g) continue;
+ if (system->nodes[GPU].nodes[g].paths[GPU][i].type == LINK_NVL) {
+ *nvlink = 1;
+ return ncclSuccess;
+ }
+ }
+ *nvlink = 0;
+ return ncclSuccess;
+}
+
+static int pathDistance(struct ncclTopoLinkList* links) {
+ int distance = PATH_PIX;
+ if (links->count > 2) distance = PATH_PXB;
+ for (int l=0; l<links->count; l++) {
+ // PHB if we go through 1 CPU, SYS if we go through 2 CPUs
+ if (links->list[l]->remNode->type == CPU) distance = (distance == PATH_PHB) ? PATH_SYS : PATH_PHB;
+ }
+ return distance;
+}
+
+ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance) {
+ int g1, g2;
+ NCCLCHECK(idToIndex(system, busId1, &g1));
+ NCCLCHECK(idToIndex(system, busId2, &g2));
+ *distance = pathDistance(system->nodes[GPU].nodes[g1].paths[GPU]+g2);
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance) {
+ int g;
+ NCCLCHECK(idToIndex(system, busId, &g));
+ *distance = pathDistance(system->nodes[GPU].nodes[g].paths[NET]+netDev);
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count) {
+ *count = system->nodes[CPU].count;
+ return ncclSuccess;
+}
diff --git a/src/graph/topo.h b/src/graph/topo.h
new file mode 100644
index 0000000..6b8a2f9
--- /dev/null
+++ b/src/graph/topo.h
@@ -0,0 +1,138 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TOPO_H_
+#define NCCL_TOPO_H_
+
+#include "graph.h"
+#include "core.h"
+
+#define LOC_WIDTH 5000
+#define PASCAL_NVLINK_WIDTH 18
+#define VOLTA_NVLINK_WIDTH 21
+#define PCI_WIDTH 12 // PCI Gen3 x16
+#define QPI_WIDTH 8
+#define SKL_QPI_WIDTH 12
+#define P9_WIDTH 32
+#define NET_WIDTH 12 // 100Gbit
+
+// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, to GPU
+// to GPU traffic consumed more PCI bandwidth.
+#define INTEL_P2P(speed) (speed*9/12)
+#define INTEL_P2P_OVERHEAD(speed) (speed*12/9)
+
+#define NCCL_TOPO_NODE_TYPES 6
+#define GPU 0
+#define PCI 1
+#define NVS 2
+#define CPU 3 // Actually NUMA domains
+#define NIC 4
+#define NET 5
+extern const char* topoNodeTypeStr[];
+
+#define LINK_LOC 0
+#define LINK_NVL 1
+#define LINK_PCI 2
+#define LINK_QPI 3
+#define LINK_NET 4
+extern const char* topoLinkTypeStr[];
+
+struct ncclTopoNode;
+struct ncclTopoLink {
+ int type;
+ int width;
+ struct ncclTopoNode* remNode;
+};
+#define NCCL_TOPO_MAX_LINKS 32
+#define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES)
+#define SELECT_PATH 1
+#define SELECT_LAST 2
+
+#define NET_GDR_MASK 0x70000000
+
+struct ncclTopoLinkList {
+ struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS];
+ int count;
+ int width;
+ int type;
+};
+
+struct ncclTopoNode {
+ int type;
+ int64_t id;
+ int rank;
+ int nlinks;
+ struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS];
+ // Pre-computed paths to GPUs and NICs
+ struct ncclTopoLinkList* paths[NCCL_TOPO_NODE_TYPES];
+ // Used during search
+ uint64_t used;
+};
+
+struct ncclTopoNodeSet {
+ int count;
+ struct ncclTopoNode nodes[NCCL_TOPO_MAX_NODES];
+};
+
+struct ncclTopoSystem {
+ struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
+ int maxSpeed;
+ int maxWidth;
+ int searchInitDone;
+};
+
+static ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
+ for (int i=0; i<system->nodes[type].count; i++) {
+ if (system->nodes[type].nodes[i].id == id) {
+ *node = system->nodes[type].nodes+i;
+ return ncclSuccess;
+ }
+ }
+ if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
+ WARN("Error : tried to create too many nodes of type %d\n", type);
+ return ncclInternalError;
+ }
+ struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
+ system->nodes[type].count++;
+ n->type = type;
+ n->id = id;
+ if (type == GPU) {
+ // Create link to itself (used in some corner cases)
+ n->nlinks=1;
+ n->links[0].type = LINK_LOC;
+ n->links[0].remNode = n;
+ n->links[0].width = LOC_WIDTH;
+ }
+ *node = n;
+ return ncclSuccess;
+}
+
+static ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, int width) {
+ // Aggregate links into higher width for NVLink
+ struct ncclTopoLink* link;
+ for (link = node->links; link->remNode; link++) {
+ if (link->remNode == remNode && link->type == type) break;
+ }
+ if (link->remNode == NULL) node->nlinks++;
+ link->type = type;
+ link->remNode = remNode;
+ link->width += width;
+
+ // Sort links in BW descending order
+ struct ncclTopoLink linkSave;
+ memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
+ while (link != node->links) {
+ if ((link-1)->width >= linkSave.width) break;
+ memcpy(link, link-1, sizeof(struct ncclTopoLink));
+ link--;
+ }
+ memcpy(link, &linkSave, sizeof(struct ncclTopoLink));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
+
+#endif
diff --git a/src/misc/trees.cu b/src/graph/trees.cc
index e53ea0b..722e61b 100644
--- a/src/misc/trees.cu
+++ b/src/graph/trees.cc
@@ -1,12 +1,10 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
-#include "net.h"
-#include "param.h"
+#include "nccl.h"
#define RANK_TO_INDEX(r) (rank > root ? rank-1 : rank)
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
new file mode 100644
index 0000000..87afb2f
--- /dev/null
+++ b/src/graph/tuning.cc
@@ -0,0 +1,212 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "devcomm.h"
+#include "comm.h"
+#include "topo.h"
+
+NCCL_PARAM(Nthreads, "NTHREADS", -2);
+NCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2);
+
+static int getNthreads(const char* name, int env, int min, int max, int def) {
+ int nt = env;
+ if (nt > 0) {
+ if (nt % WARP_SIZE != 0) {
+ WARN("Invalid %s %d (must be a multiple of %d)", name, nt, WARP_SIZE);
+ nt = max;
+ } else if (nt > max) {
+ WARN("Invalid %s %d (maximum %d).", name, nt, max);
+ nt = max;
+ } else if (nt < min) {
+ WARN("Invalid %s %d (minimum %d).", name, nt, min);
+ nt = min;
+ }
+ } else {
+ nt = def;
+ }
+ return nt;
+}
+
+ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* list) {
+ int def, set;
+ if (str[0] == '^') {
+ def = 1; set = 0; str++;
+ } else {
+ def = 0; set = 1;
+ }
+ for (int i=0; i<nelems; i++) list[i] = def;
+ char* tokStr = strdup(str);
+ char* tmpStr;
+ char* token = strtok_r(tokStr, ",", &tmpStr);
+ while (token) {
+ for (int i=0; i<nelems; i++)
+ if (strcasecmp(token, elems[i]) == 0) list[i] = set;
+ token = strtok_r(NULL, ",", &tmpStr);
+ }
+ free(tokStr);
+ return ncclSuccess;
+}
+
+static const char* ncclFuncStr[] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
+static const char* ncclAlgoStr[] = { "Tree", "Ring" };
+static const char* ncclProtoStr[] = { "LL", "LL128", "Simple" };
+
+// Latencies in us, Bandwidths in GB/s
+// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
+static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4, 0 }, { 3.6, 3.6, 8.4 } };
+
+// NVLink, PCI, Network
+#define NCCL_HW_NVLINK 0
+#define NCCL_HW_PCI 1
+#define NCCL_HW_NET 2
+// Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network).
+static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
+{ /* NVLINK */
+ { /* Tree (LL/LL128/Simple)*/ { .5, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { .4, 2.5, 5.7 } },
+ /* PCI */
+ { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 } },
+ /* NET */
+ { /* Tree (LL/LL128/Simple)*/ { 5.0, 7.5, 50 }, /* Ring (LL/LL128/Simple)*/ { .9, 2.5, 6.6 } }
+};
+
+// LL128 max BW for the different collectives
+static const double ll128MaxBw[NCCL_NUM_FUNCTIONS] = { 113.0, 72.0, 110.0, 91.0, 100.0 };
+
+ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph) {
+ int simpleDefaultThreads = (treeGraph->speedIntra*treeGraph->nChannels <= 12) ? 256 : NCCL_MAX_NTHREADS;
+ comm->maxThreads[NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
+ comm->maxThreads[NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
+ comm->maxThreads[NCCL_PROTO_LL128] = getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
+
+ INFO(NCCL_INIT, "Threads per block : %d/%d/%d", comm->maxThreads[NCCL_PROTO_LL], comm->maxThreads[NCCL_PROTO_LL128], comm->maxThreads[NCCL_PROTO_SIMPLE]);
+
+ if (comm->nRanks <= 1) return ncclSuccess;
+
+ struct ncclTopoGraph* graphs[2] = { treeGraph, ringGraph };
+ int intraHw[2], hw[2];
+ for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->nvlink ? NCCL_HW_NVLINK : NCCL_HW_PCI;
+ for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = comm->nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
+
+ for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
+ int nsteps = coll == ncclCollAllReduce ? 2*(comm->nRanks-1) :
+ coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nRanks-1 :
+ comm->nRanks;
+
+ for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+ if (coll != ncclCollAllReduce && a == NCCL_ALGO_TREE) continue;
+
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ int speed = comm->nNodes <= 2 ? graphs[a]->speedIntra : graphs[a]->speedInter;
+ float busBw = graphs[a]->nChannels * speed * 1.0;
+
+ // Various model refinements
+ if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= 1.0/4.0;
+ if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw*120.0/128.0, ll128MaxBw[coll]);
+ if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.9, comm->nNodes > 1 ? 70.0 : 90.0);
+ if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/3.0;
+ if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw *= 7.0/9.0;
+
+ // Convert bus BW to algorithm BW
+ float ratio = a == NCCL_ALGO_TREE ? .5 : (1.0 * comm->nRanks) / nsteps;
+ comm->bandwidths[coll][a][p] = busBw * ratio;
+
+ comm->latencies[coll][a][p] = baseLat[a][p];
+ if (a == NCCL_ALGO_RING) {
+ float lat = hwLat[hw[a]][a][p];
+ if ((coll == ncclCollReduce || coll == ncclCollBroadcast)) {
+ if (ringGraph->sameChannels) {
+ comm->latencies[coll][a][p] += lat;
+ } else {
+ if (p == NCCL_PROTO_SIMPLE) lat = hwLat[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling
+ comm->latencies[coll][a][p] += nsteps*lat;
+ }
+ } else {
+ comm->latencies[coll][a][p] += nsteps*lat;
+ }
+ } else {
+ float intraLat = hwLat[intraHw[a]][a][p];
+ float interLat = hwLat[NCCL_HW_NET][a][p];
+ comm->latencies[coll][a][p] +=
+ 2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat);
+ }
+ }
+ }
+ }
+
+ // Protocols/Algorithms enable/disable, and user overrides.
+ // All are enabled except ll128 which is enabled by default only in certain cases.
+ int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
+ int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1 };
+
+ const char *protoStr = getenv("NCCL_PROTO");
+ if (protoStr) NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
+ const char *algoStr = getenv("NCCL_ALGO");
+ if (algoStr) NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
+
+ for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ int pEnable = protoEnable[p];
+ if (pEnable == 2 && p == NCCL_PROTO_LL128) {
+ // Enable LL128 by default only on Volta+NVLink. Other cases are not tested and may cause silent data corruption.
+ pEnable = (graphs[a]->type <= LINK_PCI) && graphs[a]->nvlink && minCompCap == 70 && maxCompCap == 70 ? 1 : 0;
+ }
+ if (pEnable == 0 || algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
+ }
+
+ if (comm->rank == 0) {
+ char line[1024];
+ int offset = 0;
+ sprintf(line, "Latency/AlgBw |");
+ offset = strlen(line);
+ for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ sprintf(line+offset, " %4s/%6s |", ncclAlgoStr[a], ncclProtoStr[p]);
+ offset = strlen(line);
+ }
+ }
+ INFO(NCCL_TUNING, "%s", line);
+ for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
+ sprintf(line, "%13s |", ncclFuncStr[c]);
+ offset = strlen(line);
+ for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ sprintf(line+offset, "%7.1f/%5.1f|", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
+ offset = strlen(line);
+ }
+ }
+ INFO(NCCL_TUNING, "%s", line);
+ }
+ }
+
+ // Set per-thread amount of work before we increase nThreads and nChannels
+ for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+ comm->threadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD;
+ comm->threadThresholds[a][NCCL_PROTO_LL128] = NCCL_LL128_THREAD_THRESHOLD;
+ comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD;
+ }
+ comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= comm->nRanks;
+
+ // Override defaults with user env
+ char* str = getenv("NCCL_THREAD_THRESHOLDS");
+ if (str) {
+ ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2}};
+ sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2);
+ for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ if (t[a][p] >= 0) comm->threadThresholds[a][p] = t[a][p];
+ }
+ }
+ }
+
+ INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld",
+ comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL],
+ comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL128],
+ comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE],
+ comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL],
+ comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL128],
+ comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
+ return ncclSuccess;
+}
diff --git a/src/misc/group.cu b/src/group.cc
index c428a22..9bf8ac9 100644
--- a/src/misc/group.cu
+++ b/src/group.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -51,11 +51,6 @@ struct ncclAsyncArgs {
thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];
-ncclResult_t ncclSetDevice(int cudaDev) {
- CUDACHECK(cudaSetDevice(cudaDev));
- return ncclSuccess;
-}
-
#define CHECK(a) do { \
if ((args->ret = (a)) != ncclSuccess) { \
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
@@ -65,15 +60,14 @@ ncclResult_t ncclSetDevice(int cudaDev) {
void* ncclAsyncThreadMain(void* args_) {
struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
- CHECK(ncclSetDevice(args->init.cudaDev));
- CHECK(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank));
+ CHECK(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev));
return args;
}
-ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) {
+ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev) {
if (ncclGroupIndex >= MAX_ASYNC_OPS) {
WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
- return ncclAsyncErrCheck(ncclInternalError);
+ return ncclAsyncErrCheck(ncclInvalidUsage);
}
int index = ncclGroupIndex++;
struct ncclAsyncArgs* args = ncclGroupArgs+index;
@@ -84,8 +78,6 @@ ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm
args->init.ndev = ndev;
memcpy(&args->init.commId, &commId, sizeof(commId));
args->init.myrank = myrank;
- // We need to use threads for Init
- pthread_create(ncclGroupThreads+index, NULL, ncclAsyncThreadMain, args);
return ncclSuccess;
}
@@ -97,7 +89,7 @@ ncclResult_t ncclAsyncColl(ncclComm_t comm) {
}
if (ncclGroupIndex >= MAX_ASYNC_OPS) {
WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS);
- return ncclAsyncErrCheck(ncclInternalError);
+ return ncclAsyncErrCheck(ncclInvalidUsage);
}
ncclGroupIndex++;
args->funcType = ASYNC_FUNC_COLL;
@@ -118,12 +110,20 @@ ncclResult_t ncclGroupEnd() {
int savedDev;
CUDACHECK(cudaGetDevice(&savedDev));
int done = ncclGroupIndex;
- int doneArray[ncclGroupIndex];
+ int doneArray[MAX_ASYNC_OPS];
for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 0;
ncclResult_t ret = ncclGroupError;
if (ret != ncclSuccess) goto group_cleanup;
+ /* Launch async ncclCommInitRank */
+ for (int i=0; i<ncclGroupIndex; i++) {
+ struct ncclAsyncArgs* args = ncclGroupArgs+i;
+ if (args->funcType == ASYNC_FUNC_INIT) {
+ pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadMain, args);
+ }
+ }
+
/* Collectives are done in three steps :
* 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative]
* 2. Barrier Wait. No CUDA call is permitted
@@ -166,8 +166,8 @@ ncclResult_t ncclGroupEnd() {
if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL);
if (err == EBUSY) continue;
- if (err != 0) { ret = ncclSystemError; goto end; }
- if (args->ret != ncclSuccess) { ret = args->ret; goto end; }
+ if (err != 0) ret = ncclSystemError;
+ if (args->ret != ncclSuccess) ret = args->ret;
doneArray[i] = 1;
done--;
}
@@ -175,20 +175,47 @@ ncclResult_t ncclGroupEnd() {
}
goto end;
group_cleanup:
- // At least one call in the group failed. Since we want to make that group
- // an atomic operation, we need to cancel all operations.
- for (int i=0; i<ncclGroupIndex; i++) {
- struct ncclComm* comm = ncclGroupArgs[i].coll.comm;
- for (int c=0; c<comm->nChannels; c++) {
- struct ncclChannel* channel = comm->channels+c;
- for (int i=0; i<channel->collCount; i++) {
- channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
+ if (ret != ncclSuccess) {
+ // At least one call in the group failed. Since we want to make that group
+ // an atomic operation, we need to cancel all operations.
+ for (int i=0; i<ncclGroupIndex; i++) {
+ struct ncclAsyncArgs* args = ncclGroupArgs+i;
+ if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
+ if (args->init.newcomm) NCCLCHECK(ncclCommDestroy(*args->init.newcomm));
+ *args->init.newcomm = NULL;
+ } else {
+ struct ncclComm* comm = args->coll.comm;
+ for (int c=0; c<comm->nChannels; c++) {
+ struct ncclChannel* channel = comm->channels+c;
+ for (int i=0; i<channel->collCount; i++) {
+ channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
+ }
+ channel->collFifoTail = channel->collStart;
+ channel->collCount = 0;
+ }
+ /* Cancel all proxy ops : mark them as ncclProxyOpNone and they should be freed later on */
+ struct ncclProxyState* state = &comm->proxyState;
+ struct ncclProxyArgs *op, *start;
+ pthread_mutex_lock(&state->mutex);
+ op = start = state->ops;
+ while (op) {
+ if (op->opCount >= comm->lastOpCount) op->state = ncclProxyOpNone;
+ struct ncclProxyArgs* peerOp = op->nextPeer;
+ while (peerOp) {
+ if (peerOp->opCount >= comm->lastOpCount) peerOp->state = ncclProxyOpNone;
+ peerOp = peerOp->nextPeer;
+ }
+ op = op->next;
+ if (op == start) break;
+ }
+ comm->opCount = comm->lastOpCount;
+ pthread_cond_signal(&state->cond);
+ pthread_mutex_unlock(&state->mutex);
+
+ comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
+ comm->userStreamSet = false;
}
- channel->collFifoTail = channel->collStart;
- channel->collCount = 0;
}
- comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
- comm->userStreamSet = false;
}
end:
ncclGroupError = ncclSuccess;
diff --git a/src/include/alloc.h b/src/include/alloc.h
new file mode 100644
index 0000000..bcdbd18
--- /dev/null
+++ b/src/include/alloc.h
@@ -0,0 +1,51 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ALLOC_H_
+#define NCCL_ALLOC_H_
+
+#include "nccl.h"
+#include "checks.h"
+#include <sys/mman.h>
+
+static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
+ CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
+ memset(*ptr, 0, size);
+ *devPtr = *ptr;
+ return ncclSuccess;
+}
+
+static inline ncclResult_t ncclCudaHostFree(void* ptr) {
+ CUDACHECK(cudaFreeHost(ptr));
+ return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
+ void* p = malloc(nelem*sizeof(T));
+ if (p == NULL) {
+ WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
+ return ncclSystemError;
+ }
+ memset(p, 0, nelem*sizeof(T));
+ *ptr = (T*)p;
+ return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
+ CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
+ CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
+ return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
+ CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
+ return ncclSuccess;
+}
+
+#endif
diff --git a/src/include/argcheck.h b/src/include/argcheck.h
new file mode 100644
index 0000000..8d8b74e
--- /dev/null
+++ b/src/include/argcheck.h
@@ -0,0 +1,16 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ARGCHECK_H_
+#define NCCL_ARGCHECK_H_
+
+#include "core.h"
+#include "info.h"
+
+ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
+ncclResult_t ArgsCheck(struct ncclInfo* info);
+
+#endif
diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h
index a1aaf50..a7d6be9 100644
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,6 +9,7 @@
#include "nccl.h"
+ncclResult_t bootstrapNetInit();
ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState);
@@ -16,4 +17,5 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
ncclResult_t bootstrapClose(void* commState);
+ncclResult_t bootstrapAbort(void* commState);
#endif
diff --git a/src/include/channel.h b/src/include/channel.h
index 76c5e8a..e2da325 100644
--- a/src/include/channel.h
+++ b/src/include/channel.h
@@ -1,12 +1,12 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_CHANNEL_H_
#define NCCL_CHANNEL_H_
-#include "core.h"
+#include "comm.h"
ncclResult_t initChannel(struct ncclComm* comm, int channelid);
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
diff --git a/src/include/checks.h b/src/include/checks.h
index bf7750e..50737b0 100644
--- a/src/include/checks.h
+++ b/src/include/checks.h
@@ -1,10 +1,73 @@
/*************************************************************************
- * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#ifndef NCCL_CHECKS_H_
+#define NCCL_CHECKS_H_
-ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
-ncclResult_t ArgsCheck(struct ncclInfo* info);
+#include "debug.h"
+
+// Check CUDA calls
+#define CUDACHECK(cmd) do { \
+ cudaError_t e = cmd; \
+ if( e != cudaSuccess ) { \
+ WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
+ return ncclUnhandledCudaError; \
+ } \
+} while(false)
+
+#define CUDACHECKGOTO(cmd, res, label) do { \
+ cudaError_t e = cmd; \
+ if( e != cudaSuccess ) { \
+ WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
+ res = ncclUnhandledCudaError; \
+ goto label; \
+ } \
+} while(false)
+
+#include <errno.h>
+// Check system calls
+#define SYSCHECK(call, name) do { \
+ int retval; \
+ SYSCHECKVAL(call, name, retval); \
+} while (false)
+
+#define SYSCHECKVAL(call, name, retval) do { \
+ SYSCHECKSYNC(call, name, retval); \
+ if (retval == -1) { \
+ WARN("Call to " name " failed : %s", strerror(errno)); \
+ return ncclSystemError; \
+ } \
+} while (false)
+
+#define SYSCHECKSYNC(call, name, retval) do { \
+ retval = call; \
+ if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
+ INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
+ } else { \
+ break; \
+ } \
+} while(true)
+
+// Propagate errors up
+#define NCCLCHECK(call) do { \
+ ncclResult_t res = call; \
+ if (res != ncclSuccess) { \
+ /* Print the back trace*/ \
+ INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
+ return res; \
+ } \
+} while (0);
+
+#define NCCLCHECKGOTO(call, res, label) do { \
+ res = call; \
+ if (res != ncclSuccess) { \
+ /* Print the back trace*/ \
+ INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
+ goto label; \
+ } \
+} while (0);
+
+#endif
diff --git a/src/collectives/collectives.h b/src/include/collectives.h
index e6b19cb..69c8e74 100644
--- a/src/collectives/collectives.h
+++ b/src/include/collectives.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,7 +7,10 @@
#ifndef NCCL_COLLECTIVES_H_
#define NCCL_COLLECTIVES_H_
-#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*2+(al))*2+(ll))
+#include "core.h"
+#include "info.h"
+
+#define FUNC_INDEX(coll, redop, dtype, al, pr) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
#define NCCL_COLL_NAME(coll, op, dtype) \
coll##_##op##_##dtype
@@ -22,7 +25,8 @@
#define DECL_COLL4(coll, op, dtype) \
DECL_COLL5(coll, op, dtype) \
- DECL_COLL5(coll##LL, op, dtype)
+ DECL_COLL5(coll##LL, op, dtype) \
+ DECL_COLL5(coll##LL128, op, dtype)
#define DECL_COLL3(coll, op, dtype) \
DECL_COLL4(coll##Ring, op, dtype) \
diff --git a/src/include/comm.h b/src/include/comm.h
new file mode 100644
index 0000000..7164dc0
--- /dev/null
+++ b/src/include/comm.h
@@ -0,0 +1,141 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_COMM_H_
+#define NCCL_COMM_H_
+
+#include "transport.h"
+
+#if CUDART_VERSION < 9000
+struct cudaLaunchParams {
+ void *func;
+ dim3 gridDim;
+ dim3 blockDim;
+ void **args;
+ size_t sharedMem;
+ cudaStream_t stream;
+};
+#endif
+
+#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
+
+#define CACHE_LINE_SIZE 128
+#define MEM_ALIGN 4096
+#define CUDA_IPC_MIN 2097152UL
+
+// Channels / LL tuning
+#define NCCL_LL_THREAD_THRESHOLD 8
+#define NCCL_LL128_THREAD_THRESHOLD 8
+#define NCCL_SIMPLE_THREAD_THRESHOLD 64
+
+struct ncclSendMem {
+ union {
+ struct {
+ uint64_t head;
+ char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+ void* ptrExchange;
+ char pad2[CACHE_LINE_SIZE-sizeof(void*)];
+ uint64_t opCount;
+ };
+ char pad3[MEM_ALIGN];
+ };
+};
+
+struct ncclRecvMem {
+ union {
+ struct {
+ uint64_t tail;
+ char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+ uint64_t opCount;
+ char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
+ int sizesFifo[NCCL_STEPS];
+ };
+ char pad4[MEM_ALIGN];
+ };
+ ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
+ uint64_t ll128Buff[NCCL_LL128_BUFF_ELEMS];
+ char buff[1]; // Actually larger than that
+};
+
+struct ncclComm {
+ struct ncclChannel channels[MAXCHANNELS];
+
+ struct ncclPeerInfo* peerInfo;
+ struct ncclTopoSystem* topo;
+
+ void* bootstrap;
+
+ int rank; // my rank in the communicator
+ int nRanks; // number of GPUs in communicator
+ int cudaDev; // my cuda device index
+ int64_t busId; // my PCI bus ID in int format
+
+ int node;
+ int nNodes;
+ int localRanks;
+
+ enum { GROUP, PARALLEL } launchMode;
+ cudaStream_t userStream;
+ bool userStreamSet;
+ cudaEvent_t doneEvent;
+ bool checkPointers;
+
+ // Counter to make sure collectives match (needed for bcast/reduce
+ // where syncs are not symmetric).
+ uint64_t opCount;
+ uint64_t lastOpCount;
+
+ // Channels for collectives
+ int nChannels;
+
+ // Only nvlink is used for inter-GPU communication
+ int nvlink;
+
+ // Algorithm/Protocols thresholds
+ ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+ float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+ float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+ int maxThreads[NCCL_NUM_PROTOCOLS];
+
+ // An internal CUDA stream for NCCL kernel CGMD launches
+ int groupCudaStream;
+ cudaStream_t groupStream;
+
+ // Whether there has been a fatal error in this communicator.
+ ncclResult_t fatalError;
+
+ // Error reported by GPU
+ volatile ncclDevError_t* fatalDevError;
+
+ // Flag to ask NCCL kernels to abort
+ volatile uint32_t *abortFlag;
+
+ // Device side of the communicator
+ struct ncclDevComm *devComm;
+ // Host copy of the devComm (to free CUDA allocs)
+ struct ncclDevComm hostDevComm;
+
+ // Intra-process sync
+ int intraRank;
+ int intraRanks;
+ int* intraBarrier;
+ int intraPhase;
+
+ // Storage for deferred intra-process launch
+ struct cudaLaunchParams * intraParams;
+ struct cudaLaunchParams *myParams;
+ int* intraCudaDevs;
+ int* intraCGMode; // Whether we can use CUDA9 CGMD or not
+ int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
+ struct ncclColl args;
+ void* argsptr;
+
+ // Global proxy thread
+ pthread_t proxyThread;
+ struct ncclProxyState proxyState;
+};
+
+#endif
diff --git a/src/include/core.h b/src/include/core.h
index d57d271..250f43b 100644
--- a/src/include/core.h
+++ b/src/include/core.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,385 +7,12 @@
#ifndef NCCL_CORE_H_
#define NCCL_CORE_H_
-#define NCCL_MAX_OPS 2048
-#define NCCL_STEPS 8
-
-#include "nccl.h"
-#include "transport.h"
-#include "debug.h"
-#include <cstdio>
-#include <algorithm> // std::min/std::max
+#include <pthread.h>
#include <unistd.h>
#include <stdlib.h>
-#include <cuda_runtime.h>
-
-#if CUDART_VERSION < 9000
-struct cudaLaunchParams {
- void *func;
- dim3 gridDim;
- dim3 blockDim;
- void **args;
- size_t sharedMem;
- cudaStream_t stream;
-};
-#endif
-
-#define MAXCHANNELS 16
-#define MAXTHREADS 256
-#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
-
-// Channels / LL tuning
-#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
-#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL
-#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
-#define NCCL_LL_MAX_NTHREADS MAXTHREADS
-#define NCCL_LL_MIN_NTHREADS 64
-
-#define DIVUP(x, y) \
- (((x)+(y)-1)/(y))
-#define ROUNDUP(x, y) \
- (DIVUP((x), (y))*(y))
-
-#define ALIGN_SIZE(size, align) \
- size = ((size + (align) - 1) / (align)) * (align);
-
-union ncclLLFifoLine {
- /* Flags have to be *after* data, because otherwise, an incomplete receive
- from the network may receive the flag but not the data.
- Note this is assuming that either we receive contiguous chunks of data
- (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
- struct {
- uint32_t data1;
- uint32_t flag1;
- uint32_t data2;
- uint32_t flag2;
- };
- uint64_t v[2];
- int4 i4;
-};
-
-typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
-
-typedef enum {
- ncclPatternRing,
- ncclPatternRingTwice,
- ncclPatternPipelineFrom,
- ncclPatternPipelineTo,
- ncclPatternTreeUp,
- ncclPatternTreeDown,
- ncclPatternTreeUpDown
-} ncclPattern_t;
-
-typedef enum {
- ncclDevSuccess,
- ncclDevAssertedMismatch,
- ncclDevSuspectedMismatch
-} ncclDevError_t;
-
-// Used to pass NCCL call information between functions
-struct ncclInfo {
- ncclColl_t coll;
- const char* opName;
- // NCCL Coll Args
- const void* sendbuff;
- void* recvbuff;
- size_t count;
- ncclDataType_t datatype;
- ncclRedOp_t op;
- int root;
- ncclComm_t comm;
- cudaStream_t stream;
- // Algorithm details
- int chunkSteps;
- int sliceSteps;
- // Computed later
- ncclPattern_t pattern;
- size_t nBytes;
- int nstepsPerLoop;
- int nchunksPerLoop;
-};
-
-struct ncclConnInfo {
- // Regular comm mechanism
- char *buff; // Local for recv, remote for send
- uint64_t *tail; // Local for recv, remote for send
- uint64_t *head; // Local for send, remote for recv
- uint64_t *opCountLoc; // opCount of local rank
- uint64_t *opCountRem; // opCount of remote rank
-
- int direct; // Direct communication
- void **ptrExchange; // Pointer exchange for direct communication
-
- int *fifo; // Size fifo for proxy
-
- uint64_t step; // Keep where we are
-
- // Low latency mechanism
- union ncclLLFifoLine *llBuff; // Local for recv, remote for send
- uint64_t llLastCleaning;
-};
-
-struct ncclConnector {
- int connected;
- struct ncclProxyArgs *proxyAppend;
- struct ncclTransportComm* transportComm;
- void* transportResources; // Host-side resources
- struct ncclConnInfo conn;
- struct ncclComm *comm;
-};
-
-#define CACHE_LINE_SIZE 128
-#define MEM_ALIGN 4096
-#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
-
-#define NUM_LINES_PER_THREAD 8
-#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
-#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
-#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
-#define NCCL_LL_CLEAN_FREQ 0x10000000
-
-struct ncclSendMem {
- union {
- struct {
- uint64_t head;
- char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
- void* ptrExchange;
- char pad2[CACHE_LINE_SIZE-sizeof(void*)];
- uint64_t opCount;
- };
- char pad3[MEM_ALIGN];
- };
-};
-
-struct ncclRecvMem {
- union {
- struct {
- uint64_t tail;
- char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
- uint64_t opCount;
- char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
- int sizesFifo[NCCL_STEPS];
- };
- char pad4[MEM_ALIGN];
- };
- ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
- char buff[1]; // Actually larger than that
-};
-
-struct ncclRing {
- // Shortcuts for userRanks[1] and userRanks[n-1]
- int prev;
- int next;
-
- // Maps an internal nccl index to user-specified rank order. This is necessary
- // since we need to know how the user expects data to be ordered across
- // devices. Ordered from current device.
- int* userRanks;
- int* devUserRanks;
-};
-
-#define NCCL_MAX_TREE_ARITY 3
-struct ncclTree {
- int depth;
- int up;
- int down[NCCL_MAX_TREE_ARITY];
-};
-
-struct ncclPeer {
- struct ncclConnector send;
- struct ncclConnector recv;
-};
-
-struct ncclChannel {
- union {
- struct {
- struct ncclRing ring;
- struct ncclTree tree;
-
- int id;
- int nthreads;
- int buffSize;
-
- // Communication structures
- struct ncclPeer* peers;
- struct ncclPeer* devPeers;
-
- // Operation list for aggregation
- struct ncclColl* collectives;
- struct ncclColl* devCollectives;
- int collStart;
- int collCount;
- int collFifoHead; // Only used by GPU
- int collFifoTail; // Only used by CPU
- };
- int data[0x80];
- };
-};
-static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
-
-/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
-/* to make sure reads to host from the CUDA kernel are aligned. */
-/* Make sure to adjust padding at the end of ncclColl. */
-struct CollectiveArgs {
- struct ncclComm* comm;
- uint64_t opCount;
-
- // local and remote input, output, and buffer
- const void * ThisInput;
- void * ThisOutput;
-
- // general parameters
- size_t N;
- uint32_t root;
- uint8_t bid;
- uint8_t nChannels;
- uint16_t nThreads;
-
- int lastChunkSize;
-};
-struct ncclColl {
- union {
- struct {
- struct CollectiveArgs args;
- uint16_t funcIndex;
- uint16_t nextIndex;
- uint8_t active;
- };
- int data[0x10];
- };
-};
-static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
-
-struct ncclComm {
- struct ncclChannel channels[MAXCHANNELS];
-
- struct ncclPeerInfo* peerInfo;
-
- void* bootstrap;
-
- int rank; // my rank in the communicator
- int nRanks; // number of GPUs in communicator
- int cudaDev; // my cuda device index
- int nvmlDev; // my NVML device number
-
- enum { GROUP, PARALLEL } launchMode;
- cudaStream_t userStream;
- bool userStreamSet;
- cudaEvent_t doneEvent;
- bool checkPointers;
-
- // Counter to make sure collectives match (needed for bcast/reduce
- // where syncs are not symmetric).
- uint64_t opCount;
-
- // Channels for collectives
- int nChannels;
- int nThreads;
-
- // Low-latency algorithm threshold
- ssize_t llThreshold;
- ssize_t threadThreshold;
-
- // Tree algorithm threshold
- ssize_t treeThreshold;
-
- // An internal CUDA stream for NCCL kernel CGMD launches
- int groupCudaStream;
- cudaStream_t groupStream;
-
- // Whether there has been a fatal error in this communicator.
- ncclResult_t fatalError;
-
- // Error reported by GPU
- volatile ncclDevError_t* fatalDevError;
-
- // On host: this pointer has been obtained from cudaHostAlloc(cudaHostAllocMapped)
- // On device: this pointer has been obtained from cudaHostGetDevicePointer()
- volatile uint32_t *abortFlag;
-
- // Device copy of the communicator
- struct ncclComm *devComm;
-
- // Intra-process sync
- int intraRank;
- int intraRanks;
- int* intraBarrier;
- int intraPhase;
-
- // Storage for deferred intra-process launch
- struct cudaLaunchParams * intraParams;
- struct cudaLaunchParams *myParams;
- int* intraCudaDevs;
- int* intraCGMode; // Whether we can use CUDA9 CGMD or not
- int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
- struct ncclColl args;
- void* argsptr;
-
- // Global proxy thread
- pthread_t proxyThread;
- struct ncclProxyState proxyState;
-};
-
-// Check CUDA calls
-#define CUDACHECK(cmd) do { \
- cudaError_t e = cmd; \
- if( e != cudaSuccess ) { \
- WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
- return ncclUnhandledCudaError; \
- } \
-} while(false)
-
-#define CUDACHECKGOTO(cmd, res, label) do { \
- cudaError_t e = cmd; \
- if( e != cudaSuccess ) { \
- WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
- res = ncclUnhandledCudaError; \
- goto label; \
- } \
-} while(false)
-
-#include <errno.h>
-// Check system calls
-#define SYSCHECK(call, name) do { \
- int retval; \
- SYSCHECKVAL(call, name, retval); \
-} while (false)
-
-#define SYSCHECKVAL(call, name, retval) do { \
- SYSCHECKSYNC(call, name, retval); \
- if (retval == -1) { \
- WARN("Call to " name " failed : %s", strerror(errno)); \
- return ncclSystemError; \
- } \
-} while (false)
-
-#define SYSCHECKSYNC(call, name, retval) do { \
- retval = call; \
- if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
- INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
- } else { \
- break; \
- } \
-} while(true)
-
-// Propagate errors up
-#define NCCLCHECK(call) do { \
- ncclResult_t res = call; \
- if (res != ncclSuccess) { \
- /* Print the back trace*/ \
- INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
- return res; \
- } \
-} while (0);
-
-#define NCCLCHECKGOTO(call, res, label) do { \
- res = call; \
- if (res != ncclSuccess) { \
- /* Print the back trace*/ \
- INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
- goto label; \
- } \
-} while (0);
+#include <stdint.h>
+#include <algorithm> // For std::min/std::max
+#include "nccl.h"
#ifdef PROFAPI
#define NCCL_API(ret, func, args...) \
@@ -403,10 +30,6 @@ struct ncclComm {
ret func(args)
#endif // end PROFAPI
-int ncclCudaCompCap();
-ncclResult_t ncclNvlinkGpu(int* nvlink);
-int64_t ncclTreeThreshold();
-
static __inline__ int ncclTypeSize(ncclDataType_t type) {
switch (type) {
case ncclInt8:
@@ -427,42 +50,22 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
}
}
-#include <sys/mman.h>
-static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
- CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
- memset(*ptr, 0, size);
- *devPtr = *ptr;
- return ncclSuccess;
-}
-
-static inline ncclResult_t ncclCudaHostFree(void* ptr) {
- CUDACHECK(cudaFreeHost(ptr));
- return ncclSuccess;
-}
+#define NCCL_NUM_FUNCTIONS 5
+typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce } ncclFunc_t;
-template <typename T>
-static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
- void* p = malloc(nelem*sizeof(T));
- if (p == NULL) {
- WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
- return ncclSystemError;
- }
- memset(p, 0, nelem*sizeof(T));
- *ptr = (T*)p;
- return ncclSuccess;
-}
+#define NCCL_NUM_ALGORITHMS 2 // Tree/Ring
+#define NCCL_ALGO_TREE 0
+#define NCCL_ALGO_RING 1
-template <typename T>
-static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
- CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
- CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
- return ncclSuccess;
-}
+#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
+#define NCCL_PROTO_LL 0
+#define NCCL_PROTO_LL128 1
+#define NCCL_PROTO_SIMPLE 2
-template <typename T>
-static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
- CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
- return ncclSuccess;
-}
+#include "debug.h"
+#include "checks.h"
+#include "alloc.h"
+#include "utils.h"
+#include "param.h"
#endif // end include guard
diff --git a/src/include/cpuset.h b/src/include/cpuset.h
index f70d1d8..98b93de 100644
--- a/src/include/cpuset.h
+++ b/src/include/cpuset.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/include/debug.h b/src/include/debug.h
index 3acdf8c..89b6e42 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,15 +7,14 @@
#ifndef NCCL_DEBUG_H_
#define NCCL_DEBUG_H_
-#include <pthread.h>
+#include "core.h"
+
#include <stdio.h>
#include <chrono>
-#include <unistd.h>
#include <sys/syscall.h>
#include <limits.h>
#include <string.h>
-#include "nccl.h"
#include "nccl_net.h"
#define gettid() (pid_t) syscall(SYS_gettid)
@@ -24,10 +23,17 @@ extern int ncclDebugLevel;
extern uint64_t ncclDebugMask;
extern pthread_mutex_t ncclDebugOutputLock;
extern FILE *ncclDebugFile;
-extern ncclResult_t getHostName(char* hostname, int maxlen);
-extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev);
+extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
+
+void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
-extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
+// Let code temporarily downgrade WARN into INFO
+extern thread_local int ncclDebugNoWarn;
+#define NOWARN(a, ret) do { \
+ ncclDebugNoWarn = 1; \
+ ret = a; \
+ ncclDebugNoWarn = 0; \
+} while (0)
#define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
#define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
@@ -39,101 +45,4 @@ extern std::chrono::high_resolution_clock::time_point ncclEpoch;
#define TRACE(...)
#endif
-#include <stdlib.h>
-
-static inline void initDebug() {
- const char* nccl_debug = getenv("NCCL_DEBUG");
- if (nccl_debug == NULL) {
- ncclDebugLevel = NCCL_LOG_NONE;
- } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
- ncclDebugLevel = NCCL_LOG_VERSION;
- } else if (strcasecmp(nccl_debug, "WARN") == 0) {
- ncclDebugLevel = NCCL_LOG_WARN;
- } else if (strcasecmp(nccl_debug, "INFO") == 0) {
- ncclDebugLevel = NCCL_LOG_INFO;
- } else if (strcasecmp(nccl_debug, "ABORT") == 0) {
- ncclDebugLevel = NCCL_LOG_ABORT;
- } else if (strcasecmp(nccl_debug, "TRACE") == 0) {
- ncclDebugLevel = NCCL_LOG_TRACE;
- }
-
- /* Parse the NCCL_DEBUG_SUBSYS env var
- * This can be a comma separated list such as INIT,COLL
- * or ^INIT,COLL etc
- */
- char* nccl_debug_subsys = getenv("NCCL_DEBUG_SUBSYS");
- if (nccl_debug_subsys != NULL) {
- char *subsys = strtok(nccl_debug_subsys, ",");
- while (subsys != NULL) {
- int invert = 0;
- uint64_t mask = 0;
- if (subsys[0] == '^') { invert = 1; subsys++; }
- if (strcasecmp(subsys, "INIT") == 0) {
- mask = NCCL_INIT;
- } else if (strcasecmp(subsys, "COLL") == 0) {
- mask = NCCL_COLL;
- } else if (strcasecmp(subsys, "P2P") == 0) {
- mask = NCCL_P2P;
- } else if (strcasecmp(subsys, "SHM") == 0) {
- mask = NCCL_SHM;
- } else if (strcasecmp(subsys, "NET") == 0) {
- mask = NCCL_NET;
- } else if (strcasecmp(subsys, "ALL") == 0) {
- mask = NCCL_ALL;
- }
- if (mask) {
- if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
- }
- subsys = strtok(NULL, ",");
- }
- }
-
- /* Parse and expand the NCCL_DEBUG_FILE path and
- * then create the debug file. But don't bother unless the
- * NCCL_DEBUG level is > VERSION
- */
- const char* nccl_debug_file = getenv("NCCL_DEBUG_FILE");
- if (ncclDebugLevel > NCCL_LOG_VERSION && nccl_debug_file != NULL) {
- int c = 0;
- char debug_fn[PATH_MAX+1] = "";
- char *dfn = debug_fn;
- while (nccl_debug_file[c] != '\0' && c < PATH_MAX) {
- if (nccl_debug_file[c++] != '%') {
- *dfn++ = nccl_debug_file[c-1];
- continue;
- }
- switch (nccl_debug_file[c++]) {
- case '%': // Double %
- *dfn++ = '%';
- break;
- case 'h': // %h = hostname
- char hostname[1024];
- getHostName(hostname, 1024);
- dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
- break;
- case 'p': // %p = pid
- dfn += snprintf(dfn, PATH_MAX, "%d", getpid());
- break;
- default: // Echo everything we don't understand
- *dfn++ = '%';
- *dfn++ = nccl_debug_file[c-1];
- break;
- }
- }
- *dfn = '\0';
- if (debug_fn[0] != '\0') {
- FILE *file = fopen(debug_fn, "w");
- if (file != NULL) {
- INFO(NCCL_ALL,"DEBUG file is '%s'", debug_fn);
- ncclDebugFile = file;
- }
- }
- }
- pthread_mutex_init(&ncclDebugOutputLock, NULL);
-
-#ifdef ENABLE_TRACE
- ncclEpoch = std::chrono::high_resolution_clock::now();
-#endif
-}
-
#endif
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
new file mode 100644
index 0000000..46d236b
--- /dev/null
+++ b/src/include/devcomm.h
@@ -0,0 +1,214 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_DEVICE_H_
+#define NCCL_DEVICE_H_
+
+#include "nccl.h"
+#include <stdint.h>
+
+#define NCCL_MAX_OPS 2048
+#define NCCL_STEPS 8
+
+#define DIVUP(x, y) \
+ (((x)+(y)-1)/(y))
+#define ROUNDUP(x, y) \
+ (DIVUP((x), (y))*(y))
+
+#define ALIGN_SIZE(size, align) \
+ size = ((size + (align) - 1) / (align)) * (align);
+
+union ncclLLFifoLine {
+ /* Flags have to be *after* data, because otherwise, an incomplete receive
+ from the network may receive the flag but not the data.
+ Note this is assuming that either we receive contiguous chunks of data
+ (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
+ struct {
+ uint32_t data1;
+ uint32_t flag1;
+ uint32_t data2;
+ uint32_t flag2;
+ };
+ uint64_t v[2];
+ int4 i4;
+};
+
+#define WARP_SIZE 32
+#define MAXCHANNELS 32
+#define NCCL_MAX_NTHREADS 512
+#define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS
+#define NCCL_LL_LINES_PER_THREAD 8
+#define NCCL_LL_SLICE_LINES (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
+#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
+#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
+#ifdef TEST_LL_CLEANUP
+#define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
+#define NCCL_LL_FLAG_MAX 0x100
+#define NCCL_LL_FLAG(a) ((uint32_t)((a) % NCCL_LL_FLAG_MAX))
+#else
+#define NCCL_LL_CLEAN_MASK 0x7ffffff8
+#define NCCL_LL_FLAG(a) ((uint32_t)(a))
+#endif
+// Make sure the clean mask will last for at least NCCL_NSTEPS
+static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
+
+#define NCCL_LL128_LINESIZE 128
+#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
+#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)
+
+#define NCCL_LL128_MAX_NTHREADS 640
+#define NCCL_LL128_ELEMS_PER_THREAD 120
+
+// Receiving from up to 3 sources is more compute intensive than sending
+// to 3 dests. Use 70% for reduce and 30% for bcast.
+#define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)
+
+#define NCCL_LL128_SLICE_ELEMS (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
+#define NCCL_LL128_BUFF_ELEMS (NCCL_LL128_SLICE_ELEMS*NCCL_STEPS)
+#define NCCL_LL128_BUFF_SIZE (NCCL_LL128_BUFF_ELEMS*sizeof(uint64_t))
+
+#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
+#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
+
+struct ncclConnInfo {
+ // Regular comm mechanism
+ char *buff; // Local for recv, remote for send
+ uint64_t *tail; // Local for recv, remote for send
+ uint64_t *head; // Local for send, remote for recv
+ uint64_t *opCountLoc; // opCount of local rank
+ uint64_t *opCountRem; // opCount of remote rank
+
+ int direct; // Direct communication
+ void **ptrExchange; // Pointer exchange for direct communication
+
+ int *fifo; // Size fifo for proxy
+
+ uint64_t step; // Keep where we are
+
+ // Low latency mechanism
+ union ncclLLFifoLine *llBuff; // Local for recv, remote for send
+ uint64_t llLastCleaning;
+
+ // High bandwidth, low latency protocol
+ uint64_t* ll128Buff; // Local for recv, remote for send
+};
+
+struct ncclConnector {
+ int connected;
+ struct ncclProxyArgs *proxyAppend;
+ struct ncclTransportComm* transportComm;
+ void* transportResources; // Host-side resources
+ struct ncclConnInfo conn;
+ struct ncclComm *comm;
+};
+
+struct ncclRing {
+ // Shortcuts for userRanks[1] and userRanks[n-1]
+ int prev;
+ int next;
+
+ // Maps an internal nccl index to user-specified rank order. This is necessary
+ // since we need to know how the user expects data to be ordered across
+ // devices. Ordered from current device.
+ int* userRanks;
+ int* devUserRanks;
+};
+
+
+#define NCCL_MAX_TREE_ARITY 3
+struct ncclTree {
+ int depth;
+ int up;
+ int down[NCCL_MAX_TREE_ARITY];
+};
+
+struct ncclPeer {
+ struct ncclConnector send;
+ struct ncclConnector recv;
+};
+
+struct ncclDevComm;
+
+/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
+/* to make sure reads to host from the CUDA kernel are aligned. */
+/* Make sure to adjust padding at the end of ncclColl. */
+struct CollectiveArgs {
+ struct ncclDevComm* comm;
+ uint64_t opCount;
+
+ // local and remote input, output, and buffer
+ const void * ThisInput;
+ void * ThisOutput;
+
+ // general parameters
+ size_t N;
+ uint32_t root;
+ uint8_t bid;
+ uint8_t nChannels;
+ uint16_t nThreads;
+
+ int lastChunkSize;
+};
+struct ncclColl {
+ union {
+ struct {
+ struct CollectiveArgs args;
+ uint16_t funcIndex;
+ uint16_t nextIndex;
+ uint8_t active;
+ };
+ int data[0x10];
+ };
+};
+static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
+
+struct ncclChannel {
+ union {
+ struct {
+ struct ncclRing ring;
+ struct ncclTree treeUp;
+ struct ncclTree treeDn;
+
+ int id;
+ int nthreads;
+ int buffSize;
+
+ // Communication structures
+ struct ncclPeer* peers;
+ struct ncclPeer* devPeers;
+
+ // Operation list for aggregation
+ struct ncclColl* collectives;
+ struct ncclColl* devCollectives;
+ int collStart;
+ int collCount;
+ int collFifoHead; // Only used by GPU
+ int collFifoTail; // Only used by CPU
+ };
+ int data[0x80];
+ };
+};
+static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
+
+typedef enum {
+ ncclDevSuccess,
+ ncclDevAssertedMismatch,
+ ncclDevSuspectedMismatch
+} ncclDevError_t;
+
+struct ncclDevComm {
+ int rank;
+ int nRanks;
+
+ // Flag to ask NCCL kernels to abort
+ volatile uint32_t *abortFlag;
+ volatile ncclDevError_t *fatalDevError;
+
+ // Channels, device side
+ struct ncclChannel* channels;
+};
+
+#endif
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
index 4db7094..cea486e 100644
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,8 +7,9 @@
#ifndef NCCL_ENQUEUE_H_
#define NCCL_ENQUEUE_H_
-#include "core.h"
+#include "comm.h"
#include "group.h"
+#include "collectives.h"
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
diff --git a/src/include/graph.h b/src/include/graph.h
new file mode 100644
index 0000000..3c8ba19
--- /dev/null
+++ b/src/include/graph.h
@@ -0,0 +1,94 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_GRAPH_H_
+#define NCCL_GRAPH_H_
+
+#include "nccl.h"
+#include "devcomm.h"
+#include <limits.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <stdio.h>
+
+enum ncclPathDist {
+ PATH_PIX = 0,
+ PATH_PXB = 1,
+ PATH_PHB = 2,
+ PATH_NODE = 3,
+ PATH_SYS = 4,
+ PATH_ARRAY_SIZE = 5
+};
+
+extern const char* pathDists[PATH_ARRAY_SIZE];
+
+ncclResult_t ncclTopoCudaPath(int cudaDev, char** path);
+
+struct ncclTopoSystem;
+// Build the topology
+ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system);
+ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system);
+ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
+
+ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info);
+void ncclTopoFree(struct ncclTopoSystem* system);
+ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
+ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system);
+
+// Query topology
+ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink);
+ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink);
+ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance);
+ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* net);
+ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance);
+ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count);
+
+#define NCCL_TOPO_MAX_NODES 256
+
+#define NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP 1 // Split tree (send/recv from different ranks) always flowing in the same direction
+#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Split tree (send/recv from different ranks) flowing in both directions
+#define NCCL_TOPO_PATTERN_TREE 3 // Simple tree (send/recv from same rank) flowing in both directions
+#define NCCL_TOPO_PATTERN_RING 4 // Ring
+struct ncclTopoGraph {
+ // Input / output
+ int pattern;
+ int crossNic;
+ // Output
+ int nChannels;
+ int speedIntra;
+ int speedInter;
+ int type;
+ int nvlink;
+ int sameChannels;
+ int nHops;
+ int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES];
+ int inter[MAXCHANNELS*2];
+};
+ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
+
+ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
+
+struct ncclTopoRanks {
+ int ringRecv[MAXCHANNELS];
+ int ringSend[MAXCHANNELS];
+ int ringPrev[MAXCHANNELS];
+ int ringNext[MAXCHANNELS];
+ int treeUpRecv[MAXCHANNELS];
+ int treeUpSend[MAXCHANNELS];
+ int treeDnRecv[MAXCHANNELS];
+ int treeDnSend[MAXCHANNELS];
+};
+
+ncclResult_t ncclTopoPreset(struct ncclComm* comm,
+ struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
+ struct ncclTopoRanks* topoRanks);
+
+ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
+ struct ncclTopoRanks** allTopoRanks, int* rings);
+
+ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph);
+
+#endif
diff --git a/src/include/group.h b/src/include/group.h
index 76da30f..239b05f 100644
--- a/src/include/group.h
+++ b/src/include/group.h
@@ -8,14 +8,14 @@
#define NCCL_GROUP_H_
#include "nccl.h"
-#include "core.h"
+#include "comm.h"
bool ncclAsyncMode();
ncclResult_t ncclAsyncErrCheck(ncclResult_t ret);
-typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
+typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
-ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank);
+ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h
index 4f3e831..0943f99 100644
--- a/src/include/ibvwrap.h
+++ b/src/include/ibvwrap.h
@@ -4,7 +4,7 @@
* Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
*
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/include/info.h b/src/include/info.h
new file mode 100644
index 0000000..9461759
--- /dev/null
+++ b/src/include/info.h
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INFO_H_
+#define NCCL_INFO_H_
+
+#include "nccl.h"
+#include "core.h"
+
+typedef enum {
+ ncclPatternRing,
+ ncclPatternRingTwice,
+ ncclPatternPipelineFrom,
+ ncclPatternPipelineTo,
+ ncclPatternTreeUp,
+ ncclPatternTreeDown,
+ ncclPatternTreeUpDown
+} ncclPattern_t;
+
+// Used to pass NCCL call information between functions
+struct ncclInfo {
+ ncclFunc_t coll;
+ const char* opName;
+ // NCCL Coll Args
+ const void* sendbuff;
+ void* recvbuff;
+ size_t count;
+ ncclDataType_t datatype;
+ ncclRedOp_t op;
+ int root;
+ ncclComm_t comm;
+ cudaStream_t stream;
+ // Algorithm details
+ int chunkSteps;
+ int sliceSteps;
+ // Computed later
+ int algorithm;
+ int protocol;
+ ncclPattern_t pattern;
+ int nChannels;
+ int nThreads;
+ size_t nBytes;
+ int nstepsPerLoop;
+ int nchunksPerLoop;
+};
+
+#endif
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
index 89edbf5..d6ae9f8 100644
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -15,7 +15,7 @@
#define NCCL_PTR_CUDA 0x2
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_ALL=~0} ncclDebugLogSubSys;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
@@ -80,12 +80,13 @@ typedef struct {
// Finalize connection establishment after remote peer has called connectHandle
ncclResult_t (*accept)(void* listenComm, void** recvComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+ // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
- // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+ // Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
- // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+ // Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
diff --git a/src/include/net.h b/src/include/net.h
index e75e6bb..3d37c8c 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -13,16 +13,10 @@
extern ncclNet_t* ncclNet;
typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
-/* Socket Interface Selection type */
-typedef enum { findSubnetIf = -1,
- dontCareIf = -2
-} ncclSocketIfSl_t;
-
// Translation to external API
static const char* ncclNetName() { return ncclNet->name; }
static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
static ncclResult_t ncclNetPciPath(int dev, char** path) { NCCLCHECK(ncclNet->pciPath(dev, path)); return ncclSuccess; }
-static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(ncclNet->ptrSupport(dev, supportedTypes)); return ncclSuccess; }
static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
@@ -36,7 +30,37 @@ static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeS
static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
-extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str);
+#define GPU_BUF_SIZE (2*1024*1024)
+static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) {
+ int support;
+ NCCLCHECK(ncclNet->ptrSupport(dev, &support));
+ *supportedTypes = support & ~NCCL_PTR_CUDA;
+ // The network supports GPU Direct RDMA ; verify the GPU supports it as well.
+ if (support & NCCL_PTR_CUDA) {
+ void *lComm = NULL, *sComm = NULL, *rComm = NULL;
+ ncclNetHandle_t handle;
+ void* gpuPtr = NULL;
+ void* mHandle = NULL;
+ ncclResult_t res;
+ NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), res, cleanup);
+ NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), res, cleanup);
+ NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), res, cleanup);
+ CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), res, cleanup);
+ NOWARN(ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res);
+ if (res != ncclSuccess) goto cleanup;
+ NCCLCHECKGOTO(ncclNetDeregMr(sComm, mHandle), res, cleanup);
+ NCCLCHECKGOTO(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res, cleanup);
+ NCCLCHECKGOTO(ncclNetDeregMr(rComm, mHandle), res, cleanup);
+ *supportedTypes |= NCCL_PTR_CUDA;
+cleanup:
+ if (gpuPtr) cudaFree(gpuPtr);
+ if (rComm) ncclNetCloseRecv(rComm);
+ if (sComm) ncclNetCloseSend(sComm);
+ if (lComm) ncclNetCloseListen(lComm);
+ }
+ return ncclSuccess;
+}
+
extern ncclNet_t ncclNetIb;
extern ncclNet_t ncclNetSocket;
diff --git a/src/include/nvlink.h b/src/include/nvlink.h
deleted file mode 100644
index 1baf9e5..0000000
--- a/src/include/nvlink.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_NVLINK_H_
-#define NCCL_NVLINK_H_
-
-#include <sys/stat.h>
-#include <fcntl.h>
-#include "nvmlwrap.h"
-#include "topo.h"
-
-#define CONNECT_NVLINK 0x10
-#define CONNECT_NVSWITCH 0x100
-
-enum ncclNvLinkDeviceType {
- ncclNvLinkDeviceGpu,
- ncclNvLinkDeviceSwitch,
-};
-
-static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
- char classPath[] = "/sys/bus/pci/devices/0000:00:00.0/class";
- memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
- char* rPath = realpath(classPath, NULL);
- int fd;
- SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd);
- free(rPath);
- char pciClass[9];
- strncpy(pciClass, "0x000000", 9);
- int len;
- SYSCHECKVAL(read(fd, pciClass, 8), "read", len);
- SYSCHECK(close(fd), "close");
- if (strcmp(pciClass, "0x068000") == 0) {
- // PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
- *type = ncclNvLinkDeviceSwitch;
- } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
- || strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce)
- *type = ncclNvLinkDeviceGpu;
- } else {
- // Ignore if we don't know what's on the other side.
- return ncclSystemError;
- }
- return ncclSuccess;
-}
-
-/* Get the maximum number of NVLinks based on the GPU generation */
-static ncclResult_t getMaxNvlinks(int* maxLinks) {
- int cudaDev;
- CUDACHECK(cudaGetDevice(&cudaDev));
- int ccMajor;
- CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev));
- // 6 for Volta, 4 for Pascal
- *maxLinks = (ccMajor > 6) ? 6 : 4;
- // INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks);
- return ncclSuccess;
-}
-
-static int getNvlinkGpu(const char* busId1, const char* busId2) {
- // Determine if that connection is through NVLink
- int links = 0;
- int nvswitch_links = 0;
- int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
- nvmlDevice_t nvmlDev;
- ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId1, &nvmlDev);
- if (res != ncclSuccess) return 0;
-
- for(int l=0; l<maxNvLinks; ++l) {
- // Check whether we can use this NVLink for P2P
- unsigned canP2P;
- if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
-
- // Make sure the Nvlink is up. The previous call should have trained the link.
- nvmlEnableState_t isActive;
- if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
-
- // Try to figure out what's on the other side of the NVLink
- nvmlPciInfo_t remoteProc;
- if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
-
- // Old versions of NVML return a lowercase PCI ID
- char* p = remoteProc.busId;
- for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
- if (p[c] == 0) break;
- p[c] = toupper(p[c]);
- }
-
- if (busId2 != NULL && strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
- links++;
- } else {
- // Make a lower case copy of the bus ID for calling ncclDeviceType
- // PCI system path is in lower case
- char* p = remoteProc.busId;
- char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
- for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
- if (p[c] == 0) break;
- lowerId[c] = tolower(p[c]);
- }
-
- // Determine if the remote side is NVswitch or a GPU
- enum ncclNvLinkDeviceType type;
- ncclResult_t ret = ncclDeviceType(lowerId, &type);
- if (ret == ncclSuccess) {
- if (type == ncclNvLinkDeviceSwitch) {
- //TODO: we are making an assumption that all GPUs are connected to this switch
- //This assumption may change for future architectures
- nvswitch_links++;
- } else if (type == ncclNvLinkDeviceGpu && busId2 == NULL) {
- links++;
- }
- } else {
- // The NVLink is up but we couldn't find the PCI device on the other
- // side. Assume it's an NVswitch outside a VM.
- if (l==0) INFO(NCCL_INIT, "Assuming NVLink is connected to NVswitch");
- nvswitch_links++;
- }
- }
- }
- return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*links;
-}
-
-#endif
diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h
index 0b6198a..01bbb7f 100644
--- a/src/include/nvmlwrap.h
+++ b/src/include/nvmlwrap.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,18 +9,31 @@
#include "nccl.h"
-//#define NVML_DIRECT 1
-#ifdef NVML_DIRECT
-#include "nvml.h"
+// The NVML library doesn't appear to be thread safe
+#include <pthread.h>
+extern pthread_mutex_t nvmlLock;
+#define NVMLLOCK() pthread_mutex_lock(&nvmlLock)
+#define NVMLUNLOCK() pthread_mutex_unlock(&nvmlLock)
+
+#define NVMLLOCKCALL(cmd, ret) do { \
+ NVMLLOCK(); \
+ ret = cmd; \
+ NVMLUNLOCK(); \
+} while(false)
#define NVMLCHECK(cmd) do { \
- nvmlReturn_t e = cmd; \
+ nvmlReturn_t e; \
+ NVMLLOCKCALL(cmd, e); \
if( e != NVML_SUCCESS ) { \
WARN("NVML failure '%s'", nvmlErrorString(e)); \
return ncclSystemError; \
} \
} while(false)
+//#define NVML_DIRECT 1
+#ifdef NVML_DIRECT
+#include "nvml.h"
+
static ncclResult_t wrapNvmlSymbols(void) { return ncclSuccess; }
static ncclResult_t wrapNvmlInit(void) { NVMLCHECK(nvmlInit()); return ncclSuccess; }
static ncclResult_t wrapNvmlShutdown(void) { NVMLCHECK(nvmlShutdown()); return ncclSuccess; }
@@ -57,6 +70,10 @@ static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned i
NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber));
return ncclSuccess;
}
+static ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
+ NVMLCHECK(nvmlDeviceGetCudaComputeCapability(device, major, minor));
+ return ncclSuccess;
+}
#else
// Dynamically handle dependencies on NVML
@@ -139,6 +156,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned
ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
nvmlNvLinkCapability_t capability, unsigned int *capResult);
ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber);
+ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
#endif // NVML_DIRECT
diff --git a/src/include/param.h b/src/include/param.h
index dd5f697..5431757 100644
--- a/src/include/param.h
+++ b/src/include/param.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -36,7 +36,6 @@ static void setEnvFile(const char* fileName) {
s++;
strncpy(envValue, line+s, 1024);
setenv(envVar, envValue, 0);
- char *str = getenv(envVar);
}
if (line) free(line);
fclose(file);
diff --git a/src/include/rings.h b/src/include/rings.h
deleted file mode 100644
index 43fc595..0000000
--- a/src/include/rings.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_RINGS_H_
-#define NCCL_RINGS_H_
-
-static int getDefaultThreads() {
- // On Kepler, rings are doubled later.
- return ncclCudaCompCap() == 3 ? 128 : 256;
-}
-
-ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut);
-
-#endif
diff --git a/src/include/shm.h b/src/include/shm.h
index 4fb49cb..9cd9d05 100644
--- a/src/include/shm.h
+++ b/src/include/shm.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/include/socket.h b/src/include/socket.h
index fb5cfc0..96bf5db 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -18,8 +18,9 @@
#define MAX_IFS 16
#define MAX_IF_NAME_SIZE 16
-#define SLEEP_INT 1000 // sleep interval in usec
-#define RETRY_TIMES 2e4 // retry times before reporting a timeout (20 sec)
+#define SLEEP_INT 1000 // connection retry sleep interval in usec
+#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
+#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s)
/* Common socket address storage structure for IPv4/IPv6 */
union socketAddress {
@@ -41,7 +42,7 @@ static inline const char *socketToString(struct sockaddr *saddr, char *buf) {
return buf;
}
-static inline short socketToPort(struct sockaddr *saddr) {
+static inline uint16_t socketToPort(struct sockaddr *saddr) {
return ntohs(saddr->sa_family == AF_INET ? ((struct sockaddr_in*)saddr)->sin_port : ((struct sockaddr_in6*)saddr)->sin6_port);
}
@@ -65,6 +66,9 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
#endif
struct netIf userIfs[MAX_IFS];
bool searchNot = prefixList && prefixList[0] == '^';
+ if (searchNot) prefixList++;
+ bool searchExact = prefixList && prefixList[0] == '=';
+ if (searchExact) prefixList++;
int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
int found = 0;
@@ -91,7 +95,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
}
// check against user specified interfaces
- if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs) ^ searchNot)) {
+ if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) {
continue;
}
@@ -116,17 +120,17 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
return found;
}
-static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
+static bool matchSubnet(struct ifaddrs local_if, union socketAddress* remote) {
/* Check family first */
int family = local_if.ifa_addr->sa_family;
- if (family != remote.sa.sa_family) {
+ if (family != remote->sa.sa_family) {
return false;
}
if (family == AF_INET) {
struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
- struct sockaddr_in& remote_addr = remote.sin;
+ struct sockaddr_in& remote_addr = remote->sin;
struct in_addr local_subnet, remote_subnet;
local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
@@ -134,7 +138,7 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
} else if (family == AF_INET6) {
struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
- struct sockaddr_in6& remote_addr = remote.sin6;
+ struct sockaddr_in6& remote_addr = remote->sin6;
struct in6_addr& local_in6 = local_addr->sin6_addr;
struct in6_addr& mask_in6 = mask->sin6_addr;
struct in6_addr& remote_in6 = remote_addr.sin6_addr;
@@ -159,8 +163,11 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) {
}
}
-static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress remoteAddr, int ifNameMaxSize, int maxIfs) {
- char line[1024], line_a[1024];
+static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
+#ifdef ENABLE_TRACE
+ char line[1024];
+#endif
+ char line_a[1024];
int found = 0;
struct ifaddrs *interfaces, *interface;
getifaddrs(&interfaces);
@@ -184,13 +191,13 @@ static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAdd
// Store the interface name
strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
- INFO(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a));
+ TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr->sa), line_a));
found++;
if (found == maxIfs) break;
}
if (found == 0) {
- WARN("Net : No interface found in the same subnet as remote address %s", socketToString(&(remoteAddr.sa), line_a));
+ WARN("Net : No interface found in the same subnet as remote address %s", socketToString(&(remoteAddr->sa), line_a));
}
freeifaddrs(interfaces);
return found;
@@ -295,7 +302,7 @@ static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNam
// Try to find interface that is in the same subnet as the IP in comm id
union socketAddress idAddr;
GetSocketAddrFromString(&idAddr, commId);
- nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, idAddr, ifNameMaxSize, maxIfs);
+ nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
}
}
// Then look for anything else (but not docker or lo)
@@ -322,7 +329,11 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
if (socketToPort(&localAddr->sa)) {
// Port is forced by env. Make sure we get the port.
int opt = 1;
+#if defined(SO_REUSEPORT)
SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
+#else
+ SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
+#endif
}
// localAddr port should be 0 (Any port)
@@ -370,14 +381,18 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
#endif
int ret;
- int retries = 0;
+ int timedout_retries = 0;
+ int refused_retries = 0;
retry:
SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret);
if (ret == 0) return ncclSuccess;
- if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) {
- INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); \
- usleep(SLEEP_INT);
- goto retry;
+ if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) {
+ if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
+ (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
+ if (refused_retries % 1000 == 0) INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
+ usleep(SLEEP_INT);
+ goto retry;
+ }
}
WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno));
return ncclSystemError;
@@ -385,12 +400,12 @@ retry:
#define NCCL_SOCKET_SEND 0
#define NCCL_SOCKET_RECV 1
-static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* offset) {
+static ncclResult_t socketProgressOpt(int op, int fd, void* ptr, int size, int* offset, int block) {
int bytes = 0;
char* data = (char*)ptr;
do {
- if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), MSG_DONTWAIT);
- if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), MSG_DONTWAIT);
+ if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
+ if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
if (op == NCCL_SOCKET_RECV && bytes == 0) {
WARN("Net : Connection closed by remote peer");
return ncclSystemError;
@@ -408,9 +423,13 @@ static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* off
return ncclSuccess;
}
+static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* offset) {
+ return socketProgressOpt(op, fd, ptr, size, offset, 0);
+}
+
static ncclResult_t socketWait(int op, int fd, void* ptr, int size, int* offset) {
while (*offset < size)
- NCCLCHECK(socketProgress(op, fd, ptr, size, offset));
+ NCCLCHECK(socketProgressOpt(op, fd, ptr, size, offset, 1));
return ncclSuccess;
}
diff --git a/src/include/topo.h b/src/include/topo.h
deleted file mode 100644
index e824a81..0000000
--- a/src/include/topo.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_TOPO_H_
-#define NCCL_TOPO_H_
-
-#include "nccl.h"
-#include <limits.h>
-#include <stdlib.h>
-#include <ctype.h>
-
-#define BUSID_SIZE (sizeof("0000:00:00.0"))
-#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
-
-static ncclResult_t getCudaPath(int cudaDev, char** path) {
- char busId[BUSID_SIZE];
- CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
- for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
- char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
- memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
- memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
- *path = realpath(busPath, NULL);
- if (*path == NULL) {
- WARN("Could not find real path of %s", busPath);
- return ncclSystemError;
- }
- return ncclSuccess;
-}
-
-enum ncclPathDist {
- PATH_PIX = 0,
- PATH_PXB = 1,
- PATH_PHB = 2,
- PATH_SOC = 3
-};
-
-static const char* pathDists[] = { "PIX", "PXB", "PHB", "SOC" };
-
-static int pciDistance(char* path1, char* path2) {
- int score = 0;
- int depth = 0;
- int same = 1;
- for (int i=0; i<strlen(path1); i++) {
- if (path1[i] != path2[i]) same = 0;
- if (path1[i] == '/') {
- depth++;
- if (same == 1) score++;
- }
- }
- if (score <= 3) return PATH_SOC;
- if (score == 4) return PATH_PHB;
- if (score == depth-1) return PATH_PIX;
- return PATH_PXB;
-}
-
-#endif
diff --git a/src/include/transport.h b/src/include/transport.h
index 6231a71..8f9bf0e 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -7,11 +7,15 @@
#ifndef NCCL_TRANSPORT_H_
#define NCCL_TRANSPORT_H_
-#include "nccl.h"
-#include <stdint.h>
+#include "devcomm.h"
+#include "graph.h"
#include "nvmlwrap.h"
+#include "core.h"
#define NTRANSPORTS 3
+#define TRANSPORT_P2P 0
+#define TRANSPORT_SHM 1
+#define TRANSPORT_NET 2
extern struct ncclTransport ncclTransports[];
@@ -23,21 +27,19 @@ struct ncclComm;
struct ncclPeerInfo {
int rank;
int cudaDev;
- int nvmlDev;
+ int gdrSupport;
uint64_t hostHash;
uint64_t pidHash;
- char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+ dev_t shmDev;
+ int64_t busId;
};
-// Used to hold the transport connection values
-typedef int64_t ncclTvalue_t;
-
#define CONNECT_SIZE 128
struct ncclConnect {
char data[CONNECT_SIZE];
};
-enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress, ncclProxyOpDone };
+enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
struct ncclProxyArgs;
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
@@ -50,7 +52,7 @@ struct ncclProxyArgs {
int chunkSteps;
int nsteps;
uint64_t opCount;
- int llMode;
+ int protocol;
int state; // add component before this line -- it is left out during initialization
// Internal state
@@ -77,7 +79,7 @@ struct ncclProxyState {
};
struct ncclTransportComm {
- ncclResult_t (*setup)(struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
+ ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*);
ncclResult_t (*free)(void*);
ncclResult_t (*proxy)(struct ncclProxyArgs*);
@@ -85,8 +87,7 @@ struct ncclTransportComm {
struct ncclTransport {
const char name[4];
- ncclResult_t (*canConnect)(ncclTvalue_t*, struct ncclPeerInfo*, struct ncclPeerInfo*);
- ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*);
+ ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
struct ncclTransportComm send;
struct ncclTransportComm recv;
};
@@ -117,8 +118,4 @@ inline void transportProxyWait(const FUNC& func) {
}
}
-inline void transportProxyIdle(int idle) {
- sched_yield();
-}
-
#endif
diff --git a/src/include/trees.h b/src/include/trees.h
index 1a151d1..7eadd85 100644
--- a/src/include/trees.h
+++ b/src/include/trees.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/include/utils.h b/src/include/utils.h
index 5a6a588..266abca 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,7 +10,16 @@
#include "nccl.h"
#include <stdint.h>
-ncclResult_t getHostName(char* hostname, int maxlen);
+int ncclCudaCompCap();
+
+// PCI Bus ID <-> int64 conversion functions
+ncclResult_t int64ToBusId(int64_t id, char* busId);
+ncclResult_t busIdToInt64(char* busId, int64_t* id);
+
+ncclResult_t getBusId(int cudaDev, int64_t *busId);
+
+ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
+uint64_t getHash(const char* string, int n);
uint64_t getHostHash();
uint64_t getPidHash();
@@ -20,6 +29,12 @@ struct netIf {
};
int parseStringList(const char* string, struct netIf* ifList, int maxList);
-bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize);
+bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
+
+static long log2i(long n) {
+ long l = 0;
+ while (n>>=1) l++;
+ return l;
+}
#endif
diff --git a/src/init.cc b/src/init.cc
new file mode 100644
index 0000000..627f6c7
--- /dev/null
+++ b/src/init.cc
@@ -0,0 +1,934 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl.h"
+#include "channel.h"
+#include "nvmlwrap.h"
+#include "bootstrap.h"
+#include "transport.h"
+#include "group.h"
+#include "net.h"
+#include "enqueue.h"
+#include "graph.h"
+#include "argcheck.h"
+#include "cpuset.h"
+#include <sched.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <dlfcn.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#define STR2(v) #v
+#define STR(v) STR2(v)
+
+#ifdef ENABLE_TRACE
+std::chrono::high_resolution_clock::time_point ncclEpoch;
+#endif
+
+#if CUDART_VERSION >= 9020
+#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
+#else
+#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
+#endif
+
+NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
+
+NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
+
+ncclNet_t* ncclNet = NULL;
+
+// Returns ncclInternalError if anything fails, causing that network to be ignored.
+ncclResult_t initNet(ncclNet_t* net) {
+ int ndev;
+ if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
+ if (net->devices(&ndev) != ncclSuccess) return ncclInternalError;
+ if (ndev <= 0) return ncclSystemError;
+ return ncclSuccess;
+}
+
+ncclResult_t initNetPlugin(ncclNet_t** net) {
+ void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL);
+ if (netPluginLib == NULL) {
+ // dlopen does not guarantee to set errno, but dlerror only gives us a
+ // string, so checking errno doesn't hurt to try to provide a better
+ // error message
+ if (errno == ENOENT) {
+ INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (libnccl-net.so), using internal implementation");
+ } else {
+ INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
+ }
+ return ncclSuccess;
+ }
+ ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
+ if (extNet == NULL) {
+ INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol.");
+ goto cleanup;
+ }
+ if (initNet(extNet) == ncclSuccess) {
+ *net = extNet;
+ return ncclSuccess;
+ }
+cleanup:
+ if (netPluginLib != NULL) dlclose(netPluginLib);
+ return ncclSuccess;
+}
+
+ncclResult_t initNet() {
+ // Always initialize bootstrap network
+ NCCLCHECK(bootstrapNetInit());
+
+ NCCLCHECK(initNetPlugin(&ncclNet));
+ if (ncclNet != NULL) return ncclSuccess;
+ if (initNet(&ncclNetIb) == ncclSuccess) {
+ ncclNet = &ncclNetIb;
+ } else {
+ NCCLCHECK(initNet(&ncclNetSocket));
+ ncclNet = &ncclNetSocket;
+ }
+ return ncclSuccess;
+}
+
+pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
+static bool initialized = false;
+static ncclResult_t ncclInit() {
+ if (initialized) return ncclSuccess;
+ pthread_mutex_lock(&initLock);
+ if (!initialized) {
+ initEnv();
+ initNet();
+ initialized = true;
+ }
+ pthread_mutex_unlock(&initLock);
+ return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclGetVersion, int* version);
+ncclResult_t ncclGetVersion(int* version) {
+ if (version == NULL) return ncclInvalidArgument;
+ *version = NCCL_VERSION_CODE;
+ return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
+ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
+ NCCLCHECK(ncclInit());
+ NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
+ return bootstrapGetUniqueId(out);
+}
+
+// Prevent compiler from optimizing out these operations
+#ifdef __clang__
+#define NCCL_NO_OPTIMIZE __attribute__((optnone))
+#else
+#define NCCL_NO_OPTIMIZE __attribute__((optimize("O0")))
+#endif
+
+void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) {
+ comm->rank = comm->cudaDev = comm->busId = comm->nRanks = -1;
+}
+
+#undef NCCL_NO_OPTIMIZE
+
+static ncclResult_t commFree(ncclComm_t comm) {
+ if (comm == NULL)
+ return ncclSuccess;
+
+ free(comm->peerInfo);
+ ncclTopoFree(comm->topo);
+
+ if (comm->bootstrap)
+ NCCLCHECK(bootstrapClose(comm->bootstrap));
+
+ CUDACHECK(cudaFree(comm->hostDevComm.channels));
+ CUDACHECK(cudaFree(comm->devComm));
+
+ for (int channel=0; channel<comm->nChannels; channel++)
+ NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks));
+
+ if (comm->doneEvent != NULL)
+ CUDACHECK(cudaEventDestroy(comm->doneEvent));
+
+ if (comm->launchMode == ncclComm::GROUP) {
+ CUDACHECK(cudaStreamDestroy(comm->groupStream));
+ }
+
+ // Last rank frees shared resources between threads
+ int isLast;
+ NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
+ if (isLast) {
+ free(comm->intraBarrier);
+ free(comm->intraParams);
+ free(comm->intraCudaDevs);
+ free(comm->intraCGMode);
+ free(comm->intraCC);
+ }
+ CUDACHECK(cudaFreeHost((void *)comm->abortFlag));
+ CUDACHECK(cudaFreeHost((void *)comm->fatalDevError));
+
+ // Poison comm to try and catch a double free
+ commPoison(comm);
+
+ free(comm);
+ return ncclSuccess;
+}
+
+static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
+ if (ndev < 1) {
+ WARN("invalid device count (%d) requested", ndev);
+ return ncclInvalidArgument;
+ }
+ if (rank >= ndev || rank < 0) {
+ WARN("rank %d exceeds ndev=%d", rank, ndev);
+ return ncclInvalidArgument;
+ }
+
+ // Try to create a CUDA object right away. If there is something wrong with
+ // the device we're on (failure cause #1) , better know it early.
+ cudaEvent_t doneEvent;
+ CUDACHECK(cudaEventCreateWithFlags(&doneEvent, cudaEventDisableTiming));
+
+ struct ncclComm* comm;
+ NCCLCHECK(ncclCalloc(&comm, 1));
+
+ comm->rank = comm->hostDevComm.rank =rank;
+ comm->nRanks = comm->hostDevComm.nRanks = ndev;
+ cudaGetDevice(&comm->cudaDev);
+ NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
+ TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x", comm, rank, ndev, comm->cudaDev, comm->busId);
+
+ comm->doneEvent = doneEvent;
+ comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
+#if CUDART_VERSION >= 9020
+ comm->groupCudaStream = ncclParamGroupCudaStream();
+#else
+ // Don't allow the user to overload the default setting in older CUDA builds
+ comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM;
+#endif
+ comm->fatalError = ncclSuccess;
+
+ NCCLCHECK(ncclCudaHostAlloc((void**) &comm->fatalDevError, (void**) &comm->hostDevComm.fatalDevError, sizeof(ncclDevError_t)));
+ *comm->fatalDevError = ncclDevSuccess;
+
+ NCCLCHECK(ncclCudaHostAlloc((void**) &comm->abortFlag, (void**) &comm->hostDevComm.abortFlag, sizeof(uint32_t)));
+ *comm->abortFlag = 0;
+
+ comm->argsptr = &comm->args;
+
+ *comret = comm;
+ return ncclSuccess;
+}
+
+static ncclResult_t devCommSetup(ncclComm_t comm) {
+ // Duplicate the channels on the device
+ NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->nChannels));
+ NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->nChannels));
+
+ // Copy userRanks and peers
+ for (int r=0; r<comm->nChannels; r++) {
+ NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
+ NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks));
+ }
+
+ // Duplicate the dev comm on the device
+ NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
+ NCCLCHECK(ncclCudaMemcpy(comm->devComm, &comm->hostDevComm, 1));
+ return ncclSuccess;
+}
+
+// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
+#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
+static void showVersion() {
+ static int shown = 0;
+ if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
+ printf("%s\n", VERSION_STRING);
+ fflush(stdout);
+ if (ncclDebugFile != stdout)
+ INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
+ shown = 1;
+ }
+}
+
+static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) {
+ info->rank = comm->rank;
+ CUDACHECK(cudaGetDevice(&info->cudaDev));
+ info->hostHash=getHostHash()+commHash;
+ info->pidHash=getPidHash()+commHash;
+
+ // Get the device MAJOR:MINOR of /dev/shm so we can use that
+ // information to decide whether we can use SHM for inter-process
+ // communication in a container environment
+ struct stat statbuf;
+ SYSCHECK(stat("/dev/shm", &statbuf), "stat");
+ info->shmDev = statbuf.st_dev;
+
+ info->busId = comm->busId;
+ int netDevs;
+
+ NCCLCHECK(ncclNetDevices(&netDevs));
+ for (int n=0; n<netDevs; n++) {
+ int ptrSupport;
+ NCCLCHECK(ncclNetPtrSupport(n, &ptrSupport));
+ if (ptrSupport & NCCL_PTR_CUDA) info->gdrSupport |= (1 << n);
+ }
+ return ncclSuccess;
+}
+
+template <int type>
+static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) {
+ for (int t=0; t<NTRANSPORTS; t++) {
+ struct ncclTransport *transport = ncclTransports+t;
+ struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
+ int ret = 0;
+ NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo));
+ if (ret) {
+ connector->transportComm = transportComm;
+ NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, buffSize, channelId));
+ return ncclSuccess;
+ }
+ }
+ WARN("No transport found !");
+ return ncclInternalError;
+}
+
+static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks) {
+ TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
+ NCCLCHECK(initChannel(comm, channelId));
+
+ struct ncclRing* ring = &comm->channels[channelId].ring;
+ // Reorganize ranks to start with rank.
+ int shift;
+ for (shift = 0; shift<nranks; shift++) {
+ if (ringRanks[shift] == rank) {
+ break;
+ }
+ }
+ for (int i=0; i<nranks; i++) {
+ ring->userRanks[i] = ringRanks[(i+shift)%nranks];
+ }
+ return ncclSuccess;
+}
+
+void* waitForNonNullPtr(void* p) {
+ volatile void** ptr = (volatile void**) p;
+ while (*ptr == NULL) sched_yield();
+ return (void*)*ptr;
+}
+
+ncclResult_t initParams(struct ncclComm* comm) {
+ struct cudaLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
+ params->args = &comm->argsptr;
+ params->stream = NULL;
+ params->sharedMem = 0;
+ params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
+ params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1;
+ return ncclSuccess;
+}
+
+// Allocate/Set Intra Process Structures and set CG options
+ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) {
+ comm->intraRank = rank;
+ comm->intraRanks = ranks;
+ comm->intraPhase = 0;
+
+ // Alloc shared structures
+ if (rank == 0) {
+ assert(comm == comm0);
+ int* bar;
+ NCCLCHECK(ncclCalloc(&bar, 2));
+ bar[0] = bar[1] = 0;
+ comm->intraBarrier = bar;
+ NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks));
+ NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks));
+ int* CGMode;
+ NCCLCHECK(ncclCalloc(&CGMode, 1));
+ *CGMode = 0x11;
+ comm->intraCGMode = CGMode;
+ int* CC;
+ NCCLCHECK(ncclCalloc(&CC, 1));
+ *CC = ncclCudaCompCap();
+ comm->intraCC = CC;
+ } else {
+ comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
+ comm->intraParams = (struct cudaLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
+ comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
+ comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
+ comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
+ }
+ comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
+ NCCLCHECK(initParams(comm));
+
+ int cgMdLaunch = 0;
+
+ // Set CG Mode
+ comm->launchMode = ncclComm::GROUP;
+ char* str = getenv("NCCL_LAUNCH_MODE");
+ if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) {
+ comm->launchMode = ncclComm::PARALLEL;
+ }
+ if (comm->launchMode == ncclComm::GROUP) {
+ CUDACHECK(cudaStreamCreateWithFlags(&comm->groupStream, cudaStreamNonBlocking));
+#if CUDART_VERSION >= 9000
+ if (*comm->intraCC && (ncclCudaCompCap() == *comm->intraCC)) {
+ // Check whether the GPU supports Cooperative Group Multi Device Launch
+ (void) cudaDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
+ }
+#endif
+ }
+
+ // Disable cgMdLaunch if any rank does not support it
+ if (cgMdLaunch == 0) {
+ *comm->intraCGMode = 0x10;
+ }
+ return ncclSuccess;
+}
+
+static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
+ TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
+ uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
+ struct ncclConnect connect;
+ struct ncclConnector* conn;
+ for (int i=0; i<nrecv; i++) {
+ int peer = peerRecv[i];
+ if (peer == -1) continue;
+ conn = &channel->peers[peer].recv;
+ if (conn->connected) { ++nSkippedRecv; continue; }
+ memset(&connect, 0, sizeof(connect));
+ NCCLCHECK(selectTransport<0>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
+ NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+ }
+ for (int i=0; i<nsend; i++) {
+ int peer = peerSend[i];
+ if (peer == -1) continue;
+ conn = &channel->peers[peer].send;
+ if (conn->connected) { ++nSkippedSend; continue; }
+ memset(&connect, 0, sizeof(connect));
+ NCCLCHECK(selectTransport<1>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
+ NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+ }
+ for (int i=0; i<nsend; i++) {
+ int peer = peerSend[i];
+ if (peer == -1) continue;
+ conn = &channel->peers[peer].send;
+ if (conn->connected) {++nSkippedSend; continue; }
+ memset(&connect, 0, sizeof(connect));
+ NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+ NCCLCHECK(conn->transportComm->connect(&connect, conn));
+ conn->connected = 1;
+ }
+ for (int i=0; i<nrecv; i++) {
+ int peer = peerRecv[i];
+ if (peer == -1) continue;
+ conn = &channel->peers[peer].recv;
+ if (conn->connected) {++nSkippedRecv; continue; }
+ memset(&connect, 0, sizeof(connect));
+ NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+ NCCLCHECK(conn->transportComm->connect(&connect, conn));
+ conn->connected = 1;
+ }
+ TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
+ return ncclSuccess;
+}
+
+NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
+
+static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
+ // We use 3 AllGathers
+ // 1. { peerInfo, comm }
+ // 2. ConnectTransport[nranks], ConnectValue[nranks]
+ // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
+
+ int rank = comm->rank;
+ int nranks = comm->nRanks;
+ uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
+ TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
+ NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
+
+ // AllGather1 - begin
+ struct {
+ struct ncclPeerInfo peerInfo;
+ struct ncclComm* comm;
+ } *allGather1Data;
+
+ NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
+ allGather1Data[rank].comm = comm;
+ struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo;
+ NCCLCHECK(fillInfo(comm, myInfo, commHash));
+ NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
+
+ NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
+ for (int i = 0; i < nranks; i++) {
+ memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
+ if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
+ WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId);
+ return ncclInvalidUsage;
+ }
+ }
+ // AllGather1 data is used again below
+ // AllGather1 - end
+
+ // Topo detection / System graph creation
+ NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo));
+ // Compute paths between GPUs and NICs
+ NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
+ // Remove inaccessible GPUs and unused NICs
+ NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm));
+ // Recompute paths after trimming
+ NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
+ // Compute max speed to accelerate search
+ NCCLCHECK(ncclTopoGetMaxSpeed(comm->topo));
+ // Print final topology
+ NCCLCHECK(ncclTopoPrint(comm->topo));
+
+ // Get rings and trees
+ struct ncclTopoGraph treeGraph;
+ treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
+ treeGraph.crossNic = ncclParamCrossNic();
+ // We communicate only half the data between node with trees on 2 nodes.
+ NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph));
+ NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph));
+ struct ncclTopoGraph ringGraph;
+ ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
+ ringGraph.crossNic = ncclParamCrossNic();
+ NCCLCHECK(ncclTopoCompute(comm->topo, &ringGraph));
+ NCCLCHECK(ncclTopoPrintGraph(comm->topo, &ringGraph));
+
+ // AllGather3 - begin
+
+ struct {
+ int cudaCompCap;
+ int fullCudaCompCap;
+ int nvlink;
+ int nChannels;
+ struct {
+ int sameChannels;
+ int speedIntra;
+ int speedInter;
+ int nvlink;
+ } tree;
+ struct {
+ int sameChannels;
+ int speedIntra;
+ int speedInter;
+ int nvlink;
+ } ring;
+ struct ncclTopoRanks topoRanks;
+ } *allGather3Data;
+
+ NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
+ allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
+ allGather3Data[rank].nvlink = treeGraph.nvlink;
+ allGather3Data[rank].nChannels = comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
+ allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
+ allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
+ allGather3Data[rank].tree.speedInter = treeGraph.speedInter;
+ allGather3Data[rank].tree.nvlink = treeGraph.nvlink;
+ allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
+ allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra;
+ allGather3Data[rank].ring.speedInter = ringGraph.speedInter;
+ allGather3Data[rank].ring.nvlink = ringGraph.nvlink;
+
+ NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &allGather3Data[rank].topoRanks));
+
+ NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
+
+ // Determine nNodes, firstRanks, ...
+ int* nodesFirstRank;
+ NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks));
+ for (int i=0; i<nranks; i++) {
+ int node = -1;
+ int firstRank = allGather3Data[i].topoRanks.ringRecv[0];
+ for (int n=0; n<comm->nNodes; n++) {
+ if (nodesFirstRank[n] == firstRank) node = n;
+ }
+ if (node == -1) {
+ node = comm->nNodes++;
+ nodesFirstRank[node] = firstRank;
+ }
+ if (i == comm->rank) comm->node = node;
+ }
+
+ // Determine the minimum CUDA Compute capability of all GPUs
+ int myCompCap = allGather3Data[rank].cudaCompCap;
+ int minCompCap = myCompCap, maxCompCap = myCompCap;
+ for (int i = 0; i < nranks; i++) {
+ minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap);
+ maxCompCap = std::max(allGather3Data[i].cudaCompCap, maxCompCap);
+ }
+
+ comm->nvlink = 1;
+ for (int i = 0; i < nranks; i++) comm->nvlink &= allGather3Data[i].nvlink;
+
+ int nChannelsOrig = comm->nChannels;
+ struct ncclTopoRanks** allTopoRanks;
+ NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
+ for (int i=0; i<nranks; i++) {
+ allTopoRanks[i] = &allGather3Data[i].topoRanks;
+ // Make sure we align all ranks so that the tuning is consistent across ranks
+ treeGraph.nChannels = ringGraph.nChannels = comm->nChannels = std::min(allGather3Data[i].nChannels, comm->nChannels);
+ treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
+ treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
+ treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
+ treeGraph.nvlink = std::min(allGather3Data[i].tree.nvlink, treeGraph.nvlink);
+ ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
+ ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
+ ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
+ ringGraph.nvlink = std::min(allGather3Data[i].ring.nvlink, ringGraph.nvlink);
+ }
+
+ if (comm->nChannels < nChannelsOrig) {
+ // We started duplicating channels during Preset(), so we need to move the
+ // duplicated channels since we have removed some.
+ for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
+ }
+
+ int *rings;
+ NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
+
+ NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings));
+
+ free(allTopoRanks);
+ free(nodesFirstRank);
+ free(allGather3Data);
+
+ // AllGather3 - end
+
+ TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
+
+ NCCLCHECK(ncclSetThresholds(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph));
+
+ char line[1024];
+ line[0]='\0';
+ for (int c=0; c<comm->nChannels; c++) {
+ struct ncclTree* treeUp = &comm->channels[c].treeUp;
+ struct ncclTree* treeDn = &comm->channels[c].treeDn;
+ snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d|%d->%d->%d/%d/%d",
+ c, treeUp->down[0], treeUp->down[1], treeUp->down[2], rank, treeUp->up,
+ treeDn->up, rank, treeDn->down[0], treeDn->down[1], treeDn->down[2]);
+ }
+ line[1023] = '\0';
+ INFO(NCCL_INIT, "Trees%s", line);
+
+ // Connect with prev/next for each ring
+ struct ncclConnect *connect;
+ NCCLCHECK(ncclCalloc(&connect, 2));
+ for (int c=0; c<comm->nChannels; c++) {
+ struct ncclChannel* channel = comm->channels+c;
+ NCCLCHECK(setupChannel(comm, c, rank, nranks, rings+c*nranks));
+ if (comm->nRanks == 1) continue;
+ NCCLCHECK(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next));
+ NCCLCHECK(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up));
+ NCCLCHECK(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down));
+ }
+ TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
+ free(connect);
+ free(rings);
+
+ // Compute intra ranks (using AllGather1 data)
+ int intraRank0 = -1, intraRank = -1, intraRanks = 0;
+ for (int i = 0; i < nranks; i++) {
+ if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) &&
+ (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) {
+ if (intraRanks == 0) intraRank0 = i;
+ if (i == rank) intraRank = intraRanks;
+ intraRanks++;
+ }
+ }
+ TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+ rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
+ if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
+ WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+ rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
+ return ncclInternalError;
+ }
+ NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm));
+
+ // Done with AllGather1 data
+ free(allGather1Data);
+
+ if (comm->nNodes) NCCLCHECK(transportCreateProxy(comm));
+
+ TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
+ return ncclSuccess;
+}
+
+static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) {
+ CPU_ZERO_S(sizeof(cpu_set_t), mask);
+ char* cudaPath;
+ NCCLCHECK(ncclTopoCudaPath(cudaDev, &cudaPath));
+ char path[PATH_MAX];
+ strncpy(path, cudaPath, PATH_MAX-1);
+ snprintf(path+strlen(path), PATH_MAX-1-strlen(path), "/local_cpus");
+ path[PATH_MAX-1] = '\0';
+ int fd;
+ SYSCHECKVAL(open(path, O_RDONLY), "open", fd);
+ char affinityStr[sizeof(cpu_set_t)*2 + 1];
+ int r = read(fd, affinityStr, sizeof(cpu_set_t)*2);
+ if (r > 0) {
+ affinityStr[r] = '\0';
+ NCCLCHECK(ncclStrToCpuset(affinityStr, mask));
+ }
+ close(fd);
+ free(cudaPath);
+ return ncclSuccess;
+}
+
+NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
+
+static ncclResult_t setCpuAffinity(int cudaDev) {
+ // Query the CPU affinity set we were provided
+ cpu_set_t mask;
+ SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
+
+#ifdef ENABLE_TRACE
+ {
+ char affinityStr[sizeof(cpu_set_t)*2];
+ NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
+ TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", cudaDev, affinityStr);
+ }
+#endif
+
+ // Find the CPUs that are local to the supplied GPU
+ cpu_set_t gpuMask;
+ NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask));
+
+#ifdef ENABLE_TRACE
+ {
+ char affinityStr[sizeof(cpu_set_t)*2];
+ NCCLCHECK(ncclCpusetToStr(&gpuMask, affinityStr));
+ TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", cudaDev, affinityStr);
+ }
+#endif
+
+ cpu_set_t finalMask;
+ if (ncclParamIgnoreCpuAffinity())
+ // Ignore the CPU affinity set and use the GPU one instead
+ finalMask = gpuMask;
+ else
+ // Use a subset of the GPU affinity set
+ CPU_AND(&finalMask, &mask, &gpuMask);
+
+ // If there is a non empty set, use it to set affinity
+ if (CPU_COUNT(&finalMask)) {
+ char affinityStr[sizeof(cpu_set_t)*2];
+ NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
+ INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", cudaDev, affinityStr);
+ SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity");
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) {
+ cpu_set_t affinitySave;
+ sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+
+ NCCLCHECK(wrapNvmlSymbols());
+ NCCLCHECK(wrapNvmlInit());
+
+ // Make sure all host memory allocation are close to the GPU
+ CUDACHECK(cudaSetDevice(cudaDev));
+ NCCLCHECK(setCpuAffinity(cudaDev));
+ ncclResult_t res;
+
+ NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
+ NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
+ NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
+
+ sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+ NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
+
+ INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
+
+ return ncclSuccess;
+cleanup:
+ if ((*newcomm) && (*newcomm)->bootstrap) bootstrapAbort((*newcomm)->bootstrap);
+ *newcomm = NULL;
+ sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+ return res;
+}
+
+static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) {
+ ncclResult_t res;
+ char* env = getenv("NCCL_COMM_ID");
+ if (env && myrank == 0) {
+ NCCLCHECKGOTO(bootstrapCreateRoot(&commId, true), res, end);
+ }
+
+ NCCLCHECKGOTO(ncclInit(), res, end);
+ if (myrank == 0) showVersion();
+
+ // Make sure the CUDA runtime is initialized.
+ CUDACHECKGOTO(cudaFree(NULL), res, end);
+
+ NCCLCHECKGOTO(PtrCheck(newcomm, "CommInitRank", "newcomm"), res, end);
+ if (nranks < 1 || myrank < 0 || myrank >= nranks) {
+ WARN("Invalid rank requested : %d/%d", myrank, nranks);
+ res = ncclInvalidArgument;
+ goto end;
+ }
+
+ if (ncclAsyncMode()) {
+ NCCLCHECKGOTO(ncclAsyncInit(ncclCommInitRankSync, newcomm, nranks, commId, myrank, cudaDev), res, end);
+ } else {
+ NCCLCHECKGOTO(ncclCommInitRankSync(newcomm, nranks, commId, myrank, cudaDev), res, end);
+ }
+end:
+ if (ncclAsyncMode()) return ncclAsyncErrCheck(res);
+ else return res;
+}
+
+NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
+ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
+ int cudaDev;
+ CUDACHECK(cudaGetDevice(&cudaDev));
+ NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev));
+ return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
+ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
+ NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
+ if (ndev < 0) {
+ WARN("Invalid device count requested : %d", ndev);
+ return ncclInvalidArgument;
+ }
+
+ ncclUniqueId uniqueId;
+ NCCLCHECK(ncclGetUniqueId(&uniqueId));
+ NCCLCHECK(ncclGroupStart());
+ for (int i=0; i<ndev; i++) {
+ // Ignore return codes .. we need to call ncclGroupEnd to clean up anyway
+ ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i);
+ }
+ NCCLCHECK(ncclGroupEnd());
+ return ncclSuccess;
+}
+
+static ncclResult_t commDestroy(ncclComm_t comm) {
+ int savedDevice;
+#ifdef ENABLE_TRACE
+ int rank = comm->rank;
+#endif
+ CUDACHECK(cudaGetDevice(&savedDevice));
+ int commDevice = comm->cudaDev;
+
+ if (savedDevice != commDevice) {
+ CUDACHECK(cudaSetDevice(commDevice));
+ }
+
+ TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, *comm->abortFlag, comm->fatalError);
+
+ CUDACHECK(cudaStreamSynchronize(comm->groupStream));
+ NCCLCHECK(transportDestroyProxy(comm));
+ NCCLCHECK(commFree(comm));
+
+ if (savedDevice != commDevice)
+ CUDACHECK(cudaSetDevice(savedDevice));
+
+ TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
+
+ return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
+ncclResult_t ncclCommDestroy(ncclComm_t comm) {
+ if (comm == NULL)
+ return ncclSuccess;
+
+ TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %x", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
+
+ // Try and prevent a double free of the comm struct (user error)
+ if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {
+ WARN("comm %p has already been destroyed", comm);
+ return ncclInvalidArgument;
+ }
+
+ return commDestroy(comm);
+}
+
+NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
+ncclResult_t ncclCommAbort(ncclComm_t comm) {
+ if (comm == NULL)
+ return ncclSuccess;
+
+ // Ask anything that might still be running on the device to quit
+ *comm->abortFlag = 1;
+
+ return commDestroy(comm);
+}
+
+NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
+const char* ncclGetErrorString(ncclResult_t code) {
+ switch (code) {
+ case ncclSuccess : return "no error";
+ case ncclUnhandledCudaError : return "unhandled cuda error";
+ case ncclSystemError : return "unhandled system error";
+ case ncclInternalError : return "internal error";
+ case ncclInvalidArgument : return "invalid argument";
+ case ncclInvalidUsage : return "invalid usage";
+ default : return "unknown result code";
+ }
+}
+
+NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError);
+ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
+ NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm"));
+ NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError"));
+
+ // Check device reported error
+ static ncclDevError_t printedDevErr = ncclDevSuccess;
+ switch(*comm->fatalDevError) {
+ case ncclDevSuccess :
+ break;
+ case ncclDevAssertedMismatch :
+ if (printedDevErr != ncclDevAssertedMismatch) {
+ WARN("Mismatched collective detected, please check your collective calls at and around rank %d. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank);
+ printedDevErr = ncclDevAssertedMismatch;
+ }
+ if (comm->fatalError == ncclSuccess) {
+ comm->fatalError = ncclInvalidUsage;
+ }
+ break;
+ case ncclDevSuspectedMismatch :
+ if (printedDevErr != ncclDevSuspectedMismatch) {
+ WARN("Your program may be hanging, this may be caused by a collective mismatch around rank %d. Please check your collective calls at and around this rank. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank);
+ printedDevErr = ncclDevSuspectedMismatch;
+ }
+ break;
+ default:
+ WARN("Unknown device error %d", *comm->fatalDevError);
+ return ncclInternalError;
+ }
+ *asyncError = comm->fatalError;
+ return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
+ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
+ NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
+ NCCLCHECK(PtrCheck(count, "CommCount", "count"));
+ *count = comm->nRanks;
+ return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
+ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
+ NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
+ NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
+ *devid = comm->cudaDev;
+ return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
+ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
+ NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
+ NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
+ *rank = comm->rank;
+ return ncclSuccess;
+}
diff --git a/src/init.cu b/src/init.cu
deleted file mode 100644
index 75822e6..0000000
--- a/src/init.cu
+++ /dev/null
@@ -1,1240 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "nccl.h"
-#include "core.h"
-#include "channel.h"
-#include "param.h"
-#include "nvmlwrap.h"
-#include "rings.h"
-#include "trees.h"
-#include "bootstrap.h"
-#include "transport.h"
-#include "group.h"
-#include "utils.h"
-#include "net.h"
-#include "checks.h"
-#include "enqueue.h"
-#include "topo.h"
-#include "nvlink.h"
-#include "cpuset.h"
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sched.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <cuda_runtime.h>
-#include <string.h>
-#include <errno.h>
-#include <assert.h>
-#include <dlfcn.h>
-
-#define STR2(v) #v
-#define STR(v) STR2(v)
-
-int ncclDebugLevel;
-uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT
-pthread_mutex_t ncclDebugOutputLock;
-FILE *ncclDebugFile = stdout;
-
-#ifdef ENABLE_TRACE
-std::chrono::high_resolution_clock::time_point ncclEpoch;
-#endif
-
-#if CUDART_VERSION >= 9200
-#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
-#else
-#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
-#endif
-
-NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
-
-NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
-
-ncclNet_t* ncclNet = NULL;
-
-// We define this as weak to let tests redefine their own
-#pragma weak ncclNvlinkGpu
-ncclResult_t ncclNvlinkGpu(int* nvlink) {
- int cudaDev;
- CUDACHECK(cudaGetDevice(&cudaDev));
- char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
- CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
- *nvlink = getNvlinkGpu(busId, NULL);
- return ncclSuccess;
-}
-// We define this as weak to let tests redefine their own
-#pragma weak ncclCudaCompCap
-int ncclCudaCompCap() {
- int cudaDev;
- if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
- int ccMajor;
- if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
- return ccMajor;
-}
-int ncclCudaFullCompCap() {
- int cudaDev;
- if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
- int ccMajor, ccMinor;
- if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
- if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0;
- return ccMajor*10+ccMinor;
-}
-
-// Returns ncclInternalError if anything fails, causing that network to be ignored.
-ncclResult_t initNet(ncclNet_t* net) {
- int ndev;
- if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
- if (net->devices(&ndev) != ncclSuccess) return ncclInternalError;
- if (ndev <= 0) return ncclSystemError;
- return ncclSuccess;
-}
-
-ncclResult_t initNetPlugin(ncclNet_t** net) {
- void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL);
- if (netPluginLib == NULL) {
- // dlopen does not guarantee to set errno, but dlerror only gives us a
- // string, so checking errno doesn't hurt to try to provide a better
- // error message
- if (errno == ENOENT) {
- INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (libnccl-net.so).");
- } else {
- INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
- }
- return ncclSuccess;
- }
- ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
- if (extNet == NULL) {
- INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol.");
- goto cleanup;
- }
- if (initNet(extNet) == ncclSuccess) {
- *net = extNet;
- return ncclSuccess;
- }
-cleanup:
- if (netPluginLib != NULL) dlclose(netPluginLib);
- return ncclSuccess;
-}
-
-ncclResult_t initNet() {
- // Always initialize sockets as we use it for bootstrap
- NCCLCHECK(initNet(&ncclNetSocket));
-
- NCCLCHECK(initNetPlugin(&ncclNet));
- if (ncclNet != NULL) return ncclSuccess;
- if (initNet(&ncclNetIb) == ncclSuccess) {
- ncclNet = &ncclNetIb;
- } else {
- ncclNet = &ncclNetSocket;
- }
- return ncclSuccess;
-}
-
-NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
-NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2);
-NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", -2);
-
-int ncclThreadThreshold(int minCompCap, int multiNode) {
- int threshold = ncclParamThreadThreshold();
- if (threshold == -2) { // user has not set this env variable
- threshold = (minCompCap <= 6) ? NCCL_THREAD_THRESHOLD_PREVOLTA : NCCL_THREAD_THRESHOLD;
- // multiply by 2 if running on multiple nodes
- if (multiNode) {
- threshold *= 2;
- }
- }
- return threshold;
-}
-
-pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
-static bool initialized = false;
-static ncclResult_t ncclInit() {
- if (initialized) return ncclSuccess;
- pthread_mutex_lock(&initLock);
- if (!initialized) {
- initEnv();
- initDebug();
- initNet();
- initialized = true;
- }
- pthread_mutex_unlock(&initLock);
- return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclGetVersion, int* version);
-ncclResult_t ncclGetVersion(int* version) {
- if (version == NULL) return ncclInvalidArgument;
- *version = NCCL_VERSION_CODE;
- return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
-ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
- NCCLCHECK(ncclInit());
- NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
- return bootstrapGetUniqueId(out);
-}
-
-static ncclResult_t commFree(ncclComm_t comm) {
- if (comm == NULL)
- return ncclSuccess;
-
- free(comm->peerInfo);
-
- if (comm->bootstrap)
- NCCLCHECK(bootstrapClose(comm->bootstrap));
-
- CUDACHECK(cudaFree(comm->devComm));
-
- for (int channel=0; channel<comm->nChannels; channel++)
- NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks));
-
- if (comm->doneEvent != NULL)
- CUDACHECK(cudaEventDestroy(comm->doneEvent));
-
- if (comm->launchMode == ncclComm::GROUP) {
- CUDACHECK(cudaStreamDestroy(comm->groupStream));
- }
-
- // Last rank frees shared resources between threads
- int isLast;
- NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
- if (isLast) {
- free(comm->intraBarrier);
- free(comm->intraParams);
- free(comm->intraCudaDevs);
- free(comm->intraCGMode);
- free(comm->intraCC);
- }
- CUDACHECK(cudaFreeHost((void *)comm->abortFlag));
- CUDACHECK(cudaFreeHost((void *)comm->fatalDevError));
-
- free(comm);
- return ncclSuccess;
-}
-
-static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
- if (ndev < 1) {
- WARN("invalid device count (%d) requested", ndev);
- return ncclInvalidArgument;
- }
- if (rank >= ndev || rank < 0) {
- WARN("rank %d exceeds ndev=%d", rank, ndev);
- return ncclInvalidArgument;
- }
-
- // Try to create a CUDA object right away. If there is something wrong with
- // the device we're on (failure cause #1) , better know it early.
- cudaEvent_t doneEvent;
- CUDACHECK(cudaEventCreateWithFlags(&doneEvent, cudaEventDisableTiming));
-
- struct ncclComm* comm;
- NCCLCHECK(ncclCalloc(&comm, 1));
-
- comm->rank = rank;
- comm->nRanks = ndev;
- cudaGetDevice(&comm->cudaDev);
- getNvmlDevice(comm->cudaDev, &comm->nvmlDev);
- INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);
-
- comm->doneEvent = doneEvent;
- comm->llThreshold = ncclParamLlThreshold();
- comm->treeThreshold = ncclParamTreeThreshold();
- comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
-#if CUDART_VERSION >= 9200
- comm->groupCudaStream = ncclParamGroupCudaStream();
-#else
- // Don't allow the user to overload the default setting in older CUDA builds
- comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM;
-#endif
- comm->fatalError = ncclSuccess;
-
- CUDACHECK(cudaHostAlloc((void**) &comm->fatalDevError, sizeof(ncclDevError_t), cudaHostAllocMapped));
- *comm->fatalDevError = ncclDevSuccess;
-
- CUDACHECK(cudaHostAlloc((void**) &comm->abortFlag, sizeof(uint32_t), cudaHostAllocMapped));
- *comm->abortFlag = 0;
-
- comm->argsptr = &comm->args;
-
- *comret = comm;
- return ncclSuccess;
-}
-
-static ncclResult_t devCommSetup(ncclComm_t comm) {
- // Fully duplicate the comm on the device
- NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
- // Copy the comm on the device
- NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1));
- // Copy userRanks
- for (int r=0; r<comm->nChannels; r++) {
- NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
- NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks));
- }
- // Copy the device-accessible pointer to comm->abortFlag
- void *devAbortFlag;
- CUDACHECK(cudaHostGetDevicePointer(&devAbortFlag, (uint32_t *)comm->abortFlag, 0));
- CUDACHECK(cudaMemcpy(&comm->devComm->abortFlag, &devAbortFlag, sizeof(int *), cudaMemcpyHostToDevice));
- // Copy the device-accessible pointer to comm->fatalDevError
- void *devFatalError;
- CUDACHECK(cudaHostGetDevicePointer(&devFatalError, (ncclDevError_t *)comm->fatalDevError, 0));
- CUDACHECK(cudaMemcpy(&comm->devComm->fatalDevError, &devFatalError, sizeof(ncclDevError_t *), cudaMemcpyHostToDevice));
- return ncclSuccess;
-}
-
-// Pre-process the string so that running "strings" on the lib can quickly reveal the version.
-#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR)
-static void showVersion() {
- static int shown = 0;
- if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
- printf("%s\n", VERSION_STRING);
- fflush(stdout);
- if (ncclDebugFile != stdout)
- INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files
- shown = 1;
- }
-}
-
-static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank) {
- info->rank = rank;
- CUDACHECK(cudaGetDevice(&info->cudaDev));
- NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev))
- info->hostHash=getHostHash();
- info->pidHash=getPidHash();
-
- // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
- // cudaDev is a CUDA runtime dev number which could be different from the
- // NVML device number. Then we get the busID from NVML to be sure it is
- // consistent with NVML remote PCI bus Ids.
- CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
- nvmlDevice_t nvmlDevice;
- NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice));
- nvmlPciInfo_t pciInfo;
- NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo));
- strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE);
- return ncclSuccess;
-}
-
-template <int type>
-static ncclResult_t selectTransport(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) {
- for (int t=0; t<NTRANSPORTS; t++) {
- struct ncclTransport *transport = ncclTransports+t;
- struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
- ncclTvalue_t ret = 0;
- NCCLCHECK(transport->canConnect(&ret, myInfo, peerInfo));
- if (ret > 0) {
- connector->transportComm = transportComm;
- NCCLCHECK(transportComm->setup(myInfo, peerInfo, connect, connector, buffSize, channelId));
- return ncclSuccess;
- }
- }
- WARN("No transport found !");
- return ncclInternalError;
-}
-
-static int log2(int n) {
- int l = 0;
- while (n>>=1) l++;
- return l;
-}
-
-static ncclResult_t ncclTreeThreshold(int nnodes, int nranks, int nChannels, ssize_t *treeThreshold) {
- int nvlink;
- NCCLCHECK(ncclNvlinkGpu(&nvlink));
- float ringbw = nvlink ? 5000*nChannels : 5000; // approx, in MB/s or B/us
- float ringlatinter = 6;
- float treelatintra = 4;
- float treelatinter = 15;
- float treebw;
- if (!nvlink) {
- treebw = ringbw * 2 / 3;
- } else {
- treebw = ringbw * 3 / 4;
- if (nnodes == 2) treebw *= 2;
- }
- float ringlat = ringlatinter*(nranks-1);
- float treelat = treelatinter*log2(nnodes)+treelatintra*(nranks/nnodes-1);
- if (nnodes < 2 || ringlat <= treelat)
- *treeThreshold = 0;
- else if (treebw > ringbw)
- *treeThreshold = 0x7fffffffffffffff;
- else
- *treeThreshold = (ssize_t)(((ringbw*treebw/(ringbw-treebw)))*(ringlat-treelat));
- return ncclSuccess;
-}
-
-static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks, int* treeMasters) {
- TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
- NCCLCHECK(initChannel(comm, channelId));
-
- struct ncclChannel* channel = comm->channels+channelId;
- struct ncclRing* ring = &channel->ring;
-
- // Reorganize ranks to start with rank.
- int shift;
- for (shift = 0; shift<nranks; shift++) {
- if (ringRanks[shift] == rank) {
- break;
- }
- }
- for (int i=0; i<nranks; i++) {
- ring->userRanks[i] = ringRanks[(i+shift)%nranks];
- }
- int prev = ring->prev = ring->userRanks[nranks-1];
- int next = ring->next = ring->userRanks[1];
-
- struct ncclTree* tree = &channel->tree;
- tree->up = -1;
- tree->down[0] = tree->down[1] = tree->down[2] = -1;
-
- //
- // Find per-node masters and connect them via a binary tree
- //
-
- int nMasters = 0;
- for (int r=0; r<nranks; r++) nMasters += treeMasters[r];
- if (nMasters == 0) {
- nMasters = 1;
- treeMasters[0] = 1;
- }
-
- if (comm->treeThreshold == -2)
- NCCLCHECK(ncclTreeThreshold(nMasters, comm->nRanks, comm->nChannels, &comm->treeThreshold));
-
- if (comm->treeThreshold > 0) {
- // Compute tree depth. Not an exact value but a good approximation in most
- // cases and consistent across nodes
- tree->depth = nranks/nMasters + log2(nMasters);
-
- // Find my master : go backwards in the ring to find my root
- int master = 0;
- for (int i = 0; i<nranks; i++) {
- int r = ring->userRanks[(nranks-i)%nranks];
- if (treeMasters[r]) {
- master = r;
- break;
- }
- }
-
- int ranks[nMasters];
- int i = 0, masterIndex = -1;
- // Build binary tree
- for (int r=0; r<nranks; r++) {
- // Create index table
- if (r == master) masterIndex = i;
- if (treeMasters[r]) ranks[i++] = r;
- }
- int btreeUp, btreeDown0, btreeDown1;
- int u0, d0_0, d0_1, u1, d1_0, d1_1;
- NCCLCHECK(ncclGetDtree(nMasters, masterIndex, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1));
- if (channelId < DIVUP(comm->nChannels, 2)) {
- btreeUp = u0; btreeDown0 = d0_0; btreeDown1 = d0_1;
- } else {
- btreeUp = u1; btreeDown0 = d1_0; btreeDown1 = d1_1;
- }
-
- //
- // Now build the full tree, combining the intra-node ring and the
- // inter-node binary tree.
- //
-
- if (rank == master) {
- int nDown = 0;
- if (btreeUp != -1) tree->up = ranks[btreeUp];
- if (treeMasters[next] == 0) tree->down[nDown++] = next;
- if (btreeDown0 != -1) tree->down[nDown++] = ranks[btreeDown0];
- if (btreeDown1 != -1) tree->down[nDown++] = ranks[btreeDown1];
- } else {
- tree->up = prev;
- if (treeMasters[next] == 0) tree->down[0] = next;
- }
- }
-
- TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
- return ncclSuccess;
-}
-
-static ncclResult_t fillConnect(struct ncclPeerInfo* peerInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) {
- for (int r=0; r<nranks; r++) {
- connectTransport[r] = -1;
- for (int t=0; t<NTRANSPORTS; t++) {
- NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, peerInfo+rank, peerInfo+r));
- if (connectValue[r] > 0) {
- connectTransport[r] = t;
- break;
- }
- }
- }
- return ncclSuccess;
-}
-
-#define MAXWIDTH 20
-#define PREFIXLEN 15
-#define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
-void dumpMatrix(int* connectMatrix, int nranks) {
- char line[STRLENGTH+1];
- line[STRLENGTH] = '\0';
- memset(line, ' ', STRLENGTH);
- for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j);
- INFO(NCCL_INIT,"%s", line);
- for (int i=0; i<nranks; i++) {
- memset(line, ' ', STRLENGTH);
- sprintf(line, "%3d ", i);
- for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]);
- INFO(NCCL_INIT,"%s", line);
- }
-}
-
-void dumpMatrixTvalue(ncclTvalue_t* connectMatrix, int nranks) {
- char line[STRLENGTH+1];
- line[STRLENGTH] = '\0';
- memset(line, ' ', STRLENGTH);
- for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4d", j);
- INFO(NCCL_INIT,"%s", line);
- for (int i=0; i<nranks; i++) {
- memset(line, ' ', STRLENGTH);
- sprintf(line, "%3d ", i);
- for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4o", (int)connectMatrix[i*nranks+j]);
- INFO(NCCL_INIT,"%s", line);
- }
-}
-
-
-void dumpLine(int* values, int nranks, const char* prefix) {
- int prefixlen = strlen(prefix);
- char line[STRLENGTH+1];
- line[STRLENGTH] = '\0';
- memset(line, ' ', STRLENGTH);
- strncpy(line, prefix, PREFIXLEN);
- for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]);
- INFO(NCCL_INIT,"%s", line);
-}
-
-static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
- for (int r=0; r<nrings; r++) {
- char prefix[30];
- /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
- dumpLine(prev+r*nranks, nranks, prefix);
- sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
- dumpLine(next+r*nranks, nranks, prefix);*/
-
- int current = rank;
- for (int i=0; i<nranks; i++) {
- rings[r*nranks+i] = current;
- current = next[r*nranks+current];
- }
- sprintf(prefix, "Channel %02d : ", r);
- if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
- if (current != rank) {
- WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
- return ncclInternalError;
- }
- // Check that all ranks are there
- for (int i=0; i<nranks; i++) {
- int found = 0;
- for (int j=0; j<nranks; j++) {
- if (rings[r*nranks+j] == i) {
- found = 1;
- break;
- }
- }
- if (found == 0) {
- WARN("Error : ring %d does not contain rank %d", r, i);
- return ncclInternalError;
- }
- }
- }
- return ncclSuccess;
-}
-
-void* waitForNonNullPtr(void* p) {
- volatile void** ptr = (volatile void**) p;
- while (*ptr == NULL) sched_yield();
- return (void*)*ptr;
-}
-
-ncclResult_t initParams(struct ncclComm* comm) {
- struct cudaLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
- params->args = &comm->argsptr;
- params->stream = NULL;
- params->sharedMem = 0;
- params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
- params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1;
- return ncclSuccess;
-}
-
-// Allocate/Set Intra Process Structures and set CG options
-ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) {
- comm->intraRank = rank;
- comm->intraRanks = ranks;
- comm->intraPhase = 0;
-
- // Alloc shared structures
- if (rank == 0) {
- assert(comm == comm0);
- int* bar;
- NCCLCHECK(ncclCalloc(&bar, 2));
- bar[0] = bar[1] = 0;
- comm->intraBarrier = bar;
- NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks));
- NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks));
- int* CGMode;
- NCCLCHECK(ncclCalloc(&CGMode, 1));
- *CGMode = 0x11;
- comm->intraCGMode = CGMode;
- int* CC;
- NCCLCHECK(ncclCalloc(&CC, 1));
- *CC = ncclCudaFullCompCap();
- comm->intraCC = CC;
- } else {
- comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
- comm->intraParams = (struct cudaLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
- comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
- comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
- comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
- }
- comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
- NCCLCHECK(initParams(comm));
-
- int cgMdLaunch = 0;
-
- // Set CG Mode
- comm->launchMode = ncclComm::GROUP;
- char* str = getenv("NCCL_LAUNCH_MODE");
- if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) {
- comm->launchMode = ncclComm::PARALLEL;
- }
- if (comm->launchMode == ncclComm::GROUP) {
- CUDACHECK(cudaStreamCreateWithFlags(&comm->groupStream, cudaStreamNonBlocking));
-#if CUDART_VERSION >= 9000
- if (*comm->intraCC && (ncclCudaFullCompCap() == *comm->intraCC)) {
- // Check whether the GPU supports Cooperative Group Multi Device Launch
- (void) cudaDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev);
- }
-#endif
- }
-
- // Disable cgMdLaunch if any rank does not support it
- if (cgMdLaunch == 0) {
- *comm->intraCGMode = 0x10;
- }
- return ncclSuccess;
-}
-
-static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
- TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
- uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
- struct ncclConnect connect;
- struct ncclConnector* conn;
- for (int i=0; i<nrecv; i++) {
- int peer = peerRecv[i];
- if (peer == -1) continue;
- conn = &channel->peers[peer].recv;
- if (conn->connected) { ++nSkippedRecv; continue; }
- NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
- NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
- }
- for (int i=0; i<nsend; i++) {
- int peer = peerSend[i];
- if (peer == -1) continue;
- conn = &channel->peers[peer].send;
- if (conn->connected) { ++nSkippedSend; continue; }
- NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
- NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
- }
- for (int i=0; i<nsend; i++) {
- int peer = peerSend[i];
- if (peer == -1) continue;
- conn = &channel->peers[peer].send;
- if (conn->connected) {++nSkippedSend; continue; }
- NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
- NCCLCHECK(conn->transportComm->connect(&connect, conn));
- conn->connected = 1;
- }
- for (int i=0; i<nrecv; i++) {
- int peer = peerRecv[i];
- if (peer == -1) continue;
- conn = &channel->peers[peer].recv;
- if (conn->connected) {++nSkippedRecv; continue; }
- NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
- NCCLCHECK(conn->transportComm->connect(&connect, conn));
- conn->connected = 1;
- }
- TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
- return ncclSuccess;
-}
-
-static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
- // We use 3 AllGathers
- // 1. { peerInfo, comm }
- // 2. ConnectTransport[nranks], ConnectValue[nranks]
- // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
-
- int rank = comm->rank;
- int nranks = comm->nRanks;
- TRACE(NCCL_INIT, "rank %d nranks %d - BEGIN", rank, nranks);
- NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
-
- // AllGather1 - begin
- struct {
- struct ncclPeerInfo peerInfo;
- struct ncclComm* comm;
- } *allGather1Data;
-
- NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
- allGather1Data[rank].comm = comm;
- NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank));
- NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
-
- NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
- for (int i = 0; i < nranks; i++) {
- memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
- }
- // AllGather1 data is used again below
- // AllGather1 - end
-
- // AllGather2 - begin
- size_t allGather2DataRowSize = sizeof(int)*nranks + sizeof(ncclTvalue_t)*nranks;
- void *allGather2Data;
- NCCLCHECK(ncclCalloc((char **)&allGather2Data, allGather2DataRowSize*nranks));
- int *myTransportRow = (int *)((char *)allGather2Data + allGather2DataRowSize*rank);
- ncclTvalue_t *myValueRow = (ncclTvalue_t *)(myTransportRow + nranks);
-
- NCCLCHECK(fillConnect(comm->peerInfo, nranks, rank, myTransportRow, myValueRow));
- NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather2Data, allGather2DataRowSize));
-
- int* connectTransport;
- ncclTvalue_t* connectValue;
- NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
- NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
- for (int i = 0; i < nranks; i++) {
- memcpy(connectTransport + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize, sizeof(int)*nranks);
- memcpy(connectValue + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize + nranks*sizeof(int), sizeof(ncclTvalue_t)*nranks);
- }
- free(allGather2Data);
- // AllGather2 - end
-
- //if (rank == 0) dumpMatrix(connectTransport, nranks);
- //if (rank == 0) dumpMatrixTvalue(connectValue, nranks);
-
- // Get my rings
- int nrings;
- int* prev, *next, *treeIn, *treeOut;
- NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS));
- NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS));
- NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS));
- NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS));
- comm->nThreads = getDefaultThreads();
- NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut));
- TRACE(NCCL_INIT, "rank %d nranks %d - BUILD %d RINGS", rank, nranks, nrings);
- assert(nrings <= MAXCHANNELS);
- free(connectTransport);
- free(connectValue);
-
- // AllGather3 - begin
- struct {
- int nThreads;
- int nrings;
- int cudaCompCap;
- int prev[MAXCHANNELS];
- int next[MAXCHANNELS];
- } *allGather3Data;
-
- NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
- allGather3Data[rank].nThreads = comm->nThreads;
- allGather3Data[rank].nrings = nrings;
- allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
- for (int r=0; r<nrings; r++) {
- allGather3Data[rank].prev[r] = *(prev+r*nranks+rank);
- allGather3Data[rank].next[r] = *(next+r*nranks+rank);
- }
- NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
-
- // Find max nThreads
- for (int i=0; i<nranks; i++)
- comm->nThreads = std::max(allGather3Data[i].nThreads, comm->nThreads);
-
- // Determine the minimum CUDA Compute capability of all GPUs
- int myCompCap = allGather3Data[rank].cudaCompCap;
- int minCompCap = myCompCap;
- for (int i = 0; i < nranks; i++)
- minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap);
-
- // Determine thread threshold across all GPUs
- int nnodes = 0;
- for (int r=0; r<nranks; r++) nnodes += treeIn[r];
- comm->threadThreshold = ncclThreadThreshold(minCompCap, nnodes);
-
- // Find min nrings across ranks
- for (int i=0; i<nranks; i++)
- nrings = std::min(allGather3Data[i].nrings, nrings);
- comm->nChannels = nrings;
-
- // Unpack the per ring prev/next arrays
- for (int i = 0; i < nranks; i++) {
- for (int r = 0; r < nrings; r++) {
- prev[r*nranks+i] = allGather3Data[i].prev[r];
- next[r*nranks+i] = allGather3Data[i].next[r];
- }
- }
- free(allGather3Data);
- // AllGather3 - end
-
- int *rings;
- NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
- NCCLCHECK(buildRings(nrings, rings, rank, nranks, prev, next));
- free(prev);
- free(next);
- TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d RINGS", rank, nranks, nrings);
-
- // Connect with prev/next for each ring
- struct ncclConnect *connect;
- NCCLCHECK(ncclCalloc(&connect, 2));
- for (int r=0; r<nrings; r++) {
- struct ncclChannel* channel = comm->channels+r;
- NCCLCHECK(setupChannel(comm, r, rank, nranks, rings+r*nranks, treeIn+r*nranks));
- NCCLCHECK(p2pSetup(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next));
- NCCLCHECK(p2pSetup(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up));
- NCCLCHECK(p2pSetup(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down));
- }
- if (comm->treeThreshold > 0) {
- char line[1024];
- line[0]='\0';
- for (int c=0; c<nrings; c++) {
- struct ncclTree* tree = &comm->channels[c].tree;
- snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d->%d->%d/%d/%d",
- c, tree->up, rank, tree->down[0], tree->down[1], tree->down[2]);
- }
- line[1023] = '\0';
- INFO(NCCL_INIT, "Trees%s", line);
- }
- if (rank == 0) {
- char treeline[64];
- snprintf(treeline, 64, "enabled up to size %ld", comm->treeThreshold);
- INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees %s", comm->nThreads, minCompCap,
- comm->treeThreshold == 0 ? "disabled" :
- comm->treeThreshold == 0x7fffffffffffffff ? "enabled for all sizes" :
- treeline);
- }
-
- TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, nrings);
- free(connect);
- free(rings);
- free(treeIn);
- free(treeOut);
-
- // Compute intra ranks (using AllGather1 data)
- int intraRank0 = -1, intraRank = -1, intraRanks = 0;
- for (int i = 0; i < nranks; i++) {
- if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) &&
- (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) {
- if (intraRanks == 0) intraRank0 = i;
- if (i == rank) intraRank = intraRanks;
- intraRanks++;
- }
- }
- TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
- rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
- if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
- WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
- rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
- return ncclInternalError;
- }
- NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm));
-
- // Done with AllGather1 data
- free(allGather1Data);
-
- if (nnodes) NCCLCHECK(transportCreateProxy(comm));
-
- TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
- return ncclSuccess;
-}
-
-static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) {
- CPU_ZERO_S(sizeof(cpu_set_t), mask);
- char* cudaPath;
- NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
- char path[PATH_MAX];
- strncpy(path, cudaPath, PATH_MAX-1);
- snprintf(path+strlen(path), PATH_MAX-1-strlen(path), "/local_cpus");
- path[PATH_MAX-1] = '\0';
- int fd;
- SYSCHECKVAL(open(path, O_RDONLY), "open", fd);
- char affinityStr[sizeof(cpu_set_t)*2];
- int r = read(fd, affinityStr, sizeof(cpu_set_t)*2);
- if (r > 0)
- NCCLCHECK(ncclStrToCpuset(affinityStr, mask));
- close(fd);
- free(cudaPath);
- return ncclSuccess;
-}
-
-static ncclResult_t setCpuAffinity(int cudaDev) {
- // Work within the enveloppe we were provided
- cpu_set_t mask;
- SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
-
- // Find the subpart that is local to our GPU
- cpu_set_t gpuMask;
- NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask));
- cpu_set_t finalMask;
- CPU_AND(&finalMask, &mask, &gpuMask);
-
- // If those are not disjoint, try to stay local
- if (CPU_COUNT(&finalMask)) {
- char affinityStr[sizeof(cpu_set_t)*2];
- NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
- INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", cudaDev, affinityStr);
- SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity");
- }
- return ncclSuccess;
-}
-
-ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
- cpu_set_t affinitySave;
- sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-
- NCCLCHECK(wrapNvmlSymbols());
- NCCLCHECK(wrapNvmlInit());
-
- // Make sure all host memory allocation are close to the GPU
- int cudaDev;
- CUDACHECK(cudaGetDevice(&cudaDev));
- NCCLCHECK(setCpuAffinity(cudaDev));
- ncclResult_t res;
-
- NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
- NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
- NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
-
- sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
- NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
-
- INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->nvmlDev);
-
- return ncclSuccess;
-cleanup:
- *newcomm = NULL;
- sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
- return res;
-}
-
-NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
-ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
- char* env = getenv("NCCL_COMM_ID");
- if (env && myrank == 0) {
- NCCLCHECK(bootstrapCreateRoot(&commId, true));
- }
-
- NCCLCHECK(ncclInit());
- if (myrank == 0) showVersion();
-
- // Make sure the CUDA runtime is initialized.
- CUDACHECK(cudaFree(NULL));
-
- NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm"));
- if (nranks < 1 || myrank < 0 || myrank >= nranks) {
- WARN("Invalid rank requested : %d/%d", myrank, nranks);
- return ncclInvalidArgument;
- }
-
- if (ncclAsyncMode()) {
- int cudaDev;
- CUDACHECK(cudaGetDevice(&cudaDev));
- return ncclAsyncInit(ncclCommInitRankSync, cudaDev, newcomm, nranks, commId, myrank);
- } else {
- return ncclCommInitRankSync(newcomm, nranks, commId, myrank);
- }
-}
-
-static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) {
- struct ncclPeerInfo* allInfo;
- NCCLCHECK(ncclCalloc(&allInfo, nranks));
- for (int rank=0; rank<nranks; rank++) {
- CUDACHECK(cudaSetDevice(devs[rank]));
- NCCLCHECK(fillInfo(allInfo+rank, rank));
- }
-
- int* connectTransport;
- ncclTvalue_t* connectValue;
- NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
- NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
- for (int rank=0; rank<nranks; rank++)
- NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
-
- int* prev, *prevFinal, *next, *nextFinal, *treeIn, *treeOut;
- NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS));
- NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXCHANNELS));
- NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS));
- NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXCHANNELS));
- NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS));
- NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS));
- int nrings = MAXCHANNELS;
- int nthreads=0;
- int myCompCap = ncclCudaCompCap();
- int minCompCap = myCompCap;
- for (int rank=0; rank<nranks; rank++) {
- CUDACHECK(cudaSetDevice(devs[rank]));
- int nringsRank;
- int nthreadsRank = getDefaultThreads();
- myCompCap = ncclCudaCompCap();
- NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut));
- nrings = std::min(nrings, nringsRank);
- nthreads = std::max(nthreads, nthreadsRank);
- minCompCap = std::min(minCompCap, myCompCap);
- for (int ring=0; ring<nrings; ring++) {
- int index = ring*nranks+rank;
- prevFinal[index] = prev[index];
- nextFinal[index] = next[index];
- }
- }
- free(connectTransport);
- free(connectValue);
- free(prev);
- free(next);
-
- INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees disabled", nthreads, minCompCap);
-
- int* rings;
- NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
- NCCLCHECK(buildRings(nrings, rings, 0, nranks, prevFinal, nextFinal));
- free(prevFinal);
- free(nextFinal);
-
- // Determine thread threshold across all GPUs
- int threadThreshold = ncclThreadThreshold(minCompCap, 0);
-
- for (int rank=0; rank<nranks; rank++) {
- comms[rank]->nChannels = nrings;
- comms[rank]->nThreads = nthreads;
- comms[rank]->threadThreshold = threadThreshold;
- }
-
- for (int r=0; r<nrings; r++) {
- struct ncclConnect connect[2*nranks];
- int* ringRanks = rings+r*nranks;
- for (int rank=0; rank<nranks; rank++) {
- CUDACHECK(cudaSetDevice(devs[rank]));
- struct ncclChannel* channel = comms[rank]->channels+r;
- struct ncclRing *ring = &channel->ring;
- NCCLCHECK(setupChannel(comms[rank], r, rank, nranks, ringRanks, treeIn));
- // Make sure we don't use trees, we cannot use them with initAll
- comms[rank]->treeThreshold = 0;
- int prev = channel->ring.prev = ring->userRanks[nranks-1];
- int next = channel->ring.next = ring->userRanks[1];
- struct ncclConnector* recv = &channel->peers[prev].recv;
- struct ncclConnector* send = &channel->peers[next].send;
- NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+rank*2+0, recv, channel->buffSize, channel->id));
- NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+rank*2+1, send, channel->buffSize, channel->id));
- }
- for (int rank=0; rank<nranks; rank++) {
- CUDACHECK(cudaSetDevice(devs[rank]));
- struct ncclChannel* channel = comms[rank]->channels+r;
- struct ncclRing *ring = &channel->ring;
- struct ncclConnector* recv = &channel->peers[ring->prev].recv;
- struct ncclConnector* send = &channel->peers[ring->next].send;
- NCCLCHECK(recv->transportComm->connect(connect+ring->prev*2+1, recv));
- NCCLCHECK(send->transportComm->connect(connect+ring->next*2+0, send));
- }
- }
- free(allInfo);
- free(rings);
- free(treeIn);
- free(treeOut);
- return ncclSuccess;
-}
-
-
-NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist);
-ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
- NCCLCHECK(ncclInit());
- NCCLCHECK(wrapNvmlSymbols());
- NCCLCHECK(wrapNvmlInit());
- showVersion();
-
- INFO(NCCL_INIT,"nranks %d", ndev);
-
- NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms"));
- if (ndev < 1) {
- WARN("Invalid device count requested : %d", ndev);
- return ncclInvalidArgument;
- }
-
- ncclResult_t res;
- int savedDevice;
- int rank, cudaDev;
- ncclComm_t comm = NULL;
- int ncclDevList[ndev];
- for (int i=0; i<ndev; i++) {
- ncclDevList[i] = devlist ? devlist[i] : i;
- }
-
- cudaGetDevice(&savedDevice);
-
- for(rank=0; rank<ndev; ++rank)
- comms[rank] = NULL;
-
- cpu_set_t affinitySave;
- sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-
- for (rank=0; rank<ndev; ++rank) {
- cudaDev = ncclDevList[rank];
- CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
-
- NCCLCHECK(setCpuAffinity(cudaDev));
-
- NCCLCHECKGOTO(commAlloc(&comm, ndev, rank), res, cleanup);
- comms[rank] = comm;
-
- NCCLCHECKGOTO(ncclCommSetIntra(comm, rank, ndev, comms[0]), res, cleanup);
- }
-
- sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-
- NCCLCHECKGOTO(initTransportsAll(comms, ncclDevList, ndev), res, cleanup);
-
- for(rank=0; rank<ndev; ++rank) {
- cudaDev = ncclDevList[rank];
- CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
- NCCLCHECKGOTO(devCommSetup(comms[rank]), res, cleanup);
- }
-
- res = ncclSuccess;
- goto final;
-
-cleanup:
- for(rank=0; rank<ndev; ++rank) {
- if(comms[rank] != NULL) {
- commFree(comms[rank]);
- }
- }
-
-final:
- if(wrapNvmlShutdown() != ncclSuccess)
- INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
- cudaSetDevice(savedDevice);
- sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
- return res;
-}
-
-
-static ncclResult_t commDestroy(ncclComm_t comm) {
- int savedDevice;
- CUDACHECK(cudaGetDevice(&savedDevice));
- int commDevice = comm->cudaDev;
- int rank = comm->rank;
-
- if (savedDevice != commDevice) {
- CUDACHECK(cudaSetDevice(commDevice));
- }
-
- TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, *comm->abortFlag, comm->fatalError);
-
- CUDACHECK(cudaStreamSynchronize(comm->groupStream));
- NCCLCHECK(transportDestroyProxy(comm));
- NCCLCHECK(commFree(comm));
-
- if (savedDevice != commDevice)
- CUDACHECK(cudaSetDevice(savedDevice));
-
- INFO(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
-
- return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
-ncclResult_t ncclCommDestroy(ncclComm_t comm) {
- if (comm == NULL)
- return ncclSuccess;
-
- return commDestroy(comm);
-}
-
-NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
-ncclResult_t ncclCommAbort(ncclComm_t comm) {
- if (comm == NULL)
- return ncclSuccess;
-
- // Ask anything that might still be running on the device to quit
- *comm->abortFlag = 1;
-
- return commDestroy(comm);
-}
-
-NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
-const char* ncclGetErrorString(ncclResult_t code) {
- switch (code) {
- case ncclSuccess : return "no error";
- case ncclUnhandledCudaError : return "unhandled cuda error";
- case ncclSystemError : return "unhandled system error";
- case ncclInternalError : return "internal error";
- case ncclInvalidArgument : return "invalid argument";
- case ncclInvalidUsage : return "invalid usage";
- default : return "unknown result code";
- }
-}
-
-NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError);
-ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
- NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm"));
- NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError"));
-
- // Check device reported error
- static ncclDevError_t printedDevErr = ncclDevSuccess;
- switch(*comm->fatalDevError) {
- case ncclDevSuccess :
- break;
- case ncclDevAssertedMismatch :
- if (printedDevErr != ncclDevAssertedMismatch) {
- WARN("Mismatched collective detected, please check your collective calls at and around rank %d. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank);
- printedDevErr = ncclDevAssertedMismatch;
- }
- if (comm->fatalError == ncclSuccess) {
- comm->fatalError = ncclInvalidUsage;
- }
- break;
- case ncclDevSuspectedMismatch :
- if (printedDevErr != ncclDevSuspectedMismatch) {
- WARN("Your program may be hanging, this may be caused by a collective mismatch around rank %d. Please check your collective calls at and around this rank. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank);
- printedDevErr = ncclDevSuspectedMismatch;
- }
- break;
- default:
- WARN("Unknown device error %d", *comm->fatalDevError);
- return ncclInternalError;
- }
- *asyncError = comm->fatalError;
- return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
-ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
- NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
- NCCLCHECK(PtrCheck(count, "CommCount", "count"));
- *count = comm->nRanks;
- return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
-ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
- NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm"));
- NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
- *devid = comm->cudaDev;
- return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
-ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
- NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm"));
- NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
- *rank = comm->rank;
- return ncclSuccess;
-}
diff --git a/src/misc/checks.cu b/src/misc/argcheck.cc
index a07e577..67931f8 100644
--- a/src/misc/checks.cu
+++ b/src/misc/argcheck.cc
@@ -1,10 +1,11 @@
/*************************************************************************
- * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "checks.h"
+#include "argcheck.h"
+#include "comm.h"
static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
cudaPointerAttributes attr;
diff --git a/src/misc/ibvwrap.cu b/src/misc/ibvwrap.cc
index 7ac3431..f47c141 100644
--- a/src/misc/ibvwrap.cu
+++ b/src/misc/ibvwrap.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/misc/nvmlwrap.cu b/src/misc/nvmlwrap.cc
index 635f332..34ed0aa 100644
--- a/src/misc/nvmlwrap.cu
+++ b/src/misc/nvmlwrap.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -16,6 +16,7 @@ static nvmlReturn_t (*nvmlInternalInit)(void);
static nvmlReturn_t (*nvmlInternalShutdown)(void);
static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
+static nvmlReturn_t (*nvmlInternalDeviceGetHandleByIndex)(unsigned int index, nvmlDevice_t* device);
static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci);
@@ -23,7 +24,10 @@ static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t dev
static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link,
nvmlNvLinkCapability_t capability, unsigned int *capResult);
static nvmlReturn_t (*nvmlInternalDeviceGetMinorNumber)(nvmlDevice_t device, unsigned int* minorNumber);
+static nvmlReturn_t (*nvmlInternalDeviceGetCudaComputeCapability)(nvmlDevice_t device, int* major, int* minor);
+// Used to make the NVML library calls thread safe
+pthread_mutex_t nvmlLock = PTHREAD_MUTEX_INITIALIZER;
ncclResult_t wrapNvmlSymbols(void) {
if (nvmlState == nvmlInitialized)
@@ -70,12 +74,14 @@ ncclResult_t wrapNvmlSymbols(void) {
LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
+ LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByIndex", nvmlInternalDeviceGetHandleByIndex);
LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetMinorNumber", nvmlInternalDeviceGetMinorNumber);
LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState);
LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo);
LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability);
+ LOAD_SYM(nvmlhandle, "nvmlDeviceGetCudaComputeCapability", nvmlInternalDeviceGetCudaComputeCapability);
nvmlState = nvmlInitialized;
return ncclSuccess;
@@ -85,6 +91,7 @@ teardown:
nvmlInternalShutdown = NULL;
nvmlInternalDeviceGetHandleByPciBusId = NULL;
nvmlInternalDeviceGetIndex = NULL;
+ nvmlInternalDeviceGetHandleByIndex = NULL;
nvmlInternalDeviceGetPciInfo = NULL;
nvmlInternalDeviceGetMinorNumber = NULL;
nvmlInternalDeviceGetNvLinkState = NULL;
@@ -130,7 +137,8 @@ ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_
WARN("lib wrapper not initialized.");
return ncclInternalError;
}
- nvmlReturn_t ret = nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device);
+ nvmlReturn_t ret;
+ NVMLLOCKCALL(nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device), ret);
if (ret != NVML_SUCCESS) {
WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ",
nvmlInternalErrorString(ret));
@@ -144,7 +152,8 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
WARN("lib wrapper not initialized.");
return ncclInternalError;
}
- nvmlReturn_t ret = nvmlInternalDeviceGetIndex(device, index);
+ nvmlReturn_t ret;
+ NVMLLOCKCALL(nvmlInternalDeviceGetIndex(device, index), ret);
if (ret != NVML_SUCCESS) {
WARN("nvmlDeviceGetIndex() failed: %s ",
nvmlInternalErrorString(ret));
@@ -153,12 +162,28 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
return ncclSuccess;
}
+ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) {
+ if (nvmlInternalDeviceGetHandleByIndex == NULL) {
+ WARN("lib wrapper not initialized.");
+ return ncclInternalError;
+ }
+ nvmlReturn_t ret;
+ NVMLLOCKCALL(nvmlInternalDeviceGetHandleByIndex(index, device), ret);
+ if (ret != NVML_SUCCESS) {
+ WARN("nvmlDeviceGetHandleByIndex() failed: %s ",
+ nvmlInternalErrorString(ret));
+ return ncclSystemError;
+ }
+ return ncclSuccess;
+}
+
ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
if (nvmlInternalDeviceGetPciInfo == NULL) {
WARN("lib wrapper not initialized.");
return ncclInternalError;
}
- nvmlReturn_t ret = nvmlInternalDeviceGetPciInfo(device, pci);
+ nvmlReturn_t ret;
+ NVMLLOCKCALL(nvmlInternalDeviceGetPciInfo(device, pci), ret);
if (ret != NVML_SUCCESS) {
WARN("nvmlDeviceGetPciInfo() failed: %s ",
nvmlInternalErrorString(ret));
@@ -172,7 +197,8 @@ ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* min
WARN("lib wrapper not initialized.");
return ncclInternalError;
}
- nvmlReturn_t ret = nvmlInternalDeviceGetMinorNumber(device, minorNumber);
+ nvmlReturn_t ret;
+ NVMLLOCKCALL(nvmlInternalDeviceGetMinorNumber(device, minorNumber), ret);
if (ret != NVML_SUCCESS) {
WARN("nvmlDeviceGetMinorNumber() failed: %s ",
nvmlInternalErrorString(ret));
@@ -186,7 +212,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link
/* Do not warn, this symbol is optional. */
return ncclInternalError;
}
- nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive);
+ nvmlReturn_t ret;
+ NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkState(device, link, isActive), ret);
if (ret != NVML_SUCCESS) {
if (ret != NVML_ERROR_NOT_SUPPORTED)
INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
@@ -201,7 +228,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned
/* Do not warn, this symbol is optional. */
return ncclInternalError;
}
- nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci);
+ nvmlReturn_t ret;
+ NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci), ret);
if (ret != NVML_SUCCESS) {
if (ret != NVML_ERROR_NOT_SUPPORTED)
INFO(NCCL_INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ",
@@ -217,7 +245,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int
/* Do not warn, this symbol is optional. */
return ncclInternalError;
}
- nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult);
+ nvmlReturn_t ret;
+ NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult), ret);
if (ret != NVML_SUCCESS) {
if (ret != NVML_ERROR_NOT_SUPPORTED)
INFO(NCCL_INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ",
@@ -226,4 +255,19 @@ ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int
}
return ncclSuccess;
}
+
+ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
+ if (nvmlInternalDeviceGetNvLinkCapability == NULL) {
+ WARN("lib wrapper not initialized.");
+ return ncclInternalError;
+ }
+ nvmlReturn_t ret;
+ NVMLLOCKCALL(nvmlInternalDeviceGetCudaComputeCapability(device, major, minor), ret);
+ if (ret != NVML_SUCCESS) {
+ WARN("nvmlDeviceGetCudaComputeCapability() failed: %s ",
+ nvmlInternalErrorString(ret));
+ return ncclSystemError;
+ }
+ return ncclSuccess;
+}
#endif
diff --git a/src/misc/rings.cu b/src/misc/rings.cu
deleted file mode 100644
index a7b122c..0000000
--- a/src/misc/rings.cu
+++ /dev/null
@@ -1,385 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "param.h"
-
-#define NCCL_MAX_SCORE 7
-
-/* Parse user defined rings. Format is like :
- * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
- * Rings with a non-matching number of ranks are ignored so we can provide
- * rings for multiple cases.
- */
-#define MAX_ENV_RANKS 512
-static ncclResult_t parseRings(const char* str, int* nringsRet, int nranks, int* prev, int* next) {
- int ranks[MAX_ENV_RANKS];
- int nrings = 0;
- int rank = 0;
- int offset = 0;
- int status = 0; // 0 : between numbers, 1 : inside number
- do {
- int digit = str[offset] - '0';
- if (digit >= 0 && digit <= 9) {
- if (status == 0) {
- ranks[rank] = digit;
- status = 1;
- } else {
- ranks[rank] = ranks[rank]*10+digit;
- }
- } else {
- if (status == 1) {
- rank++;
- if (rank == MAX_ENV_RANKS) goto end;
- }
- status = 0;
- if (str[offset] == '|' || str[offset] == '\0') {
- int prevRank = ranks[rank-1];
- // Ignore rings if nranks doesn't match
- if (rank != nranks) goto newring;
-
- for (int r=0; r<nranks; r++) {
- int rank = ranks[r];
- // Ignore rings with ranks out of bounds
- if (rank < 0 || rank >= nranks) goto newring;
- // Ignore rings with duplicate ranks
- for (int i=0; i<r; i++)
- if (ranks[i] == rank) goto newring;
-
- next[nrings*nranks+prevRank] = rank;
- prev[nrings*nranks+rank] = prevRank;
- prevRank = rank;
- }
- nrings++;
-newring:
- rank = 0;
- }
- }
- } while (str[offset++] != 0);
-end:
- *nringsRet = nrings;
- return ncclSuccess;
-}
-
-/*
- * Ring creation algorithm
- *
- * First, we establish hierarchical coordinates depending on the way ranks can
- * communicate. After fillCoords, we have for each rank a unique 3-int array
- * { node, pci_domain, rank } corresponding to the three transports :
- * { 2[NET], 1[SHM], 0[P2P] }.
- * Also, we renumber ranks (to indexes) based on their growing coordinates.
- *
- * Then, we ask transports to connect groups together. We start with net, then
- * shm, then p2p. We maintain two arrays, prev and next, where values are equal
- * to -1 when ranks are not yet connected, and a rank otherwise. We never
- * connect ranks outside our group, meaning that on 4 nodes of 2 sockets of 4
- * ranks, if we are rank 13, we should see something like (provided we have a
- * single net interface, hence a single ring) :
- *
- * Connecting all nodes <13>
- * 2[NET] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 -1 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
- * next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 -1 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0
- *
- * Connecting P2P domains with shared memory <13>
- * 1[SHM] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 11 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
- * next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 12 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0
- *
- * Connecting ranks (only inside the P2P domain) <13>
- * 0[P2P] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 11 12 13 14 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
- * next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 12 13 14 15 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0
- *
- * Hence, when we ask a transport to connect groups, we provide it with a subview of the ranks (except for net
- * which always sees the full world). That way, P2P can bruteforce all combinations inside the node without
- * risking to explode in terms of combinations, and we scale better.
- *
- * Finally, we loop over Network scores to try to create rings with high scores (=locality) and decrease until
- * we get at least one ring.
- */
-
-static void recIsConnected(int rank, int* connected, int nranks, int* matrix, int transport) {
- connected[rank] = 1;
- for (int r=0; r<nranks; r++) {
- if (connected[r] == 0 && matrix[rank*nranks+r] == transport) {
- recIsConnected(r, connected, nranks, matrix, transport);
- }
- }
-}
-
-static void isConnected(int rank, int* connected, int nranks, int* matrix, int transport) {
- for (int r=0; r<nranks; r++) connected[r] = 0;
- recIsConnected(rank, connected, nranks, matrix, transport);
-}
-
-#define NEW_IDX(rank) do { \
- rankToIdx[rank] = idx; \
- idxToRank[idx] = rank; \
- for (int t=0; t<NTRANSPORTS; t++) coords[rank*NTRANSPORTS+t] = current[t]; \
- idx++; \
-} while (0)
-
-int findConnected(int rank, int* matrix, int nranks, int transport, int* coords) {
- for (int r=0; r<nranks; r++) {
- if (coords[r*NTRANSPORTS] == -1 && matrix[rank*nranks+r] == transport) return r;
- }
- return -1;
-}
-
-static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankToIdx, int* idxToRank) {
- int current[NTRANSPORTS];
- int* p2pConnected;
- NCCLCHECK(ncclCalloc(&p2pConnected, nranks));
- for (int i=0; i<NTRANSPORTS; i++) current[i] = 0;
- int curRank = 0, idx = 0;
- while (1) {
- // P2P is handled separately as there is no level below it and we need to
- // cover the case of being connected to another GPU indirectly.
- // So we detect all GPUs in the same P2P domain once and add them all at
- // once.
- isConnected(curRank, p2pConnected, nranks, matrix, 0);
- for (int r=0; r<nranks; r++) {
- if (p2pConnected[r]) {
- NEW_IDX(r);
- curRank = r;
- current[0]++;
- }
- }
- current[0] = 0;
-
- if (idx == nranks) {
- free(p2pConnected);
- return ncclSuccess;
- }
-
- // Find next group, either connected through SHM or NET.
- int rank;
- int transport = 1;
- while ((rank = findConnected(curRank, matrix, nranks, transport, coords)) == -1) {
- current[transport] = 0;
- transport++;
- if (transport == NTRANSPORTS) {
- WARN("Error : Could not find transport to connect next group\n");
- free(p2pConnected);
- return ncclInternalError; }
- }
- curRank = rank;
- current[transport]++;
- }
-}
-
-NCCL_PARAM(MinNrings, "MIN_NRINGS", 0);
-NCCL_PARAM(MaxNrings, "MAX_NRINGS", 0);
-
-/* Users can force the number of threads with an environment variable */
-NCCL_PARAM(Nthreads, "NTHREADS", -2);
-ncclResult_t getEnvThreads(int* nthreads) {
- int64_t nt = ncclParamNthreads();
- if (nt != -2)
- *nthreads = nt;
- return ncclSuccess;
-}
-
-static inline int copyRings(int nrings, int newNrings, int nranks, int* a, int* b, int* c, int* d) {
- if (newNrings > MAXCHANNELS) newNrings = MAXCHANNELS;
- for (int r=nrings; r<newNrings; r++) {
- for (int i=0; i<nranks; i++) {
- a[r*nranks+i] = a[(r-nrings)*nranks+i];
- b[r*nranks+i] = b[(r-nrings)*nranks+i];
- c[r*nranks+i] = c[(r-nrings)*nranks+i];
- d[r*nranks+i] = d[(r-nrings)*nranks+i];
- }
- }
- return newNrings;
-}
-/* Main ring creation function */
-ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut) {
- *nrings = 0;
-
- if (nranks == 1) return ncclSuccess;
-
- char* str = getenv("NCCL_RINGS");
- if (str && strlen(str)>0) {
- int ret = parseRings(str, nrings, nranks, prev, next);
- if (ret == ncclSuccess && *nrings > 0) {
- if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings);
- NCCLCHECK(getEnvThreads(nthreads));
- for (int r = 0; r<*nrings; r++) {
- for (int i = 0; i<nranks; i++) {
- if (transports[i*nranks+prev[i]] == 2) treeIn[i] = 1;
- if (transports[i*nranks+next[i]] == 2) treeOut[i] = 1;
- }
- }
- return ncclSuccess;
- }
- if (rank == 0) INFO(NCCL_INIT,"No valid ring found in environment, ignoring");
- *nrings = 0;
- }
-
- // Compute hierarchical topology groups, indexes, and rank<->index tables
- int* coords, *globalIdxToRank, *globalRankToIdx;
- NCCLCHECK(ncclCalloc(&coords, nranks*NTRANSPORTS));
- for (int i=0; i<nranks*NTRANSPORTS; i++) coords[i] = -1;
- NCCLCHECK(ncclCalloc(&globalIdxToRank, nranks));
- NCCLCHECK(ncclCalloc(&globalRankToIdx, nranks));
-
- NCCLCHECK(fillCoords(nranks, transports, coords, globalRankToIdx, globalIdxToRank));
-
- // Start with a high score, then decrease until we find rings
- int minScore = NCCL_MAX_SCORE;
- int nringsTmp;
- int *prevTmp, *nextTmp, *idxToRank, *rankToIdx, *groups, *subgroups;
- NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXCHANNELS));
- NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXCHANNELS));
- NCCLCHECK(ncclCalloc(&idxToRank, nranks));
- NCCLCHECK(ncclCalloc(&rankToIdx, nranks));
- NCCLCHECK(ncclCalloc(&groups, nranks));
- NCCLCHECK(ncclCalloc(&subgroups, nranks));
-
- int nThreads;
- do {
- nThreads = *nthreads;
- for (int i=0; i<nranks*MAXCHANNELS; i++) prevTmp[i] = nextTmp[i] = -1;
- nringsTmp = MAXCHANNELS;
- // Loop over transports to connect groups
- for (int t=NTRANSPORTS-1; t>=0; t--) {
- for (int i=0; i<nranks; i++) idxToRank[i] = rankToIdx[i] = -1;
-
- int nidx = 0;
- for (int i=0; i<nranks; i++) {
- // Extract only ranks in the same local area as rank
- // We need to extract them in the topological order, hence we iterate over indexes, not ranks
- int r = globalIdxToRank[i];
- int sameLocal = 1;
- for (int tr = NTRANSPORTS-1; tr > t; tr--) if (coords[r*NTRANSPORTS+tr] != coords[rank*NTRANSPORTS+tr]) sameLocal = 0;
- if (!sameLocal) continue;
-
- groups[nidx] = coords[r*NTRANSPORTS+t];
- subgroups[nidx] = t ? coords[r*NTRANSPORTS+t-1] : nidx;
- rankToIdx[r] = nidx;
- idxToRank[nidx] = r;
- nidx++;
- }
-
- int ngroups = groups[nidx-1] + 1; // Coords should be ordered
-
- ncclTvalue_t* subvalues;
- int *subprev, *subnext;
- NCCLCHECK(ncclCalloc(&subvalues, nidx*nidx));
- NCCLCHECK(ncclCalloc(&subprev, nidx*nringsTmp));
- NCCLCHECK(ncclCalloc(&subnext, nidx*nringsTmp));
- if (ngroups > 1) {
- /* Extract subvalues */
- for (int i=0; i<nidx; i++) {
- for (int j=0; j<nidx; j++) {
- if (transports[idxToRank[i]*nranks+idxToRank[j]] == t)
- subvalues[i*nidx+j] = values[idxToRank[i]*nranks+idxToRank[j]];
- else
- subvalues[i*nidx+j] = 0;
- }
- }
- /* Extract subprev/subnext */
- for (int i=0; i<nidx*nringsTmp; i++) {
- subprev[i] = subnext[i] = -1;
- }
- for (int r=0; r<nringsTmp; r++) {
- int start = -1, end = -1;
- for (int i=0; i<nranks; i++) {
- if (rankToIdx[i] == -1) continue;
- if (prevTmp[r*nranks+i] != -1) start = i;
- if (nextTmp[r*nranks+i] != -1) end = i;
- }
- if (start != -1 && end != -1) {
- subprev[r*nidx+rankToIdx[start]] = rankToIdx[end];
- subnext[r*nidx+rankToIdx[end]] = rankToIdx[start];
- }
- }
- /* Get rings */
- NCCLCHECK(ncclTransports[t].getRings(nidx, groups, subgroups, subvalues, &nringsTmp, subprev, subnext, minScore, &nThreads));
- /* Merge subprev/subnext into prev/next */
- for (int r=0; r<nringsTmp; r++) {
- for (int i=0; i<nidx; i++) {
- if ((prevTmp[r*nranks+idxToRank[i]] == -1) && (subprev[r*nidx+i] != -1)) prevTmp[r*nranks+idxToRank[i]] = idxToRank[subprev[r*nidx+i]];
- if ((nextTmp[r*nranks+idxToRank[i]] == -1) && (subnext[r*nidx+i] != -1)) nextTmp[r*nranks+idxToRank[i]] = idxToRank[subnext[r*nidx+i]];
- if (t == NTRANSPORTS-1) {
- // Save node-level masters for trees
- treeIn[r*nranks+idxToRank[i]] = prevTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
- treeOut[r*nranks+idxToRank[i]] = nextTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
- }
- }
- }
- //for (int r=0; r<nringsTmp; r++) {
- //printf("[%d] [%d] [%d] [%d] Prev ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", prevTmp[r*nranks+i]); printf("\n");
- //printf("[%d] [%d] [%d] [%d] Next ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", nextTmp[r*nranks+i]); printf("\n");
- //}
- }
- free(subvalues);
- free(subprev);
- free(subnext);
- if (nringsTmp == 0) break;
- }
- minScore--;
- if (nringsTmp > *nrings) {
- *nrings = nringsTmp;
- for (int i=0; i<nranks*(*nrings); i++) {
- prev[i] = prevTmp[i];
- next[i] = nextTmp[i];
- }
- }
- } while (nringsTmp == 0 && minScore);
-
- free(coords);
- free(globalRankToIdx);
- free(globalIdxToRank);
- free(prevTmp);
- free(nextTmp);
- free(idxToRank);
- free(rankToIdx);
- free(groups);
- free(subgroups);
-
- *nthreads = nThreads;
-
- /* Duplicate the rings in case of multinode+NVLink */
- int nnodes = 0;
- for (int r=0; r<nranks; r++) nnodes += treeIn[r];
- int nvlink;
- NCCLCHECK(ncclNvlinkGpu(&nvlink));
- if (nnodes > 1 && nvlink) {
- *nrings = copyRings(*nrings, *nrings*2, nranks, prev, next, treeIn, treeOut);
- }
-
- if (*nrings == 0) {
- WARN("Could not create rings, falling back on simple ring");
- *nrings = 1;
- prev[rank] = (rank-1+nranks) % nranks;
- next[rank] = (rank+1)%nranks;
- }
-
- int maxNrings = ncclParamMaxNrings();
- int minNrings = ncclParamMinNrings();
- if (maxNrings > 0 && minNrings > maxNrings) {
- if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than NCCL_MAX_NRINGS, ignoring NCCL_MIN_NRINGS");
- minNrings = 0;
- }
- if (minNrings > MAXCHANNELS) {
- if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXCHANNELS, MAXCHANNELS);
- minNrings = MAXCHANNELS;
- }
- if (maxNrings > 0 && maxNrings <= *nrings) {
- if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
- *nrings = maxNrings;
- } else {
- int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
- if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
- if (minNrings > 0 && minNrings > *nrings) {
- if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings);
- *nrings = copyRings(*nrings, minNrings, nranks, prev, next, treeIn, treeOut);
- }
- }
-
- NCCLCHECK(getEnvThreads(nthreads));
- return ncclSuccess;
-}
diff --git a/src/misc/utils.cc b/src/misc/utils.cc
new file mode 100644
index 0000000..5158529
--- /dev/null
+++ b/src/misc/utils.cc
@@ -0,0 +1,194 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "utils.h"
+#include "core.h"
+
+#include "nvmlwrap.h"
+
+// Get current Compute Capability
+int ncclCudaCompCap() {
+ int cudaDev;
+ if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0;
+ int ccMajor, ccMinor;
+ if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0;
+ if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0;
+ return ccMajor*10+ccMinor;
+}
+
+ncclResult_t int64ToBusId(int64_t id, char* busId) {
+ sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf));
+ return ncclSuccess;
+}
+
+ncclResult_t busIdToInt64(char* busId, int64_t* id) {
+ const int size = strlen(busId);
+ char* hexStr;
+ NCCLCHECK(ncclCalloc(&hexStr, size));
+ int hexOffset = 0;
+ for (int i=0; i<size; i++) {
+ char c = busId[i];
+ if (c == '.' || c == ':') continue;
+ if ((c >= '0' && c <= '9') ||
+ (c >= 'A' && c <= 'F') ||
+ (c >= 'a' && c <= 'f')) {
+ hexStr[hexOffset++] = busId[i];
+ } else break;
+ }
+ hexStr[hexOffset] = '\0';
+ *id = strtol(hexStr, NULL, 16);
+ free(hexStr);
+ return ncclSuccess;
+}
+
+// Convert a logical cudaDev index to the NVML device minor number
+ncclResult_t getBusId(int cudaDev, int64_t *busId) {
+ // On most systems, the PCI bus ID comes back as in the 0000:00:00.0
+ // format. Still need to allocate proper space in case PCI domain goes
+ // higher.
+ char busIdStr[] = "00000000:00:00.0";
+ CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev));
+ NCCLCHECK(busIdToInt64(busIdStr, busId));
+ return ncclSuccess;
+}
+
+ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
+ if (gethostname(hostname, maxlen) != 0) {
+ strncpy(hostname, "unknown", maxlen);
+ return ncclSystemError;
+ }
+ int i = 0;
+ while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen-1)) i++;
+ hostname[i] = '\0';
+ return ncclSuccess;
+}
+
+uint64_t getHash(const char* string, int n) {
+ // Based on DJB2, result = result * 33 + char
+ uint64_t result = 5381;
+ for (int c = 0; c < n; c++) {
+ result = ((result << 5) + result) + string[c];
+ }
+ return result;
+}
+
+/* Generate a hash of the unique identifying string for this host
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
+ *
+ * This string can be overridden by using the NCCL_HOSTID env var.
+ */
+#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
+uint64_t getHostHash(void) {
+ char hostHash[1024];
+ char *hostId;
+
+ // Fall back is the full hostname if something fails
+ (void) getHostName(hostHash, sizeof(hostHash), '\0');
+ int offset = strlen(hostHash);
+
+ if ((hostId = getenv("NCCL_HOSTID")) != NULL) {
+ strncpy(hostHash, hostId, sizeof(hostHash));
+ } else {
+ FILE *file = fopen(HOSTID_FILE, "r");
+ if (file != NULL) {
+ char *p;
+ if (fscanf(file, "%ms", &p) == 1) {
+ strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
+ free(p);
+ }
+ }
+ fclose(file);
+ }
+
+ // Make sure the string is terminated
+ hostHash[sizeof(hostHash)-1]='\0';
+
+ TRACE(NCCL_INIT,"unique hostname '%s'", hostHash);
+
+ return getHash(hostHash, strlen(hostHash));
+}
+
+/* Generate a hash of the unique identifying string for this process
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $$ $(readlink /proc/self/ns/pid)
+ */
+uint64_t getPidHash(void) {
+ char pname[1024];
+ // Start off with our pid ($$)
+ sprintf(pname, "%ld", (long) getpid());
+ int plen = strlen(pname);
+ int len = readlink("/proc/self/ns/pid", pname+plen, sizeof(pname)-1-plen);
+ if (len < 0) len = 0;
+
+ pname[plen+len]='\0';
+ TRACE(NCCL_INIT,"unique PID '%s'", pname);
+
+ return getHash(pname, strlen(pname));
+}
+
+int parseStringList(const char* string, struct netIf* ifList, int maxList) {
+ if (!string) return 0;
+
+ const char* ptr = string;
+
+ int ifNum = 0;
+ int ifC = 0;
+ char c;
+ do {
+ c = *ptr;
+ if (c == ':') {
+ if (ifC > 0) {
+ ifList[ifNum].prefix[ifC] = '\0';
+ ifList[ifNum].port = atoi(ptr+1);
+ ifNum++; ifC = 0;
+ }
+ while (c != ',' && c != '\0') c = *(++ptr);
+ } else if (c == ',' || c == '\0') {
+ if (ifC > 0) {
+ ifList[ifNum].prefix[ifC] = '\0';
+ ifList[ifNum].port = -1;
+ ifNum++; ifC = 0;
+ }
+ } else {
+ ifList[ifNum].prefix[ifC] = c;
+ ifC++;
+ }
+ ptr++;
+ } while (ifNum < maxList && c);
+ return ifNum;
+}
+
+static bool matchIf(const char* string, const char* ref, bool matchExact) {
+ // Make sure to include '\0' in the exact case
+ int matchLen = matchExact ? strlen(string) + 1 : strlen(ref);
+ return strncmp(string, ref, matchLen) == 0;
+}
+
+static bool matchPort(const int port1, const int port2) {
+ if (port1 == -1) return true;
+ if (port2 == -1) return true;
+ if (port1 == port2) return true;
+ return false;
+}
+
+
+bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) {
+ // Make an exception for the case where no user list is defined
+ if (listSize == 0) return true;
+
+ for (int i=0; i<listSize; i++) {
+ if (matchIf(string, ifList[i].prefix, matchExact)
+ && matchPort(port, ifList[i].port)) {
+ return true;
+ }
+ }
+ return false;
+}
diff --git a/src/misc/utils.cu b/src/misc/utils.cu
deleted file mode 100644
index c618e71..0000000
--- a/src/misc/utils.cu
+++ /dev/null
@@ -1,203 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "utils.h"
-#include "debug.h"
-#include "nccl_net.h"
-#include <unistd.h>
-#include <string.h>
-#include <stdarg.h>
-
-#include "nvmlwrap.h"
-#include "core.h"
-
-// Convert a logical cudaDev index to the NVML device minor number
-ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) {
- char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
- nvmlDevice_t nvmlDevice;
- unsigned int dev;
- *nvmlDev = -1;
- CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
- NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDevice));
- NCCLCHECK(wrapNvmlDeviceGetMinorNumber(nvmlDevice, &dev));
-
- *nvmlDev = dev;
-
- return ncclSuccess;
-}
-
-ncclResult_t getHostName(char* hostname, int maxlen) {
- if (gethostname(hostname, maxlen) != 0) {
- strncpy(hostname, "unknown", maxlen);
- return ncclSystemError;
- }
- int i = 0;
- while ((hostname[i] != '.') && (hostname[i] != '\0') && (i < maxlen-1)) i++;
- hostname[i] = '\0';
- return ncclSuccess;
-}
-
-/* Common logging function used by the INFO, WARN and TRACE macros
- * Also exported to the dynamically loadable Net transport modules so
- * they can share the debugging mechanisms and output files
- */
-void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
- if (ncclDebugLevel <= NCCL_LOG_NONE) return;
-
- char hostname[1024];
- getHostName(hostname, 1024);
- int cudaDev;
- cudaGetDevice(&cudaDev);
-
- char buffer[1024];
- size_t len = 0;
- pthread_mutex_lock(&ncclDebugOutputLock);
- if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
- len = snprintf(buffer, sizeof(buffer),
- "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
- else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask))
- len = snprintf(buffer, sizeof(buffer),
- "%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev);
-#ifdef ENABLE_TRACE
- else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
- auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
- double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
- len = snprintf(buffer, sizeof(buffer),
- "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line);
- }
-#endif
- if (len) {
- va_list vargs;
- va_start(vargs, fmt);
- (void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
- va_end(vargs);
- fprintf(ncclDebugFile,"%s\n", buffer);
- fflush(ncclDebugFile);
- }
- pthread_mutex_unlock(&ncclDebugOutputLock);
-
- // If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort()
- if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) {
- fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n",
- hostname, getpid(), gettid(), cudaDev, filefunc, line);
- abort();
- }
-}
-
-uint64_t getHash(const char* string) {
- // Based on DJB2, result = result * 33 + char
- uint64_t result = 5381;
- for (int c = 0; string[c] != '\0'; c++) {
- result = ((result << 5) + result) + string[c];
- }
- return result;
-}
-
-/* Generate a hash of the unique identifying string for this host
- * that will be unique for both bare-metal and container instances
- * Equivalent of a hash of;
- *
- * $(hostname) $(readlink /proc/self/ns/uts) $(readlink /proc/self/ns/mnt)
- */
-uint64_t getHostHash(void) {
- char uname[1024];
- // Start off with the hostname
- (void) getHostName(uname, sizeof(uname));
- int offset = strlen(uname);
- int len;
- // $(readlink /proc/self/ns/uts)
- len = readlink("/proc/self/ns/uts", uname+offset, sizeof(uname)-1-offset);
- if (len < 0) len = 0;
- offset += len;
- // $(readlink /proc/self/ns/mnt)
- len = readlink("/proc/self/ns/mnt", uname+offset, sizeof(uname)-1-offset);
- if (len < 0) len = 0;
- offset += len;
- // Trailing '\0'
- uname[offset]='\0';
- TRACE(NCCL_INIT,"unique hostname '%s'", uname);
-
- return getHash(uname);
-}
-
-/* Generate a hash of the unique identifying string for this process
- * that will be unique for both bare-metal and container instances
- * Equivalent of a hash of;
- *
- * $$ $(readlink /proc/self/ns/pid)
- */
-uint64_t getPidHash(void) {
- char pname[1024];
- // Start off with our pid ($$)
- sprintf(pname, "%ld", (long) getpid());
- int plen = strlen(pname);
- int len = readlink("/proc/self/ns/pid", pname+plen, sizeof(pname)-1-plen);
- if (len < 0) len = 0;
-
- pname[plen+len]='\0';
- TRACE(NCCL_INIT,"unique PID '%s'", pname);
-
- return getHash(pname);
-}
-
-int parseStringList(const char* string, struct netIf* ifList, int maxList) {
- if (!string) return 0;
-
- const char* ptr = string;
- // Ignore "^" prefix, will be detected outside of this function
- if (ptr[0] == '^') ptr++;
-
- int ifNum = 0;
- int ifC = 0;
- char c;
- do {
- c = *ptr;
- if (c == ':') {
- if (ifC > 0) {
- ifList[ifNum].prefix[ifC] = '\0';
- ifList[ifNum].port = atoi(ptr+1);
- ifNum++; ifC = 0;
- }
- while (c != ',' && c != '\0') c = *(++ptr);
- } else if (c == ',' || c == '\0') {
- if (ifC > 0) {
- ifList[ifNum].prefix[ifC] = '\0';
- ifList[ifNum].port = -1;
- ifNum++; ifC = 0;
- }
- } else {
- ifList[ifNum].prefix[ifC] = c;
- ifC++;
- }
- ptr++;
- } while (ifNum < maxList && c);
- return ifNum;
-}
-
-static bool matchPrefix(const char* string, const char* prefix) {
- return (strncmp(string, prefix, strlen(prefix)) == 0);
-}
-
-static bool matchPort(const int port1, const int port2) {
- if (port1 == -1) return true;
- if (port2 == -1) return true;
- if (port1 == port2) return true;
- return false;
-}
-
-
-bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize) {
- // Make an exception for the case where no user list is defined
- if (listSize == 0) return true;
-
- for (int i=0; i<listSize; i++) {
- if (matchPrefix(string, ifList[i].prefix)
- && matchPort(port, ifList[i].port)) {
- return true;
- }
- }
- return false;
-}
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 985274e..f07e0a4 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -41,7 +41,7 @@ typedef enum { ncclSuccess = 0,
* This integer is coded with the MAJOR, MINOR and PATCH level of the
* NCCL library
*/
-ncclResult_t ncclGetVersion(int *version);
+ncclResult_t ncclGetVersion(int *version);
ncclResult_t pncclGetVersion(int *version);
/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
@@ -244,7 +244,8 @@ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou
* Start a group call. All subsequent calls to NCCL may not block due to
* inter-CPU synchronization.
*/
-ncclResult_t ncclGroupStart();
+ncclResult_t ncclGroupStart();
+ncclResult_t pncclGroupStart();
/*
* Group End
@@ -252,7 +253,8 @@ ncclResult_t ncclGroupStart();
* End a group call. Wait for all calls since ncclGroupStart to complete
* before returning.
*/
-ncclResult_t ncclGroupEnd();
+ncclResult_t ncclGroupEnd();
+ncclResult_t pncclGroupEnd();
#ifdef __cplusplus
} // end extern "C"
diff --git a/src/nccl.pc.in b/src/nccl.pc.in
new file mode 100755
index 0000000..0d98494
--- /dev/null
+++ b/src/nccl.pc.in
@@ -0,0 +1,10 @@
+prefix=${nccl:Prefix}
+exec_prefix=${prefix}
+libdir=${exec_prefix}/lib
+includedir=${prefix}/include
+
+Name: nccl
+Description: Optimized primitives for collective multi-GPU communication
+Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
+Libs: -L${libdir} -lnccl
+Cflags: -I${includedir}
diff --git a/src/transport.cu b/src/transport.cc
index 1436a5b..4059849 100644
--- a/src/transport.cu
+++ b/src/transport.cc
@@ -4,7 +4,8 @@
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#include "comm.h"
+#include "info.h"
extern struct ncclTransport p2pTransport;
extern struct ncclTransport shmTransport;
@@ -119,13 +120,13 @@ ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int r
}
if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
// Tree up
- struct ncclTree* tree = &args->channel->tree;
+ struct ncclTree* tree = &args->channel->treeUp;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args));
NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
}
if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
// Tree down
- struct ncclTree* tree = &args->channel->tree;
+ struct ncclTree* tree = &args->channel->treeDn;
for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
}
@@ -157,7 +158,9 @@ void* persistentThread(void *comm_) {
}
} while (op == NULL);
op->idle = 0;
- if (op->state != ncclProxyOpNone) ret = op->progress(op);
+ // opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started
+ // yet and might be cancelled before they even start. Hold on on those.
+ if (op->state != ncclProxyOpNone && op->opCount < comm->lastOpCount) ret = op->progress(op);
if (ret != ncclSuccess) {
comm->fatalError = ret;
INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
diff --git a/src/transport/net.cu b/src/transport/net.cc
index 06a6e23..87fc9ce 100644
--- a/src/transport/net.cu
+++ b/src/transport/net.cc
@@ -4,39 +4,9 @@
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
-#include "transport.h"
-#include "nvmlwrap.h"
+#include "comm.h"
#include "net.h"
-#include "param.h"
-#include "topo.h"
-#include <cuda_runtime.h>
-#include <assert.h>
-
-#define NET_MAX_IFS 16
-#define NET_MAX_GPUS 32
-
-// Cache GPU-NIC distances to avoid re-computing them
-#define NET_TVALUE_UNKNOWN 0ULL
-static ncclTvalue_t ncclNetTvalues[NET_MAX_GPUS] = { NET_TVALUE_UNKNOWN };
-static int ncclNetNDev;
-
-// We encode 3 bits of distance per interface into a ncclTvalue_t (64-bit)
-#define NET_BITS_PER_IF 3
-#define NET_BITS_PER_IF_MASK ((1<<NET_BITS_PER_IF)-1)
-static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t");
-static ncclTvalue_t getTvalue(short* distances, int ndev) {
- ncclTvalue_t tvalue = 0;
- for (int d=0; d<ndev; d++) {
- int score = 1 + PATH_SOC - distances[d];
- // Keep 3 bits of score info per dev
- tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
- }
- return tvalue;
-}
-static int getScore(ncclTvalue_t tvalue, int dev) {
- return (tvalue >> (dev*NET_BITS_PER_IF)) & NET_BITS_PER_IF_MASK;
-}
+#include "graph.h"
struct netConnectInfo {
ncclNetHandle_t netHandle;
@@ -53,6 +23,7 @@ struct netSendResources {
int buffSize;
void* mhandle;
void* llMhandle;
+ void* ll128Mhandle;
struct ncclRecvMem* devRecvMem;
uint64_t step;
uint64_t llLastCleaning;
@@ -70,224 +41,61 @@ struct netRecvResources {
int buffSize;
void* mhandle;
void* llMhandle;
+ void* ll128Mhandle;
struct ncclRecvMem* devRecvMem;
uint64_t step;
uint64_t llLastCleaning;
};
-static ncclResult_t netDistance(int cudaDev, int dev, short* distance) {
- char* cudaPath = NULL;
- char* nicPath = NULL;
- ncclResult_t err;
- NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
- err = ncclNetPciPath(dev, &nicPath);
- *distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath);
- if (nicPath) free(nicPath);
- if (cudaPath) free(cudaPath);
- return ncclSuccess;
-}
-
-static ncclResult_t netDevices(int* ndev, short** distances) {
- NCCLCHECK(ncclNetDevices(ndev));
- if (*ndev == 0) {
- WARN("Error : Network returned 0 device");
- return ncclSystemError;
- }
- if (*ndev > NET_MAX_IFS) *ndev = NET_MAX_IFS;
-
- *distances = (short*)malloc(*ndev*sizeof(short));
- if (*distances == NULL) return ncclSystemError;
-
- // Find distance with current GPU
- int cudaDev, nvmlDev;
- CUDACHECK(cudaGetDevice(&cudaDev));
- NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
- char line[1024];
- sprintf(line, "CUDA Dev %d[%d], %s NIC distance : ", cudaDev, nvmlDev, ncclNetName());
- for (int d=0; d<*ndev; d++) {
- NCCLCHECK(netDistance(cudaDev, d, *distances+d));
- sprintf(line+strlen(line), " %s", pathDists[(*distances)[d]]);
- }
- INFO(NCCL_INIT|NCCL_NET, "%s", line);
- return ncclSuccess;
-}
-
-/* Determine if we can communicate with the peer */
-ncclResult_t netCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
- int cudaDev;
- CUDACHECK(cudaGetDevice(&cudaDev));
- ret[0] = ncclNetTvalues[cudaDev];
- if (ret[0] == NET_TVALUE_UNKNOWN) {
- if (cudaDev >= NET_MAX_GPUS) {
- WARN("CUDA device %d >= MAX %d\n", cudaDev, NET_MAX_GPUS);
- return ncclInternalError;
- }
- int nDev;
- short* distances;
- NCCLCHECK(netDevices(&nDev, &distances));
- ncclNetTvalues[cudaDev] = ret[0] = getTvalue(distances, nDev);
- ncclNetNDev = nDev;
- free(distances);
- }
- return ncclSuccess;
-}
-
-static inline int groupBestStart(int nranks, int* groups, int group, ncclTvalue_t* values, int card, int minScore) {
- int bestRank = -1;
- int bestScore = 0;
- for (int rank=0; rank<nranks; rank++) {
- if (groups[rank] != group) continue;
- for (int i=0; i<nranks; i++) {
- ncclTvalue_t netValue = values[rank*nranks+i];
- if (netValue != 0) {
- ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK;
- if (score >= minScore && score > bestScore) {
- bestScore = score;
- bestRank = rank;
- }
- // All other values should be the same, stop here for this rank
- break;
- }
- }
- }
- return bestRank;
-}
-static inline int groupBestEnd(int nranks, int* groups, int group, int* subgroups, int startSubGroup, int startRank, ncclTvalue_t* values, int card, int minScore) {
- // For the last rank, we don't need the absolute best score, just to be within minScore.
- for (int rank=nranks-1; rank>=0; rank--) {
- if (groups[rank] != group) continue;
- if (startSubGroup != -1 && startSubGroup == subgroups[rank]) continue;
- if (startRank == rank) continue;
- for (int i=0; i<nranks; i++) {
- ncclTvalue_t netValue = values[rank*nranks+i];
- if (netValue != 0) {
- ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK;
- if (score >= minScore) {
- return rank;
- }
- // All other values should be the same, stop here for this rank
- break;
- }
- }
- }
- return -1;
-}
-
-
-ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
- int nGroups = groups[nranks-1] + 1;
- int cardUsed[NET_MAX_IFS*nGroups];
- for (int c=0; c<NET_MAX_IFS*nGroups; c++) cardUsed[c] = 0;
-
- for (int ring = 0; ring<*nringsRet; ring++) {
- int starts[nGroups];
- int ends[nGroups];
- for (int group = 0; group<nGroups; group++) {
- int nranksInGroup = 0;
- int nsubGroups = 0;
- for (int rank=0; rank<nranks; rank++) if (groups[rank] == group) {
- nranksInGroup++;
- nsubGroups = std::max(subgroups[rank], nsubGroups);
- }
- starts[group] = ends[group] = -1;
- // Receive on the rank closest to the NIC
- for (int card=0; card<NET_MAX_IFS; card++) {
- if (cardUsed[group*NET_MAX_IFS+card] == 1) continue;
- int start = groupBestStart(nranks, groups, group, values, card, minScore);
- // Send from any rank, but best on a different subgroup and close to the NIC also.
- int end = (nranksInGroup == 1) ? start
- : groupBestEnd(nranks, groups, group, subgroups, nsubGroups ? subgroups[start] : -1, start, values, card, minScore);
- //printf("Ring %d, Minscore %d, Card %d, group %d, start = %d, end = %d\n", ring, minScore, card, group, start, end);
- if (start != -1 && end != -1) {
- cardUsed[group*NET_MAX_IFS+card] = 1;
- starts[group] = start;
- ends[group] = end;
- break;
- }
- }
- if (starts[group] == -1 || ends[group] == -1) {
- *nringsRet = ring;
- return ncclSuccess;
- }
- }
- // Link groups together
- for (int group = 0; group<nGroups; group++) {
- int nextGroup = (group+1)%nGroups;
- next[ring*nranks+ends[group]] = starts[nextGroup];
- prev[ring*nranks+starts[nextGroup]] = ends[group];
- }
- }
+/* Determine if two peers can communicate with NET */
+ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+ *ret = 1;
return ncclSuccess;
}
-int getDev(int cudaDev, int ringId) {
- ncclTvalue_t tvalues = ncclNetTvalues[cudaDev];
-
- int dev = 0;
- int maxScore = 0;
- for (int d=0; d<ncclNetNDev; d++) if (getScore(tvalues,d) > maxScore) maxScore = getScore(tvalues,d);
- int skip = ringId+1;
- while (skip) {
- for (int d=0; d<ncclNetNDev; d++) {
- if (getScore(tvalues, d) == maxScore) {
- skip--;
- if (skip == 0) { dev = d; goto end; }
- }
- }
- }
-end:
- return dev;
-}
-
NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
-static ncclResult_t netGetGdrSupport(int dev, int read, int* useGdr) {
+static ncclResult_t netGetGdrSupport(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr) {
*useGdr = 0;
- int cudaDev, nvmlDev;
- CUDACHECK(cudaGetDevice(&cudaDev));
- NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
-
if (read) { // For reads (sends) only enable under certain conditions
int gdrReadParam = ncclParamNetGdrRead();
if (gdrReadParam == 0) return ncclSuccess;
if (gdrReadParam < 0) {
int nvlink;
- NCCLCHECK(ncclNvlinkGpu(&nvlink));
+ NCCLCHECK(ncclTopoHasNvlink(topo, busId, &nvlink));
if (!nvlink) return ncclSuccess;
}
}
// Check if we are close enough that it makes sense to enable GDR
int netGdrLevel = ncclParamNetGdrLevel();
- short distance;
- NCCLCHECK(netDistance(cudaDev, dev, &distance));
+ int distance;
+ NCCLCHECK(ncclTopoNetDistance(topo, busId, netDev, &distance));
if (distance >= netGdrLevel) {
- INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel);
+ INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d >= %d)", ncclNetName(), busId, netDev, distance, netGdrLevel);
return ncclSuccess;
}
// Finally, check if the NIC supports it
int flags;
- NCCLCHECK(ncclNetPtrSupport(dev, &flags));
+ NCCLCHECK(ncclNetPtrSupport(netDev, &flags));
if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess;
*useGdr = 1;
- INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d[%d] / HCA %d (distance %d < %d), read %d", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel, read);
+ INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d < %d), read %d", ncclNetName(), busId, netDev, distance, netGdrLevel, read);
return ncclSuccess;
}
/* Determine if we will use this transport for this peer and return connect
* information for this peer */
-ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
struct netSendResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
- int cudaDev;
- CUDACHECK(cudaGetDevice(&cudaDev));
- resources->netDev = getDev(cudaDev, channelId);
- NCCLCHECK(netGetGdrSupport(resources->netDev, 1, &resources->useGdr));
+ NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &resources->netDev));
+ NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
int sendSize = sizeof(struct ncclSendMem);
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
@@ -299,20 +107,18 @@ ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
resources->buffSize = buffSize;
- INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [send] via NET/%s/%d%s", channelId, myInfo->rank, peerInfo->rank, ncclNetName(), resources->netDev,
+ INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
resources->useGdr ? "/GDRDMA" : "");
return ncclSuccess;
}
-ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
+ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
struct netRecvResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
- int cudaDev;
- CUDACHECK(cudaGetDevice(&cudaDev));
- resources->netDev = getDev(cudaDev, channelId);
- NCCLCHECK(netGetGdrSupport(resources->netDev, 0, &resources->useGdr));
+ NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &resources->netDev));
+ NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
int sendSize = sizeof(struct ncclSendMem);
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
@@ -324,7 +130,7 @@ ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
resources->buffSize = buffSize;
- INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [receive] via NET/%s/%d%s", channelId, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
+ INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
resources->useGdr ? "/GDRDMA" : "");
struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
@@ -339,6 +145,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
send->conn.buff = recvMem->buff;
send->conn.llBuff = resources->devHostRecvMem->llBuff;
+ send->conn.ll128Buff = recvMem->ll128Buff;
// Head/Tail/Opcount/Fifos are always on host
send->conn.tail = &resources->devHostRecvMem->tail;
@@ -356,6 +163,8 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->devHostRecvMem->llBuff,
NCCL_LL_BUFF_SIZE, NCCL_PTR_HOST, &resources->llMhandle));
+ NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE,
+ resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle));
return ncclSuccess;
}
@@ -369,6 +178,7 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
recv->conn.buff = recvMem->buff;
recv->conn.llBuff = recvMem->llBuff;
+ recv->conn.ll128Buff = recvMem->ll128Buff;
// Head/Tail/Opcount are always on host
recv->conn.tail = &resources->devHostRecvMem->tail;
@@ -384,6 +194,8 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->llBuff, NCCL_LL_BUFF_SIZE,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->llMhandle));
+ NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE,
+ resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle));
return ncclSuccess;
}
@@ -393,6 +205,7 @@ ncclResult_t netSendFree(void* transportResources) {
NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandle));
NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->llMhandle));
+ NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->ll128Mhandle));
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
if (resources->useGdr)
CUDACHECK(cudaFree(resources->devRecvMem));
@@ -406,6 +219,7 @@ ncclResult_t netRecvFree(void* transportResources) {
NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandle));
NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->llMhandle));
+ NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->ll128Mhandle));
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
if (resources->useGdr)
CUDACHECK(cudaFree(resources->devRecvMem));
@@ -432,11 +246,44 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
if (args->head < args->end) {
if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
- if (args->llMode) {
+ volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
+ if (args->protocol == NCCL_PROTO_LL128) {
+ int stepSize = NCCL_LL128_BUFF_SIZE/NCCL_STEPS;
+ if (args->tail < *recvTail) {
+ int buffSlot = args->tail%NCCL_STEPS;
+ if (sizesFifo[buffSlot] != -1) {
+ struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
+ char* localBuff = (char*)localMem->ll128Buff;
+ int ready = resources->useGdr;
+ if (!ready) {
+ // When data is in sysmem, we need to wait until all flags are correct since the GPU only
+ // called threadfence()
+ uint64_t flag = args->tail + 1;
+ int nFifoLines = DIVUP(sizesFifo[buffSlot], sizeof(uint64_t)*NCCL_LL128_LINEELEMS);
+ volatile uint64_t* lines = (volatile uint64_t*)(localBuff+buffSlot*stepSize);
+ ready = 1;
+ for (int i=0; i<nFifoLines; i++) {
+ if (lines[i*NCCL_LL128_LINEELEMS+NCCL_LL128_DATAELEMS] != flag) { ready = 0; break; }
+ }
+ }
+ if (ready) {
+ // Send through network
+ NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, sizesFifo[buffSlot], resources->ll128Mhandle, args->requests+buffSlot));
+ if (args->requests[buffSlot] != NULL) {
+ sizesFifo[buffSlot] = -1;
+ // Make sure size is reset to zero before we update the head.
+ __sync_synchronize();
+ args->tail += args->sliceSteps;
+ args->idle = 0;
+ }
+ }
+ }
+ }
+ } else if (args->protocol == NCCL_PROTO_LL) {
int buffSlot = args->tail%NCCL_STEPS;
int size = sizesFifo[buffSlot];
if (size != -1) {
- uint32_t flag = args->tail + 1;
+ uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
size = nFifoLines * sizeof(union ncclLLFifoLine);
union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
@@ -457,18 +304,20 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
}
}
}
- } else if (args->tail < resources->hostRecvMem->tail) {
- struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
+ } else if (args->tail < *recvTail) {
int stepSize = args->channel->buffSize/NCCL_STEPS;
+ struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
// Send through network
int buffSlot = args->tail%NCCL_STEPS;
- NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot));
- if (args->requests[buffSlot] != NULL) {
- sizesFifo[buffSlot] = -1;
- // Make sure size is reset to zero before we update the head.
- __sync_synchronize();
- args->tail += args->sliceSteps;
- args->idle = 0;
+ if (sizesFifo[buffSlot] != -1) {
+ NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot));
+ if (args->requests[buffSlot] != NULL) {
+ sizesFifo[buffSlot] = -1;
+ // Make sure size is reset to zero before we update the head.
+ __sync_synchronize();
+ args->tail += args->sliceSteps;
+ args->idle = 0;
+ }
}
}
}
@@ -486,18 +335,8 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
if (args->head == args->end) {
resources->step = args->end;
args->idle = 0;
- args->state = ncclProxyOpDone;
- }
- }
- if (args->state == ncclProxyOpDone) {
- union ncclLLFifoLine* llBuff = resources->hostRecvMem->llBuff;
- if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
- for (int i=0; i< NCCL_LL_BUFF_LINES; i++) llBuff[i].flag1 = llBuff[i].flag2 = resources->step;
- resources->step += NCCL_STEPS;
- resources->hostSendMem->head = resources->step;
- resources->llLastCleaning = resources->step;
+ args->state = ncclProxyOpNone;
}
- args->state = ncclProxyOpNone;
}
return ncclSuccess;
}
@@ -517,12 +356,13 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
}
if (args->state == ncclProxyOpProgress) {
args->idle = 1;
- int stepSize = ( args->llMode ? NCCL_LL_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS;
+ int stepSize = ( args->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : args->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS;
if (args->head < args->end) {
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
- char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff;
- void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle;
- if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
+ char* localBuff = args->protocol == NCCL_PROTO_LL ? (char*)localMem->llBuff : args->protocol == NCCL_PROTO_LL128 ? (char*)localMem->ll128Buff : localMem->buff;
+ void* mhandle = args->protocol == NCCL_PROTO_LL ? resources->llMhandle : args->protocol == NCCL_PROTO_LL128 ? resources->ll128Mhandle : resources->mhandle;
+ volatile uint64_t* sendHead = &resources->hostSendMem->head;
+ if ((args->tail < args->head + NCCL_STEPS) && (args->tail < *sendHead + NCCL_STEPS) && (args->tail < args->end)) {
int buffSlot = args->tail%NCCL_STEPS;
int sliceSize = stepSize * args->sliceSteps;
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot));
@@ -537,7 +377,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size));
if (done) {
args->head += args->sliceSteps;
- if (args->llMode == 0) {
+ if (args->protocol == NCCL_PROTO_SIMPLE) {
if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle);
resources->hostRecvMem->tail = args->head;
}
@@ -548,16 +388,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
if (args->head == args->end) {
resources->step = args->end;
args->idle = 0;
- args->state = ncclProxyOpDone;
- }
- }
- if (args->state == ncclProxyOpDone) {
- if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
- resources->step += NCCL_STEPS;
- while (resources->hostSendMem->head < resources->step);
- resources->llLastCleaning = resources->step;
+ args->state = ncclProxyOpNone;
}
- args->state = ncclProxyOpNone;
}
return ncclSuccess;
}
@@ -565,7 +397,6 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
struct ncclTransport netTransport = {
"NET",
netCanConnect,
- netGetRings,
{ netSendSetup, netSendConnect, netSendFree, netSendProxy },
{ netRecvSetup, netRecvConnect, netRecvFree, netRecvProxy }
};
diff --git a/src/transport/net_ib.cu b/src/transport/net_ib.cc
index f7c574b..0d5307c 100644
--- a/src/transport/net_ib.cu
+++ b/src/transport/net_ib.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,7 +8,7 @@
#include "core.h"
#include "socket.h"
#include "net.h"
-#include "topo.h"
+#include "graph.h"
#include "utils.h"
#include "param.h"
@@ -107,18 +107,22 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
char* userIbEnv = getenv("NCCL_IB_HCA");
struct netIf userIfs[MAX_IB_DEVS];
bool searchNot = userIbEnv && userIbEnv[0] == '^';
+ if (searchNot) userIbEnv++;
+ bool searchExact = userIbEnv && userIbEnv[0] == '=';
+ if (searchExact) userIbEnv++;
int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS);
if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return ncclInternalError;
- for (int d=0; d<nIbDevs; d++) {
+ for (int d=0; d<nIbDevs && ncclNIbDevs<MAX_IB_DEVS; d++) {
struct ibv_context * context;
if (ncclSuccess != wrap_ibv_open_device(&context, devices[d]) || context == NULL) {
WARN("NET/IB : Unable to open device %s", devices[d]->name);
continue;
}
- int found = 0;
+ int nPorts = 0;
struct ibv_device_attr devAttr;
+ memset(&devAttr, 0, sizeof(devAttr));
if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
WARN("NET/IB : Unable to query device %s", devices[d]->name);
if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
@@ -135,7 +139,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
&& portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
// check against user specified HCAs/ports
- if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) {
+ if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs, searchExact) ^ searchNot)) {
continue;
}
TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
@@ -146,10 +150,10 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
ncclIbDevs[ncclNIbDevs].context = context;
strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
ncclNIbDevs++;
- found++;
+ nPorts++;
pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
}
- if (found == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
+ if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
}
if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; };
}
@@ -197,32 +201,14 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
}
if (moduleLoaded == 0) return ncclSystemError;
- ncclResult_t ret = ncclSystemError;
- void* ptr;
- if (cudaMalloc(&ptr, sizeof(int)) == cudaSuccess) {
- struct ibv_mr* mr;
- struct ibv_pd* pd;
- if (wrap_ibv_alloc_pd(&pd, ncclIbDevs[ibDev].context) == ncclSuccess) {
- if ((mr = wrap_direct_ibv_reg_mr(pd, ptr, sizeof(int), IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)) != NULL) {
- ret = ncclSuccess;
- wrap_ibv_dereg_mr(mr);
- }
- wrap_ibv_dealloc_pd(pd);
- }
- cudaFree(ptr);
- }
- return ret;
+ return ncclSuccess;
}
ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
*supportedTypes = NCCL_PTR_HOST;
- int cudaDev, nvmlDev;
- CUDACHECK(cudaGetDevice(&cudaDev));
- NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
-
if (ncclIbGdrSupport(dev) != ncclSuccess) {
- INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d '%s' (no module or not supported by GPU)", cudaDev, nvmlDev, dev, ncclIbDevs[dev].devName);
+ INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName);
return ncclSuccess;
}
*supportedTypes |= NCCL_PTR_CUDA;
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
new file mode 100644
index 0000000..1b1fc4f
--- /dev/null
+++ b/src/transport/net_socket.cc
@@ -0,0 +1,503 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "core.h"
+#include "socket.h"
+#include "net.h"
+#include "param.h"
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <poll.h>
+#include <limits.h>
+#include <fcntl.h>
+
+/* Init functions */
+static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
+static union socketAddress ncclNetIfAddrs[MAX_IFS];
+static int ncclNetIfs = -1;
+pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
+
+ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
+ if (ncclNetIfs == -1) {
+ pthread_mutex_lock(&ncclSocketLock);
+ if (ncclNetIfs == -1) {
+ ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
+ if (ncclNetIfs <= 0) {
+ WARN("NET/Socket : no interface found");
+ return ncclInternalError;
+ } else {
+ char line[1024];
+ char addrline[1024];
+ line[0] = '\0';
+ for (int i=0; i<ncclNetIfs; i++) {
+ snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, ncclNetIfNames+i*MAX_IF_NAME_SIZE,
+ socketToString(&ncclNetIfAddrs[i].sa, addrline));
+ }
+ line[1023] = '\0';
+ INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
+ }
+ }
+ pthread_mutex_unlock(&ncclSocketLock);
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) {
+ *supportedTypes = NCCL_PTR_HOST;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketDevices(int* ndev) {
+ *ndev = ncclNetIfs;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketPciPath(int dev, char** path) {
+ char devicepath[PATH_MAX];
+ snprintf(devicepath, PATH_MAX, "/sys/class/net/%s/device", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
+ *path = realpath(devicepath, NULL);
+ if (*path == NULL) {
+ INFO(NCCL_NET|NCCL_INIT, "Could not find real path of %s", devicepath);
+ return ncclSystemError;
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
+ if (dev >= ncclNetIfs) return ncclInternalError;
+ memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr));
+ return ncclSuccess;
+}
+
+/* Communication functions */
+
+#define MAX_SOCKETS 64
+#define MAX_THREADS 16
+#define MAX_REQUESTS 128
+#define MAX_QUEUE_LEN MAX_REQUESTS
+#define MIN_CHUNKSIZE (64*1024)
+
+NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2);
+NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2);
+
+struct ncclSocketHandle {
+ union socketAddress connectAddr;
+ int nSocks;
+ int nThreads;
+};
+
+struct ncclSocketTask {
+ int op;
+ void* data;
+ int size;
+ int fd;
+ int offset;
+ int used;
+ ncclResult_t result;
+};
+
+struct ncclSocketRequest {
+ int op;
+ void* data;
+ int size;
+ int ctrlFd;
+ int offset;
+ int used;
+ struct ncclSocketComm* comm;
+ struct ncclSocketTask* tasks[MAX_SOCKETS];
+ int nSubs;
+};
+
+struct ncclSocketTaskQueue {
+ int next;
+ struct ncclSocketTask* tasks;
+};
+
+enum threadState {start, stop};
+
+struct ncclSocketThreadResources {
+ struct ncclSocketTaskQueue threadTaskQueue;
+ enum threadState state;
+ struct ncclSocketComm* comm;
+ pthread_mutex_t threadLock;
+ pthread_cond_t threadCond;
+};
+
+struct ncclSocketListenComm {
+ int fd;
+ int nSocks;
+ int nThreads;
+};
+
+struct ncclSocketComm {
+ int ctrlFd;
+ int fds[MAX_SOCKETS];
+ int nSocks;
+ int nThreads;
+ int nextFd;
+ struct ncclSocketRequest requests[MAX_REQUESTS];
+ pthread_t helperThread[MAX_THREADS];
+ struct ncclSocketThreadResources threadResources[MAX_THREADS];
+};
+
+void* persistentSocketThread(void *args_) {
+ struct ncclSocketThreadResources* resource = (struct ncclSocketThreadResources*)args_;
+ struct ncclSocketComm* comm = resource->comm;
+ volatile enum threadState* state = &resource->state;
+ struct ncclSocketTaskQueue* myQueue = &resource->threadTaskQueue;
+ int nSocksPerThread = comm->nSocks / comm->nThreads;
+ while (1) {
+ int idle = 1;
+ int mark = myQueue->next; // mark newest task seen
+ for (int i=0; i<MAX_QUEUE_LEN; i+=nSocksPerThread) {
+ int repeat;
+ do {
+ repeat = 0;
+ for (int j=0; j<nSocksPerThread; j++) {
+ struct ncclSocketTask* r = myQueue->tasks+i+j;
+ if (r != NULL && r->used == 1 && r->offset < r->size) {
+ r->result = socketProgress(r->op, r->fd, r->data, r->size, &r->offset);
+ if (r->result != ncclSuccess) {
+ WARN("NET/Socket : socket progress error");
+ return NULL;
+ }
+ idle = 0;
+ if (r->offset < r->size) repeat = 1;
+ }
+ }
+ } while (repeat);
+ }
+ if (idle) {
+ pthread_mutex_lock(&resource->threadLock);
+ while (mark == myQueue->next && *state != stop) { // no new tasks, wait
+ pthread_cond_wait(&resource->threadCond, &resource->threadLock);
+ }
+ pthread_mutex_unlock(&resource->threadLock);
+ }
+ if (*state == stop) return NULL;
+ }
+}
+
+ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
+ int nSocksPerThread = ncclParamSocketNsocksPerThread();
+ int nThreads = ncclParamSocketNthreads();
+ if (nThreads > MAX_THREADS) {
+ WARN("NET/Socket : NCCL_SOCKET_NTHREADS is greater than the maximum allowed, setting to %d", MAX_THREADS);
+ nThreads = MAX_THREADS;
+ }
+ if (nThreads == -2 || nSocksPerThread == -2) {
+ // Auto-detection
+ int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads
+ char vendorPath[PATH_MAX];
+ snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
+ char* rPath = realpath(vendorPath, NULL);
+ int fd = open(rPath, O_RDONLY);
+ free(rPath);
+ if (fd == -1) {
+ // Could not find device vendor. This is handled silently so
+ // we don't want to print an INFO error.
+ TRACE(NCCL_NET, "Open of %s failed : %s\n", vendorPath, strerror(errno));
+ goto end;
+ }
+ char vendor[7];
+ strncpy(vendor, "0x0000", 7);
+ int len;
+ SYSCHECKVAL(read(fd, vendor, 6), "read", len);
+ SYSCHECK(close(fd), "close");
+ if (strcmp(vendor, "0x1d0f") == 0) { // AWS
+ autoNt = 2;
+ autoNs = 8;
+ } else if (strcmp(vendor, "0x1ae0") == 0) { // GCP
+ autoNt = 4;
+ autoNs = 1;
+ }
+end:
+ if (nThreads == -2) nThreads = autoNt;
+ if (nSocksPerThread == -2) nSocksPerThread = autoNs;
+ }
+ int nSocks = nSocksPerThread * nThreads;
+ if (nSocks > MAX_SOCKETS) {
+ nSocksPerThread = MAX_SOCKETS/nThreads;
+ WARN("NET/Socket : the total number of sockets is greater than the maximum allowed, setting NCCL_NSOCKS_PERTHREAD to %d", nSocksPerThread);
+ nSocks = nSocksPerThread * nThreads;
+ }
+ *ns = nSocks;
+ *nt = nThreads;
+ if (nSocks > 0) INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread);
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketNewListenComm(struct ncclSocketListenComm** comm) {
+ NCCLCHECK(ncclCalloc(comm, 1));
+ (*comm)->fd = -1;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) {
+ NCCLCHECK(ncclCalloc(comm, 1));
+ (*comm)->ctrlFd = -1;
+ for (int i=0; i < MAX_SOCKETS; i++) {
+ (*comm)->fds[i] = -1;
+ }
+ (*comm)->nextFd = 0;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
+ if (dev < 0) { // data transfer socket is based on specified dev
+ return ncclInternalError;
+ }
+ struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
+ static_assert(sizeof(struct ncclSocketHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large");
+ struct ncclSocketListenComm* comm;
+ NCCLCHECK(ncclSocketNewListenComm(&comm));
+ NCCLCHECK(GetSocketAddr(dev, &handle->connectAddr));
+ NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
+ NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
+ handle->nSocks = comm->nSocks;
+ handle->nThreads = comm->nThreads;
+ *listenComm = comm;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) {
+ if (dev < 0) { // data transfer socket is based on specified dev
+ return ncclInternalError;
+ }
+ struct ncclSocketComm* comm;
+ NCCLCHECK(ncclSocketNewComm(&comm));
+ struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
+ comm->nSocks = handle->nSocks;
+ comm->nThreads = handle->nThreads;
+ for (int i=0; i<comm->nSocks+1; i++) {
+ int tmpFd, offset=0;
+ NCCLCHECK(connectAddress(&tmpFd, &handle->connectAddr));
+ NCCLCHECK(socketWait(NCCL_SOCKET_SEND, tmpFd, &i, sizeof(int), &offset));
+ if (i == comm->nSocks) comm->ctrlFd = tmpFd;
+ else comm->fds[i] = tmpFd;
+ }
+ *sendComm = comm;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) {
+ struct ncclSocketListenComm* lComm = (struct ncclSocketListenComm*)listenComm;
+ struct ncclSocketComm* rComm;
+ NCCLCHECK(ncclSocketNewComm(&rComm));
+ rComm->nSocks = lComm->nSocks;
+ rComm->nThreads = lComm->nThreads;
+ for (int i=0; i<rComm->nSocks+1; i++) {
+ int tmpFd, sendSockIdx, offset=0;
+ struct sockaddr_in sockaddr;
+ socklen_t socklen = sizeof(struct sockaddr_in);
+ SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", tmpFd);
+ NCCLCHECK(socketWait(NCCL_SOCKET_RECV, tmpFd, &sendSockIdx, sizeof(int), &offset));
+ if (sendSockIdx == rComm->nSocks) rComm->ctrlFd = tmpFd;
+ else rComm->fds[sendSockIdx] = tmpFd;
+ }
+ *recvComm = rComm;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketRequest** req) {
+ for (int i=0; i<MAX_REQUESTS; i++) {
+ struct ncclSocketRequest* r = comm->requests+i;
+ if (r->used == 0) {
+ r->op = op;
+ r->data = data;
+ r->size = size;
+ r->ctrlFd = comm->ctrlFd;
+ r->used = 1;
+ r->comm = comm;
+ r->nSubs = 0;
+ *req = r;
+ return ncclSuccess;
+ }
+ }
+ WARN("NET/Socket : unable to allocate requests");
+ return ncclInternalError;
+}
+
+ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketTask** req) {
+ int tid = comm->nextFd % comm->nThreads;
+ struct ncclSocketThreadResources* res = comm->threadResources+tid;
+ struct ncclSocketTaskQueue* queue = &res->threadTaskQueue;
+ // create helper threads and prepare per-thread task queue
+ if (queue->tasks == NULL) {
+ NCCLCHECK(ncclCalloc(&queue->tasks, MAX_QUEUE_LEN));
+ queue->next = 0;
+ res->comm = comm;
+ pthread_mutex_init(&res->threadLock, NULL);
+ pthread_cond_init(&res->threadCond, NULL);
+ pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res);
+ }
+ struct ncclSocketTask* r = queue->tasks+queue->next;
+ if (r->used == 0) {
+ r->op = op;
+ r->data = data;
+ r->size = size;
+ r->fd = comm->fds[comm->nextFd];
+ r->offset = 0;
+ r->result = ncclSuccess;
+ comm->nextFd = (comm->nextFd + 1) % comm->nSocks;
+ r->used = 1;
+ *req = r;
+ pthread_mutex_lock(&res->threadLock);
+ queue->next = (queue->next+1)%MAX_QUEUE_LEN;
+ res->state = start;
+ pthread_cond_signal(&res->threadCond);
+ pthread_mutex_unlock(&res->threadLock);
+ return ncclSuccess;
+ }
+ WARN("NET/Socket : unable to allocate subtasks");
+ return ncclInternalError;
+}
+
+ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
+ *done = 0;
+ struct ncclSocketRequest *r = (struct ncclSocketRequest*)request;
+ if (r == NULL) {
+ WARN("NET/Socket : test called with NULL request");
+ return ncclInternalError;
+ }
+ if (r->used == 1) { /* try to send/recv size */
+ int data = r->size;
+ int offset = 0;
+ NCCLCHECK(socketProgress(r->op, r->ctrlFd, &data, sizeof(int), &offset));
+
+ if (offset == 0) return ncclSuccess; /* Not ready -- retry later */
+
+ // Not sure we could ever receive less than 4 bytes, but just in case ...
+ if (offset < sizeof(int)) NCCLCHECK(socketWait(r->op, r->ctrlFd, &data, sizeof(int), &offset));
+
+ // Check size is less or equal to the size provided by the user
+ if (r->op == NCCL_SOCKET_RECV && data > r->size) {
+ WARN("NET/Socket : message truncated : receiving %d bytes instead of %d", data, r->size);
+ return ncclInternalError;
+ }
+ r->size = data;
+ r->offset = 0;
+ r->used = 2; // done exchanging size
+ // divide into subtasks
+ int chunkOffset = 0, i = 0;
+ if (r->comm->nSocks > 0) {
+ int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
+ while (chunkOffset < r->size) {
+ int chunkSize = std::min(taskSize, r->size-chunkOffset);
+ NCCLCHECK(ncclSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
+ chunkOffset += chunkSize;
+ }
+ }
+ r->nSubs = i;
+ }
+ if (r->used == 2) { // already exchanged size
+ if (r->nSubs > 0) {
+ int nCompleted = 0;
+ for (int i=0; i<r->nSubs; i++) {
+ struct ncclSocketTask* sub = r->tasks[i];
+ if (sub->result != ncclSuccess) return sub->result;
+ if (sub->offset == sub->size) nCompleted++;
+ }
+ if (nCompleted == r->nSubs) {
+ if (size) *size = r->size;
+ *done = 1;
+ r->used = 0;
+ for (int i=0; i<r->nSubs; i++) {
+ struct ncclSocketTask* sub = r->tasks[i];
+ sub->used = 0;
+ }
+ }
+ } else { // progress request using main thread
+ if (r->offset < r->size) {
+ NCCLCHECK(socketProgress(r->op, r->ctrlFd, r->data, r->size, &r->offset));
+ }
+ if (r->offset == r->size) {
+ if (size) *size = r->size;
+ *done = 1;
+ r->used = 0;
+ }
+ }
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+ return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess;
+}
+ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
+
+ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, void* mhandle, void** request) {
+ struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm;
+ NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclSocketRequest**)request));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) {
+ struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm;
+ NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_RECV, data, size, (struct ncclSocketRequest**)request));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size, void* mhandle) {
+ // We don't support CUDA pointers, so we don't need a flush operation
+ return ncclInternalError;
+}
+
+ncclResult_t ncclSocketCloseListen(void* opaqueComm) {
+ struct ncclSocketListenComm* comm = (struct ncclSocketListenComm*)opaqueComm;
+ if (comm) {
+ if (comm->fd != -1) close(comm->fd);
+ free(comm);
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketClose(void* opaqueComm) {
+ struct ncclSocketComm* comm = (struct ncclSocketComm*)opaqueComm;
+ if (comm) {
+ for (int i=0; i<comm->nThreads; i++) {
+ struct ncclSocketThreadResources* res = comm->threadResources+i;
+ if (comm->helperThread[i]) {
+ pthread_mutex_lock(&res->threadLock);
+ res->state = stop;
+ pthread_cond_signal(&res->threadCond);
+ pthread_mutex_unlock(&res->threadLock);
+ pthread_join(comm->helperThread[i], NULL);
+ }
+ free(res->threadTaskQueue.tasks);
+ }
+ if (comm->ctrlFd != -1) close(comm->ctrlFd);
+ for (int i=0; i<comm->nSocks; i++) {
+ if (comm->fds[i] != -1) close(comm->fds[i]);
+ }
+ free(comm);
+ }
+ return ncclSuccess;
+}
+
+ncclNet_t ncclNetSocket = {
+ "Socket",
+ ncclSocketInit,
+ ncclSocketDevices,
+ ncclSocketPciPath,
+ ncclSocketPtrSupport,
+ ncclSocketListen,
+ ncclSocketConnect,
+ ncclSocketAccept,
+ ncclSocketRegMr,
+ ncclSocketDeregMr,
+ ncclSocketIsend,
+ ncclSocketIrecv,
+ ncclSocketFlush,
+ ncclSocketTest,
+ ncclSocketClose,
+ ncclSocketClose,
+ ncclSocketCloseListen
+};
diff --git a/src/transport/net_socket.cu b/src/transport/net_socket.cu
deleted file mode 100644
index 0464b43..0000000
--- a/src/transport/net_socket.cu
+++ /dev/null
@@ -1,267 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "nccl.h"
-#include "core.h"
-#include "socket.h"
-#include "net.h"
-
-#include <assert.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <poll.h>
-#include <limits.h>
-
-/* Init functions */
-static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
-static union socketAddress ncclNetIfAddrs[MAX_IFS];
-static int ncclNetIfs = -1;
-pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
-
-ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
- if (ncclNetIfs == -1) {
- pthread_mutex_lock(&ncclSocketLock);
- if (ncclNetIfs == -1) {
- ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
- if (ncclNetIfs <= 0) {
- WARN("NET/Socket : no interface found");
- return ncclInternalError;
- } else {
- char line[1024];
- char addrline[1024];
- line[0] = '\0';
- for (int i=0; i<ncclNetIfs; i++) {
- snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, ncclNetIfNames+i*MAX_IF_NAME_SIZE,
- socketToString(&ncclNetIfAddrs[i].sa, addrline));
- }
- line[1023] = '\0';
- INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
- }
- }
- pthread_mutex_unlock(&ncclSocketLock);
- }
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) {
- *supportedTypes = NCCL_PTR_HOST;
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketDevices(int* ndev) {
- *ndev = ncclNetIfs;
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketPciPath(int dev, char** path) {
- char devicepath[PATH_MAX];
- snprintf(devicepath, PATH_MAX, "/sys/class/net/%s/device", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
- *path = realpath(devicepath, NULL);
- if (*path == NULL) {
- INFO(NCCL_NET|NCCL_INIT, "Could not find real path of %s", devicepath);
- return ncclSystemError;
- }
- return ncclSuccess;
-}
-
-static ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
- if (dev >= ncclNetIfs) return ncclInternalError;
- memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr));
- return ncclSuccess;
-}
-
-/* Communication functions */
-
-struct ncclSocketHandle {
- union socketAddress connectAddr;
-};
-
-struct ncclSocketRequest {
- int op;
- void* data;
- int size;
- int fd;
- int offset;
- int used;
-};
-
-struct ncclSocketReqs {
- struct ncclSocketRequest* requests;
-};
-
-struct ncclSocketComm {
- int fd;
- struct ncclSocketReqs reqs;
-};
-
-ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) {
- NCCLCHECK(ncclCalloc(comm, 1));
- (*comm)->fd = -1;
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str) {
- struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
- NCCLCHECK(GetSocketAddrFromString(&(handle->connectAddr), str));
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
- struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
- static_assert(sizeof(struct ncclSocketHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large");
- // if dev >= 0, listen based on dev
- if (dev >= 0) {
- NCCLCHECK(GetSocketAddr(dev, &(handle->connectAddr)));
- } else if (dev == findSubnetIf) {
- // handle stores a remote address
- // need to find a local addr that is in the same network as the remote addr
- union socketAddress localAddr;
- char ifName[MAX_IF_NAME_SIZE];
- if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
- WARN("NET/Socket : No usable listening interface found");
- return ncclSystemError;
- }
- // pass the local address back
- memcpy(&handle->connectAddr, &localAddr, sizeof(handle->connectAddr));
- } // Otherwise, handle stores a local address
- struct ncclSocketComm* comm;
- NCCLCHECK(ncclSocketNewComm(&comm));
- NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
- *listenComm = comm;
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) {
- struct ncclSocketComm* comm;
- NCCLCHECK(ncclSocketNewComm(&comm));
- struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
- NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr));
- *sendComm = comm;
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) {
- struct ncclSocketComm* lComm = (struct ncclSocketComm*)listenComm;
- struct ncclSocketComm* rComm;
- NCCLCHECK(ncclSocketNewComm(&rComm));
- struct sockaddr_in sockaddr;
- socklen_t socklen = sizeof(struct sockaddr_in);
- SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd);
- *recvComm = rComm;
- return ncclSuccess;
-}
-
-#define MAX_REQUESTS 128
-
-ncclResult_t ncclSocketGetRequest(struct ncclSocketReqs* reqs, int op, void* data, int size, int fd, struct ncclSocketRequest** req) {
- if (reqs->requests == NULL) {
- NCCLCHECK(ncclCalloc(&reqs->requests, MAX_REQUESTS));
- }
- for (int i=0; i<MAX_REQUESTS; i++) {
- struct ncclSocketRequest* r = reqs->requests+i;
- if (r->used == 0) {
- r->op = op;
- r->data = data;
- r->size = size;
- r->fd = fd;
- r->offset = -1;
- r->used = 1;
- *req = r;
- return ncclSuccess;
- }
- }
- WARN("Socket : unable to allocate requests");
- return ncclInternalError;
-}
-
-ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
- *done = 0;
- struct ncclSocketRequest *r = (struct ncclSocketRequest*)request;
- if (r == NULL) {
- WARN("NET/Socket : test called with NULL request");
- return ncclInternalError;
- }
- if (r->offset == -1) { /* try to send/recv size */
- int data = r->size;
- int offset = 0;
- NCCLCHECK(socketProgress(r->op, r->fd, &data, sizeof(int), &offset));
-
- if (offset == 0) return ncclSuccess; /* Not ready -- retry later */
-
- // Not sure we could ever receive less than 4 bytes, but just in case ...
- if (offset < sizeof(int)) NCCLCHECK(socketWait(r->op, r->fd, &data, sizeof(int), &offset));
-
- // Check size is less or equal to the size provided by the user
- if (r->op == NCCL_SOCKET_RECV && data > r->size) {
- WARN("NET/Socket : message truncated : receiving %d bytes instead of %d", data, r->size);
- return ncclInternalError;
- }
- r->size = data;
- r->offset = 0;
- }
- if (r->offset < r->size) {
- NCCLCHECK(socketProgress(r->op, r->fd, r->data, r->size, &r->offset));
- }
- if (r->offset == r->size) {
- if (size) *size = r->size;
- *done = 1;
- r->used = 0;
- }
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) {
- return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess;
-}
-ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
-
-ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, void* mhandle, void** request) {
- struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm;
- NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_SEND, data, size, comm->fd, (struct ncclSocketRequest**)request));
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) {
- struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm;
- NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_RECV, data, size, comm->fd, (struct ncclSocketRequest**)request));
- return ncclSuccess;
-}
-
-ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size, void* mhandle) {
- // We don't support CUDA pointers, so we don't need a flush operation
- return ncclInternalError;
-}
-
-ncclResult_t ncclSocketClose(void* opaqueComm) {
- struct ncclSocketComm* comm = (struct ncclSocketComm*)opaqueComm;
- if (comm) {
- free(comm->reqs.requests);
- close(comm->fd);
- free(comm);
- }
- return ncclSuccess;
-}
-
-ncclNet_t ncclNetSocket = {
- "Socket",
- ncclSocketInit,
- ncclSocketDevices,
- ncclSocketPciPath,
- ncclSocketPtrSupport,
- ncclSocketListen,
- ncclSocketConnect,
- ncclSocketAccept,
- ncclSocketRegMr,
- ncclSocketDeregMr,
- ncclSocketIsend,
- ncclSocketIrecv,
- ncclSocketFlush,
- ncclSocketTest,
- ncclSocketClose,
- ncclSocketClose,
- ncclSocketClose
-};
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
new file mode 100644
index 0000000..0cc92f3
--- /dev/null
+++ b/src/transport/p2p.cc
@@ -0,0 +1,312 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "graph.h"
+#include "utils.h"
+
+struct p2pConnectInfo {
+ int direct;
+ union {
+ void* directPtr;
+ cudaIpcMemHandle_t devIpc;
+ };
+};
+
+struct p2pSendResources {
+ struct ncclSendMem* devMem;
+ void* ipcPtr;
+};
+
+struct p2pRecvResources {
+ struct ncclRecvMem* devMem;
+ void* ipcPtr;
+};
+
+#include <sys/types.h>
+
+NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2);
+NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2);
+
+/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
+static int busIdToCudaDev(int64_t busId) {
+ int ndev;
+ if (cudaGetDeviceCount(&ndev) != cudaSuccess)
+ return -1;
+ for (int i = 0; i < ndev; i++) {
+ char devBusIdStr[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+ if (cudaDeviceGetPCIBusId(devBusIdStr, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess)
+ return -1;
+ int64_t devBusId;
+ NCCLCHECK(busIdToInt64(devBusIdStr, &devBusId));
+ if (busId == devBusId) return i;
+ }
+ // BusId was not found in our locally visible CUDA devices
+ return -1;
+}
+
+/* Determine if two peers can communicate through p2p */
+ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+ int cpuCount;
+ NCCLCHECK(ncclTopoCpuCount(topo, &cpuCount));
+ // Do not use P2P across sockets by default (provided CUDA permits it).
+ // When we are on a single socket, don't even use P2P through the CPU as
+ // it should be able to sustain two flows to sysmem faster than PCI P2P.
+ int p2pLevel = cpuCount == 1 ? PATH_PHB : PATH_NODE;
+ if (ncclParamP2pDisable() == 1) p2pLevel = 0;
+ if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();
+
+ // Disable P2P
+ *ret = 0;
+
+ if (p2pLevel == 0) return ncclSuccess;
+
+ // Rule out different nodes
+ if (info1->hostHash != info2->hostHash) return ncclSuccess;
+
+ // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
+ int cudaDev1 = busIdToCudaDev(info1->busId);
+ int cudaDev2 = busIdToCudaDev(info2->busId);
+ if (cudaDev1 == -1 || cudaDev2 == -1) {
+ // Peer's CUDA device is not visible in this process
+#if CUDART_VERSION >= 10010
+ // But in CUDA 10.1 we can still communicate with 'invisible' devices
+ TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %lx and %lx", info1->busId, info2->busId);
+ // Check for NVLink/NVswitch including P2P access
+ int nvlink;
+ NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink));
+ if (nvlink > 0) {
+ *ret = 1;
+ return ncclSuccess;
+ }
+#endif
+ return ncclSuccess;
+ }
+
+ TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%lx] and [%d=%lx]", cudaDev1, info1->busId, cudaDev2, info2->busId);
+
+ // Do not detect topology if we're on the same GPU. Note this is not really supported.
+ if (cudaDev1 == cudaDev2) {
+ *ret = 1;
+ return ncclSuccess;
+ }
+
+ // See if CUDA can do P2P
+ int p2p;
+ if (cudaDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != cudaSuccess) {
+ INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%lx) and dev %d(=%lx)",
+ cudaDev1, info1->busId, cudaDev2, info2->busId);
+ return ncclSuccess;
+ }
+ if (p2p == 0) return ncclSuccess;
+
+ // Check for NVLink/NVswitch
+ int nvlink;
+ NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink));
+ if (nvlink > 0) {
+ *ret = 1;
+ return ncclSuccess;
+ }
+
+ // Finally compute the PCI distance and compare with the p2pLevel.
+ int distance;
+ NCCLCHECK(ncclTopoGpuDistance(topo, info1->busId, info2->busId, &distance));
+ if (distance < p2pLevel) {
+ *ret = 1;
+ }
+ return ncclSuccess;
+}
+
+#define TRACE_DUMP_IPC(DEVIPC) \
+ do { \
+ unsigned long *devIpc = (unsigned long *) (DEVIPC); \
+ TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[0], devIpc[1], devIpc[2], devIpc[3]); \
+ TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \
+ } while (0)
+
+/* Send: Create and return connect structures for this peer to connect to me */
+ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+ struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+
+ struct p2pSendResources* resources;
+ NCCLCHECK(ncclCalloc(&resources, 1));
+ send->transportResources = resources;
+ int sendSize = sizeof(struct ncclSendMem);
+ ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
+ NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize));
+
+ struct p2pConnectInfo info;
+ if (myInfo->pidHash == peerInfo->pidHash) {
+ info.direct = 1;
+ info.directPtr = resources->devMem;
+ if (myInfo->cudaDev == peerInfo->cudaDev) {
+ INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/common device", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+ return ncclInternalError;
+ } else {
+ // Enable P2P access
+ cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
+ if (err == cudaErrorPeerAccessAlreadyEnabled) {
+ cudaGetLastError();
+ } else if (err != cudaSuccess) {
+ WARN("failed to peer with device %d(=%lx): %d %s",
+ peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
+ return ncclInternalError;
+ }
+ INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer",
+ channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+ }
+ } else {
+ // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
+ int peerCudaDev = busIdToCudaDev(peerInfo->busId);
+ info.direct = 0;
+ // Map IPC and enable P2P access
+ cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
+ if (err != cudaSuccess) {
+ WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s",
+ myInfo->rank, peerCudaDev, peerInfo->busId, err, cudaGetErrorString(err));
+ return ncclInternalError;
+ }
+ INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/IPC",
+ channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+ //TRACE_DUMP_IPC(&info.devIpc);
+ }
+ static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
+ memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
+ return ncclSuccess;
+}
+
+/* Create and return connect structures for this peer to connect to me */
+ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+ struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) {
+
+ struct p2pRecvResources* resources;
+ NCCLCHECK(ncclCalloc(&resources, 1));
+ recv->transportResources = resources;
+ int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+ ALIGN_SIZE(recvSize, CUDA_IPC_MIN);
+ NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize));
+
+ struct p2pConnectInfo info;
+ if (myInfo->pidHash == peerInfo->pidHash) {
+ info.direct = 1;
+ info.directPtr = resources->devMem;
+ if (myInfo->cudaDev == peerInfo->cudaDev) {
+ TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
+ } else {
+ // Enable P2P access
+ cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
+ if (err == cudaErrorPeerAccessAlreadyEnabled) {
+ cudaGetLastError();
+ } else if (err != cudaSuccess) {
+ WARN("failed to peer with device %d(=%lx): %d %s",
+ peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
+ return ncclInternalError;
+ }
+ TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/direct pointer", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+ }
+ } else {
+ // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
+ int peerCudaDev = busIdToCudaDev(peerInfo->busId);
+ info.direct = 0;
+ // Map IPC and enable P2P access
+ cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
+ if (err != cudaSuccess) {
+ WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s",
+ myInfo->rank, peerCudaDev, peerInfo->busId, err, cudaGetErrorString(err));
+ return ncclInternalError;
+ }
+ TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/IPC", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+ //TRACE_DUMP_IPC(&info.devIpc);
+ }
+ static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
+ memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
+ return ncclSuccess;
+}
+
+/* Connect/Send to this peer */
+static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+ struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
+ struct ncclRecvMem* remDevMem;
+ struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
+ if (info->direct) {
+ remDevMem = (struct ncclRecvMem*)(info->directPtr);
+ send->conn.direct = 1;
+ } else {
+ //TRACE_DUMP_IPC(&info->devIpc);
+ cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
+ remDevMem = (struct ncclRecvMem*)resources->ipcPtr;
+ if (err != cudaSuccess) {
+ WARN("failed to open CUDA IPC handle : %d %s",
+ err, cudaGetErrorString(err));
+ return ncclUnhandledCudaError;
+ }
+ }
+
+ send->conn.buff = remDevMem->buff;
+ send->conn.llBuff = remDevMem->llBuff;
+ send->conn.ll128Buff = remDevMem->ll128Buff;
+ send->conn.tail = &remDevMem->tail;
+ send->conn.opCountRem = &remDevMem->opCount;
+ send->conn.head = &resources->devMem->head;
+ send->conn.ptrExchange = &resources->devMem->ptrExchange;
+ send->conn.opCountLoc = &resources->devMem->opCount;
+ return ncclSuccess;
+}
+
+/* Connect/Recv from this peer */
+ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+ struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
+ struct ncclSendMem* remDevMem;
+ struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
+ if (info->direct) {
+ remDevMem = (struct ncclSendMem*)(info->directPtr);
+ recv->conn.direct = 1;
+ recv->conn.ptrExchange = &remDevMem->ptrExchange;
+ } else {
+ //TRACE_DUMP_IPC(&info->devIpc);
+ cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
+ remDevMem = (struct ncclSendMem*)resources->ipcPtr;
+ if (err != cudaSuccess) {
+ WARN("failed to open CUDA IPC handle : %d %s",
+ err, cudaGetErrorString(err));
+ return ncclUnhandledCudaError;
+ }
+ }
+
+ recv->conn.buff = resources->devMem->buff;
+ recv->conn.llBuff = resources->devMem->llBuff;
+ recv->conn.ll128Buff = resources->devMem->ll128Buff;
+ recv->conn.tail = &resources->devMem->tail;
+ recv->conn.opCountLoc = &resources->devMem->opCount;
+ recv->conn.head = &remDevMem->head;
+ recv->conn.opCountRem = &remDevMem->opCount;
+ return ncclSuccess;
+}
+
+ncclResult_t p2pSendFree(void* resources) {
+ struct p2pSendResources* sendRes = (struct p2pSendResources*)resources;
+ if (sendRes->ipcPtr)
+ CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
+ CUDACHECK(cudaFree(sendRes->devMem));
+ free(sendRes);
+ return ncclSuccess;
+}
+
+ncclResult_t p2pRecvFree(void* resources) {
+ struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources;
+ if (recvRes->ipcPtr)
+ CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
+ CUDACHECK(cudaFree(recvRes->devMem));
+ free(recvRes);
+ return ncclSuccess;
+}
+
+struct ncclTransport p2pTransport = {
+ "P2P",
+ p2pCanConnect,
+ { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL },
+ { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL }
+};
diff --git a/src/transport/p2p.cu b/src/transport/p2p.cu
deleted file mode 100644
index 9f3e0b6..0000000
--- a/src/transport/p2p.cu
+++ /dev/null
@@ -1,620 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "utils.h"
-#include "topo.h"
-#include "transport.h"
-#include "param.h"
-#include <unistd.h>
-#include <cuda_runtime.h>
-#include <ctype.h>
-#include "nvlink.h"
-
-struct p2pConnectInfo {
- int direct;
- union {
- void* directPtr;
- cudaIpcMemHandle_t devIpc;
- };
-};
-
-struct p2pSendResources {
- struct ncclSendMem* devMem;
- void* ipcPtr;
-};
-
-struct p2pRecvResources {
- struct ncclRecvMem* devMem;
- void* ipcPtr;
-};
-
-#include <sys/types.h>
-
-NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2);
-NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2);
-
-/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
-static int busIdToCudaDev(const char* busId) {
- int ndev;
- if (cudaGetDeviceCount(&ndev) != cudaSuccess)
- return -1;
- for (int i = 0; i < ndev; i++) {
- char devBusId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
- if (cudaDeviceGetPCIBusId(devBusId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess)
- return -1;
- if (strcmp(busId, devBusId) == 0) {
- return i;
- }
- }
- // BusId was not found in our locally visible CUDA devices
- return -1;
-}
-
-/* Determine if we can communicate with the peer through p2p */
-ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
- // Do not use P2P across root complexes by default (provided CUDA permits it)
- int p2pLevel = PATH_SOC;
- if (ncclParamP2pDisable() == 1) p2pLevel = 0;
- if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();
-
- *ret = 0;
-
- if (p2pLevel == 0) return ncclSuccess;
-
- // Rule out different nodes
- if (myInfo->hostHash != peerInfo->hostHash) return ncclSuccess;
-
- // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
- int peerCudaDev = busIdToCudaDev(peerInfo->busId);
- if (peerCudaDev == -1) return ncclSuccess; // Peer's CUDA device is not visible in this process
-
- TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
-
- // Do not detect topology if we're on the same GPU. Note this is not really supported.
- if (myInfo->cudaDev == peerCudaDev) {
- *ret = 1 + PATH_SOC;
- return ncclSuccess;
- }
-
- // See if CUDA can do P2P
- int p2p;
- if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerCudaDev) != cudaSuccess) {
- INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%d) and dev %d(=%d)",
- myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
- return ncclSuccess;
- }
- if (p2p == 0) return ncclSuccess;
-
- // Check for NVLink/NVswitch
- int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
- if (nvlinkp2p > 0) {
- *ret = nvlinkp2p;
- return ncclSuccess;
- }
-
- // Finally compute the PCI distance and compare with the p2pLevel.
- char* myPath;
- char* peerPath;
- ncclResult_t err1 = getCudaPath(myInfo->cudaDev, &myPath);
- ncclResult_t err2 = getCudaPath(peerCudaDev, &peerPath);
- if (err1 == ncclSuccess && err2 == ncclSuccess) {
- int distance = pciDistance(myPath, peerPath);
- if (distance < p2pLevel) {
- *ret = 1 + PATH_SOC - distance;
- }
- }
- if (err1 == ncclSuccess) free(myPath);
- if (err2 == ncclSuccess) free(peerPath);
- return ncclSuccess;
-}
-
-static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) {
- int nrings = 0;
- ncclTvalue_t* line = matrix+current*n;
- inTheRing[current] = 1;
- int currentStep = (currentRing+1)*n-remaining;
- rings[currentStep-1] = current;
- if (remaining == 0) {
- int looprank = rings[currentRing*n];
- if (line[looprank] > 0) {
- if (currentRing+1 == nRingsMax) {
- nrings = 1;
- } else {
- line[looprank]--;
- for (int i=0; i<n; i++) inTheRing[i] = 0;
- if (connect) {
- // First two slots are already set and we need to respect those constraints
- inTheRing[rings[currentStep]] = 1;
- nrings = 1 + computeRingsRec(matrix, n, rings, currentRing+1, nRingsMax, inTheRing, rings[currentStep+1], n-2, connect);
- } else {
- rings[(currentRing+1)*n] = 0;
- nrings = 1 + computeRingsRec(matrix, n, rings, currentRing+1, nRingsMax, inTheRing, 0, n-1, connect);
- }
- line[looprank]++;
- for (int i=0; i<n; i++) inTheRing[i] = 1;
- }
- }
- } else {
- int ringsSave[nRingsMax*n];
- int maxStep = 0;
- for (int i=0; i<n; i++) {
- if (inTheRing[i] == 0 && line[i] > 0) {
- line[i]--;
- int nr = computeRingsRec(matrix, n, rings, currentRing, nRingsMax, inTheRing, i, remaining-1, connect);
- if (nr > nrings) {
- nrings = nr;
- maxStep = (nr+currentRing)*n;
- ringsSave[currentStep] = i;
- // Save the rest of the rings
- for (int r=currentStep+1; r<maxStep; r++) {
- ringsSave[r] = rings[r];
- }
- if (nrings + currentRing == nRingsMax) {
- // We found an optimal solution. Let's stop there.
- break;
- }
- }
- line[i]++;
- }
- }
- for (int r=currentStep; r<maxStep; r++) {
- rings[r] = ringsSave[r];
- }
- }
- inTheRing[current] = 0;
- return nrings;
-}
-
-static inline int copyRings(int nranks, int* rings, int nrings, int newNrings) {
- if (nrings == 0) return 0;
- // Copy rings by dup times
- if (newNrings > MAXCHANNELS) {
- newNrings = MAXCHANNELS;
- }
- for (int r=nrings; r<newNrings; r++) {
- for (int i=0; i<nranks; i++) rings[r*nranks+i] = rings[(r%nrings)*nranks+i];
- }
- return newNrings;
-}
-
-int p2pComputeRingsNvLink(ncclTvalue_t* matrix, int nranks, int *rings, int nringsMax, int connect) {
- int* inTheRing = (int*)malloc(sizeof(int)*nranks);
- if (inTheRing == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*nranks); return 0; }
- for (int i=0; i<nranks; i++) inTheRing[i] = 0;
- int nrings;
- if (connect) {
- inTheRing[rings[0]] = 1;
- nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, rings[1], nranks-2, connect);
- } else {
- rings[0] = 0;
- nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, 0, nranks-1, connect);
- }
- free(inTheRing);
- return nrings;
-}
-
-static inline int findConnect(int nranks, int* ranks) {
- for (int i = 0; i<nranks; i++) {
- if (ranks[i] != -1) return i;
- }
- return -1;
-}
-
-int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int oversubscribe, int* nthreads) {
- if (nrings == 0) return 0;
- if (nrings > MAXCHANNELS) {
- WARN("Max rings reached, limiting to %d", MAXCHANNELS);
- nrings = MAXCHANNELS;
- }
- // Find existing constraints / connections
- int connect = 0;
- for (int r=0; r<nrings; r++) {
- int start = findConnect(nranks, prev+r*nranks);
- int end = findConnect(nranks, next+r*nranks);
- if (start != -1 && end != -1) {
- rings[r*nranks] = end;
- rings[r*nranks+1] = start;
- connect = 1;
- }
- }
-
- // Compute rings
- ncclTvalue_t* matrix = (ncclTvalue_t*)malloc(sizeof(ncclTvalue_t)*nranks*nranks);
- if (matrix == NULL) { WARN("malloc of %ld bytes failed", sizeof(ncclTvalue_t)*nranks*nranks); return 0; }
- for (int i=0; i<nranks; i++) for (int j=0; j<nranks; j++)
- matrix[i*nranks+j] = oversubscribe ? values[i*nranks+j]/CONNECT_NVLINK*2 : values[i*nranks+j]/CONNECT_NVLINK ;
-
- int compNrings = p2pComputeRingsNvLink(matrix, nranks, rings, nrings, connect);
-
- free(matrix);
-
- if (oversubscribe || connect) return compNrings;
-
- if (compNrings && compNrings < nrings && nranks <= 4) {
- // Try to oversubscribe to get a better result
- int *rings2 = (int *)malloc(sizeof(int)*MAXCHANNELS*nranks);
- if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXCHANNELS*nranks); return 0; }
- for (int i=0; i<MAXCHANNELS*nranks; i++) rings2[i] = -1;
- int nThreads = *nthreads;
- int compNrings2 = p2pComputeRingsNvLink(values, nranks, rings2, nrings, prev, next, 1, &nThreads);
- if (compNrings2 > compNrings*2) {
- // Oversubscription worked.
- for (int i=0; i<compNrings2*nranks; i++) rings[i] = rings2[i];
- compNrings = compNrings2;
- }
- free(rings2);
- }
-
- // Duplicate the rings for direct NVLink
- compNrings = copyRings(nranks, rings, compNrings, compNrings*2);
-
- return compNrings;
-}
-
-int p2pComputeRingsSeqConnect(ncclTvalue_t* values, int nranks, int* rings, int nringsStart, int* prev, int* next, int minScore, int* nthreads) {
- int nrings = nringsStart;
- int connect = 0;
- for (int r=0; r<nrings; r++) {
- int start = findConnect(nranks, prev+r*nranks);
- int end = findConnect(nranks, next+r*nranks);
- if (start != -1 && end != -1) {
- rings[r*nranks] = end;
- rings[r*nranks+1] = start;
- int cur = start;
- for (int i=2; i<nranks; i++) {
- int next = (cur+1) % nranks;
- while (next == end || next == start) next = (next+1) % nranks;
- if (values[cur*nranks+next] < minScore) {
- return 0;
- }
- rings[r*nranks+i] = next;
- cur = next;
- }
- connect = 1;
- } else {
- if (connect == 1 && r > 0) {
- WARN("Connecting rings but did not find start/end for ring %d. Disabling other rings.", r);
- return r;
- } else {
- return 0;
- }
- }
- }
- return nrings;
-}
-
-int p2pComputeRingsSeqNew(ncclTvalue_t* values, int nranks, int* rings, int nringsStart, int* prev, int* next, int minScore, int* nthreads) {
- for (int r=0; r<nringsStart; r++) {
- for (int i=0; i<nranks; i++) {
- rings[r*nranks+i] = i;
- }
- }
- return nringsStart;
-}
-
-static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) {
- for (int score = PATH_SOC+1; score >= minScore; score--) {
- int best = -1;
- int worst_end_score = PATH_SOC+2; // find the closest to rank, farthest from end
- for (int n = 0; n < nranks; n++) {
- if (inRing[n]) continue;
- if (values[rank*nranks+n] == score) {
- if (end == -1) return n;
- if (values[end*nranks+n] < worst_end_score) {
- best = n;
- worst_end_score = values[end*nranks+n];
- }
- }
- }
- if (best != -1) return best;
- }
- return -1;
-}
-
-int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int minScore) {
- int connect = 0;
- for (int r=0; r<nrings; r++) {
- int start = findConnect(nranks, prev+r*nranks);
- int end = findConnect(nranks, next+r*nranks);
-
- int inRing[nranks];
- for (int i=0; i<nranks; i++) inRing[i] = 0;
-
- if (start == -1 && end == -1) {
- if (connect == 1 && r > 0) {
- WARN("Connecting ring %d : did not find start/end. Disabling other rings.", r);
- return r;
- }
- end = 0;
- inRing[end] = 1;
- start = findClosestPci(values, inRing, end, -1, nranks, minScore);
- if (start == -1) return r;
- } else if (start == -1 || end == -1) {
- WARN("Connecting ring %d : inconsistent start/end. Disabling other rings.", r);
- return r;
- } else {
- connect = 1;
- }
- rings[r*nranks] = end;
- rings[r*nranks+1] = start;
- inRing[start] = inRing[end] = 1;
- int cur = start;
- for (int i=2; i<nranks; i++) {
- int next = findClosestPci(values, inRing, cur, end, nranks, minScore);
- if (next == -1) return r;
-
- inRing[next] = 1;
- rings[r*nranks+i] = next;
- cur = next;
- }
- // Check the loop is closing
- inRing[end] = 0;
- if (findClosestPci(values, inRing, cur, end, nranks, minScore) != end) return r;
-
- if (connect == 0) return 1;
- }
- return nrings;
-}
-
-ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
- if (*nringsRet == 0) return ncclSuccess;
- int *rings;
- NCCLCHECK(ncclCalloc(&rings, MAXCHANNELS*nranks));
- for (int i=0; i<MAXCHANNELS*nranks; i++) rings[i] = -1;
- int nrings = *nringsRet;
-
- // NVswitch
- int nvswitchLinks = 0;
- int directLinks = 0;
- for (int rank=0; rank<nranks; rank++) {
- for (int j=1; j<nranks; j++) {
- int i = (rank + j) % nranks;
- ncclTvalue_t links = values[rank*nranks+i]/CONNECT_NVSWITCH;
- if (j>1 && links != nvswitchLinks) {
- WARN("Internal error : NVswitch links mismatch");
- return ncclInternalError;
- }
- nvswitchLinks = links;
- }
- }
- if (nvswitchLinks) {
- // NVSwitch : Connect existing rings
- int nringsConnected = p2pComputeRingsSeqConnect(values, nranks, rings, nrings, prev, next, minScore, nthreads);
- if (nringsConnected > 0) {
- nrings = nringsConnected;
- } else {
- nrings = std::min(nrings, nvswitchLinks); // NVSwitch: Limit rings to number of NVLinks
- // Or create new ones
- nrings = p2pComputeRingsSeqNew(values, nranks, rings, nrings, prev, next, minScore, nthreads);
- // And duplicate them
- nrings = copyRings(nranks, rings, nrings, nrings*2);
- }
- goto end;
- }
-
- // point-to-point NVLink
- for (int rank=0; rank<nranks; rank++) {
- int links = 0;
- for (int i=0; i<nranks; i++) {
- ncclTvalue_t val = values[rank*nranks+i];
- if (val >= CONNECT_NVSWITCH) continue;
- links += val/CONNECT_NVLINK;
- }
- if (rank == 0) directLinks = links;
- else directLinks = std::min(directLinks, links);
- }
- if (directLinks > 0) {
- // NVLink : Connect rings or create new ones
- nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads);
- goto end;
- }
-
- // PCIe or QPI : Connect rings or create new ones
- nrings = p2pComputeRingsPci(values, nranks, rings, *nringsRet, prev, next, minScore);
-
-end:
- *nringsRet = nrings;
- for (int ring = 0; ring<nrings; ring++) {
- for (int index=0; index<nranks; index++) {
- int prevIndex = (index - 1 + nranks) % nranks;
- int nextIndex = (index + 1) % nranks;
- int curRank = rings[ring*nranks+index];
- int prevRank = rings[ring*nranks+prevIndex];
- int nextRank = rings[ring*nranks+nextIndex];
- if (prev[ring*nranks+curRank] == -1) prev[ring*nranks+curRank] = prevRank;
- if (next[ring*nranks+curRank] == -1) next[ring*nranks+curRank] = nextRank;
- }
- }
-
- free(rings);
- return ncclSuccess;
-}
-
-#define TRACE_DUMP_IPC(DEVIPC) \
- do { \
- unsigned long *devIpc = (unsigned long *) (DEVIPC); \
- TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[0], devIpc[1], devIpc[2], devIpc[3]); \
- TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \
- } while (0)
-
-/* Send: Create and return connect structures for this peer to connect to me */
-ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
- struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
-
- struct p2pSendResources* resources;
- NCCLCHECK(ncclCalloc(&resources, 1));
- send->transportResources = resources;
- const int sendSize = sizeof(struct ncclSendMem);
- NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize));
-
- struct p2pConnectInfo info;
- if (myInfo->pidHash == peerInfo->pidHash) {
- info.direct = 1;
- info.directPtr = resources->devMem;
- if (myInfo->cudaDev == peerInfo->cudaDev) {
- INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", channelId, myInfo->rank, peerInfo->rank);
- } else {
- // Enable P2P access
- cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
- if (err == cudaErrorPeerAccessAlreadyEnabled) {
- cudaGetLastError();
- } else if (err != cudaSuccess) {
- WARN("failed to peer with device %d(=%d): %d %s",
- peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
- return ncclInternalError;
- }
- INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
- channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
- }
- } else {
- // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
- int peerCudaDev = busIdToCudaDev(peerInfo->busId);
- info.direct = 0;
- // Map IPC and enable P2P access
- cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
- if (err != cudaSuccess) {
- WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
- myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
- return ncclInternalError;
- }
- INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
- channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
- //TRACE_DUMP_IPC(&info.devIpc);
- }
- static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
- memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
- return ncclSuccess;
-}
-
-/* Create and return connect structures for this peer to connect to me */
-ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
- struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) {
-
- struct p2pRecvResources* resources;
- NCCLCHECK(ncclCalloc(&resources, 1));
- recv->transportResources = resources;
- const int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
- NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize));
-
- struct p2pConnectInfo info;
- if (myInfo->pidHash == peerInfo->pidHash) {
- info.direct = 1;
- info.directPtr = resources->devMem;
- if (myInfo->cudaDev == peerInfo->cudaDev) {
- TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
- } else {
- // Enable P2P access
- cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
- if (err == cudaErrorPeerAccessAlreadyEnabled) {
- cudaGetLastError();
- } else if (err != cudaSuccess) {
- WARN("failed to peer with device %d(=%d): %d %s",
- peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
- return ncclInternalError;
- }
- TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
- }
- } else {
- // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
- int peerCudaDev = busIdToCudaDev(peerInfo->busId);
- info.direct = 0;
- // Map IPC and enable P2P access
- cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
- if (err != cudaSuccess) {
- WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
- myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
- return ncclInternalError;
- }
- TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
- //TRACE_DUMP_IPC(&info.devIpc);
- }
- static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
- memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
- return ncclSuccess;
-}
-
-/* Connect/Send to this peer */
-static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
- struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
- struct ncclRecvMem* remDevMem;
- struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
- if (info->direct) {
- remDevMem = (struct ncclRecvMem*)(info->directPtr);
- send->conn.direct = 1;
- } else {
- //TRACE_DUMP_IPC(&info->devIpc);
- cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
- remDevMem = (struct ncclRecvMem*)resources->ipcPtr;
- if (err != cudaSuccess) {
- WARN("failed to open CUDA IPC handle : %d %s",
- err, cudaGetErrorString(err));
- return ncclUnhandledCudaError;
- }
- }
-
- send->conn.buff = remDevMem->buff;
- send->conn.llBuff = remDevMem->llBuff;
- send->conn.tail = &remDevMem->tail;
- send->conn.opCountRem = &remDevMem->opCount;
- send->conn.head = &resources->devMem->head;
- send->conn.ptrExchange = &resources->devMem->ptrExchange;
- send->conn.opCountLoc = &resources->devMem->opCount;
- return ncclSuccess;
-}
-
-/* Connect/Recv from this peer */
-ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
- struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
- struct ncclSendMem* remDevMem;
- struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
- if (info->direct) {
- remDevMem = (struct ncclSendMem*)(info->directPtr);
- recv->conn.direct = 1;
- recv->conn.ptrExchange = &remDevMem->ptrExchange;
- } else {
- //TRACE_DUMP_IPC(&info->devIpc);
- cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
- remDevMem = (struct ncclSendMem*)resources->ipcPtr;
- if (err != cudaSuccess) {
- WARN("failed to open CUDA IPC handle : %d %s",
- err, cudaGetErrorString(err));
- return ncclUnhandledCudaError;
- }
- }
-
- recv->conn.buff = resources->devMem->buff;
- recv->conn.llBuff = resources->devMem->llBuff;
- recv->conn.tail = &resources->devMem->tail;
- recv->conn.opCountLoc = &resources->devMem->opCount;
- recv->conn.head = &remDevMem->head;
- recv->conn.opCountRem = &remDevMem->opCount;
- return ncclSuccess;
-}
-
-ncclResult_t p2pSendFree(void* resources) {
- struct p2pSendResources* sendRes = (struct p2pSendResources*)resources;
- if (sendRes->ipcPtr)
- CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
- CUDACHECK(cudaFree(sendRes->devMem));
- return ncclSuccess;
-}
-
-ncclResult_t p2pRecvFree(void* resources) {
- struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources;
- if (recvRes->ipcPtr)
- CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
- CUDACHECK(cudaFree(recvRes->devMem));
- return ncclSuccess;
-}
-
-struct ncclTransport p2pTransport = {
- "P2P",
- p2pCanConnect,
- p2pGetRings,
- { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL },
- { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL }
-};
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
new file mode 100644
index 0000000..60f16c8
--- /dev/null
+++ b/src/transport/shm.cc
@@ -0,0 +1,175 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "shm.h"
+
+struct shmConnectInfo {
+ uint64_t pidHash;
+ int id;
+ int sendRank;
+ int recvRank;
+ int shmSize;
+};
+
+struct shmSendResources {
+ int remShmSize;
+ struct ncclRecvMem* remHostMem;
+ struct ncclRecvMem* devRemHostMem;
+ int shmSize;
+ struct ncclSendMem* hostMem;
+ struct ncclSendMem* devHostMem;
+};
+
+struct shmRecvResources {
+ int remShmSize;
+ struct ncclSendMem* remHostMem;
+ struct ncclSendMem* devRemHostMem;
+ int shmSize;
+ struct ncclRecvMem* hostMem;
+ struct ncclRecvMem* devHostMem;
+};
+
+NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);
+
+/* Determine two peers can communicate with SHM */
+ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+ *ret = 0;
+
+ if (ncclParamShmDisable() == 1) return ncclSuccess;
+
+ // Same host?
+ TRACE(NCCL_INIT|NCCL_SHM, "peer1 hostHash %lx peer2 hostHash %lx", info1->hostHash, info2->hostHash);
+ if (info1->hostHash != info2->hostHash) return ncclSuccess;
+
+ // Common /dev/shm (between containers) ?
+ TRACE(NCCL_INIT|NCCL_SHM, "peer1 shmDev %lx peer2 shmDev %lx", info1->shmDev, info2->shmDev);
+ if (info1->shmDev != info2->shmDev) return ncclSuccess;
+
+ *ret = 1;
+
+ return ncclSuccess;
+}
+
+#define MAX_SHM_NAME_LEN 1024
+
+/* Create and return connect structures for this peer to connect to me */
+ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+
+ struct shmSendResources* resources;
+ NCCLCHECK(ncclCalloc(&resources, 1));
+ send->transportResources = resources;
+
+ struct shmConnectInfo info;
+ info.id = channelId;
+ info.pidHash = myInfo->pidHash;
+ info.sendRank = myInfo->rank;
+ info.recvRank = peerInfo->rank;
+
+ char shmName[MAX_SHM_NAME_LEN];
+ sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
+ info.shmSize = resources->shmSize = sizeof(struct ncclSendMem);
+ TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
+ NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+
+ INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+ static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
+ memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
+ return ncclSuccess;
+}
+
+ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
+ struct shmRecvResources* resources;
+ NCCLCHECK(ncclCalloc(&resources, 1));
+ recv->transportResources = resources;
+
+ struct shmConnectInfo info;
+ info.id = channelId;
+ info.pidHash = myInfo->pidHash;
+ info.sendRank = peerInfo->rank;
+ info.recvRank = myInfo->rank;
+
+ char shmName[MAX_SHM_NAME_LEN];
+ sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
+ info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+ TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
+ NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+
+ static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big");
+ memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
+ return ncclSuccess;
+}
+
+/* Connect to this peer */
+ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+ // Setup device pointers
+ struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
+ struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
+
+ char shmName[MAX_SHM_NAME_LEN];
+ sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
+ resources->remShmSize = info->shmSize;
+ TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
+ NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
+ // Remove the file to ensure proper clean-up
+ NCCLCHECK(shmUnlink(shmName));
+
+ send->transportResources = resources;
+ send->conn.buff = resources->devRemHostMem->buff;
+ send->conn.llBuff = resources->devRemHostMem->llBuff;
+ send->conn.ll128Buff = resources->devRemHostMem->ll128Buff;
+ send->conn.tail = &resources->devRemHostMem->tail;
+ send->conn.opCountRem = &resources->devRemHostMem->opCount;
+
+ send->conn.head = &resources->devHostMem->head;
+ send->conn.opCountLoc = &resources->devHostMem->opCount;
+ return ncclSuccess;
+}
+
+ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+ // Setup device pointers
+ struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
+ struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
+
+ char shmName[MAX_SHM_NAME_LEN];
+ sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
+ resources->remShmSize = info->shmSize;
+ TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
+ NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
+ NCCLCHECK(shmUnlink(shmName));
+ recv->conn.head = &resources->devRemHostMem->head;
+ recv->conn.opCountRem = &resources->devRemHostMem->opCount;
+
+ recv->conn.buff = resources->devHostMem->buff;
+ recv->conn.llBuff = resources->devHostMem->llBuff;
+ recv->conn.ll128Buff = resources->devHostMem->ll128Buff;
+ recv->conn.tail = &resources->devHostMem->tail;
+ recv->conn.opCountLoc = &resources->devHostMem->opCount;
+ return ncclSuccess;
+}
+
+ncclResult_t shmSendFree(void* transportResources) {
+ struct shmSendResources* resources = (struct shmSendResources*)transportResources;
+ NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
+ NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
+ free(resources);
+ return ncclSuccess;
+}
+
+ncclResult_t shmRecvFree(void* transportResources) {
+ struct shmRecvResources* resources = (struct shmRecvResources*)transportResources;
+ NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
+ NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
+ free(resources);
+ return ncclSuccess;
+}
+
+struct ncclTransport shmTransport = {
+ "SHM",
+ shmCanConnect,
+ { shmSendSetup, shmSendConnect, shmSendFree, NULL },
+ { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL }
+};
diff --git a/src/transport/shm.cu b/src/transport/shm.cu
deleted file mode 100644
index 56e0242..0000000
--- a/src/transport/shm.cu
+++ /dev/null
@@ -1,245 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "core.h"
-#include "utils.h"
-#include "transport.h"
-#include "param.h"
-#include "shm.h"
-#include <unistd.h>
-#include <cuda_runtime.h>
-
-struct shmSendConnectInfo {
- uint64_t pidHash;
- int id;
- int rank;
- int shmSize;
-};
-
-struct shmRecvConnectInfo {
- uint64_t pidHash;
- int id;
- int rank;
- int shmSize;
-};
-
-struct shmSendResources {
- int remShmSize;
- struct ncclRecvMem* remHostMem;
- struct ncclRecvMem* devRemHostMem;
- int shmSize;
- struct ncclSendMem* hostMem;
- struct ncclSendMem* devHostMem;
-};
-
-struct shmRecvResources {
- int remShmSize;
- struct ncclSendMem* remHostMem;
- struct ncclSendMem* devRemHostMem;
- int shmSize;
- struct ncclRecvMem* hostMem;
- struct ncclRecvMem* devHostMem;
-};
-
-NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);
-
-/* Determine if we can communicate with the peer */
-ncclResult_t shmCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
- *ret = ((ncclParamShmDisable() == 1) || (myInfo->hostHash != peerInfo->hostHash)) ? 0 : 1;
- return ncclSuccess;
-}
-
-static inline int groupFirst(int nranks, int* groups, int group, int rankToAvoid) {
- for (int rank = 0; rank<nranks; rank++) {
- if ((groups[rank] == group) && (rank != rankToAvoid)) return rank;
- }
- return -1;
-}
-
-static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid) {
- for (int rank = nranks-1; rank>=0; rank--) {
- if ((groups[rank] == group) && (rank != rankToAvoid)) return rank;
- }
- return -1;
-}
-
-ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
- if (*nringsRet == MAXCHANNELS) *nringsRet = 1;
- int nGroups = groups[nranks-1] + 1;
- int starts[nGroups];
- int ends[nGroups];
- for (int ring = 0; ring<*nringsRet; ring++) {
- int startGroup = -1, endGroup = -1;
- for (int group = 0; group<nGroups; group++) {
- int start = -1;
- int end = -1;
- int nranksInGroup = 0;
- for (int rank=0; rank<nranks; rank++) {
- if (groups[rank] != group) continue;
- nranksInGroup++;
- if (prev[ring*nranks+rank] != -1) {
- if (start != -1) {
- WARN("Multiple starts found in group");
- }
- start = rank;
- startGroup = group;
- }
- if (next[ring*nranks+rank] != -1) {
- if (end != -1) {
- WARN("Multiple ends found in group");
- }
- end = rank;
- endGroup = group;
- }
- }
- if (nranksInGroup == 1) {
- start = end = groupFirst(nranks, groups, group, -1);
- } else {
- if (start == -1)
- start = groupFirst(nranks, groups, group, end);
- if (end == -1)
- end = groupLast(nranks, groups, group, start);
- }
- if (start == -1 || end == -1) {
- *nringsRet = ring;
- return ncclSuccess;
- }
- starts[group] = start;
- ends[group] = end;
- }
- if (endGroup == -1 || startGroup == -1) {
- startGroup = 0;
- endGroup = nGroups-1;
- // Close the loop
- next[ring*nranks+ends[endGroup]] = starts[startGroup];
- prev[ring*nranks+starts[startGroup]] = ends[endGroup];
- }
- int group = startGroup;
- for (int i=0; i<nGroups-2; i++) {
- int nextGroup = (group+1)%nGroups;
- if (nextGroup == endGroup) nextGroup = (nextGroup+1)%nGroups;
- next[ring*nranks+ends[group]] = starts[nextGroup];
- prev[ring*nranks+starts[nextGroup]] = ends[group];
- group = nextGroup;
- }
- // Connect with the last
- next[ring*nranks+ends[group]] = starts[endGroup];
- prev[ring*nranks+starts[endGroup]] = ends[group];
- }
- return ncclSuccess;
-}
-
-#define MAX_SHM_NAME_LEN 1024
-
-/* Create and return connect structures for this peer to connect to me */
-ncclResult_t shmSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
-
- struct shmSendResources* resources;
- NCCLCHECK(ncclCalloc(&resources, 1));
- send->transportResources = resources;
-
- struct shmRecvConnectInfo info;
- char shmName[MAX_SHM_NAME_LEN];
- sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank);
- info.shmSize = resources->shmSize = sizeof(struct ncclSendMem);
- TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
- NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
-
- INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
- info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
- static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
- memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo));
- return ncclSuccess;
-}
-
-ncclResult_t shmRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
- struct shmRecvResources* resources;
- NCCLCHECK(ncclCalloc(&resources, 1));
- recv->transportResources = resources;
-
- struct shmSendConnectInfo info;
-
- char shmName[MAX_SHM_NAME_LEN];
- sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank);
- info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+buffSize;
- TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
- NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
-
- info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
- static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big");
- memcpy(connectInfo, &info, sizeof(struct shmSendConnectInfo));
- return ncclSuccess;
-}
-
-/* Connect to this peer */
-ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
- // Setup device pointers
- struct shmSendConnectInfo* info = (struct shmSendConnectInfo*)connectInfo;
- struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
-
- char shmName[MAX_SHM_NAME_LEN];
- sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", info->pidHash, info->id, info->rank);
- resources->remShmSize = info->shmSize;
- TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
- NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
- // Remove the file to ensure proper clean-up
- NCCLCHECK(shmUnlink(shmName));
-
- send->transportResources = resources;
- send->conn.buff = resources->devRemHostMem->buff;
- send->conn.llBuff = resources->devRemHostMem->llBuff;
- send->conn.tail = &resources->devRemHostMem->tail;
- send->conn.opCountRem = &resources->devRemHostMem->opCount;
-
- send->conn.head = &resources->devHostMem->head;
- send->conn.opCountLoc = &resources->devHostMem->opCount;
- return ncclSuccess;
-}
-
-ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
- // Setup device pointers
- struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
- struct shmRecvConnectInfo* info = (struct shmRecvConnectInfo*)connectInfo;
-
- char shmName[MAX_SHM_NAME_LEN];
- sprintf(shmName, "nccl-shm-send-%lx-%d-%d", info->pidHash, info->id, info->rank);
- resources->remShmSize = info->shmSize;
- TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
- NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
- NCCLCHECK(shmUnlink(shmName));
- recv->conn.head = &resources->devRemHostMem->head;
- recv->conn.opCountRem = &resources->devRemHostMem->opCount;
-
- recv->conn.buff = resources->devHostMem->buff;
- recv->conn.llBuff = resources->devHostMem->llBuff;
- recv->conn.tail = &resources->devHostMem->tail;
- recv->conn.opCountLoc = &resources->devHostMem->opCount;
- return ncclSuccess;
-}
-
-ncclResult_t shmSendFree(void* transportResources) {
- struct shmSendResources* resources = (struct shmSendResources*)transportResources;
- NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
- NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
- free(resources);
- return ncclSuccess;
-}
-
-ncclResult_t shmRecvFree(void* transportResources) {
- struct shmRecvResources* resources = (struct shmRecvResources*)transportResources;
- NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
- NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
- free(resources);
- return ncclSuccess;
-}
-
-struct ncclTransport shmTransport = {
- "SHM",
- shmCanConnect,
- shmGetRings,
- { shmSendSetup, shmSendConnect, shmSendFree, NULL },
- { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL }
-};