Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Addison <daddison@nvidia.com>2019-03-15 05:39:20 +0300
committerDavid Addison <daddison@nvidia.com>2019-04-05 23:05:45 +0300
commitf40ce73e8987d2990e4b9ef6c75f4b3423acce78 (patch)
tree8df24e6ebc127a82a6562eb60fc6e80590bb3c55
parent14e0cf644b9ba2214f2b6d2e299e8218f6145d32 (diff)
NCCL 2.4.6-1
Added detection of IBM/Power NVLink bridge device. Add NUMA support to PCI distance calculations. Added NCCL_IGNORE_CPU_AFFINITY env var. Fix memory leaks; GithubIssue#180 Compiler warning fix; GithubIssue#178 Replace non-standard variable length arrays. GithubIssue#171 Fix Tree+Shared Memory crash. GithubPR#185 Fix LL cleanup hang during long running DL jobs. Fix NCCL_RINGS environment variable handling. Added extra checks to catch repeat calls to ncclCommDestroy() GithubIssue#191 Improve bootstrap socket connection reliability at scale. Fix hostname hashing issue. GithubIssue#187 Code cleanup to rename all non device files from *.cu to *.cc
-rw-r--r--LICENSE.txt2
-rw-r--r--Makefile2
-rw-r--r--README.md2
-rw-r--r--ext-net/dummy/Makefile2
-rw-r--r--ext-net/dummy/plugin.c2
-rw-r--r--makefiles/common.mk8
-rw-r--r--makefiles/formatting.mk2
-rw-r--r--makefiles/version.mk2
-rw-r--r--pkg/Makefile2
-rw-r--r--pkg/debian/Makefile2
-rw-r--r--pkg/redhat/Makefile2
-rw-r--r--pkg/srctxz/Makefile2
-rw-r--r--pkg/srctxz/create_srctxz.sh.in2
-rw-r--r--pkg/txz/Makefile2
-rw-r--r--pkg/txz/create_txz.sh.in2
-rw-r--r--src/Makefile20
-rw-r--r--src/bootstrap.cc (renamed from src/bootstrap.cu)2
-rw-r--r--src/channel.cc (renamed from src/channel.cu)7
-rw-r--r--src/collectives/all_gather.cc (renamed from src/collectives/all_gather.cu)2
-rw-r--r--src/collectives/all_reduce.cc (renamed from src/collectives/all_reduce.cu)2
-rw-r--r--src/collectives/broadcast.cc (renamed from src/collectives/broadcast.cu)2
-rw-r--r--src/collectives/collectives.h2
-rw-r--r--src/collectives/device/Makefile2
-rw-r--r--src/collectives/device/all_gather.cu2
-rw-r--r--src/collectives/device/all_gather.h8
-rw-r--r--src/collectives/device/all_reduce.cu2
-rw-r--r--src/collectives/device/all_reduce.h12
-rw-r--r--src/collectives/device/broadcast.cu2
-rw-r--r--src/collectives/device/broadcast.h8
-rw-r--r--src/collectives/device/common.h6
-rw-r--r--src/collectives/device/common_kernel.h4
-rw-r--r--src/collectives/device/functions.cu4
-rwxr-xr-xsrc/collectives/device/gen_rules.sh2
-rw-r--r--src/collectives/device/primitives.h54
-rw-r--r--src/collectives/device/reduce.cu2
-rw-r--r--src/collectives/device/reduce.h8
-rw-r--r--src/collectives/device/reduce_scatter.cu2
-rw-r--r--src/collectives/device/reduce_scatter.h8
-rw-r--r--src/collectives/reduce.cc (renamed from src/collectives/reduce.cu)2
-rw-r--r--src/collectives/reduce_scatter.cc (renamed from src/collectives/reduce_scatter.cu)2
-rw-r--r--src/enqueue.cc (renamed from src/enqueue.cu)8
-rw-r--r--src/include/alloc.h51
-rw-r--r--src/include/argcheck.h15
-rw-r--r--src/include/bootstrap.h2
-rw-r--r--src/include/channel.h2
-rw-r--r--src/include/checks.h71
-rw-r--r--src/include/comm.h127
-rw-r--r--src/include/core.h423
-rw-r--r--src/include/cpuset.h2
-rw-r--r--src/include/debug.h6
-rw-r--r--src/include/devcomm.h194
-rw-r--r--src/include/enqueue.h8
-rw-r--r--src/include/ibvwrap.h2
-rw-r--r--src/include/info.h45
-rw-r--r--src/include/nccl_net.h7
-rw-r--r--src/include/net.h2
-rw-r--r--src/include/nvlink.h14
-rw-r--r--src/include/nvmlwrap.h2
-rw-r--r--src/include/param.h3
-rw-r--r--src/include/rings.h2
-rw-r--r--src/include/shm.h2
-rw-r--r--src/include/socket.h21
-rw-r--r--src/include/topo.h64
-rw-r--r--src/include/transport.h7
-rw-r--r--src/include/trees.h2
-rw-r--r--src/include/utils.h4
-rw-r--r--src/init.cc (renamed from src/init.cu)109
-rw-r--r--src/misc/argcheck.cc (renamed from src/misc/checks.cu)4
-rw-r--r--src/misc/group.cc (renamed from src/misc/group.cu)4
-rw-r--r--src/misc/ibvwrap.cc (renamed from src/misc/ibvwrap.cu)2
-rw-r--r--src/misc/nvmlwrap.cc (renamed from src/misc/nvmlwrap.cu)2
-rw-r--r--src/misc/rings.cc (renamed from src/misc/rings.cu)6
-rw-r--r--src/misc/topo.cc51
-rw-r--r--src/misc/trees.cc (renamed from src/misc/trees.cu)2
-rw-r--r--src/misc/utils.cc (renamed from src/misc/utils.cu)12
-rw-r--r--src/transport.cc (renamed from src/transport.cu)0
-rw-r--r--src/transport/net.cc (renamed from src/transport/net.cu)52
-rw-r--r--src/transport/net_ib.cc (renamed from src/transport/net_ib.cu)3
-rw-r--r--src/transport/net_socket.cc (renamed from src/transport/net_socket.cu)2
-rw-r--r--src/transport/p2p.cc (renamed from src/transport/p2p.cu)40
-rw-r--r--src/transport/shm.cc (renamed from src/transport/shm.cu)8
81 files changed, 893 insertions, 693 deletions
diff --git a/LICENSE.txt b/LICENSE.txt
index 3593a7a..e318c66 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,5 +1,5 @@
- Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
diff --git a/Makefile b/Makefile
index 605e3bf..caed3d4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/README.md b/README.md
index fa51453..abfd1cd 100644
--- a/README.md
+++ b/README.md
@@ -89,4 +89,4 @@ $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g <ngpus>
## Copyright
-All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+All source code and accompanying documentation is copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
diff --git a/ext-net/dummy/Makefile b/ext-net/dummy/Makefile
index d1eb4c5..efa841c 100644
--- a/ext-net/dummy/Makefile
+++ b/ext-net/dummy/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/ext-net/dummy/plugin.c b/ext-net/dummy/plugin.c
index f11b365..67d7d88 100644
--- a/ext-net/dummy/plugin.c
+++ b/ext-net/dummy/plugin.c
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/makefiles/common.mk b/makefiles/common.mk
index d0e2ca8..2ad5c73 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -15,6 +15,7 @@ PROFAPI ?= 0
NVCC = $(CUDA_HOME)/bin/nvcc
CUDA_LIB ?= $(CUDA_HOME)/lib64
+CUDA_INC ?= $(CUDA_HOME)/include
CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
#CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
@@ -43,7 +44,8 @@ endif
#$(info NVCC_GENCODE is ${NVCC_GENCODE})
CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
-CXXFLAGS += -Wall -Wno-sign-compare
+CXXFLAGS += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla
+CXXFLAGS += -I $(CUDA_INC)
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
# Use addprefix so that we can specify more than one path
NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt
@@ -67,7 +69,7 @@ CXXFLAGS += -O0 -g -ggdb3
endif
ifneq ($(VERBOSE), 0)
-NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra
+NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
CXXFLAGS += -Wall -Wextra
else
.SILENT:
diff --git a/makefiles/formatting.mk b/makefiles/formatting.mk
index 4a4ab88..a543131 100644
--- a/makefiles/formatting.mk
+++ b/makefiles/formatting.mk
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/makefiles/version.mk b/makefiles/version.mk
index a8c6e3a..7abaaaf 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 4
-NCCL_PATCH := 2
+NCCL_PATCH := 6
NCCL_SUFFIX :=
PKG_REVISION := 1
diff --git a/pkg/Makefile b/pkg/Makefile
index 04b23da..ab6487b 100644
--- a/pkg/Makefile
+++ b/pkg/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/pkg/debian/Makefile b/pkg/debian/Makefile
index 439635f..7884cf2 100644
--- a/pkg/debian/Makefile
+++ b/pkg/debian/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/pkg/redhat/Makefile b/pkg/redhat/Makefile
index ffcc973..0808478 100644
--- a/pkg/redhat/Makefile
+++ b/pkg/redhat/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/pkg/srctxz/Makefile b/pkg/srctxz/Makefile
index ed677fe..01cab95 100644
--- a/pkg/srctxz/Makefile
+++ b/pkg/srctxz/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/pkg/srctxz/create_srctxz.sh.in b/pkg/srctxz/create_srctxz.sh.in
index ae7d01f..11bdd52 100644
--- a/pkg/srctxz/create_srctxz.sh.in
+++ b/pkg/srctxz/create_srctxz.sh.in
@@ -1,6 +1,6 @@
#!/bin/bash
#
-# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/pkg/txz/Makefile b/pkg/txz/Makefile
index fa587ef..b7d9aa5 100644
--- a/pkg/txz/Makefile
+++ b/pkg/txz/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/pkg/txz/create_txz.sh.in b/pkg/txz/create_txz.sh.in
index 73922e0..deae854 100644
--- a/pkg/txz/create_txz.sh.in
+++ b/pkg/txz/create_txz.sh.in
@@ -1,6 +1,6 @@
#!/bin/bash
#
-# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/src/Makefile b/src/Makefile
index fe60b11..2d32dca 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -9,10 +9,10 @@ include ../makefiles/version.mk
##### src files
INCEXPORTS := nccl.h nccl_net.h
-LIBSRCFILES := init.cu channel.cu bootstrap.cu transport.cu enqueue.cu \
- misc/group.cu misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/checks.cu misc/trees.cu \
- transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
- collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
+LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc \
+ misc/group.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/rings.cc misc/utils.cc misc/argcheck.cc misc/trees.cc misc/topo.cc \
+ transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
+ collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc
##### lib files
LIBNAME := libnccl.so
@@ -27,7 +27,7 @@ INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%)
LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR))
LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
STATICLIBTARGET := $(STATICLIBNAME)
-LIBOBJ := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
+LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
DEPFILES := $(LIBOBJ:%.o=%.d)
LDFLAGS += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl
@@ -87,11 +87,11 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h
mkdir -p $(INCDIR)
cp -f $< $@
-$(OBJDIR)/%.o : %.cu
+$(OBJDIR)/%.o : %.cc
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p `dirname $@`
- $(NVCC) -I. -I$(INCDIR) -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@
- @$(NVCC) -I. -I$(INCDIR) -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp)
+ $(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@
+ @$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp)
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
@@ -107,7 +107,7 @@ install : lib
cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/
cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
-FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
+FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h')
# Note that formatting.mk defines a new target so in order to not overwrite the default target,
# it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well
# as the BUILDDIR variable.
diff --git a/src/bootstrap.cu b/src/bootstrap.cc
index 6b1d573..9df38e4 100644
--- a/src/bootstrap.cu
+++ b/src/bootstrap.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/channel.cu b/src/channel.cc
index 937e84e..b053e5b 100644
--- a/src/channel.cu
+++ b/src/channel.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -47,5 +47,10 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
}
+
+ // Free the peer structures.
+ CUDACHECK(cudaFree(channel->devPeers));
+ free(channel->peers);
+
return ncclSuccess;
}
diff --git a/src/collectives/all_gather.cu b/src/collectives/all_gather.cc
index db21dee..348c176 100644
--- a/src/collectives/all_gather.cu
+++ b/src/collectives/all_gather.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/all_reduce.cu b/src/collectives/all_reduce.cc
index 1492c90..921f2de 100644
--- a/src/collectives/all_reduce.cu
+++ b/src/collectives/all_reduce.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/broadcast.cu b/src/collectives/broadcast.cc
index 6a3d0a8..042301b 100644
--- a/src/collectives/broadcast.cu
+++ b/src/collectives/broadcast.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/collectives.h b/src/collectives/collectives.h
index e6b19cb..73fe7d5 100644
--- a/src/collectives/collectives.h
+++ b/src/collectives/collectives.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile
index 8e92596..0ee587b 100644
--- a/src/collectives/device/Makefile
+++ b/src/collectives/device/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu
index 530bf14..109c341 100644
--- a/src/collectives/device/all_gather.cu
+++ b/src/collectives/device/all_gather.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index 36809c9..8e78730 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -1,10 +1,10 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
@@ -13,7 +13,7 @@ __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
@@ -74,7 +74,7 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu
index aaa96b4..85d007e 100644
--- a/src/collectives/device/all_reduce.cu
+++ b/src/collectives/device/all_reduce.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index ea89a71..9b058cc 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -1,10 +1,10 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
@@ -13,7 +13,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
@@ -87,7 +87,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* tree = &channel->tree;
const ssize_t size = args->N;
@@ -139,7 +139,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
@@ -214,7 +214,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* tree = &channel->tree;
const ssize_t size = args->N;
diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu
index b83ee70..8c8dbb6 100644
--- a/src/collectives/device/broadcast.cu
+++ b/src/collectives/device/broadcast.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index fb18312..ae8667f 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -1,10 +1,10 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
@@ -13,7 +13,7 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
@@ -59,7 +59,7 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index e4aecbd..8c336bf 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,7 +8,7 @@
#define NCCL_DEVICE_COMMON_H_
#include "../collectives.h"
-#include "core.h"
+#include "devcomm.h"
#include "nccl.h"
// Exit If Abort Barrier across CTA: make sure all threads exit consistently
@@ -57,7 +57,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
int bid = blockIdx.x; \
__shared__ struct ncclColl localColl; \
\
- struct ncclComm* comm = firstColl.args.comm; \
+ struct ncclDevComm* comm = firstColl.args.comm; \
struct ncclChannel* channel = comm->channels+bid; \
struct ncclColl* c; \
if (bid == 0) { \
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
index e1fb096..435a598 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,7 +7,7 @@
#ifndef NCCL_COMMON_KERNEL_H_
#define NCCL_COMMON_KERNEL_H_
-#include "core.h"
+#include "devcomm.h"
#include <cstdio>
#include <cstdint>
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu
index ea06b68..010c454 100644
--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@@ -1,10 +1,10 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#include "devcomm.h"
#include "collectives.h"
#include "common.h"
diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh
index 3942c8c..4413213 100755
--- a/src/collectives/device/gen_rules.sh
+++ b/src/collectives/device/gen_rules.sh
@@ -1,6 +1,6 @@
#!/bin/bash
#
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index c5aaf54..7beeaf4 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -50,7 +50,7 @@ class ncclPrimitives {
T* sendDirectBuff[NSEND];
const T* recvBuff[NRECV];
T* sendBuff[NSEND];
- struct ncclComm* comm;
+ struct ncclDevComm* comm;
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
@@ -239,7 +239,7 @@ class ncclPrimitives {
public:
__device__ __forceinline__
- ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
+ ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
: comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
// Make sure step is updated before we read it
__syncthreads();
@@ -329,14 +329,14 @@ class ncclLLPrimitives {
uint64_t sendConnHead;
union ncclLLFifoLine* recvBuff[NRECV];
union ncclLLFifoLine* sendBuff[NSEND];
- struct ncclComm* comm;
+ struct ncclDevComm* comm;
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
- inline __device__ uint32_t recvFlag(int i) { return recvStep[i]+1; }
- inline __device__ uint32_t sendFlag(int i) { return sendStep[i]+1; }
+ inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
+ inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
// Exit If Abort Barrier : make sure all threads exit consistently
// Each thread sets a predicate to true if val == 1
@@ -393,7 +393,10 @@ class ncclLLPrimitives {
sendConnHead = *waitPtr;
if (checkAbort(sendConn[i]->opCountRem)) break;
}
- if (fifoPtr) fifoPtr[sendStep[i]%NCCL_STEPS] = nbytes;
+ if (fifoPtr) {
+ int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
+ fifoPtr[sendStep[i]%NCCL_STEPS] = size;
+ }
}
}
@@ -402,7 +405,12 @@ class ncclLLPrimitives {
if (tid == i) *postPtr = recvStep[i];
}
- inline __device__ void postSend(int i) {
+ inline __device__ void postSend(int i, int offset) {
+ // LL Cleanup : write all flags in the slice to make sure we don't have
+ // data corruption when flag loops over.
+ if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
+ for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
+ }
sendStep[i]++;
}
@@ -443,9 +451,10 @@ class ncclLLPrimitives {
uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
uint64_t* srcPack = (uint64_t*)srcPtr;
uint64_t* dstPack = (uint64_t*)dstPtr;
+ int offset = tid;
// Do multiples of 64 bits
#pragma unroll 2
- for (int offset=tid; offset<npack; offset+=nthreads) {
+ for (; offset<npack; offset+=nthreads) {
// Recv : local, then intra-node, then inter-node
uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
if (RECV) {
@@ -471,7 +480,7 @@ class ncclLLPrimitives {
}
exitIfAbortLocalBarrier();
FOR_RECV(postRecv);
- FOR_SEND(postSend);
+ FOR_SEND(postSend, offset);
}
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
@@ -514,32 +523,9 @@ class ncclLLPrimitives {
}
}
- __device__ __forceinline__ void llSendCleaning(int i) {
- if (sendStep[i] > sendConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
- /* Reset all flags */
- static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS");
- static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS");
- for (int s=0; s<NCCL_STEPS; s++) {
- waitSend(i, 0);
- for (int o=tid; o<NCCL_LL_SLICE_LINES; o+=nthreads) {
- const union ncclLLFifoLine resetLine = { 0, sendFlag(i), 0, sendFlag(i) };
- sendPtr(i)[o].i4 = resetLine.i4;
- }
- }
- if (tid == 0) sendConn[i]->llLastCleaning = sendStep[i];
- }
- }
-
- __device__ __forceinline__ void llRecvCleaning(int i) {
- if (recvStep[i] > recvConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
- recvStep[i] += NCCL_STEPS;
- if (tid == 0) recvConn[i]->llLastCleaning = recvStep[i];
- }
- }
-
public:
__device__ __forceinline__
- ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
+ ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
: comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
// Make sure step is updated before we read it.
barrier();
@@ -577,8 +563,6 @@ class ncclLLPrimitives {
}
__device__ __forceinline__ ~ncclLLPrimitives() {
- for (int i=0; i<NSEND && i<nsend; i++) llSendCleaning(i);
- for (int i=0; i<NRECV && i<nrecv; i++) llRecvCleaning(i);
// Save steps for the next operation
for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu
index 1ef66d4..a2caac5 100644
--- a/src/collectives/device/reduce.cu
+++ b/src/collectives/device/reduce.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index 302d053..d2d5d3b 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -1,10 +1,10 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
@@ -13,7 +13,7 @@ __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
@@ -55,7 +55,7 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu
index 10857ed..8b45299 100644
--- a/src/collectives/device/reduce_scatter.cu
+++ b/src/collectives/device/reduce_scatter.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index c70c845..09ba56e 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -1,10 +1,10 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
@@ -13,7 +13,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
@@ -69,7 +69,7 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nthreads = args->nThreads;
- struct ncclComm* comm = args->comm;
+ struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
diff --git a/src/collectives/reduce.cu b/src/collectives/reduce.cc
index 302d4bc..67f2fae 100644
--- a/src/collectives/reduce.cu
+++ b/src/collectives/reduce.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/collectives/reduce_scatter.cu b/src/collectives/reduce_scatter.cc
index 4ee77ef..5ad7f5f 100644
--- a/src/collectives/reduce_scatter.cu
+++ b/src/collectives/reduce_scatter.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/enqueue.cu b/src/enqueue.cc
index d283223..b485634 100644
--- a/src/enqueue.cu
+++ b/src/enqueue.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -87,7 +87,7 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *par
}
ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
- params->gridDim.x = std::min((int) params->gridDim.x, comm->nChannels);
+ params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);
// Set active = 2 for the last operation
for (int r=0; r<params->gridDim.x; r++) {
@@ -266,7 +266,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) {
// Compute thresholds and limits that users can override
- int perThreadLLThreshold = std::min(info->comm->threadThreshold, (ssize_t)NCCL_LL_CHANNEL_THRESHOLD);
+ ssize_t perThreadLLThreshold = std::min<ssize_t>(info->comm->threadThreshold, NCCL_LL_CHANNEL_THRESHOLD);
int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads);
// First compute nThreads
@@ -365,7 +365,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
NCCLCHECK(computeColl(info, &coll, &proxyArgs));
- info->comm->myParams->blockDim.x = max(info->comm->myParams->blockDim.x, coll.args.nThreads);
+ info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, coll.args.nThreads);
if (info->comm->userStreamSet == false) {
info->comm->userStream = info->stream;
info->comm->userStreamSet = true;
diff --git a/src/include/alloc.h b/src/include/alloc.h
new file mode 100644
index 0000000..bcdbd18
--- /dev/null
+++ b/src/include/alloc.h
@@ -0,0 +1,51 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ALLOC_H_
+#define NCCL_ALLOC_H_
+
+#include "nccl.h"
+#include "checks.h"
+#include <sys/mman.h>
+
+static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
+ CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
+ memset(*ptr, 0, size);
+ *devPtr = *ptr;
+ return ncclSuccess;
+}
+
+static inline ncclResult_t ncclCudaHostFree(void* ptr) {
+ CUDACHECK(cudaFreeHost(ptr));
+ return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
+ void* p = malloc(nelem*sizeof(T));
+ if (p == NULL) {
+ WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
+ return ncclSystemError;
+ }
+ memset(p, 0, nelem*sizeof(T));
+ *ptr = (T*)p;
+ return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
+ CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
+ CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
+ return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
+ CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
+ return ncclSuccess;
+}
+
+#endif
diff --git a/src/include/argcheck.h b/src/include/argcheck.h
new file mode 100644
index 0000000..0d6cca7
--- /dev/null
+++ b/src/include/argcheck.h
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ARGCHECK_H_
+#define NCCL_ARGCHECK_H_
+
+#include "core.h"
+
+ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
+ncclResult_t ArgsCheck(struct ncclInfo* info);
+
+#endif
diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h
index a1aaf50..dd7de2c 100644
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/include/channel.h b/src/include/channel.h
index 76c5e8a..c01d942 100644
--- a/src/include/channel.h
+++ b/src/include/channel.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/include/checks.h b/src/include/checks.h
index bf7750e..50737b0 100644
--- a/src/include/checks.h
+++ b/src/include/checks.h
@@ -1,10 +1,73 @@
/*************************************************************************
- * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
+#ifndef NCCL_CHECKS_H_
+#define NCCL_CHECKS_H_
-ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
-ncclResult_t ArgsCheck(struct ncclInfo* info);
+#include "debug.h"
+
+// Check CUDA calls
+#define CUDACHECK(cmd) do { \
+ cudaError_t e = cmd; \
+ if( e != cudaSuccess ) { \
+ WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
+ return ncclUnhandledCudaError; \
+ } \
+} while(false)
+
+#define CUDACHECKGOTO(cmd, res, label) do { \
+ cudaError_t e = cmd; \
+ if( e != cudaSuccess ) { \
+ WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
+ res = ncclUnhandledCudaError; \
+ goto label; \
+ } \
+} while(false)
+
+#include <errno.h>
+// Check system calls
+#define SYSCHECK(call, name) do { \
+ int retval; \
+ SYSCHECKVAL(call, name, retval); \
+} while (false)
+
+#define SYSCHECKVAL(call, name, retval) do { \
+ SYSCHECKSYNC(call, name, retval); \
+ if (retval == -1) { \
+ WARN("Call to " name " failed : %s", strerror(errno)); \
+ return ncclSystemError; \
+ } \
+} while (false)
+
+#define SYSCHECKSYNC(call, name, retval) do { \
+ retval = call; \
+ if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
+ INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
+ } else { \
+ break; \
+ } \
+} while(true)
+
+// Propagate errors up
+#define NCCLCHECK(call) do { \
+ ncclResult_t res = call; \
+ if (res != ncclSuccess) { \
+ /* Print the back trace*/ \
+ INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
+ return res; \
+ } \
+} while (0);
+
+#define NCCLCHECKGOTO(call, res, label) do { \
+ res = call; \
+ if (res != ncclSuccess) { \
+ /* Print the back trace*/ \
+ INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
+ goto label; \
+ } \
+} while (0);
+
+#endif
diff --git a/src/include/comm.h b/src/include/comm.h
new file mode 100644
index 0000000..132eb39
--- /dev/null
+++ b/src/include/comm.h
@@ -0,0 +1,127 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_COMM_H_
+#define NCCL_COMM_H_
+
+#if CUDART_VERSION < 9000
+struct cudaLaunchParams {
+ void *func;
+ dim3 gridDim;
+ dim3 blockDim;
+ void **args;
+ size_t sharedMem;
+ cudaStream_t stream;
+};
+#endif
+
+#define MAXCHANNELS 16
+#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
+
+#define CACHE_LINE_SIZE 128
+#define MEM_ALIGN 4096
+#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
+
+struct ncclSendMem {
+ union {
+ struct {
+ uint64_t head;
+ char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+ void* ptrExchange;
+ char pad2[CACHE_LINE_SIZE-sizeof(void*)];
+ uint64_t opCount;
+ };
+ char pad3[MEM_ALIGN];
+ };
+};
+
+struct ncclRecvMem {
+ union {
+ struct {
+ uint64_t tail;
+ char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+ uint64_t opCount;
+ char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
+ int sizesFifo[NCCL_STEPS];
+ };
+ char pad4[MEM_ALIGN];
+ };
+ ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
+ char buff[1]; // Actually larger than that
+};
+
+struct ncclComm {
+ struct ncclChannel channels[MAXCHANNELS];
+
+ struct ncclPeerInfo* peerInfo;
+
+ void* bootstrap;
+
+ int rank; // my rank in the communicator
+ int nRanks; // number of GPUs in communicator
+ int cudaDev; // my cuda device index
+ int nvmlDev; // my NVML device number
+
+ enum { GROUP, PARALLEL } launchMode;
+ cudaStream_t userStream;
+ bool userStreamSet;
+ cudaEvent_t doneEvent;
+ bool checkPointers;
+
+ // Counter to make sure collectives match (needed for bcast/reduce
+ // where syncs are not symmetric).
+ uint64_t opCount;
+
+ // Channels for collectives
+ int nChannels;
+ int nThreads;
+
+ // Low-latency algorithm threshold
+ ssize_t llThreshold;
+ ssize_t threadThreshold;
+
+ // Tree algorithm threshold
+ ssize_t treeThreshold;
+
+ // An internal CUDA stream for NCCL kernel CGMD launches
+ int groupCudaStream;
+ cudaStream_t groupStream;
+
+ // Whether there has been a fatal error in this communicator.
+ ncclResult_t fatalError;
+
+ // Error reported by GPU
+ volatile ncclDevError_t* fatalDevError;
+
+ // Flag to ask NCCL kernels to abort
+ volatile uint32_t *abortFlag;
+
+ // Device side of the communicator
+ struct ncclDevComm *devComm;
+ // Host copy of the devComm (to free CUDA allocs)
+ struct ncclDevComm hostDevComm;
+
+ // Intra-process sync
+ int intraRank;
+ int intraRanks;
+ int* intraBarrier;
+ int intraPhase;
+
+ // Storage for deferred intra-process launch
+ struct cudaLaunchParams * intraParams;
+ struct cudaLaunchParams *myParams;
+ int* intraCudaDevs;
+ int* intraCGMode; // Whether we can use CUDA9 CGMD or not
+ int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
+ struct ncclColl args;
+ void* argsptr;
+
+ // Global proxy thread
+ pthread_t proxyThread;
+ struct ncclProxyState proxyState;
+};
+
+#endif
diff --git a/src/include/core.h b/src/include/core.h
index d57d271..8a08b91 100644
--- a/src/include/core.h
+++ b/src/include/core.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,385 +7,20 @@
#ifndef NCCL_CORE_H_
#define NCCL_CORE_H_
-#define NCCL_MAX_OPS 2048
-#define NCCL_STEPS 8
-
+#include <pthread.h>
+#include <algorithm>
#include "nccl.h"
-#include "transport.h"
#include "debug.h"
+#include "checks.h"
+#include "alloc.h"
+#include "transport.h"
+#include "devcomm.h"
+#include "comm.h"
+#include "info.h"
+#include "argcheck.h"
#include <cstdio>
-#include <algorithm> // std::min/std::max
#include <unistd.h>
#include <stdlib.h>
-#include <cuda_runtime.h>
-
-#if CUDART_VERSION < 9000
-struct cudaLaunchParams {
- void *func;
- dim3 gridDim;
- dim3 blockDim;
- void **args;
- size_t sharedMem;
- cudaStream_t stream;
-};
-#endif
-
-#define MAXCHANNELS 16
-#define MAXTHREADS 256
-#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
-
-// Channels / LL tuning
-#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
-#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL
-#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
-#define NCCL_LL_MAX_NTHREADS MAXTHREADS
-#define NCCL_LL_MIN_NTHREADS 64
-
-#define DIVUP(x, y) \
- (((x)+(y)-1)/(y))
-#define ROUNDUP(x, y) \
- (DIVUP((x), (y))*(y))
-
-#define ALIGN_SIZE(size, align) \
- size = ((size + (align) - 1) / (align)) * (align);
-
-union ncclLLFifoLine {
- /* Flags have to be *after* data, because otherwise, an incomplete receive
- from the network may receive the flag but not the data.
- Note this is assuming that either we receive contiguous chunks of data
- (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
- struct {
- uint32_t data1;
- uint32_t flag1;
- uint32_t data2;
- uint32_t flag2;
- };
- uint64_t v[2];
- int4 i4;
-};
-
-typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
-
-typedef enum {
- ncclPatternRing,
- ncclPatternRingTwice,
- ncclPatternPipelineFrom,
- ncclPatternPipelineTo,
- ncclPatternTreeUp,
- ncclPatternTreeDown,
- ncclPatternTreeUpDown
-} ncclPattern_t;
-
-typedef enum {
- ncclDevSuccess,
- ncclDevAssertedMismatch,
- ncclDevSuspectedMismatch
-} ncclDevError_t;
-
-// Used to pass NCCL call information between functions
-struct ncclInfo {
- ncclColl_t coll;
- const char* opName;
- // NCCL Coll Args
- const void* sendbuff;
- void* recvbuff;
- size_t count;
- ncclDataType_t datatype;
- ncclRedOp_t op;
- int root;
- ncclComm_t comm;
- cudaStream_t stream;
- // Algorithm details
- int chunkSteps;
- int sliceSteps;
- // Computed later
- ncclPattern_t pattern;
- size_t nBytes;
- int nstepsPerLoop;
- int nchunksPerLoop;
-};
-
-struct ncclConnInfo {
- // Regular comm mechanism
- char *buff; // Local for recv, remote for send
- uint64_t *tail; // Local for recv, remote for send
- uint64_t *head; // Local for send, remote for recv
- uint64_t *opCountLoc; // opCount of local rank
- uint64_t *opCountRem; // opCount of remote rank
-
- int direct; // Direct communication
- void **ptrExchange; // Pointer exchange for direct communication
-
- int *fifo; // Size fifo for proxy
-
- uint64_t step; // Keep where we are
-
- // Low latency mechanism
- union ncclLLFifoLine *llBuff; // Local for recv, remote for send
- uint64_t llLastCleaning;
-};
-
-struct ncclConnector {
- int connected;
- struct ncclProxyArgs *proxyAppend;
- struct ncclTransportComm* transportComm;
- void* transportResources; // Host-side resources
- struct ncclConnInfo conn;
- struct ncclComm *comm;
-};
-
-#define CACHE_LINE_SIZE 128
-#define MEM_ALIGN 4096
-#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
-
-#define NUM_LINES_PER_THREAD 8
-#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
-#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
-#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
-#define NCCL_LL_CLEAN_FREQ 0x10000000
-
-struct ncclSendMem {
- union {
- struct {
- uint64_t head;
- char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
- void* ptrExchange;
- char pad2[CACHE_LINE_SIZE-sizeof(void*)];
- uint64_t opCount;
- };
- char pad3[MEM_ALIGN];
- };
-};
-
-struct ncclRecvMem {
- union {
- struct {
- uint64_t tail;
- char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
- uint64_t opCount;
- char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
- int sizesFifo[NCCL_STEPS];
- };
- char pad4[MEM_ALIGN];
- };
- ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
- char buff[1]; // Actually larger than that
-};
-
-struct ncclRing {
- // Shortcuts for userRanks[1] and userRanks[n-1]
- int prev;
- int next;
-
- // Maps an internal nccl index to user-specified rank order. This is necessary
- // since we need to know how the user expects data to be ordered across
- // devices. Ordered from current device.
- int* userRanks;
- int* devUserRanks;
-};
-
-#define NCCL_MAX_TREE_ARITY 3
-struct ncclTree {
- int depth;
- int up;
- int down[NCCL_MAX_TREE_ARITY];
-};
-
-struct ncclPeer {
- struct ncclConnector send;
- struct ncclConnector recv;
-};
-
-struct ncclChannel {
- union {
- struct {
- struct ncclRing ring;
- struct ncclTree tree;
-
- int id;
- int nthreads;
- int buffSize;
-
- // Communication structures
- struct ncclPeer* peers;
- struct ncclPeer* devPeers;
-
- // Operation list for aggregation
- struct ncclColl* collectives;
- struct ncclColl* devCollectives;
- int collStart;
- int collCount;
- int collFifoHead; // Only used by GPU
- int collFifoTail; // Only used by CPU
- };
- int data[0x80];
- };
-};
-static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
-
-/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
-/* to make sure reads to host from the CUDA kernel are aligned. */
-/* Make sure to adjust padding at the end of ncclColl. */
-struct CollectiveArgs {
- struct ncclComm* comm;
- uint64_t opCount;
-
- // local and remote input, output, and buffer
- const void * ThisInput;
- void * ThisOutput;
-
- // general parameters
- size_t N;
- uint32_t root;
- uint8_t bid;
- uint8_t nChannels;
- uint16_t nThreads;
-
- int lastChunkSize;
-};
-struct ncclColl {
- union {
- struct {
- struct CollectiveArgs args;
- uint16_t funcIndex;
- uint16_t nextIndex;
- uint8_t active;
- };
- int data[0x10];
- };
-};
-static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
-
-struct ncclComm {
- struct ncclChannel channels[MAXCHANNELS];
-
- struct ncclPeerInfo* peerInfo;
-
- void* bootstrap;
-
- int rank; // my rank in the communicator
- int nRanks; // number of GPUs in communicator
- int cudaDev; // my cuda device index
- int nvmlDev; // my NVML device number
-
- enum { GROUP, PARALLEL } launchMode;
- cudaStream_t userStream;
- bool userStreamSet;
- cudaEvent_t doneEvent;
- bool checkPointers;
-
- // Counter to make sure collectives match (needed for bcast/reduce
- // where syncs are not symmetric).
- uint64_t opCount;
-
- // Channels for collectives
- int nChannels;
- int nThreads;
-
- // Low-latency algorithm threshold
- ssize_t llThreshold;
- ssize_t threadThreshold;
-
- // Tree algorithm threshold
- ssize_t treeThreshold;
-
- // An internal CUDA stream for NCCL kernel CGMD launches
- int groupCudaStream;
- cudaStream_t groupStream;
-
- // Whether there has been a fatal error in this communicator.
- ncclResult_t fatalError;
-
- // Error reported by GPU
- volatile ncclDevError_t* fatalDevError;
-
- // On host: this pointer has been obtained from cudaHostAlloc(cudaHostAllocMapped)
- // On device: this pointer has been obtained from cudaHostGetDevicePointer()
- volatile uint32_t *abortFlag;
-
- // Device copy of the communicator
- struct ncclComm *devComm;
-
- // Intra-process sync
- int intraRank;
- int intraRanks;
- int* intraBarrier;
- int intraPhase;
-
- // Storage for deferred intra-process launch
- struct cudaLaunchParams * intraParams;
- struct cudaLaunchParams *myParams;
- int* intraCudaDevs;
- int* intraCGMode; // Whether we can use CUDA9 CGMD or not
- int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
- struct ncclColl args;
- void* argsptr;
-
- // Global proxy thread
- pthread_t proxyThread;
- struct ncclProxyState proxyState;
-};
-
-// Check CUDA calls
-#define CUDACHECK(cmd) do { \
- cudaError_t e = cmd; \
- if( e != cudaSuccess ) { \
- WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
- return ncclUnhandledCudaError; \
- } \
-} while(false)
-
-#define CUDACHECKGOTO(cmd, res, label) do { \
- cudaError_t e = cmd; \
- if( e != cudaSuccess ) { \
- WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
- res = ncclUnhandledCudaError; \
- goto label; \
- } \
-} while(false)
-
-#include <errno.h>
-// Check system calls
-#define SYSCHECK(call, name) do { \
- int retval; \
- SYSCHECKVAL(call, name, retval); \
-} while (false)
-
-#define SYSCHECKVAL(call, name, retval) do { \
- SYSCHECKSYNC(call, name, retval); \
- if (retval == -1) { \
- WARN("Call to " name " failed : %s", strerror(errno)); \
- return ncclSystemError; \
- } \
-} while (false)
-
-#define SYSCHECKSYNC(call, name, retval) do { \
- retval = call; \
- if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
- INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
- } else { \
- break; \
- } \
-} while(true)
-
-// Propagate errors up
-#define NCCLCHECK(call) do { \
- ncclResult_t res = call; \
- if (res != ncclSuccess) { \
- /* Print the back trace*/ \
- INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
- return res; \
- } \
-} while (0);
-
-#define NCCLCHECKGOTO(call, res, label) do { \
- res = call; \
- if (res != ncclSuccess) { \
- /* Print the back trace*/ \
- INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
- goto label; \
- } \
-} while (0);
#ifdef PROFAPI
#define NCCL_API(ret, func, args...) \
@@ -427,42 +62,4 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
}
}
-#include <sys/mman.h>
-static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
- CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
- memset(*ptr, 0, size);
- *devPtr = *ptr;
- return ncclSuccess;
-}
-
-static inline ncclResult_t ncclCudaHostFree(void* ptr) {
- CUDACHECK(cudaFreeHost(ptr));
- return ncclSuccess;
-}
-
-template <typename T>
-static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
- void* p = malloc(nelem*sizeof(T));
- if (p == NULL) {
- WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
- return ncclSystemError;
- }
- memset(p, 0, nelem*sizeof(T));
- *ptr = (T*)p;
- return ncclSuccess;
-}
-
-template <typename T>
-static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
- CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
- CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
- return ncclSuccess;
-}
-
-template <typename T>
-static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
- CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
- return ncclSuccess;
-}
-
#endif // end include guard
diff --git a/src/include/cpuset.h b/src/include/cpuset.h
index f70d1d8..98b93de 100644
--- a/src/include/cpuset.h
+++ b/src/include/cpuset.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/include/debug.h b/src/include/debug.h
index 3acdf8c..c3e8fa0 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -24,7 +24,7 @@ extern int ncclDebugLevel;
extern uint64_t ncclDebugMask;
extern pthread_mutex_t ncclDebugOutputLock;
extern FILE *ncclDebugFile;
-extern ncclResult_t getHostName(char* hostname, int maxlen);
+extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev);
extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
@@ -108,7 +108,7 @@ static inline void initDebug() {
break;
case 'h': // %h = hostname
char hostname[1024];
- getHostName(hostname, 1024);
+ getHostName(hostname, 1024, '.');
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
break;
case 'p': // %p = pid
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
new file mode 100644
index 0000000..0a2ef96
--- /dev/null
+++ b/src/include/devcomm.h
@@ -0,0 +1,194 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_DEVICE_H_
+#define NCCL_DEVICE_H_
+
+#include "nccl.h"
+#include <stdint.h>
+
+#define NCCL_MAX_OPS 2048
+#define NCCL_STEPS 8
+
+typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
+
+#define DIVUP(x, y) \
+ (((x)+(y)-1)/(y))
+#define ROUNDUP(x, y) \
+ (DIVUP((x), (y))*(y))
+
+#define ALIGN_SIZE(size, align) \
+ size = ((size + (align) - 1) / (align)) * (align);
+
+union ncclLLFifoLine {
+ /* Flags have to be *after* data, because otherwise, an incomplete receive
+ from the network may receive the flag but not the data.
+ Note this is assuming that either we receive contiguous chunks of data
+ (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
+ struct {
+ uint32_t data1;
+ uint32_t flag1;
+ uint32_t data2;
+ uint32_t flag2;
+ };
+ uint64_t v[2];
+ int4 i4;
+};
+
+#define MAXTHREADS 256
+#define NCCL_LL_MAX_NTHREADS MAXTHREADS
+#define NUM_LINES_PER_THREAD 8
+#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
+#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
+#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
+#ifdef DEBUG_LL
+#define NCCL_LL_CLEAN_MASK 0x00000ff8
+#define NCCL_LL_FLAG_MAX 0x00001000
+#define NCCL_LL_FLAG(a) ((uint32_t)(a % NCCL_LL_FLAG_MAX))
+#else
+#define NCCL_LL_CLEAN_MASK 0x7ffffff8
+#define NCCL_LL_FLAG(a) ((uint32_t)(a))
+#endif
+// Make sure the clean mask will last for at least NCCL_NSTEPS
+static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
+
+struct ncclConnInfo {
+ // Regular comm mechanism
+ char *buff; // Local for recv, remote for send
+ uint64_t *tail; // Local for recv, remote for send
+ uint64_t *head; // Local for send, remote for recv
+ uint64_t *opCountLoc; // opCount of local rank
+ uint64_t *opCountRem; // opCount of remote rank
+
+ int direct; // Direct communication
+ void **ptrExchange; // Pointer exchange for direct communication
+
+ int *fifo; // Size fifo for proxy
+
+ uint64_t step; // Keep where we are
+
+ // Low latency mechanism
+ union ncclLLFifoLine *llBuff; // Local for recv, remote for send
+ uint64_t llLastCleaning;
+};
+
+struct ncclConnector {
+ int connected;
+ struct ncclProxyArgs *proxyAppend;
+ struct ncclTransportComm* transportComm;
+ void* transportResources; // Host-side resources
+ struct ncclConnInfo conn;
+ struct ncclComm *comm;
+};
+
+struct ncclRing {
+ // Shortcuts for userRanks[1] and userRanks[n-1]
+ int prev;
+ int next;
+
+ // Maps an internal nccl index to user-specified rank order. This is necessary
+ // since we need to know how the user expects data to be ordered across
+ // devices. Ordered from current device.
+ int* userRanks;
+ int* devUserRanks;
+};
+
+
+#define NCCL_MAX_TREE_ARITY 3
+struct ncclTree {
+ int depth;
+ int up;
+ int down[NCCL_MAX_TREE_ARITY];
+};
+
+struct ncclPeer {
+ struct ncclConnector send;
+ struct ncclConnector recv;
+};
+
+struct ncclDevComm;
+
+/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
+/* to make sure reads to host from the CUDA kernel are aligned. */
+/* Make sure to adjust padding at the end of ncclColl. */
+struct CollectiveArgs {
+ struct ncclDevComm* comm;
+ uint64_t opCount;
+
+ // local and remote input, output, and buffer
+ const void * ThisInput;
+ void * ThisOutput;
+
+ // general parameters
+ size_t N;
+ uint32_t root;
+ uint8_t bid;
+ uint8_t nChannels;
+ uint16_t nThreads;
+
+ int lastChunkSize;
+};
+struct ncclColl {
+ union {
+ struct {
+ struct CollectiveArgs args;
+ uint16_t funcIndex;
+ uint16_t nextIndex;
+ uint8_t active;
+ };
+ int data[0x10];
+ };
+};
+static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
+
+struct ncclChannel {
+ union {
+ struct {
+ struct ncclRing ring;
+ struct ncclTree tree;
+
+ int id;
+ int nthreads;
+ int buffSize;
+
+ // Communication structures
+ struct ncclPeer* peers;
+ struct ncclPeer* devPeers;
+
+ // Operation list for aggregation
+ struct ncclColl* collectives;
+ struct ncclColl* devCollectives;
+ int collStart;
+ int collCount;
+ int collFifoHead; // Only used by GPU
+ int collFifoTail; // Only used by CPU
+ };
+ int data[0x80];
+ };
+};
+static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
+
+#define MAXCHANNELS 16
+
+typedef enum {
+ ncclDevSuccess,
+ ncclDevAssertedMismatch,
+ ncclDevSuspectedMismatch
+} ncclDevError_t;
+
+struct ncclDevComm {
+ int rank;
+ int nRanks;
+
+ // Flag to ask NCCL kernels to abort
+ volatile uint32_t *abortFlag;
+ volatile ncclDevError_t *fatalDevError;
+
+ // Channels, device side
+ struct ncclChannel* channels;
+};
+
+#endif
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
index 4db7094..3b7a18c 100644
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,6 +10,12 @@
#include "core.h"
#include "group.h"
+// Channels / LL tuning
+#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
+#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL
+#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
+#define NCCL_LL_MIN_NTHREADS 64
+
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h
index 4f3e831..0943f99 100644
--- a/src/include/ibvwrap.h
+++ b/src/include/ibvwrap.h
@@ -4,7 +4,7 @@
* Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
*
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/include/info.h b/src/include/info.h
new file mode 100644
index 0000000..401298a
--- /dev/null
+++ b/src/include/info.h
@@ -0,0 +1,45 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INFO_H_
+#define NCCL_INFO_H_
+
+#include "nccl.h"
+
+typedef enum {
+ ncclPatternRing,
+ ncclPatternRingTwice,
+ ncclPatternPipelineFrom,
+ ncclPatternPipelineTo,
+ ncclPatternTreeUp,
+ ncclPatternTreeDown,
+ ncclPatternTreeUpDown
+} ncclPattern_t;
+
+// Used to pass NCCL call information between functions
+struct ncclInfo {
+ ncclColl_t coll;
+ const char* opName;
+ // NCCL Coll Args
+ const void* sendbuff;
+ void* recvbuff;
+ size_t count;
+ ncclDataType_t datatype;
+ ncclRedOp_t op;
+ int root;
+ ncclComm_t comm;
+ cudaStream_t stream;
+ // Algorithm details
+ int chunkSteps;
+ int sliceSteps;
+ // Computed later
+ ncclPattern_t pattern;
+ size_t nBytes;
+ int nstepsPerLoop;
+ int nchunksPerLoop;
+};
+
+#endif
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
index 89edbf5..797c759 100644
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -80,12 +80,13 @@ typedef struct {
// Finalize connection establishment after remote peer has called connectHandle
ncclResult_t (*accept)(void* listenComm, void** recvComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+ // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
- // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+ // Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
- // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+ // Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
diff --git a/src/include/net.h b/src/include/net.h
index e75e6bb..da3ecea 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/include/nvlink.h b/src/include/nvlink.h
index 1baf9e5..8a0f99e 100644
--- a/src/include/nvlink.h
+++ b/src/include/nvlink.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -18,6 +18,7 @@
enum ncclNvLinkDeviceType {
ncclNvLinkDeviceGpu,
ncclNvLinkDeviceSwitch,
+ ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
};
static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
@@ -25,7 +26,13 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
char* rPath = realpath(classPath, NULL);
int fd;
- SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd);
+ if ((fd = open(rPath, O_RDONLY)) == -1) {
+ // Could not find device. It might be because we're in a VM and
+ // we don't see the whole machine. This is handled silently so
+ // we don't want to print an INFO error.
+ TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
+ return ncclSystemError;
+ }
free(rPath);
char pciClass[9];
strncpy(pciClass, "0x000000", 9);
@@ -35,6 +42,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType*
if (strcmp(pciClass, "0x068000") == 0) {
// PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
*type = ncclNvLinkDeviceSwitch;
+ } else if (strcmp(pciClass, "0x068001") == 0) {
+ // PCI device is of type "Bridge: IBM Device 04ea"
+ *type = ncclNvLinkDeviceBridge;
} else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
|| strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce)
*type = ncclNvLinkDeviceGpu;
diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h
index 0b6198a..f658279 100644
--- a/src/include/nvmlwrap.h
+++ b/src/include/nvmlwrap.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/include/param.h b/src/include/param.h
index dd5f697..5431757 100644
--- a/src/include/param.h
+++ b/src/include/param.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -36,7 +36,6 @@ static void setEnvFile(const char* fileName) {
s++;
strncpy(envValue, line+s, 1024);
setenv(envVar, envValue, 0);
- char *str = getenv(envVar);
}
if (line) free(line);
fclose(file);
diff --git a/src/include/rings.h b/src/include/rings.h
index 43fc595..9701f84 100644
--- a/src/include/rings.h
+++ b/src/include/rings.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/include/shm.h b/src/include/shm.h
index 4fb49cb..9cd9d05 100644
--- a/src/include/shm.h
+++ b/src/include/shm.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/include/socket.h b/src/include/socket.h
index fb5cfc0..739c0c4 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -18,8 +18,9 @@
#define MAX_IFS 16
#define MAX_IF_NAME_SIZE 16
-#define SLEEP_INT 1000 // sleep interval in usec
-#define RETRY_TIMES 2e4 // retry times before reporting a timeout (20 sec)
+#define SLEEP_INT 1000 // connection retry sleep interval in usec
+#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec)
+#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s)
/* Common socket address storage structure for IPv4/IPv6 */
union socketAddress {
@@ -370,14 +371,18 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
#endif
int ret;
- int retries = 0;
+ int timedout_retries = 0;
+ int refused_retries = 0;
retry:
SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret);
if (ret == 0) return ncclSuccess;
- if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) {
- INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); \
- usleep(SLEEP_INT);
- goto retry;
+ if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) {
+ if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
+ (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
+ INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
+ usleep(SLEEP_INT);
+ goto retry;
+ }
}
WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno));
return ncclSystemError;
diff --git a/src/include/topo.h b/src/include/topo.h
index e824a81..69cd100 100644
--- a/src/include/topo.h
+++ b/src/include/topo.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,49 +11,35 @@
#include <limits.h>
#include <stdlib.h>
#include <ctype.h>
+#include <stdio.h>
-#define BUSID_SIZE (sizeof("0000:00:00.0"))
-#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
-
-static ncclResult_t getCudaPath(int cudaDev, char** path) {
- char busId[BUSID_SIZE];
- CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
- for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
- char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
- memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
- memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
- *path = realpath(busPath, NULL);
- if (*path == NULL) {
- WARN("Could not find real path of %s", busPath);
- return ncclSystemError;
- }
- return ncclSuccess;
+ncclResult_t getCudaPath(int cudaDev, char** path);
+
+static int getNumaId(char *path) {
+ char npath[PATH_MAX];
+ snprintf(npath, PATH_MAX, "%s/numa_node", path);
+ npath[PATH_MAX-1] = '\0';
+
+ int numaId = -1;
+ FILE *file = fopen(npath, "r");
+ if (file == NULL) return -1;
+ if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
+ fclose(file);
+
+ return numaId;
}
enum ncclPathDist {
- PATH_PIX = 0,
- PATH_PXB = 1,
- PATH_PHB = 2,
- PATH_SOC = 3
+ PATH_PIX = 0,
+ PATH_PXB = 1,
+ PATH_PHB = 2,
+ PATH_NODE = 3,
+ PATH_SYS = 4,
+ PATH_ARRAY_SIZE = 5
};
-static const char* pathDists[] = { "PIX", "PXB", "PHB", "SOC" };
-
-static int pciDistance(char* path1, char* path2) {
- int score = 0;
- int depth = 0;
- int same = 1;
- for (int i=0; i<strlen(path1); i++) {
- if (path1[i] != path2[i]) same = 0;
- if (path1[i] == '/') {
- depth++;
- if (same == 1) score++;
- }
- }
- if (score <= 3) return PATH_SOC;
- if (score == 4) return PATH_PHB;
- if (score == depth-1) return PATH_PIX;
- return PATH_PXB;
-}
+extern const char* pathDists[PATH_ARRAY_SIZE];
+
+int pciDistance(char* path1, char* path2);
#endif
diff --git a/src/include/transport.h b/src/include/transport.h
index 6231a71..91628f6 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -8,6 +8,7 @@
#define NCCL_TRANSPORT_H_
#include "nccl.h"
+#include "devcomm.h"
#include <stdint.h>
#include "nvmlwrap.h"
@@ -37,7 +38,7 @@ struct ncclConnect {
char data[CONNECT_SIZE];
};
-enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress, ncclProxyOpDone };
+enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
struct ncclProxyArgs;
typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
@@ -117,8 +118,4 @@ inline void transportProxyWait(const FUNC& func) {
}
}
-inline void transportProxyIdle(int idle) {
- sched_yield();
-}
-
#endif
diff --git a/src/include/trees.h b/src/include/trees.h
index 1a151d1..7eadd85 100644
--- a/src/include/trees.h
+++ b/src/include/trees.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/include/utils.h b/src/include/utils.h
index 5a6a588..29b72ad 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,7 +10,7 @@
#include "nccl.h"
#include <stdint.h>
-ncclResult_t getHostName(char* hostname, int maxlen);
+ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
uint64_t getHostHash();
uint64_t getPidHash();
diff --git a/src/init.cu b/src/init.cc
index 75822e6..80af287 100644
--- a/src/init.cu
+++ b/src/init.cc
@@ -47,7 +47,7 @@ FILE *ncclDebugFile = stdout;
std::chrono::high_resolution_clock::time_point ncclEpoch;
#endif
-#if CUDART_VERSION >= 9200
+#if CUDART_VERSION >= 9020
#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
#else
#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
@@ -182,6 +182,11 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
return bootstrapGetUniqueId(out);
}
+// Prevent compiler from optimizing out these operations
+void __attribute__((optimize("O0"))) commPoison(ncclComm_t comm) {
+ comm->rank = comm->cudaDev = comm->nvmlDev = comm->nRanks = -1;
+}
+
static ncclResult_t commFree(ncclComm_t comm) {
if (comm == NULL)
return ncclSuccess;
@@ -191,6 +196,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
if (comm->bootstrap)
NCCLCHECK(bootstrapClose(comm->bootstrap));
+ CUDACHECK(cudaFree(comm->hostDevComm.channels));
CUDACHECK(cudaFree(comm->devComm));
for (int channel=0; channel<comm->nChannels; channel++)
@@ -216,6 +222,9 @@ static ncclResult_t commFree(ncclComm_t comm) {
CUDACHECK(cudaFreeHost((void *)comm->abortFlag));
CUDACHECK(cudaFreeHost((void *)comm->fatalDevError));
+ // Poison comm to try and catch a double free
+ commPoison(comm);
+
free(comm);
return ncclSuccess;
}
@@ -238,17 +247,17 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
struct ncclComm* comm;
NCCLCHECK(ncclCalloc(&comm, 1));
- comm->rank = rank;
- comm->nRanks = ndev;
+ comm->rank = comm->hostDevComm.rank =rank;
+ comm->nRanks = comm->hostDevComm.nRanks = ndev;
cudaGetDevice(&comm->cudaDev);
getNvmlDevice(comm->cudaDev, &comm->nvmlDev);
- INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);
+ TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);
comm->doneEvent = doneEvent;
comm->llThreshold = ncclParamLlThreshold();
comm->treeThreshold = ncclParamTreeThreshold();
comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
-#if CUDART_VERSION >= 9200
+#if CUDART_VERSION >= 9020
comm->groupCudaStream = ncclParamGroupCudaStream();
#else
// Don't allow the user to overload the default setting in older CUDA builds
@@ -256,10 +265,10 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
#endif
comm->fatalError = ncclSuccess;
- CUDACHECK(cudaHostAlloc((void**) &comm->fatalDevError, sizeof(ncclDevError_t), cudaHostAllocMapped));
+ NCCLCHECK(ncclCudaHostAlloc((void**) &comm->fatalDevError, (void**) &comm->hostDevComm.fatalDevError, sizeof(ncclDevError_t)));
*comm->fatalDevError = ncclDevSuccess;
- CUDACHECK(cudaHostAlloc((void**) &comm->abortFlag, sizeof(uint32_t), cudaHostAllocMapped));
+ NCCLCHECK(ncclCudaHostAlloc((void**) &comm->abortFlag, (void**) &comm->hostDevComm.abortFlag, sizeof(uint32_t)));
*comm->abortFlag = 0;
comm->argsptr = &comm->args;
@@ -269,23 +278,19 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
}
static ncclResult_t devCommSetup(ncclComm_t comm) {
- // Fully duplicate the comm on the device
- NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
- // Copy the comm on the device
- NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1));
- // Copy userRanks
+ // Duplicate the channels on the device
+ NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->nChannels));
+ NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->nChannels));
+
+ // Copy userRanks and peers
for (int r=0; r<comm->nChannels; r++) {
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks));
}
- // Copy the device-accessible pointer to comm->abortFlag
- void *devAbortFlag;
- CUDACHECK(cudaHostGetDevicePointer(&devAbortFlag, (uint32_t *)comm->abortFlag, 0));
- CUDACHECK(cudaMemcpy(&comm->devComm->abortFlag, &devAbortFlag, sizeof(int *), cudaMemcpyHostToDevice));
- // Copy the device-accessible pointer to comm->fatalDevError
- void *devFatalError;
- CUDACHECK(cudaHostGetDevicePointer(&devFatalError, (ncclDevError_t *)comm->fatalDevError, 0));
- CUDACHECK(cudaMemcpy(&comm->devComm->fatalDevError, &devFatalError, sizeof(ncclDevError_t *), cudaMemcpyHostToDevice));
+
+ // Duplicate the dev comm on the device
+ NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1));
+ NCCLCHECK(ncclCudaMemcpy(comm->devComm, &comm->hostDevComm, 1));
return ncclSuccess;
}
@@ -423,7 +428,8 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
}
}
- int ranks[nMasters];
+ int* ranks;
+ NCCLCHECK(ncclCalloc(&ranks, nMasters));
int i = 0, masterIndex = -1;
// Build binary tree
for (int r=0; r<nranks; r++) {
@@ -455,6 +461,7 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
tree->up = prev;
if (treeMasters[next] == 0) tree->down[0] = next;
}
+ free(ranks);
}
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
@@ -638,6 +645,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
if (peer == -1) continue;
conn = &channel->peers[peer].recv;
if (conn->connected) { ++nSkippedRecv; continue; }
+ memset(&connect, 0, sizeof(connect));
NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
}
@@ -646,6 +654,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
if (peer == -1) continue;
conn = &channel->peers[peer].send;
if (conn->connected) { ++nSkippedSend; continue; }
+ memset(&connect, 0, sizeof(connect));
NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
}
@@ -654,6 +663,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
if (peer == -1) continue;
conn = &channel->peers[peer].send;
if (conn->connected) {++nSkippedSend; continue; }
+ memset(&connect, 0, sizeof(connect));
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
NCCLCHECK(conn->transportComm->connect(&connect, conn));
conn->connected = 1;
@@ -663,6 +673,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel,
if (peer == -1) continue;
conn = &channel->peers[peer].recv;
if (conn->connected) {++nSkippedRecv; continue; }
+ memset(&connect, 0, sizeof(connect));
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
NCCLCHECK(conn->transportComm->connect(&connect, conn));
conn->connected = 1;
@@ -877,18 +888,42 @@ static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) {
return ncclSuccess;
}
+NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
+
static ncclResult_t setCpuAffinity(int cudaDev) {
- // Work within the enveloppe we were provided
+ // Query the CPU affinity set we were provided
cpu_set_t mask;
SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
- // Find the subpart that is local to our GPU
+#ifdef ENABLE_TRACE
+ {
+ char affinityStr[sizeof(cpu_set_t)*2];
+ NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
+ TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", cudaDev, affinityStr);
+ }
+#endif
+
+ // Find the CPUs that are local to the supplied GPU
cpu_set_t gpuMask;
NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask));
+
+#ifdef ENABLE_TRACE
+ {
+ char affinityStr[sizeof(cpu_set_t)*2];
+ NCCLCHECK(ncclCpusetToStr(&gpuMask, affinityStr));
+ TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", cudaDev, affinityStr);
+ }
+#endif
+
cpu_set_t finalMask;
- CPU_AND(&finalMask, &mask, &gpuMask);
+ if (ncclParamIgnoreCpuAffinity())
+ // Ignore the CPU affinity set and use the GPU one instead
+ finalMask = gpuMask;
+ else
+ // Use a subset of the GPU affinity set
+ CPU_AND(&finalMask, &mask, &gpuMask);
- // If those are not disjoint, try to stay local
+ // If there is a non empty set, use it to set affinity
if (CPU_COUNT(&finalMask)) {
char affinityStr[sizeof(cpu_set_t)*2];
NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
@@ -1018,8 +1053,9 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
comms[rank]->threadThreshold = threadThreshold;
}
+ struct ncclConnect* connect;
+ NCCLCHECK(ncclCalloc(&connect, 2*nranks));
for (int r=0; r<nrings; r++) {
- struct ncclConnect connect[2*nranks];
int* ringRanks = rings+r*nranks;
for (int rank=0; rank<nranks; rank++) {
CUDACHECK(cudaSetDevice(devs[rank]));
@@ -1045,6 +1081,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
NCCLCHECK(send->transportComm->connect(connect+ring->next*2+0, send));
}
}
+ free(connect);
free(allInfo);
free(rings);
free(treeIn);
@@ -1072,12 +1109,13 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
int savedDevice;
int rank, cudaDev;
ncclComm_t comm = NULL;
- int ncclDevList[ndev];
+ int* ncclDevList = NULL;
+ NCCLCHECK(ncclCalloc(&ncclDevList, ndev));
for (int i=0; i<ndev; i++) {
ncclDevList[i] = devlist ? devlist[i] : i;
}
- cudaGetDevice(&savedDevice);
+ CUDACHECKGOTO(cudaGetDevice(&savedDevice), res, cleanup);
for(rank=0; rank<ndev; ++rank)
comms[rank] = NULL;
@@ -1118,6 +1156,7 @@ cleanup:
}
final:
+ free(ncclDevList);
if(wrapNvmlShutdown() != ncclSuccess)
INFO(NCCL_INIT,"NCCL did not shutdown nvml properly");
cudaSetDevice(savedDevice);
@@ -1128,9 +1167,11 @@ final:
static ncclResult_t commDestroy(ncclComm_t comm) {
int savedDevice;
+#ifdef ENABLE_TRACE
+ int rank = comm->rank;
+#endif
CUDACHECK(cudaGetDevice(&savedDevice));
int commDevice = comm->cudaDev;
- int rank = comm->rank;
if (savedDevice != commDevice) {
CUDACHECK(cudaSetDevice(commDevice));
@@ -1145,7 +1186,7 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
if (savedDevice != commDevice)
CUDACHECK(cudaSetDevice(savedDevice));
- INFO(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
+ TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
return ncclSuccess;
}
@@ -1155,6 +1196,14 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
if (comm == NULL)
return ncclSuccess;
+ TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d nvmlDev %d", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev);
+
+ // Try and prevent a double free of the comm struct (user error)
+ if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->nvmlDev == -1) {
+ WARN("comm %p has already been destroyed", comm);
+ return ncclInvalidArgument;
+ }
+
return commDestroy(comm);
}
diff --git a/src/misc/checks.cu b/src/misc/argcheck.cc
index a07e577..364f041 100644
--- a/src/misc/checks.cu
+++ b/src/misc/argcheck.cc
@@ -1,10 +1,10 @@
/*************************************************************************
- * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-#include "checks.h"
+#include "argcheck.h"
static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
cudaPointerAttributes attr;
diff --git a/src/misc/group.cu b/src/misc/group.cc
index c428a22..7bc64cd 100644
--- a/src/misc/group.cu
+++ b/src/misc/group.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -118,7 +118,7 @@ ncclResult_t ncclGroupEnd() {
int savedDev;
CUDACHECK(cudaGetDevice(&savedDev));
int done = ncclGroupIndex;
- int doneArray[ncclGroupIndex];
+ int doneArray[MAX_ASYNC_OPS];
for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 0;
ncclResult_t ret = ncclGroupError;
diff --git a/src/misc/ibvwrap.cu b/src/misc/ibvwrap.cc
index 7ac3431..f47c141 100644
--- a/src/misc/ibvwrap.cu
+++ b/src/misc/ibvwrap.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/misc/nvmlwrap.cu b/src/misc/nvmlwrap.cc
index 635f332..fbe481f 100644
--- a/src/misc/nvmlwrap.cu
+++ b/src/misc/nvmlwrap.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/misc/rings.cu b/src/misc/rings.cc
index a7b122c..27ca9b6 100644
--- a/src/misc/rings.cu
+++ b/src/misc/rings.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -208,8 +208,8 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
NCCLCHECK(getEnvThreads(nthreads));
for (int r = 0; r<*nrings; r++) {
for (int i = 0; i<nranks; i++) {
- if (transports[i*nranks+prev[i]] == 2) treeIn[i] = 1;
- if (transports[i*nranks+next[i]] == 2) treeOut[i] = 1;
+ if (transports[i*nranks+prev[r*nranks+i]] == 2) treeIn[r*nranks+i] = 1;
+ if (transports[i*nranks+next[r*nranks+i]] == 2) treeOut[r*nranks+i] = 1;
}
}
return ncclSuccess;
diff --git a/src/misc/topo.cc b/src/misc/topo.cc
new file mode 100644
index 0000000..6364978
--- /dev/null
+++ b/src/misc/topo.cc
@@ -0,0 +1,51 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "topo.h"
+
+#define BUSID_SIZE (sizeof("0000:00:00.0"))
+#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
+
+ncclResult_t getCudaPath(int cudaDev, char** path) {
+ char busId[BUSID_SIZE];
+ CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
+ for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
+ char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
+ memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
+ memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
+ *path = realpath(busPath, NULL);
+ if (*path == NULL) {
+ WARN("Could not find real path of %s", busPath);
+ return ncclSystemError;
+ }
+ return ncclSuccess;
+}
+
+const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
+
+int pciDistance(char* path1, char* path2) {
+ int score = 0;
+ int depth = 0;
+ int same = 1;
+ for (int i=0; i<strlen(path1); i++) {
+ if (path1[i] != path2[i]) same = 0;
+ if (path1[i] == '/') {
+ depth++;
+ if (same == 1) score++;
+ }
+ }
+ if (score <= 3) {
+ /* Split the former PATH_SOC distance into PATH_NODE and PATH_SYS based on numaId */
+ int numaId1 = getNumaId(path1);
+ int numaId2 = getNumaId(path2);
+ TRACE(NCCL_INIT, "depth %d score %d path1 %s numaId %d path2 %s numaId %d", depth, score, path1, numaId1, path2, numaId2);
+ return ((numaId1 == numaId2) ? PATH_NODE : PATH_SYS);
+ }
+ if (score == 4) return PATH_PHB;
+ if (score == depth-1) return PATH_PIX;
+ return PATH_PXB;
+}
diff --git a/src/misc/trees.cu b/src/misc/trees.cc
index e53ea0b..f672abe 100644
--- a/src/misc/trees.cu
+++ b/src/misc/trees.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/misc/utils.cu b/src/misc/utils.cc
index c618e71..5e884ae 100644
--- a/src/misc/utils.cu
+++ b/src/misc/utils.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -29,13 +29,13 @@ ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) {
return ncclSuccess;
}
-ncclResult_t getHostName(char* hostname, int maxlen) {
+ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
if (gethostname(hostname, maxlen) != 0) {
strncpy(hostname, "unknown", maxlen);
return ncclSystemError;
}
int i = 0;
- while ((hostname[i] != '.') && (hostname[i] != '\0') && (i < maxlen-1)) i++;
+ while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen-1)) i++;
hostname[i] = '\0';
return ncclSuccess;
}
@@ -48,7 +48,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
if (ncclDebugLevel <= NCCL_LOG_NONE) return;
char hostname[1024];
- getHostName(hostname, 1024);
+ getHostName(hostname, 1024, '.');
int cudaDev;
cudaGetDevice(&cudaDev);
@@ -104,8 +104,8 @@ uint64_t getHash(const char* string) {
*/
uint64_t getHostHash(void) {
char uname[1024];
- // Start off with the hostname
- (void) getHostName(uname, sizeof(uname));
+ // Start off with the full hostname
+ (void) getHostName(uname, sizeof(uname), '\0');
int offset = strlen(uname);
int len;
// $(readlink /proc/self/ns/uts)
diff --git a/src/transport.cu b/src/transport.cc
index 1436a5b..1436a5b 100644
--- a/src/transport.cu
+++ b/src/transport.cc
diff --git a/src/transport/net.cu b/src/transport/net.cc
index 06a6e23..823caf1 100644
--- a/src/transport/net.cu
+++ b/src/transport/net.cc
@@ -28,7 +28,7 @@ static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IF
static ncclTvalue_t getTvalue(short* distances, int ndev) {
ncclTvalue_t tvalue = 0;
for (int d=0; d<ndev; d++) {
- int score = 1 + PATH_SOC - distances[d];
+ int score = 1 + PATH_SYS - distances[d];
// Keep 3 bits of score info per dev
tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d));
}
@@ -81,7 +81,7 @@ static ncclResult_t netDistance(int cudaDev, int dev, short* distance) {
ncclResult_t err;
NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
err = ncclNetPciPath(dev, &nicPath);
- *distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath);
+ *distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SYS : pciDistance(nicPath, cudaPath);
if (nicPath) free(nicPath);
if (cudaPath) free(cudaPath);
return ncclSuccess;
@@ -173,19 +173,19 @@ static inline int groupBestEnd(int nranks, int* groups, int group, int* subgroup
return -1;
}
-
ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
int nGroups = groups[nranks-1] + 1;
- int cardUsed[NET_MAX_IFS*nGroups];
- for (int c=0; c<NET_MAX_IFS*nGroups; c++) cardUsed[c] = 0;
+ int *cardUsed, *starts, *ends;
+ NCCLCHECK(ncclCalloc(&cardUsed, NET_MAX_IFS*nGroups));
+ NCCLCHECK(ncclCalloc(&starts, nGroups));
+ NCCLCHECK(ncclCalloc(&ends, nGroups));
for (int ring = 0; ring<*nringsRet; ring++) {
- int starts[nGroups];
- int ends[nGroups];
for (int group = 0; group<nGroups; group++) {
int nranksInGroup = 0;
int nsubGroups = 0;
- for (int rank=0; rank<nranks; rank++) if (groups[rank] == group) {
+ for (int rank=0; rank<nranks; rank++)
+ if (groups[rank] == group) {
nranksInGroup++;
nsubGroups = std::max(subgroups[rank], nsubGroups);
}
@@ -207,7 +207,7 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
}
if (starts[group] == -1 || ends[group] == -1) {
*nringsRet = ring;
- return ncclSuccess;
+ goto done;
}
}
// Link groups together
@@ -217,6 +217,10 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
prev[ring*nranks+starts[nextGroup]] = ends[group];
}
}
+done:
+ free(cardUsed);
+ free(starts);
+ free(ends);
return ncclSuccess;
}
@@ -432,11 +436,12 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
if (args->head < args->end) {
if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
+ volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
if (args->llMode) {
int buffSlot = args->tail%NCCL_STEPS;
int size = sizesFifo[buffSlot];
if (size != -1) {
- uint32_t flag = args->tail + 1;
+ uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
size = nFifoLines * sizeof(union ncclLLFifoLine);
union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
@@ -457,7 +462,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
}
}
}
- } else if (args->tail < resources->hostRecvMem->tail) {
+ } else if (args->tail < *recvTail) {
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
int stepSize = args->channel->buffSize/NCCL_STEPS;
// Send through network
@@ -486,18 +491,8 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
if (args->head == args->end) {
resources->step = args->end;
args->idle = 0;
- args->state = ncclProxyOpDone;
- }
- }
- if (args->state == ncclProxyOpDone) {
- union ncclLLFifoLine* llBuff = resources->hostRecvMem->llBuff;
- if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
- for (int i=0; i< NCCL_LL_BUFF_LINES; i++) llBuff[i].flag1 = llBuff[i].flag2 = resources->step;
- resources->step += NCCL_STEPS;
- resources->hostSendMem->head = resources->step;
- resources->llLastCleaning = resources->step;
+ args->state = ncclProxyOpNone;
}
- args->state = ncclProxyOpNone;
}
return ncclSuccess;
}
@@ -522,7 +517,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff;
void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle;
- if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
+ volatile uint64_t* sendHead = &resources->hostSendMem->head;
+ if ((args->tail < args->head + NCCL_STEPS) && (args->tail < *sendHead + NCCL_STEPS) && (args->tail < args->end)) {
int buffSlot = args->tail%NCCL_STEPS;
int sliceSize = stepSize * args->sliceSteps;
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot));
@@ -548,16 +544,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
if (args->head == args->end) {
resources->step = args->end;
args->idle = 0;
- args->state = ncclProxyOpDone;
- }
- }
- if (args->state == ncclProxyOpDone) {
- if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
- resources->step += NCCL_STEPS;
- while (resources->hostSendMem->head < resources->step);
- resources->llLastCleaning = resources->step;
+ args->state = ncclProxyOpNone;
}
- args->state = ncclProxyOpNone;
}
return ncclSuccess;
}
diff --git a/src/transport/net_ib.cu b/src/transport/net_ib.cc
index f7c574b..de72f89 100644
--- a/src/transport/net_ib.cu
+++ b/src/transport/net_ib.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -119,6 +119,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
}
int found = 0;
struct ibv_device_attr devAttr;
+ memset(&devAttr, 0, sizeof(devAttr));
if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
WARN("NET/IB : Unable to query device %s", devices[d]->name);
if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
diff --git a/src/transport/net_socket.cu b/src/transport/net_socket.cc
index 0464b43..9958936 100644
--- a/src/transport/net_socket.cu
+++ b/src/transport/net_socket.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
diff --git a/src/transport/p2p.cu b/src/transport/p2p.cc
index 9f3e0b6..42b549e 100644
--- a/src/transport/p2p.cu
+++ b/src/transport/p2p.cc
@@ -57,7 +57,7 @@ static int busIdToCudaDev(const char* busId) {
/* Determine if we can communicate with the peer through p2p */
ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
// Do not use P2P across root complexes by default (provided CUDA permits it)
- int p2pLevel = PATH_SOC;
+ int p2pLevel = PATH_NODE;
if (ncclParamP2pDisable() == 1) p2pLevel = 0;
if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();
@@ -70,13 +70,26 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
int peerCudaDev = busIdToCudaDev(peerInfo->busId);
- if (peerCudaDev == -1) return ncclSuccess; // Peer's CUDA device is not visible in this process
+ if (peerCudaDev == -1) {
+ // Peer's CUDA device is not visible in this process
+#if CUDART_VERSION >= 10010
+ // But in CUDA 10.1 we can still communicate with 'invisible' devices
+ TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %d(%s) and %d(%s)", myInfo->nvmlDev, myInfo->busId, peerInfo->nvmlDev, peerInfo->busId);
+ // Check for NVLink/NVswitch including P2P access
+ int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId);
+ if (nvlinkp2p > 0) {
+ *ret = nvlinkp2p;
+ return ncclSuccess;
+ }
+#endif
+ return ncclSuccess;
+ }
TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
// Do not detect topology if we're on the same GPU. Note this is not really supported.
if (myInfo->cudaDev == peerCudaDev) {
- *ret = 1 + PATH_SOC;
+ *ret = 1 + PATH_SYS;
return ncclSuccess;
}
@@ -104,7 +117,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
if (err1 == ncclSuccess && err2 == ncclSuccess) {
int distance = pciDistance(myPath, peerPath);
if (distance < p2pLevel) {
- *ret = 1 + PATH_SOC - distance;
+ *ret = 1 + PATH_SYS - distance;
}
}
if (err1 == ncclSuccess) free(myPath);
@@ -112,6 +125,9 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
return ncclSuccess;
}
+#define MAXGPUS_NVLINKP2P 8 // 16 would take an almost infinite time anyway
+#define MAXGPUS_PCI 64
+
static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) {
int nrings = 0;
ncclTvalue_t* line = matrix+current*n;
@@ -139,7 +155,7 @@ static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentR
}
}
} else {
- int ringsSave[nRingsMax*n];
+ int ringsSave[MAXCHANNELS*MAXGPUS_NVLINKP2P];
int maxStep = 0;
for (int i=0; i<n; i++) {
if (inTheRing[i] == 0 && line[i] > 0) {
@@ -297,9 +313,9 @@ int p2pComputeRingsSeqNew(ncclTvalue_t* values, int nranks, int* rings, int nrin
}
static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) {
- for (int score = PATH_SOC+1; score >= minScore; score--) {
+ for (int score = PATH_SYS+1; score >= minScore; score--) {
int best = -1;
- int worst_end_score = PATH_SOC+2; // find the closest to rank, farthest from end
+ int worst_end_score = PATH_SYS+2; // find the closest to rank, farthest from end
for (int n = 0; n < nranks; n++) {
if (inRing[n]) continue;
if (values[rank*nranks+n] == score) {
@@ -321,7 +337,7 @@ int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings,
int start = findConnect(nranks, prev+r*nranks);
int end = findConnect(nranks, next+r*nranks);
- int inRing[nranks];
+ int inRing[MAXGPUS_PCI];
for (int i=0; i<nranks; i++) inRing[i] = 0;
if (start == -1 && end == -1) {
@@ -405,10 +421,14 @@ ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
links += val/CONNECT_NVLINK;
}
if (rank == 0) directLinks = links;
- else directLinks = std::min(directLinks, links);
+ else directLinks = std::min(directLinks, links);
}
if (directLinks > 0) {
// NVLink : Connect rings or create new ones
+ if (nranks > MAXGPUS_NVLINKP2P) {
+ WARN("Recursive P2P computation cannot work for >8 GPUs");
+ return ncclInternalError;
+ }
nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads);
goto end;
}
@@ -600,6 +620,7 @@ ncclResult_t p2pSendFree(void* resources) {
if (sendRes->ipcPtr)
CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
CUDACHECK(cudaFree(sendRes->devMem));
+ free(sendRes);
return ncclSuccess;
}
@@ -608,6 +629,7 @@ ncclResult_t p2pRecvFree(void* resources) {
if (recvRes->ipcPtr)
CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
CUDACHECK(cudaFree(recvRes->devMem));
+ free(recvRes);
return ncclSuccess;
}
diff --git a/src/transport/shm.cu b/src/transport/shm.cc
index 83cc9d1..2ec5f23 100644
--- a/src/transport/shm.cu
+++ b/src/transport/shm.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -60,11 +60,13 @@ static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid)
return -1;
}
+#define MAXGROUPS 16
+
ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
if (*nringsRet == MAXCHANNELS) *nringsRet = 1;
int nGroups = groups[nranks-1] + 1;
- int starts[nGroups];
- int ends[nGroups];
+ int starts[MAXGROUPS];
+ int ends[MAXGROUPS];
for (int ring = 0; ring<*nringsRet; ring++) {
int startGroup = -1, endGroup = -1;
for (int group = 0; group<nGroups; group++) {