Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSylvain Jeaugey <sjeaugey@nvidia.com>2020-05-13 00:40:18 +0300
committerSylvain Jeaugey <sjeaugey@nvidia.com>2020-06-08 19:31:44 +0300
commit5949d96f36d050e59d05872f8bbffd2549318e95 (patch)
treee56476c71668bbd1ce4ddbc189b1be7d037b065c
parentf36540f55a15683a121b6c330657af442b85c796 (diff)
2.7.3-1
Add support for A100 GPU and related platforms. Add support for CUDA 11. Add support for send/receive operations (beta).
-rw-r--r--makefiles/common.mk9
-rw-r--r--makefiles/version.mk4
-rw-r--r--src/Makefile4
-rw-r--r--src/bootstrap.cc3
-rw-r--r--src/channel.cc19
-rw-r--r--src/collectives/device/Makefile4
-rw-r--r--src/collectives/device/all_gather.h74
-rw-r--r--src/collectives/device/all_reduce.h223
-rw-r--r--src/collectives/device/broadcast.h76
-rw-r--r--src/collectives/device/common.h8
-rw-r--r--src/collectives/device/functions.cu6
-rwxr-xr-xsrc/collectives/device/gen_rules.sh4
-rw-r--r--src/collectives/device/primitives.h38
-rw-r--r--src/collectives/device/prims_ll.h23
-rw-r--r--src/collectives/device/prims_ll128.h19
-rw-r--r--src/collectives/device/reduce.h82
-rw-r--r--src/collectives/device/reduce_scatter.h74
-rw-r--r--src/collectives/device/sendrecv.cu14
-rw-r--r--src/collectives/device/sendrecv.h81
-rw-r--r--src/collectives/sendrecv.cc37
-rw-r--r--src/debug.cc28
-rw-r--r--src/enqueue.cc211
-rw-r--r--src/graph/paths.cc102
-rw-r--r--src/graph/search.cc52
-rw-r--r--src/graph/topo.cc24
-rw-r--r--src/graph/topo.h15
-rw-r--r--src/graph/tuning.cc76
-rw-r--r--src/graph/xml.cc2
-rw-r--r--src/group.cc165
-rw-r--r--src/include/alloc.h10
-rw-r--r--src/include/checks.h14
-rw-r--r--src/include/collectives.h10
-rw-r--r--src/include/comm.h13
-rw-r--r--src/include/core.h15
-rw-r--r--src/include/devcomm.h71
-rw-r--r--src/include/enqueue.h15
-rw-r--r--src/include/graph.h7
-rw-r--r--src/include/info.h8
-rw-r--r--src/include/nccl_net.h2
-rw-r--r--src/include/p2p.h32
-rw-r--r--src/include/proxy.h77
-rw-r--r--src/include/socket.h6
-rw-r--r--src/include/transport.h72
-rw-r--r--src/init.cc152
-rw-r--r--src/misc/argcheck.cc25
-rw-r--r--src/misc/utils.cc1
-rw-r--r--src/nccl.h.in52
-rw-r--r--src/proxy.cc283
-rw-r--r--src/transport.cc291
-rw-r--r--src/transport/coll_net.cc291
-rw-r--r--src/transport/net.cc260
-rw-r--r--src/transport/net_ib.cc2
-rw-r--r--src/transport/p2p.cc83
-rw-r--r--src/transport/shm.cc28
54 files changed, 2047 insertions, 1250 deletions
diff --git a/makefiles/common.mk b/makefiles/common.mk
index ece18c7..8e91a45 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -23,19 +23,24 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
#$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR})
-# Better define NVCC_GENCODE in your environment to the minimal set
+# You should define NVCC_GENCODE in your environment to the minimal set
# of archs to reduce compile time.
CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \
-gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61
CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
+CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
CUDA8_PTX = -gencode=arch=compute_61,code=compute_61
CUDA9_PTX = -gencode=arch=compute_70,code=compute_70
+CUDA11_PTX = -gencode=arch=compute_80,code=compute_80
+# Include Ampere support if we're using CUDA11 or above
+ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
+ NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX) $(CUDA11_GENCODE) $(CUDA11_PTX)
# Include Volta support if we're using CUDA9 or above
-ifeq ($(shell test "0$(CUDA_MAJOR)" -gt 8; echo $$?),0)
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0)
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
else
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
diff --git a/makefiles/version.mk b/makefiles/version.mk
index 883e625..4a82cb9 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
-NCCL_MINOR := 6
-NCCL_PATCH := 4
+NCCL_MINOR := 7
+NCCL_PATCH := 3
NCCL_SUFFIX :=
PKG_REVISION := 1
diff --git a/src/Makefile b/src/Makefile
index db1698a..d065888 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -9,10 +9,10 @@ include ../makefiles/version.mk
##### src files
INCEXPORTS := nccl.h nccl_net.h
-LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc \
+LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc \
misc/nvmlwrap.cc misc/ibvwrap.cc misc/utils.cc misc/argcheck.cc \
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
- collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
+ collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
##### lib files
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index 11ffc35..e90dd66 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -240,6 +240,7 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
char* env = getenv("NCCL_COMM_ID");
if (env) {
+ INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
if (bootstrapNetCreateHandle(netHandle, env) != 0) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return ncclInvalidArgument;
diff --git a/src/channel.cc b/src/channel.cc
index 0a43e17..d22ea63 100644
--- a/src/channel.cc
+++ b/src/channel.cc
@@ -1,29 +1,17 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "channel.h"
#include "param.h"
-#include "graph.h"
-
-#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
-#define DEFAULT_BUFFER_SIZE_BYTES_ARM (1LL << 20) /* 1MiB */
-
-NCCL_PARAM(Buffsize, "BUFFSIZE", -2);
ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
struct ncclChannel* channel = comm->channels+channelid;
+ if (channel->id != -1) return ncclSuccess;
channel->id = channelid;
- // Setup intermediate buffering
- int buffSize = ncclParamBuffsize();
- int cpuArch, cpuVendor, cpuModel;
- NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
- channel->buffSize = buffSize != -2 ? buffSize :
- cpuArch == NCCL_TOPO_CPU_ARCH_ARM ? DEFAULT_BUFFER_SIZE_BYTES_ARM : DEFAULT_BUFFER_SIZE_BYTES;
-
// Ring index to user rank table.
NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
@@ -37,11 +25,12 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
}
// Per-channel operation list.
- NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
+ NCCLCHECK(ncclCudaHostCalloc(&channel->collectives, NCCL_MAX_OPS));
return ncclSuccess;
}
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
+ if (channel->id == -1) return ncclSuccess;
// Operation list
NCCLCHECK(ncclCudaHostFree(channel->collectives));
diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile
index 001059c..3796fb1 100644
--- a/src/collectives/device/Makefile
+++ b/src/collectives/device/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -10,7 +10,7 @@ include ../../../makefiles/version.mk
BUILDDIR ?= $(abspath ../../../build)
OBJDIR := $(BUILDDIR)/obj/collectives/device
-LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu
+LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu
LIBSRCFILES += functions.cu
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index 059092c..724b1aa 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,26 +11,27 @@
template<int UNROLL, class FUNC, typename T>
__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int nthreads = args->nThreads-WARP_SIZE;
- const int bid = args->bid;
+ const int nthreads = args->coll.nThreads-WARP_SIZE;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
- const ssize_t size = args->N;
- const int nranks = comm->nRanks;
- const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+ const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
- const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+ const ssize_t size = args->coll.count;
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
- ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, FUNC>
- prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
+ ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, 1, FUNC>
+ prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+ int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
@@ -75,27 +76,27 @@ __device__ void ncclAllGatherCollNetKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
+ const int nthreads = args->coll.nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
-
- ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
- //const int rank = comm->rank;
+ const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+ ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const int nranks = comm->nRanks;
- ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
- const ssize_t loopSize = args->nChannels*chunkSize;
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
+
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm, args->opCount);
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
- chunkSize = args->lastChunkSize;
+ chunkSize = args->coll.lastChunkSize;
}
ssize_t chunkOffset = gridOffset + bid*chunkSize;
@@ -140,29 +141,28 @@ __device__ void ncclAllGatherCollNetLLKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
+ const int nthreads = args->coll.nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
-
- ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
- //const int rank = comm->rank;
- const int nranks = comm->nRanks;
- ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+ const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+ ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
- const ssize_t loopSize = args->nChannels*chunkSize;
+ ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm, args->opCount);
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
+ chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t chunkOffset = gridOffset + bid*chunkSize;
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index 4e04f88..6891ac0 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,26 +11,27 @@
template<int UNROLL, class FUNC, typename T>
__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int nthreads = args->nThreads-WARP_SIZE;
- const int bid = args->bid;
+ const int nthreads = args->coll.nThreads-WARP_SIZE;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
- const ssize_t size = args->N;
- const int nranks = comm->nRanks;
- const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+ const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
- const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+ const ssize_t size = args->coll.count;
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
- ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
- prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
+ ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, 1, FUNC>
+ prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
- int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels));
+ ssize_t realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize;
@@ -85,28 +86,29 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
template<int UNROLL, class FUNC, typename T>
__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int nthreads = args->nThreads-WARP_SIZE;
- const int bid = args->bid;
+ const int nthreads = args->coll.nThreads-WARP_SIZE;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
- const ssize_t size = args->N;
- const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
- int chunkSize = args->lastChunkSize;
+ const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+ int chunkSize = args->coll.lastChunkSize;
const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
- const ssize_t loopSize = args->nChannels*chunkSize;
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
if (loopSize > size) {
- chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+ chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
do {
struct ncclTree* tree = &channel->treeUp;
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
- ncclPrimitives<UNROLL/2, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
+ ncclPrimitives<UNROLL/2, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, 0, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
@@ -124,17 +126,17 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
do {
struct ncclTree* tree = &channel->treeDn;
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
- ncclPrimitives<UNROLL/2, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
+ ncclPrimitives<UNROLL/2, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, &tree->up, tree->down, thisOutput, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
- prims.send(thisOutput+offset, nelem);
+ prims.directSend(thisOutput+offset, offset, nelem);
} else if (tree->down[0] == -1) {
- prims.recv(thisOutput+offset, nelem);
+ prims.directRecv(thisOutput+offset, offset, nelem);
} else {
- prims.recvCopySend(thisOutput+offset, nelem);
+ prims.directRecvCopySend(thisOutput+offset, offset, nelem);
}
}
} while(0);
@@ -143,27 +145,28 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
template<int UNROLL, class FUNC, typename T>
__device__ void ncclAllReduceCollNetKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int nthreads = args->nThreads-WARP_SIZE;
- const int bid = args->bid;
+ const int nthreads = args->coll.nThreads-WARP_SIZE;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
- const ssize_t size = args->N;
- const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
- int chunkSize = args->lastChunkSize;
+ const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+ int chunkSize = args->coll.lastChunkSize;
const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
- const ssize_t loopSize = args->nChannels*chunkSize;
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
if (loopSize > size) {
- chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+ chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
- if (blockIdx.x < args->nChannels) { // first half of the channels do reduce
+ if (blockIdx.x < nChannels) { // first half of the channels do reduce
struct ncclTree* tree = &channel->collTreeUp;
- ncclPrimitives<UNROLL, 1, 1, T, 1, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
+ ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
@@ -178,9 +181,9 @@ __device__ void ncclAllReduceCollNetKernel(struct CollectiveArgs* args) {
}
}
- if (blockIdx.x >= args->nChannels) { // second half of the channels do broadcast
+ if (blockIdx.x >= nChannels) { // second half of the channels do broadcast
struct ncclTree* tree = &channel->collTreeDn;
- ncclPrimitives<UNROLL, 1, 1, T, 1, 1, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
+ ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
@@ -199,28 +202,27 @@ __device__ void ncclAllReduceCollNetKernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
+ const int nthreads = args->coll.nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
-
- ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
- //const int rank = comm->rank;
- const int nranks = comm->nRanks;
- ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+ const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+ ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const ssize_t minChunkSize = nthreads * (sizeof(uint64_t)) / sizeof(T);
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*nranks*chunkSize;
+ const ssize_t size = args->coll.count;
- const ssize_t loopSize = args->nChannels*nranks*chunkSize;
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm, args->opCount);
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
+ chunkSize = min(DIVUP(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
/////////////// begin AllReduce steps ///////////////
ssize_t offset;
@@ -229,7 +231,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
// step 0: push data to next GPU
chunk = ring->devUserRanks[nranks-1];
- offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.send(thisInput+offset, nelem);
@@ -237,7 +239,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
chunk = ring->devUserRanks[nranks-j];
- offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvReduceSend(thisInput+offset, nelem);
@@ -246,7 +248,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
chunk = ring->devUserRanks[0];
- offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
@@ -254,7 +256,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
chunk = ring->devUserRanks[nranks-j];
- offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvCopySend(thisOutput+offset, nelem);
@@ -262,7 +264,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
// Make final copy from buffer to dest.
chunk = ring->devUserRanks[1];
- offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
// Here we need to copy from buffer to this output.
@@ -273,27 +275,29 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int nthreads = args->nThreads;
- const int bid = args->bid;
+ const int nthreads = args->coll.nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
- const ssize_t size = args->N;
- ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+ const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+ ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
- const ssize_t loopSize = args->nChannels*chunkSize;
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
if (loopSize > size) {
- chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+ chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
do {
struct ncclTree* tree = &channel->treeUp;
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
- ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
+ ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
@@ -311,7 +315,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
do {
struct ncclTree* tree = &channel->treeDn;
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
- ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
+ ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
@@ -330,26 +334,28 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllReduceCollNetLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int nthreads = args->nThreads;
- const int bid = args->bid;
+ const int nthreads = args->coll.nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
- const ssize_t size = args->N;
- ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+ const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+ ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
- const ssize_t loopSize = args->nChannels*chunkSize;
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
if (loopSize > size) {
- chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+ chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
- if (blockIdx.x < args->nChannels) { // first half of the channels do reduce
+ if (blockIdx.x < nChannels) { // first half of the channels do reduce
struct ncclTree* tree = &channel->collTreeUp;
- ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
@@ -364,9 +370,9 @@ __device__ void ncclAllReduceCollNetLLKernel(struct CollectiveArgs* args) {
}
}
- if (blockIdx.x >= args->nChannels) { // second half of the channels do broadcast
+ if (blockIdx.x >= nChannels) { // second half of the channels do broadcast
struct ncclTree* tree = &channel->collTreeDn;
- ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
@@ -386,29 +392,28 @@ __device__ void ncclAllReduceCollNetLLKernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
+ const int nthreads = args->coll.nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
-
- ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
- //const int rank = comm->rank;
- const int nranks = comm->nRanks;
- ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+ const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+ ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*nranks*chunkSize;
+ const ssize_t size = args->coll.count;
- const ssize_t loopSize = args->nChannels*nranks*chunkSize;
+ ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm, args->opCount);
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
+ chunkSize = min(DIVUP(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
/////////////// begin AllReduce steps ///////////////
ssize_t offset;
@@ -417,7 +422,7 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
// step 0: push data to next GPU
chunk = ring->devUserRanks[nranks-1];
- offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.send(thisInput+offset, nelem);
@@ -425,7 +430,7 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
chunk = ring->devUserRanks[nranks-j];
- offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvReduceSend(thisInput+offset, nelem);
@@ -434,7 +439,7 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
chunk = ring->devUserRanks[0];
- offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
@@ -442,7 +447,7 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
chunk = ring->devUserRanks[nranks-j];
- offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
LLprims.recvCopySend(thisOutput+offset, nelem);
@@ -450,7 +455,7 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
// Make final copy from buffer to dest.
chunk = ring->devUserRanks[1];
- offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize;
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
// Here we need to copy from buffer to this output.
@@ -461,29 +466,31 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int nthreads = args->nThreads;
- const int bid = args->bid;
+ const int nthreads = args->coll.nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclTree* treeUp = &channel->treeUp;
struct ncclTree* treeDn = &channel->treeDn;
- const ssize_t size = args->N;
- ssize_t chunkSize = args->lastChunkSize;
+ const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+ ssize_t chunkSize = args->coll.lastChunkSize;
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/8;
- const ssize_t loopSize = args->nChannels*chunkSize;
+ const ssize_t loopSize = nChannels*chunkSize;
int nthreadsSplit = NCCL_LL128_SPLIT(nthreads);
+ const ssize_t size = args->coll.count;
if (loopSize > size) {
- chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+ chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
if (treeUp->up == -1) {
// ReduceAndBroadcast : max number of recv is 3, max number of send is 3
- ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, treeUp->down, treeDn->down, channel, comm, args->opCount);
+ ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, treeUp->down, treeDn->down, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
@@ -492,7 +499,7 @@ __device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
} else {
if (tid < nthreadsSplit) {
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
- ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreadsSplit, treeUp->down, &treeUp->up, channel, comm, args->opCount);
+ ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreadsSplit, treeUp->down, &treeUp->up, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
@@ -505,7 +512,7 @@ __device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
}
} else {
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
- ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &treeDn->up, treeDn->down, channel, comm, args->opCount);
+ ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &treeDn->up, treeDn->down, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index 5146682..b141a5d 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,28 +11,29 @@
template<int UNROLL, class FUNC, typename T>
__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int nthreads = args->nThreads-WARP_SIZE;
- const int bid = args->bid;
+ const int nthreads = args->coll.nThreads-WARP_SIZE;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
- const ssize_t size = args->N;
- const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+ const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
- const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
+ const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+ const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
- const int root = args->root;
+ const int root = args->coll.root;
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
- ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, FUNC>
- prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
+ ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, 0, FUNC>
+ prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+ int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t offset = gridOffset + bid*realChunkSize;
int nelem = min(realChunkSize, size-offset);
@@ -60,29 +61,29 @@ __device__ void ncclBroadcastCollNetKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
+ const int nthreads = args->coll.nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
-
- ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
+ const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+ ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
- const int root = args->root;
+ const int root = args->coll.root;
- ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
- const ssize_t loopSize = args->nChannels*chunkSize;
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm, args->opCount);
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
- chunkSize = args->lastChunkSize;
+ chunkSize = args->coll.lastChunkSize;
}
ssize_t offset = gridOffset + bid*chunkSize;
@@ -111,30 +112,29 @@ __device__ void ncclBroadcastCollNetLLKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
+ const int nthreads = args->coll.nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
-
- ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
+ const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+ ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
+ const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
- const int root = args->root;
-
- ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
- const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+ const int root = args->coll.root;
- const ssize_t loopSize = args->nChannels*chunkSize;
+ ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm, args->opCount);
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
+ chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index 6e06369..a76f4e8 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -67,10 +67,10 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
c = &firstColl; \
} else { \
c = &localColl; \
- load_coll(c, channel->devCollectives+channel->collFifoHead, tid, comm); \
+ load_coll(c, channel->collectives+channel->collFifoHead, tid, comm); \
} \
while (1) { \
- if (tid < c->args.nThreads) { \
+ if (tid < c->args.common.nThreads) { \
if (c->funcIndex == fIndex) { \
coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
} else { \
@@ -86,7 +86,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
\
/* Load next collective operation*/ \
c = &localColl; /* for bid 0 */ \
- load_coll(c, channel->devCollectives+nextIndex, tid, comm); \
+ load_coll(c, channel->collectives+nextIndex, tid, comm); \
} \
}
#else
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu
index d10f11e..119cd36 100644
--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -56,6 +56,7 @@ __device__ volatile uint64_t* ncclShmem;
// Must be consistent with ncclFunc_t
#define NCCL_FUNCS() { \
+ NCCL_COLL_NAME(ncclSendRecv, copy, i8),\
NCCL_FUNCS2B(ncclBroadcast), \
NCCL_FUNCS2A(ncclReduce), \
NCCL_FUNCS2B(ncclAllGather), \
@@ -63,11 +64,12 @@ __device__ volatile uint64_t* ncclShmem;
NCCL_FUNCS2A(ncclAllReduce) }
// Must be consistent with the ncclFuncSet enum
-__device__ ncclKern_t ncclFuncs[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
+__device__ ncclKern_t ncclFuncs[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
// Don't try to initialize the host shadow copy of this device-side global
// variable. There is no host pointer to a device-side function, which
// confuses clang. This will be fixed in the next clang release.
#if __CUDA_ARCH__
+ NCCL_COLL_NAME(ncclSendRecv, copy, i8),
NCCL_FUNCS2B(ncclBroadcast),
NCCL_FUNCS2A(ncclReduce),
NCCL_FUNCS2B(ncclAllGather),
diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh
index 4413213..97dc0ae 100755
--- a/src/collectives/device/gen_rules.sh
+++ b/src/collectives/device/gen_rules.sh
@@ -1,6 +1,6 @@
#!/bin/bash
#
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -9,7 +9,7 @@ dir=$1
targets="GENOBJS := \\\\\n"
-for base in all_reduce all_gather broadcast reduce reduce_scatter; do
+for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do
opn=0
for op in sum prod min max; do
dtn=0
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index c1067bf..bbbde25 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -32,7 +32,7 @@
} while (0)
// Implementation of primitive types
-template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, class FUNC>
+template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, int DIRECT, class FUNC>
class ncclPrimitives {
private:
const int tid;
@@ -70,10 +70,18 @@ class ncclPrimitives {
inline __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
inline __device__ void barrier() {
- asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+ if (NSEND>NRECV) {
+ asm volatile ("bar.sync 1, %0;" :: "r"(nthreads+WARP_SIZE));
+ } else {
+ asm volatile ("bar.sync 2, %0;" :: "r"(nthreads+WARP_SIZE));
+ }
}
inline __device__ void subBarrier() {
- asm volatile ("bar.sync 2, %0;" :: "r"(nthreads-WARP_SIZE));
+ if (NSEND>NRECV) {
+ asm volatile ("bar.sync 3, %0;" :: "r"(nthreads));
+ } else {
+ asm volatile ("bar.sync 4, %0;" :: "r"(nthreads));
+ }
}
uint32_t mismatch = 0;
@@ -183,7 +191,7 @@ class ncclPrimitives {
for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
}
- bool syncThread = tid >= nthreads-WARP_SIZE;
+ bool syncThread = tid >= nthreads;
#pragma unroll
for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
@@ -196,10 +204,10 @@ class ncclPrimitives {
if (DIRECTRECV && recvDirectBuff[0]) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (SEND) {
- ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads-WARP_SIZE, 1, srcs, nsend, dsts+1, realSize);
+ ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
}
} else {
- ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads-WARP_SIZE, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
+ ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
}
}
}
@@ -223,11 +231,11 @@ class ncclPrimitives {
}
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
- recvBuff[i] = (const T*)conn->buff;
+ recvBuff[i] = (const T*)conn->buffs[NCCL_PROTO_SIMPLE];
recvStep[i] = conn->step;
recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
recvDirectBuff[i] = NULL;
- if (directBuff && (conn->direct & NCCL_DIRECT_GPU)) {
+ if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
recvDirectBuff[i] = directBuff;
if (tid == 0) *conn->ptrExchange = directBuff;
}
@@ -240,7 +248,7 @@ class ncclPrimitives {
recvConnTailPtr = recvConn->tail;
recvConnTailCache = *recvConnTailPtr;
}
- if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+ if (tid >= nthreads && wid < nrecv) {
recvConnHeadPtr = recvConn->head;
// Return credits in case we rounded up.
*recvConnHeadPtr = recvConnHead;
@@ -249,12 +257,12 @@ class ncclPrimitives {
}
}
- __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
- sendBuff[i] = (T*)conn->buff;
+ __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
+ sendBuff[i] = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
sendStep[i] = conn->step;
sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
sendDirectBuff[i] = NULL;
- if (directBuff && (conn->direct & NCCL_DIRECT_GPU)) {
+ if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
void* volatile* ptr = conn->ptrExchange;
while ((sendDirectBuff[i] = (T*)(*ptr)) == NULL);
barrier();
@@ -271,13 +279,13 @@ class ncclPrimitives {
sendConnFifoPtr = sendConn->fifo;
*(sendConn->opCountLoc) = opCount;
}
- if (tid >= nthreads-WARP_SIZE && wid<nsend) {
+ if (tid >= nthreads && wid<nsend) {
sendConnTailPtr = sendConn->tail;
}
}
__device__ __forceinline__ void saveRecvSync() {
- if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
+ if (tid >= nthreads && wid < nrecv) {
recvConn->step = recvConnHead;
*(recvConn->opCountLoc) = opCount+1;
__threadfence_system();
@@ -300,7 +308,7 @@ class ncclPrimitives {
barrier();
for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, directBuff);
- for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, directBuff);
+ for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
loadRecvSync();
loadSendSync();
}
diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h
index f919493..5518061 100644
--- a/src/collectives/device/prims_ll.h
+++ b/src/collectives/device/prims_ll.h
@@ -1,9 +1,16 @@
+/*************************************************************************
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
template <typename T, class FUNC, int NRECV, int NSEND>
class ncclLLPrimitives {
private:
const int tid;
const int nthreads;
const int wid;
+ const int stepLines;
int nrecv = 0;
int nsend = 0;
struct ncclConnInfo* recvConn = NULL;
@@ -22,8 +29,8 @@ class ncclLLPrimitives {
union ncclLLFifoLine* sendBuff[NSEND];
struct ncclDevComm* comm;
- inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
- inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+ inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepLines; }
+ inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepLines; }
inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
@@ -68,7 +75,7 @@ class ncclLLPrimitives {
if (checkAbort(wid, 1)) break;
}
if (sendConnFifoPtr) {
- int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
+ int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? stepLines*sizeof(union ncclLLFifoLine) : nbytes;
sendConnFifoPtr[sendConnHead%NCCL_STEPS] = size;
}
sendConnHead += 1;
@@ -88,7 +95,7 @@ class ncclLLPrimitives {
// LL Cleanup : write all flags in the slice to make sure we don't have
// data corruption when flag loops over.
if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
- for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
+ for (int o = offset; o<stepLines; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
}
sendStep[i]++;
}
@@ -164,7 +171,7 @@ class ncclLLPrimitives {
}
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
- recvBuff[i] = conn->llBuff;
+ recvBuff[i] = (union ncclLLFifoLine*)conn->buffs[NCCL_PROTO_LL];
recvStep[i] = conn->step;
if (wid == i) recvConn = conn;
nrecv++;
@@ -179,7 +186,7 @@ class ncclLLPrimitives {
}
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
- sendBuff[i] = conn->llBuff;
+ sendBuff[i] = (union ncclLLFifoLine*)conn->buffs[NCCL_PROTO_LL];
sendStep[i] = conn->step;
if (wid == i) sendConn = conn;
nsend++;
@@ -212,8 +219,8 @@ class ncclLLPrimitives {
public:
__device__ __forceinline__
- ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
- : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), opCount(opCount) {
+ ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, int stepLines, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
+ : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepLines(stepLines), opCount(opCount) {
// Make sure step is updated before we read it.
barrier();
diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h
index 40a8cff..f445e0d 100644
--- a/src/collectives/device/prims_ll128.h
+++ b/src/collectives/device/prims_ll128.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -14,6 +14,7 @@ class ncclLL128Primitives {
const int tid;
const int nthreads;
const int wid;
+ const int stepSize;
const int warp;
const bool flagThread;
int nrecv = 0;
@@ -38,8 +39,8 @@ class ncclLL128Primitives {
volatile uint64_t* shmem;
- inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; }
- inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; }
+ inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
+ inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
inline __device__ uint64_t* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
inline __device__ uint64_t* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
inline __device__ uint64_t recvFlag(int i) { return recvStep[i]+1; }
@@ -47,9 +48,9 @@ class ncclLL128Primitives {
inline __device__ void barrier() {
if (NSEND>NRECV) {
- asm volatile ("bar.sync 2, %0;" :: "r"(nthreads));
+ asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
} else {
- asm volatile ("bar.sync 3, %0;" :: "r"(nthreads));
+ asm volatile ("bar.sync 2, %0;" :: "r"(nthreads));
}
}
@@ -309,7 +310,7 @@ class ncclLL128Primitives {
}
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
- recvBuff[i] = conn->ll128Buff;
+ recvBuff[i] = (uint64_t*)conn->buffs[NCCL_PROTO_LL128];
recvStep[i] = conn->step;
if (wid == i) recvConn = conn;
nrecv++;
@@ -324,7 +325,7 @@ class ncclLL128Primitives {
}
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
- sendBuff[i] = conn->ll128Buff;
+ sendBuff[i] = (uint64_t*)conn->buffs[NCCL_PROTO_LL128];
sendStep[i] = conn->step;
if (wid == i) sendConn = conn;
nsend++;
@@ -363,8 +364,8 @@ class ncclLL128Primitives {
public:
__device__ __forceinline__
- ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
- : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), opCount(opCount), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) {
+ ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
+ : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), stepSize(stepSize), opCount(opCount), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) {
// Make sure step is updated before we read it.
barrier();
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index e36613f..19b090e 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,29 +11,30 @@
template<int UNROLL, class FUNC, typename T>
__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int nthreads = args->nThreads-WARP_SIZE;
- const int bid = args->bid;
+ const int nthreads = args->coll.nThreads-WARP_SIZE;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
- const ssize_t size = args->N;
- const int nranks = comm->nRanks;
- const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+ const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
- const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+ const ssize_t size = args->coll.count;
const int rank = ring->devUserRanks[0];
const int prevRank = ring->devUserRanks[nranks-1];
- const int root = args->root;
+ const int root = args->coll.root;
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
- ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, FUNC>
- prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
+ ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, 0, FUNC>
+ prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+ int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t offset = gridOffset + bid*realChunkSize;
int nelem = min(realChunkSize, size-offset);
@@ -56,30 +57,30 @@ __device__ void ncclReduceCollNetKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
+ const int nthreads = args->coll.nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
-
- ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
- const int rank = comm->rank;
+ const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+ ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
+ const int rank = comm->rank;
const int prevRank = ring->devUserRanks[nranks-1];
- const int root = args->root;
+ const int root = args->coll.root;
- ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
- const ssize_t loopSize = args->nChannels*chunkSize;
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm, args->opCount);
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
- chunkSize = args->lastChunkSize;
+ chunkSize = args->coll.lastChunkSize;
}
ssize_t offset = gridOffset + bid*chunkSize;
@@ -104,31 +105,30 @@ __device__ void ncclReduceCollNetLLKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
+ const int nthreads = args->coll.nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
-
- ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
- const int rank = comm->rank;
+ const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+ ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
+ const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
+ const int rank = comm->rank;
const int prevRank = ring->devUserRanks[nranks-1];
- const int root = args->root;
-
- ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
- const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+ const int root = args->coll.root;
- const ssize_t loopSize = args->nChannels*chunkSize;
+ ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm, args->opCount);
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
+ chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index 0b0ae81..a0a9cc0 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,26 +11,27 @@
template<int UNROLL, class FUNC, typename T>
__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int nthreads = args->nThreads-WARP_SIZE;
- const int bid = args->bid;
+ const int nthreads = args->coll.nThreads-WARP_SIZE;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
- const ssize_t size = args->N;
- const int nranks = comm->nRanks;
- const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+ const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
- const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+ const ssize_t size = args->coll.count;
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
- ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, FUNC>
- prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
+ ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
+ prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+ int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
ssize_t chunkOffset = gridOffset + bid*realChunkSize;
@@ -70,27 +71,27 @@ __device__ void ncclReduceScatterCollNetKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
+ const int nthreads = args->coll.nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
-
- ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
- //const int rank = comm->rank;
+ const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+ ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
const int nranks = comm->nRanks;
- ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
- const ssize_t loopSize = args->nChannels*chunkSize;
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
+
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm, args->opCount);
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
- chunkSize = args->lastChunkSize;
+ chunkSize = args->coll.lastChunkSize;
}
ssize_t chunkOffset = gridOffset + bid*chunkSize;
@@ -132,29 +133,28 @@ __device__ void ncclReduceScatterCollNetLLKernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
+ const int nthreads = args->coll.nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
struct ncclRing* ring = &channel->ring;
-
- ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
- //const int rank = comm->rank;
- const int nranks = comm->nRanks;
- ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+ const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+ ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
- const ssize_t loopSize = args->nChannels*chunkSize;
+ ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm, args->opCount);
// Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
+ chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
ssize_t chunkOffset = gridOffset + bid*chunkSize;
diff --git a/src/collectives/device/sendrecv.cu b/src/collectives/device/sendrecv.cu
new file mode 100644
index 0000000..34e7adf
--- /dev/null
+++ b/src/collectives/device/sendrecv.cu
@@ -0,0 +1,14 @@
+/*************************************************************************
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "sendrecv.h"
+#include "common.h"
+#include "collectives.h"
+
+#if NCCL_OP == 0 && NCCL_TYPE == 0
+IMPL_COLL_FUNC(ncclSendRecv, copy, FuncSum, i8, int8_t);
+IMPL_COLL_KERN(ncclSendRecv, copy, FuncSum, i8, int8_t, 0);
+#endif
diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h
new file mode 100644
index 0000000..2fc64af
--- /dev/null
+++ b/src/collectives/device/sendrecv.h
@@ -0,0 +1,81 @@
+/*************************************************************************
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "devcomm.h"
+#include "primitives.h"
+#include "collectives.h"
+
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclSendRecvKernel(struct CollectiveArgs* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->p2p.nThreads-2*WARP_SIZE;
+
+ // Compute pointers
+ const T* sendbuff = (const T*)args->sendbuff;
+ T* recvbuff = (T*)args->recvbuff;
+
+ if (args->p2p.delta < 0 ) return; // No-op
+
+ if (args->p2p.delta == 0) {
+ if (tid < nthreads && sendbuff != recvbuff) {
+ // local copy : ReduceOrCopyMulti takes an int as number of elements,
+ // so we split it in blocks of 1G elements.
+ int blockSize = 1<<30;
+ for (size_t offset=0; offset<args->p2p.sendCount; offset += blockSize) {
+ size_t remaining = args->p2p.sendCount - offset;
+ if (remaining < blockSize) blockSize = remaining;
+ ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, blockSize);
+ sendbuff += blockSize; recvbuff += blockSize;
+ }
+ }
+ return;
+ }
+
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+
+ const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(sizeof(T)*NCCL_STEPS)/SENDRECV_SLICEFACTOR;
+
+ int nthreadsSplit = nthreads/2;
+ // We set NRECV or NSEND to 2 to use different barriers in primitives for the send threads and
+ // receive threads, but then we define all peers to -1 since sender threads don't receive and
+ // receive threads don't send.
+ int peerNone[2] = {-1,-1};
+
+ if (tid < nthreadsSplit + WARP_SIZE ) {
+ const ssize_t sendSize = args->p2p.sendCount;
+ if (sendSize < 0) return;
+
+ int peer = (comm->rank+(int)args->p2p.delta)%comm->nRanks;
+ ncclPrimitives<UNROLL, 1, 1, T, 2, 1, 1, FUNC>
+ prims(tid, nthreadsSplit, peerNone, &peer, recvbuff, stepSize*4, channel, comm, args->opCount);
+
+ if (sendSize == 0) {
+ prims.send(sendbuff, 0);
+ } else for (ssize_t offset = 0; offset < sendSize; offset += stepSize) {
+ int realChunkSize = min(stepSize, sendSize-offset);
+ ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+ int nelem = min(realChunkSize, sendSize-offset);
+ prims.directSend(sendbuff+offset, offset, nelem);
+ }
+ } else {
+ const ssize_t recvSize = args->p2p.recvCount;
+ if (recvSize < 0) return;
+
+ int peer = (comm->rank-(int)args->p2p.delta+comm->nRanks)%comm->nRanks;
+ ncclPrimitives<UNROLL, 1, 1, T, 1, 2, 1, FUNC>
+ prims(tid-nthreadsSplit-WARP_SIZE, nthreads-nthreadsSplit, &peer, peerNone, recvbuff, stepSize*4, channel, comm, args->opCount);
+
+ if (recvSize == 0) {
+ prims.recv(recvbuff, 0);
+ } else for (ssize_t offset = 0; offset < recvSize; offset += stepSize) {
+ int realChunkSize = min(stepSize, recvSize-offset);
+ ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+ int nelem = min(realChunkSize, recvSize-offset);
+ prims.directRecv(recvbuff+offset, offset, nelem);
+ }
+ }
+}
diff --git a/src/collectives/sendrecv.cc b/src/collectives/sendrecv.cc
new file mode 100644
index 0000000..2e32875
--- /dev/null
+++ b/src/collectives/sendrecv.cc
@@ -0,0 +1,37 @@
+/*************************************************************************
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "collectives.h"
+#include "argcheck.h" // Need some checks here since we access comm
+
+NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+ ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+ ncclComm_t comm, cudaStream_t stream) {
+ struct ncclInfo info = { ncclCollSendRecv, "Send",
+ sendbuff, NULL, count, datatype, ncclSum, peer, comm, stream, /* Args */
+ 1, 1 };
+ ncclResult_t ret;
+ NCCLCHECK(ncclGroupStart());
+ ret = ncclEnqueueCheck(&info);
+ NCCLCHECK(ncclGroupEnd());
+ return ret;
+}
+
+NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+ ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+ ncclComm_t comm, cudaStream_t stream) {
+ struct ncclInfo info = { ncclCollSendRecv, "Recv",
+ NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
+ 1, 1 };
+ ncclResult_t ret;
+ NCCLCHECK(ncclGroupStart());
+ ret = ncclEnqueueCheck(&info);
+ NCCLCHECK(ncclGroupEnd());
+ return ret;
+}
diff --git a/src/debug.cc b/src/debug.cc
index b2fc03c..3b99201 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -17,7 +17,7 @@ pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
void ncclDebugInit() {
pthread_mutex_lock(&ncclDebugLock);
- if (ncclDebugLevel != -1) return;
+ if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
const char* nccl_debug = getenv("NCCL_DEBUG");
if (nccl_debug == NULL) {
ncclDebugLevel = NCCL_LOG_NONE;
@@ -60,6 +60,8 @@ void ncclDebugInit() {
mask = NCCL_GRAPH;
} else if (strcasecmp(subsys, "TUNING") == 0) {
mask = NCCL_TUNING;
+ } else if (strcasecmp(subsys, "ENV") == 0) {
+ mask = NCCL_ENV;
} else if (strcasecmp(subsys, "ALL") == 0) {
mask = NCCL_ALL;
}
@@ -125,27 +127,32 @@ void ncclDebugInit() {
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
if (ncclDebugLevel == -1) ncclDebugInit();
if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
+ if (ncclDebugLevel < level) return;
+ // Gather the rank information. This can take > 1us so we want to make sure
+ // we only do it when needed.
char hostname[1024];
getHostName(hostname, 1024, '.');
int cudaDev;
cudaGetDevice(&cudaDev);
+ int pid = getpid();
+ int tid = gettid();
char buffer[1024];
size_t len = 0;
pthread_mutex_lock(&ncclDebugLock);
- if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
+ if (level == NCCL_LOG_WARN)
len = snprintf(buffer, sizeof(buffer),
- "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
- else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask))
+ "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, pid, tid, cudaDev, filefunc, line);
+ else if (level == NCCL_LOG_INFO && (flags & ncclDebugMask))
len = snprintf(buffer, sizeof(buffer),
- "%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev);
+ "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
#ifdef ENABLE_TRACE
- else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
+ else if (level == NCCL_LOG_TRACE && (flags & ncclDebugMask)) {
auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch;
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
len = snprintf(buffer, sizeof(buffer),
- "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line);
+ "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, pid, tid, cudaDev, timestamp, filefunc, line);
}
#endif
if (len) {
@@ -157,11 +164,4 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
fflush(ncclDebugFile);
}
pthread_mutex_unlock(&ncclDebugLock);
-
- // If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort()
- if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) {
- fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n",
- hostname, getpid(), gettid(), cudaDev, filefunc, line);
- abort();
- }
}
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 92f3467..2aeaf65 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -54,7 +54,8 @@
NCCL_FUNCS3B(coll, copy)
// Must be consistent with the ncclFuncSet enum
-static void* const ncclKerns[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
+static void* const ncclKerns[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
+ (void*)NCCL_KERN_NAME(ncclSendRecv, copy, i8),
NCCL_FUNCS2B(ncclBroadcast),
NCCL_FUNCS2A(ncclReduce),
NCCL_FUNCS2B(ncclAllGather),
@@ -87,11 +88,29 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *par
}
ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
- params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels);
+ // Only launch blocks where we have work to do.
+ for (int c=0; c<comm->p2pnChannels; c++) {
+ if (comm->channels[c].collCount) params->gridDim.x = c+1;
+ }
- // Set active = 2 for the last operation
- for (int r=0; r<params->gridDim.x; r++) {
- struct ncclChannel* channel = comm->channels+r;
+ // Set active = 2 for the last operation and add a no-op on empty channels (p2p case).
+ for (int c=0; c<params->gridDim.x; c++) {
+ struct ncclChannel* channel = comm->channels+c;
+ if (channel->collCount == 0) {
+ int opIndex = channel->collFifoTail;
+ struct ncclColl* c = channel->collectives+opIndex;
+ volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
+ while (activePtr[0] != 0) sched_yield();
+
+ c->args.p2p.delta = -1; // no-op
+ c->funcIndex = FUNC_INDEX_P2P;
+ c->args.comm = comm->devComm;
+ c->active = 1;
+ opIndex = (opIndex+1)%NCCL_MAX_OPS;
+ c->nextIndex = opIndex;
+ channel->collFifoTail = opIndex;
+ channel->collCount++;
+ }
channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active = 2;
}
@@ -146,8 +165,8 @@ ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
}
ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
- if (comm->nRanks == 1) return ncclSuccess;
struct cudaLaunchParams* params = comm->myParams;
+ if (params->gridDim.x == 0) return ncclSuccess;
NCCLCHECK(setupLaunch(comm, params));
@@ -166,21 +185,22 @@ ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
params->stream = comm->userStream;
}
- int isLast = 0;
- NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
-
- if (isLast) {
- if (comm->launchMode == ncclComm::GROUP) {
+ if (comm->launchMode == ncclComm::GROUP) {
+ int isLast = 0;
+ NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
+ if (isLast) {
// I'm the last. Launch all operations.
NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode));
+ NCCLCHECK(ncclCpuBarrierLast(comm));
}
- NCCLCHECK(ncclCpuBarrierLast(comm));
}
return ncclSuccess;
}
ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
- if (comm->nRanks == 1) return ncclSuccess;
+ struct cudaLaunchParams *params = comm->myParams;
+ if (params->gridDim.x == 0) return ncclSuccess;
+
// We can't print the CG mode before the first barrier happened.
if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
*comm->intraCGMode ^= 0x10;
@@ -190,15 +210,16 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
(comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
}
- NCCLCHECK(ncclCpuBarrierOut(comm));
- struct cudaLaunchParams *params = comm->myParams;
if (comm->launchMode == ncclComm::PARALLEL) {
CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
+ } else {
+ NCCLCHECK(ncclCpuBarrierOut(comm));
}
+
// Start the network proxies as soon as the kernel has been launched. We can't
// perform any CUDA call between the two or having a cudaFree between the CUDA
- // launch and the transportStartProxy call could cause a deadlock.
+ // launch and the ncclProxyStart call could cause a deadlock.
// Also, starting the proxies after the CUDA launch seems to be better for
// performance (latency).
for (int r=0; r<params->gridDim.x; r++) {
@@ -208,7 +229,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
}
params->gridDim.x = params->blockDim.x = 0;
comm->lastOpCount = comm->opCount;
- NCCLCHECK(transportStartProxy(comm));
+ NCCLCHECK(ncclProxyStart(comm));
return ncclSuccess;
}
@@ -313,23 +334,32 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
}
static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
+ coll->args.sendbuff = info->sendbuff;
+ coll->args.recvbuff = info->recvbuff;
+ coll->args.comm = info->comm->devComm;
+ coll->args.opCount = info->comm->opCount;
+
+ if (info->coll == ncclCollSendRecv) {
+ coll->args.p2p.sendCount = info->sendbytes;
+ coll->args.p2p.recvCount = info->recvbytes;
+ coll->args.p2p.delta = info->delta;
+ coll->funcIndex = FUNC_INDEX_P2P;
+ coll->args.p2p.nThreads = info->nThreads = info->comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]+2*WARP_SIZE;
+ return ncclSuccess;
+ }
// Set nstepsPerLoop and nchunksPerLoop
NCCLCHECK(getAlgoInfo(info));
NCCLCHECK(getPatternInfo(info));
NCCLCHECK(getLoopInfo(info));
- coll->args.root = info->root;
- coll->args.N = info->count;
- coll->args.ThisInput = info->sendbuff;
- coll->args.ThisOutput = info->recvbuff;
- coll->args.comm = info->comm->devComm;
- coll->args.opCount = info->comm->opCount;
- coll->args.nChannels = info->nChannels;
- coll->args.nThreads = info->nThreads;
+ coll->args.coll.root = info->root;
+ coll->args.coll.count = info->count;
+ coll->args.coll.nChannels = info->nChannels;
+ coll->args.coll.nThreads = info->nThreads;
coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
- int stepSize = (info->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : info->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS;
+ int stepSize = info->comm->buffSizes[info->protocol]/NCCL_STEPS;
int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1;
int chunkSize = stepSize*chunkSteps;
@@ -343,25 +373,28 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth && chunkSize > 32768) chunkSize /= 2;
}
// Use lastChunkSize as chunkSize
- coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+ coll->args.coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) {
// Optimize chunkSize / nSteps
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*16 && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth && chunkSize > 32768) chunkSize /= 2;
// Use lastChunkSize as chunkSize
- coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+ coll->args.coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->protocol == NCCL_PROTO_LL) {
- int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t);
+ const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
- coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
- ALIGN_SIZE(coll->args.lastChunkSize, info->nThreads*sizeof(uint64_t));
- coll->args.lastChunkSize /= ncclTypeSize(info->datatype);
+ coll->args.coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
+ ALIGN_SIZE(coll->args.coll.lastChunkSize, info->nThreads*sizeof(uint64_t));
+ coll->args.coll.lastChunkSize /= ncclTypeSize(info->datatype);
} else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) {
- int nstepsInter = 1+log2i(info->comm->nNodes);
- while (info->nBytes / (info->nChannels*chunkSize) < nstepsInter*4 && chunkSize > 32768) chunkSize /= 2;
+ int nNodes = info->comm->nNodes;
+ float ppn = info->comm->nRanks / (float)nNodes;
+ float nstepsLL128 = 1+log2i(nNodes) + 0.1*ppn;
+ while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2;
+ while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2;
// Use lastChunkSize as chunkSize
- coll->args.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
+ coll->args.coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
}
// Compute nSteps for proxies
@@ -383,8 +416,19 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
return ncclSuccess;
}
-static ncclResult_t saveKernel(struct ncclInfo* info) {
- if (info->comm->nRanks == 1) {
+static ncclResult_t checkSetStream(struct ncclInfo* info) {
+ if (info->comm->userStreamSet == false) {
+ info->comm->userStream = info->stream;
+ info->comm->userStreamSet = true;
+ } else if (info->stream != info->comm->userStream) {
+ WARN("Error : mixing different streams within a group call is not supported.");
+ return ncclInvalidUsage;
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
+ if (info->comm->nRanks == 1 && info->coll != ncclCollSendRecv) {
if (info->sendbuff != info->recvbuff)
CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, cudaMemcpyDeviceToDevice, info->stream));
return ncclSuccess;
@@ -395,22 +439,18 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
NCCLCHECK(computeColl(info, &coll, &proxyArgs));
- info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, coll.args.nThreads);
- if (info->comm->userStreamSet == false) {
- info->comm->userStream = info->stream;
- info->comm->userStreamSet = true;
- } else if (info->stream != info->comm->userStream) {
- WARN("Error : mixing different streams within a group call is not supported.");
- return ncclInvalidUsage;
- }
+ info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
+ int nChannels = info->coll == ncclCollSendRecv ? 1 : coll.args.coll.nChannels;
int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1;
- for (int bid=0; bid<coll.args.nChannels*nSubChannels; bid++) {
- int channelId = info->comm->myParams->gridDim.x % info->comm->nChannels;
+
+ for (int bid=0; bid<nChannels*nSubChannels; bid++) {
+ int channelId = (info->coll == ncclCollSendRecv) ? info->channelId :
+ info->comm->myParams->gridDim.x % info->comm->nChannels;
struct ncclChannel* channel = info->comm->channels+channelId;
if (channel->collCount == NCCL_MAX_OPS) {
- WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
+ WARN("Too many aggregated operations on channel %d (%d max)", channel->id, NCCL_MAX_OPS);
return ncclInvalidUsage;
}
@@ -420,18 +460,22 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
if (nSubChannels == 2) {
info->pattern = (channelId < info->comm->nChannels/nSubChannels) ? ncclPatternCollTreeUp : ncclPatternCollTreeDown;
}
- NCCLCHECK(transportSaveProxies(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
+ if (info->coll == ncclCollSendRecv) {
+ info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1);
+ NCCLCHECK(ncclProxySaveP2p(info, channel));
+ } else {
+ NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
+ }
info->comm->myParams->gridDim.x++;
-
int opIndex = channel->collFifoTail;
struct ncclColl* c = channel->collectives+opIndex;
volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
while (activePtr[0] != 0) sched_yield();
memcpy(c, &coll, sizeof(struct ncclColl));
+ if (info->coll != ncclCollSendRecv) c->args.coll.bid = bid % coll.args.coll.nChannels;
- c->args.bid = bid % coll.args.nChannels;
c->active = 1;
opIndex = (opIndex+1)%NCCL_MAX_OPS;
c->nextIndex = opIndex;
@@ -442,35 +486,82 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
return ncclSuccess;
}
+// Save p2p operations in comm->p2plist. Operations will be posted to channels
+// during ncclGroupEnd()
+ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
+ struct ncclComm* comm = info->comm;
+ struct ncclP2Plist* p2plist = &comm->p2plist;
+ int peer = info->root;
+ p2plist->count++;
+ ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
+ if (info->recvbuff == NULL) {
+ if (peer != comm->rank) {
+ int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
+ for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
+ int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
+ if (comm->channels[channelId].peers[peer].send.connected == 0) {
+ p2plist->connect.send[channelId*comm->nRanks+p2plist->connect.nsend[channelId]++] = peer;
+ }
+ }
+ }
+ p2plist->peerlist[info->root].sendbytes = nBytes;
+ p2plist->peerlist[info->root].sendbuff = info->sendbuff;
+ } else {
+ if (peer != comm->rank) {
+ int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
+ for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
+ int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
+ if (comm->channels[channelId].peers[peer].recv.connected == 0) {
+ p2plist->connect.recv[channelId*comm->nRanks+p2plist->connect.nrecv[channelId]++] = peer;
+ }
+ }
+ }
+ p2plist->peerlist[info->root].recvbytes = nBytes;
+ p2plist->peerlist[info->root].recvbuff = info->recvbuff;
+ }
+ return ncclSuccess;
+}
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
- if (info->comm == NULL) return ncclInvalidArgument;
-
- INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
- info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
- info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
-
// Launch asynchronously if needed
if (ncclAsyncMode()) {
ncclResult_t ret = ncclSuccess;
int savedDev = -1;
+ // Check arguments
+ NCCLCHECK(PtrCheck(info->comm, info->opName, "comm"));
if (info->comm->checkPointers) {
CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end);
CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, end);
}
- // Check arguments
NCCLCHECKGOTO(ArgsCheck(info), ret, end);
// Always register comm even in case of error to make sure ncclGroupEnd
// cleans it up.
NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
- NCCLCHECKGOTO(saveKernel(info), ret, end);
+ NCCLCHECKGOTO(checkSetStream(info), ret, end);
+
+ INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
+ info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
+ info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
+
+ if (info->coll == ncclCollSendRecv) { //p2p stored separately
+ NCCLCHECKGOTO(ncclSaveP2p(info), ret, end);
+ } else {
+ NCCLCHECKGOTO(ncclSaveKernel(info), ret, end);
+ }
end:
if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev));
ncclAsyncErrCheck(ret);
return ret;
} else {
+ NCCLCHECK(PtrCheck(info->comm, info->opName, "comm"));
NCCLCHECK(ArgsCheck(info));
- NCCLCHECK(saveKernel(info));
+ NCCLCHECK(checkSetStream(info));
+
+ INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
+ info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
+ info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
+
+ NCCLCHECK(ncclSaveKernel(info));
NCCLCHECK(ncclBarrierEnqueue(info->comm));
NCCLCHECK(ncclBarrierEnqueueWait(info->comm));
NCCLCHECK(ncclEnqueueEvents(info->comm));
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 0872ae7..b711874 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -9,6 +9,7 @@
#include "topo.h"
#include "comm.h"
#include "net.h"
+#include "channel.h"
// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths
@@ -231,15 +232,16 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
}
}
}
- if (l >= 0) INFO(NCCL_GRAPH, "%s set from environment to %s", levelEnv, topoPathTypeStr[l]);
+ if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]);
*level = l >= 0 ? l : -2;
}
return ncclSuccess;
}
int ncclTopoUserP2pLevel = -1;
-ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p) {
+ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read) {
*p2p = 0;
+ *read = 0;
// Get GPUs from topology
int g1, g2;
@@ -254,21 +256,33 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
// In general, use P2P whenever we can.
int p2pLevel = PATH_SYS;
+ // User override
+ if (ncclTopoUserP2pLevel == -1)
+ NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL"));
+ if (ncclTopoUserP2pLevel != -2) {
+ p2pLevel = ncclTopoUserP2pLevel;
+ goto compare;
+ }
+
// Don't use P2P through ARM CPUs
int arch, vendor, model;
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB;
- if (arch == NCCL_TOPO_CPU_ARCH_X86 &&
- vendor == NCCL_TOPO_CPU_VENDOR_INTEL &&
- model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB;
-
- // User override
- NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL"));
- if (ncclTopoUserP2pLevel != -2) p2pLevel = ncclTopoUserP2pLevel;
+ if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
+ if (model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB;
+ else p2pLevel = PATH_PHB;
+ }
+compare:
// Compute the PCI distance and compare with the p2pLevel.
if (path->type <= p2pLevel) *p2p = 1;
+ if (path->type == PATH_NVL) {
+ struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2;
+ // Enable P2P Read for Ampere/NVLink only
+ if ((gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1;
+ }
+
return ncclSuccess;
}
@@ -341,8 +355,8 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
// Update path when we don't want to / can't use GPU Direct P2P
for (int p=0; p<system->nodes[GPU].count; p++) {
- int p2p;
- NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p));
+ int p2p, read;
+ NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, &read));
if (p2p == 0) {
// Divert all traffic through the CPU
int cpu;
@@ -437,3 +451,69 @@ void ncclTopoFree(struct ncclTopoSystem* system) {
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
free(system);
}
+
+static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*local gpu index*/, int peerRank, int* nChannels) {
+ int peer;
+ struct ncclTopoLinkList* path = NULL;
+ if (ncclTopoRankToIndex(system, peerRank, &peer) == ncclSuccess) {
+ // Same rank
+ if (g == peer) {
+ *nChannels = -1;
+ return ncclSuccess;
+ }
+ // Local rank
+ path = system->nodes[GPU].nodes[peer].paths[GPU]+g;
+ if (path->type == PATH_NVL) {
+ int sm = system->nodes[GPU].nodes[g].gpu.cudaCompCap;
+ double nvlWidth = sm < 70 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
+ *nChannels = 2*std::max(1, (int)(path->width / nvlWidth));
+ } else {
+ *nChannels = 2;
+ }
+ } else {
+ // Remote rank, use network
+ *nChannels = 1;
+ }
+ return ncclSuccess;
+}
+
+NCCL_PARAM(MinP2pNChannels, "MIN_P2P_NCHANNELS", 1);
+NCCL_PARAM(MaxP2pNChannels, "MAX_P2P_NCHANNELS", MAXCHANNELS);
+
+static int nextPow2(int v) {
+ int pow2 = 1;
+ while (pow2 < v) pow2 <<= 1;
+ return pow2;
+}
+
+ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
+ comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels());
+ comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels());
+ int minChannels = comm->p2pnChannels;
+ // We need to loop through all local GPUs to have a global picture
+ for (int g=0; g<comm->topo->nodes[GPU].count; g++) {
+ for (int r=0; r<comm->nRanks; r++) {
+ int nChannels;
+ NCCLCHECK(ncclTopoGetNchannels(comm->topo, g, r, &nChannels));
+ if (nChannels >= 0) minChannels = std::min(minChannels, nChannels);
+ }
+ }
+
+ // Round to next pow2 nChannelsPerPeer and nChannels
+ comm->p2pnChannelsPerPeer = nextPow2(minChannels);
+ comm->p2pnChannels = nextPow2(comm->p2pnChannels);
+
+ // Init channels that weren't used so far
+ for (int c=comm->nChannels; c<comm->p2pnChannels; c++) NCCLCHECK(initChannel(comm, c));
+
+ // We want to spread channels used when there aren't many and progressively
+ // fill the whole space of nChannels. To do so we mirror the bits in the
+ // nChannels space.
+ for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
+ int mirror = 0;
+ for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb;
+ comm->p2pChannels[c] = mirror;
+ }
+ INFO(NCCL_INIT, "%d coll channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
+ return ncclSuccess;
+}
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 1bbb7d3..42e1bb9 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -13,13 +13,11 @@
// Initialize system->maxWidth. This is the per-channel (i.e. per-SM)
// max speed.
static float getMaxWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) {
- float nvLinkWidth = gpu->gpu.cudaCompCap > 60 ? VOLTA_NVLINK_WIDTH : PASCAL_NVLINK_WIDTH;
float maxWidth = 0.0;
for (int i=0; i<system->nodes[type].count; i++) {
struct ncclTopoLinkList* path = gpu->paths[type]+i;
float width = path->width;
if (path->count == 0) continue;
- if (path->type == PATH_NVL) width = std::min(nvLinkWidth, width);
maxWidth = std::max(maxWidth, width);
}
return maxWidth;
@@ -73,7 +71,7 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod
struct ncclTopoLink* revLink = NULL;
float fwSpeed = link->type == LINK_PCI ? pciSpeed : speed;
float revSpeed = 0;
- if (link->remNode->type == GPU && start->type != GPU) {
+ if (link->remNode->type == GPU && link->remNode->gpu.cudaCompCap < 80 && start->type != GPU) {
if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink));
revSpeed += fwSpeed/8;
}
@@ -326,6 +324,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
for (int n=0; n<system->nodes[NET].count; n++) {
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+ if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
if (net) {
@@ -394,8 +393,10 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
}
if (graph->nChannels == 0 || graph->sameChannels == 0) {
if (graph->nChannels == 0) {
- // Always try the PCI order first to set a reference
- NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, NET, n, 0));
+ // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
+ int t = 1 << 10;
+ NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
+ if (t == -1) *time = -1;
}
// Then try the most local GPUs
@@ -528,7 +529,7 @@ ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, st
}
return ncclSuccess;
}
-ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
+ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels) {
int id;
NCCLCHECK(xmlGetAttrInt(xmlGraph, "id", &id));
if (graph->id != id) return ncclSuccess;
@@ -551,11 +552,12 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc
for (int s=0; s<xmlGraph->nSubs; s++) {
NCCLCHECK(ncclTopoGetChannelFromXml(xmlGraph->subs[s], s, system, graph));
}
+ *nChannels = xmlGraph->nSubs;
return ncclSuccess;
}
-ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
+ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels) {
for (int s=0; s<xmlGraphs->nSubs; s++) {
- NCCLCHECK(ncclTopoGetGraphFromXmlSub(xmlGraphs->subs[s], system, graph));
+ NCCLCHECK(ncclTopoGetGraphFromXmlSub(xmlGraphs->subs[s], system, graph, nChannels));
}
return ncclSuccess;
}
@@ -621,7 +623,7 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
return ncclSuccess;
}
-float speedArray[] = { 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+float speedArray[] = { 42.0, 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
#define NSPEEDS (sizeof(speedArray)/sizeof(float))
ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
@@ -636,10 +638,13 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
char* str = getenv("NCCL_GRAPH_FILE");
if (str) {
+ INFO(NCCL_ENV, "NCCL_GRAPH_FILE set by environment to %s", str);
struct ncclXml* xml;
NCCLCHECK(ncclCalloc(&xml, 1));
NCCLCHECK(ncclTopoGetXmlGraphFromFile(str, xml));
- NCCLCHECK(ncclTopoGetGraphFromXml(xml->nodes, system, graph));
+ int nChannels;
+ NCCLCHECK(ncclTopoGetGraphFromXml(xml->nodes, system, graph, &nChannels));
+ INFO(NCCL_GRAPH, "Search %d : %d channels loaded from XML graph", graph->id, nChannels);
free(xml);
if (graph->nChannels > 0) return ncclSuccess;
}
@@ -764,6 +769,15 @@ done:
graph->typeIntra = graph->typeInter = PATH_SYS;
graph->nChannels = 1;
}
+
+ if (graph->speedIntra >= 25.0) {
+ int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
+ memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
+ memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
+ graph->speedIntra /= 2;
+ graph->speedInter /= 2;
+ graph->nChannels = dupChannels;
+ }
return ncclSuccess;
}
@@ -795,6 +809,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs) {
char* str = getenv("NCCL_GRAPH_DUMP_FILE");
if (str) {
+ INFO(NCCL_ENV, "NCCL_GRAPH_DUMP_FILE set by environment to %s", str);
struct ncclXml* xml;
NCCLCHECK(ncclCalloc(&xml, 1));
NCCLCHECK(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml));
@@ -804,10 +819,17 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
return ncclSuccess;
}
-ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int rank, int channelId, int* dev) {
- int channel = channelId%graph->nChannels;
- int ngpus = system->nodes[GPU].count;
- int index = graph->intra[channel*ngpus] == rank ? 0 : 1;
- *dev = graph->inter[channel*2+index];
+ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* dev) {
+ if (graph) {
+ // Honor the net device in the graph
+ int channel = channelId%graph->nChannels;
+ int ngpus = system->nodes[GPU].count;
+ int index = graph->intra[channel*ngpus] == rank ? 0 : 1;
+ *dev = graph->inter[channel*2+index];
+ } else {
+ int64_t id;
+ NCCLCHECK(ncclTopoGetLocalNet(system, rank, &id, channelId));
+ *dev = id;
+ }
return ncclSuccess;
}
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index ac6b111..ed79e09 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -504,6 +504,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
NCCLCHECK(ncclCalloc(&xml, 1));
char* xmlTopoFile = getenv("NCCL_TOPO_FILE");
if (xmlTopoFile) {
+ INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile);
NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml));
}
if (xml->maxIndex == 0) {
@@ -562,6 +563,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE");
if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
+ INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
}
@@ -570,6 +572,28 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
return ncclSuccess;
}
+ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_t* id, int rr) {
+ int g;
+ NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
+ int minType = PATH_SYS;
+ float maxWidth = 0;
+ int count = 0;
+ int* nets;
+ NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
+ for (int n=0; n<system->nodes[NET].count; n++) {
+ struct ncclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU]+g;
+ if (path->width > maxWidth || (path->width == maxWidth && path->type < minType)) {
+ maxWidth = path->width;
+ minType = path->type;
+ count = 0;
+ }
+ if (path->width == maxWidth && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
+ }
+ *id = nets[rr % count];
+ free(nets);
+ return ncclSuccess;
+}
+
/****************************/
/* External query functions */
/****************************/
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 848fc03..950cff8 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -126,8 +126,10 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode
ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system);
+ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_t* id, int rr);
+
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem);
-ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
+ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels);
ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml);
static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, int64_t id, int* index) {
@@ -141,4 +143,15 @@ static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, i
return ncclInternalError;
}
+static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank, int* index) {
+ *index = -1;
+ for (int i=0; i<system->nodes[GPU].count; i++) {
+ if (system->nodes[GPU].nodes[i].gpu.rank == rank) {
+ *index = i;
+ return ncclSuccess;
+ }
+ }
+ return ncclInternalError;
+}
+
#endif
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index 8a0b4cd..29424b0 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -51,13 +51,9 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
return ncclSuccess;
}
-static const char* ncclFuncStr[] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
-static const char* ncclAlgoStr[] = { "Tree", "Ring", "CollNet" };
-static const char* ncclProtoStr[] = { "LL", "LL128", "Simple" };
-
// Latencies in us, Bandwidths in GB/s
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
-static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4, 0 }, { 3.6, 3.6, 8.4 }, { 4.4, 4.4, 0 } };
+static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4, 0 }, { 3.6, 10.0, 8.4 }, { 4.4, 4.4, 0 } };
// NVLink, PCI, Network
#define NCCL_HW_NVLINK 0
@@ -66,17 +62,18 @@ static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4,
// Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network).
static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
{ /* NVLINK */
- { /* Tree (LL/LL128/Simple)*/ { .5, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { .4, 2.5, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { .5, 1.9, 4.0 } },
+ { /* Tree (LL/LL128/Simple)*/ { .52, 1.2, 28 }, /* Ring (LL/LL128/Simple)*/ { .47, 1.9, 3.4 }, /* CollNet (LL/LL128/Simple)*/ { .5, 1.2, 4.0 } },
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { 1.0, 1.9, 5.5 } },
/* NET */
- { /* Tree (LL/LL128/Simple)*/ { 5.0, 7.5, 50 }, /* Ring (LL/LL128/Simple)*/ { .9, 2.5, 6.6 }, /* CollNet (LL/LL128/Simple)*/ { 5.0, 5.0, 10.7 } }
+ { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 50 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 9.6 }, /* CollNet (LL/LL128/Simple)*/ { 5.0, 5.0, 10.7 } }
};
-// LL128 max BW for the different collectives
-static const double ll128MaxBw[NCCL_NUM_FUNCTIONS] = { 113.0, 72.0, 110.0, 91.0, 100.0 };
+// LL128 max BW (per channel) for the different collectives
+// ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce
+static const double ll128MaxBwPerCh[NCCL_NUM_FUNCTIONS] = { 18.8, 12.0, 18.3, 15.2, 16.7 };
-ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
+ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_MAX_NTHREADS;
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
@@ -89,6 +86,8 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma
if (comm->nRanks <= 1) return ncclSuccess;
+ int compCap80 = minCompCap == 80 && maxCompCap == 80 ? 1 : 0;
+ float ppn = (float)comm->nRanks / comm->nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount
struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph };
int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
@@ -98,6 +97,9 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma
int nsteps = coll == ncclCollAllReduce ? 2*(comm->nRanks-1) :
coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nRanks-1 :
comm->nRanks;
+ int nInterSteps = coll == ncclCollAllReduce ? 2*(comm->nNodes-1) :
+ coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nNodes-1 :
+ comm->nNodes;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
if (coll != ncclCollAllReduce && a != NCCL_ALGO_RING) continue;
@@ -105,13 +107,17 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
float speed = comm->nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
float busBw = graphs[a]->nChannels * speed;
+ if (compCap80) busBw *= 0.92;
// Various model refinements
- if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= 1.0/4.0;
- if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw*120.0/128.0, ll128MaxBw[coll]);
- if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.9, comm->nNodes > 2 ? 80.0 : 110.0);
- if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/3.0;
- if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw *= 7.0/9.0;
+ if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= (comm->nNodes > 1 || coll == ncclCollAllReduce || coll == ncclCollReduce) ? 1.0/4.0 : 1.0/3.0;
+ if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
+ double maxTreeBw = comm->nNodes > 2 ?
+ compCap80 && p == NCCL_PROTO_LL128 ? 105.0 : 80.0 :
+ compCap80 && p == NCCL_PROTO_LL128 ? 130.0 : 110.0;
+ if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.9, maxTreeBw);
+ if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/3.8;
+ if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (comm->nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels*7.0/9.0);
if (a == NCCL_ALGO_COLLNET) busBw *= .9;
if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL) busBw *= 1.0/6.0; // Take into account that GDR read is disabled on both sides
if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL128) busBw = 0; // CollNet does not support LL128
@@ -121,6 +127,9 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma
comm->bandwidths[coll][a][p] = busBw * ratio;
comm->latencies[coll][a][p] = baseLat[a][p];
+ float intraLat = hwLat[intraHw[a]][a][p];
+ float interLat = hwLat[NCCL_HW_NET][a][p];
+ if (comm->nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
if (a == NCCL_ALGO_RING) {
float lat = hwLat[hw[a]][a][p];
if ((coll == ncclCollReduce || coll == ncclCollBroadcast)) {
@@ -131,16 +140,12 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma
comm->latencies[coll][a][p] += nsteps*lat;
}
} else {
- comm->latencies[coll][a][p] += nsteps*lat;
+ comm->latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat;
}
} else if (a == NCCL_ALGO_TREE) {
- float intraLat = hwLat[intraHw[a]][a][p];
- float interLat = hwLat[NCCL_HW_NET][a][p];
comm->latencies[coll][a][p] +=
2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat);
} else {
- float intraLat = hwLat[intraHw[a]][a][p];
- float interLat = hwLat[NCCL_HW_NET][a][p];
comm->latencies[coll][a][p] +=
2 * (comm->nRanks/comm->nNodes-1) * intraLat + interLat;
}
@@ -154,17 +159,26 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma
int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1 };
const char *protoStr = getenv("NCCL_PROTO");
- if (protoStr) NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
+ if (protoStr) {
+ INFO(NCCL_ENV, "NCCL_PROTO set by environment to %s", protoStr);
+ NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
+ }
const char *algoStr = getenv("NCCL_ALGO");
- if (algoStr) NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
+ if (algoStr) {
+ INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
+ NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
+ }
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
int pEnable = protoEnable[p];
if (pEnable == 2 && p == NCCL_PROTO_LL128) {
- // Enable LL128 by default only on Volta+NVLink. Other cases are not tested and may cause silent data corruption.
- pEnable = (graphs[a]->typeInter <= LINK_PCI) && graphs[a]->typeIntra == LINK_NVL && minCompCap == 70 && maxCompCap == 70 ? 1 : 0;
+ // Enable LL128 by default only on Volta/Ampere+NVLink. Other cases are not tested and may cause silent data corruption.
+ pEnable = (graphs[a]->typeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL &&
+ ((minCompCap == 70 && maxCompCap == 70) || (minCompCap == 80 && maxCompCap == 80)) ? 1 : 0;
}
- if (pEnable == 0 || algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
+ if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
+ // Only disable algo for Allreduce since others only have one
+ if (c == ncclCollAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
}
if (comm->rank == 0) {
@@ -205,6 +219,7 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma
// Override defaults with user env
char* str = getenv("NCCL_THREAD_THRESHOLDS");
if (str) {
+ INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str);
ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2}};
sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2);
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
@@ -228,20 +243,23 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma
}
// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
-// factor is not ideal but works quite well. Powers of two, 64 B to 1 GB.
+// factor is not ideal but works quite well. Powers of two, 64 B to 128MB.
static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
- { 1.0, 1.0, 1.0, 1.0, .9, .8, .7, .7, .7, .7, .6, .5, .5, .5, .6, .7, .8, .9, .9, 1.0, 1.0, 1.0 },
- { 1.0, 1.0, 1.0, 1.0, 1.0, .9, .8, .8, .8, .8, .7, .7, .7, .6, .6, .7, .7, .8, .8, .9, .9, 1.0 },
+ { 1.0, 1.0, 1.0, 1.0, .9, .8, .7, .7, .7, .7, .6, .5, .4, .4, .5, .6, .7, .8, .9, 1.0, 1.0, 1.0 },
+ { 1.0, 1.0, 1.0, 1.0, 1.0, .9, .8, .8, .8, .7, .6, .6, .6, .5, .6, .6, .7, .7, .8, .9, .9, 1.0 },
{ .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .5, .5, .6, .6, .7, .8, .9 }
};
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time) {
float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
+ float lat = info->comm->latencies[info->coll][algorithm][protocol];
if (bw == 0) {
*time = -1.0; return ncclSuccess;
}
int logSize = log2i(info->nBytes>>6);
if (algorithm == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[protocol][logSize];
- *time = info->comm->latencies[info->coll][algorithm][protocol] + (info->nBytes) / (1000 * bw);
+ if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1
+ && info->coll == ncclCollAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) lat *= 1.9; // Plateau effect of ring
+ *time = lat + (info->nBytes) / (1000 * bw);
return ncclSuccess;
}
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index f138d0b..2885787 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -590,7 +590,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
NCCLCHECK(xmlGetSub(pciNode, "nvlink", &nvlNode));
if (nvlNode == NULL) {
// NVML NVLink detection
- int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : 6;
+ int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : 12;
if (maxNvLinks > 0 && nvmlDev == NULL) {
WARN("No NVML device handle. Skipping nvlink detection.\n");
diff --git a/src/group.cc b/src/group.cc
index 9bf8ac9..549a4fd 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,6 +7,7 @@
#include "group.h"
#include "debug.h"
#include "enqueue.h"
+#include "transport.h"
#define MAX_ASYNC_OPS 128
thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS];
@@ -33,6 +34,7 @@ struct ncclInitArgs {
};
struct ncclCollArgs {
ncclComm_t comm;
+ int connect;
};
enum ncclAsyncFuncType {
@@ -51,16 +53,24 @@ struct ncclAsyncArgs {
thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];
-#define CHECK(a) do { \
+#define NCCLCHECKTHREAD(a) do { \
if ((args->ret = (a)) != ncclSuccess) { \
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
return args; \
} \
} while(0)
+#define CUDACHECKTHREAD(a) do { \
+ if ((a) != cudaSuccess) { \
+ INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
+ args->ret = ncclUnhandledCudaError; \
+ return args; \
+ } \
+} while(0)
+
void* ncclAsyncThreadMain(void* args_) {
struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
- CHECK(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev));
+ NCCLCHECKTHREAD(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev));
return args;
}
@@ -99,20 +109,50 @@ ncclResult_t ncclAsyncColl(ncclComm_t comm) {
NCCL_API(ncclResult_t, ncclGroupStart);
ncclResult_t ncclGroupStart() {
+ if (ncclGroupMode == 0) {
+ memset(ncclGroupArgs, 0, sizeof(struct ncclAsyncArgs)*MAX_ASYNC_OPS);
+ }
ncclGroupMode++;
return ncclSuccess;
}
+static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes, void* recvbuff, ssize_t sendbytes, const void* sendbuff) {
+ struct ncclInfo info = { ncclCollSendRecv, "SendRecv",
+ sendbuff, recvbuff, (size_t)std::max<ssize_t>(sendbytes,recvbytes), ncclInt8, ncclSum, -1, comm, comm->userStream, /* Args */
+ 1, 1 };
+ info.delta = delta;
+ info.channelId = channelId;
+ info.sendbytes = sendbytes;
+ info.recvbytes = recvbytes;
+ if (delta == 0 && sendbytes != recvbytes) return ncclInvalidUsage;
+ NCCLCHECK(ncclSaveKernel(&info));
+ return ncclSuccess;
+}
+
+void* ncclAsyncThreadPreconnect(void* args_) {
+ struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
+ CUDACHECKTHREAD(cudaSetDevice(args->coll.comm->cudaDev));
+ for (int c=0; c<args->coll.comm->p2pnChannels; c++) {
+ struct ncclComm* comm = args->coll.comm;
+ struct ncclChannel* channel = comm->channels+c;
+ struct ncclP2PConnect* connect = &comm->p2plist.connect;
+ NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, channel, connect->nrecv[c], connect->recv+c*comm->nRanks, connect->nsend[c], connect->send+c*comm->nRanks));
+ connect->nrecv[c] = 0;
+ connect->nsend[c] = 0;
+ }
+ return args;
+}
+
NCCL_API(ncclResult_t, ncclGroupEnd);
ncclResult_t ncclGroupEnd() {
+ if (ncclGroupMode == 0) return ncclInvalidUsage;
ncclGroupMode--;
if (ncclGroupMode > 0) return ncclSuccess;
int savedDev;
CUDACHECK(cudaGetDevice(&savedDev));
- int done = ncclGroupIndex;
+ int activeThreads = 0;
int doneArray[MAX_ASYNC_OPS];
- for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 0;
-
+ for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 1;
ncclResult_t ret = ncclGroupError;
if (ret != ncclSuccess) goto group_cleanup;
@@ -121,6 +161,97 @@ ncclResult_t ncclGroupEnd() {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_INIT) {
pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadMain, args);
+ activeThreads++;
+ doneArray[i] = 0;
+ }
+ }
+ /* For init, since we use threads, we just wait for threads to complete */
+ while (activeThreads) {
+ for (int i=0; i<ncclGroupIndex; i++) {
+ struct ncclAsyncArgs* args = ncclGroupArgs+i;
+ if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
+ int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL);
+ if (err == EBUSY) continue;
+ if (err != 0) ret = ncclSystemError;
+ if (args->ret != ncclSuccess) ret = args->ret;
+ doneArray[i] = 1;
+ activeThreads--;
+ }
+ }
+ }
+
+ for (int i=0; i<ncclGroupIndex; i++) {
+ struct ncclAsyncArgs* args = ncclGroupArgs+i;
+ if (args->funcType == ASYNC_FUNC_COLL) {
+ struct ncclP2Plist* p2plist = &args->coll.comm->p2plist;
+ if (p2plist->count != 0) {
+ struct ncclComm* comm = args->coll.comm;
+ args->coll.connect = 0;
+ for (int c=0; c<comm->p2pnChannels; c++)
+ args->coll.connect += comm->p2plist.connect.nsend[c] + comm->p2plist.connect.nrecv[c];
+ if (args->coll.connect) {
+ pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args);
+ }
+ }
+ }
+ }
+
+ for (int i=0; i<ncclGroupIndex; i++) {
+ struct ncclAsyncArgs* args = ncclGroupArgs+i;
+ if (args->funcType == ASYNC_FUNC_COLL && (args->coll.connect)) {
+ int err = pthread_join(ncclGroupThreads[i], NULL);
+ if (err != 0) {
+ WARN("Error waiting for pthread_join : %s\n", strerror(errno));
+ return ncclSystemError;
+ }
+ NCCLCHECKGOTO(args->ret, ret, end);
+ }
+ }
+
+ for (int i=0; i<ncclGroupIndex; i++) {
+ struct ncclAsyncArgs* args = ncclGroupArgs+i;
+ if (args->funcType == ASYNC_FUNC_COLL) {
+ struct ncclComm* comm = args->coll.comm;
+ int rank = comm->rank;
+ int nRanks = comm->nRanks;
+ struct ncclP2Plist* p2plist = &args->coll.comm->p2plist;
+ if (p2plist->count) {
+ for (int delta=0; delta<nRanks; delta++) {
+ uint32_t from = (rank+nRanks-delta)%nRanks;
+ uint32_t to = (rank+delta)%nRanks;
+
+ // Compute how much to split operations
+ // Natural step size matching buffer steps.
+ ssize_t stepSize = 4*comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
+ // Split each operation on p2pnChannelsPerPeer max.
+ ssize_t recvChunkSize = DIVUP(p2plist->peerlist[from].recvbytes, comm->p2pnChannelsPerPeer);
+ ssize_t sendChunkSize = DIVUP(p2plist->peerlist[to].sendbytes, comm->p2pnChannelsPerPeer);
+ recvChunkSize = std::max((ssize_t)1, DIVUP(recvChunkSize, stepSize)) * stepSize;
+ sendChunkSize = std::max((ssize_t)1, DIVUP(sendChunkSize, stepSize)) * stepSize;
+
+ ssize_t sendOffset = 0;
+ ssize_t recvOffset = 0;
+ int remaining = 1;
+ int chunk = 0;
+ while (remaining) {
+ int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
+ remaining = 0;
+ ssize_t recvbytes = p2plist->peerlist[from].recvbytes-recvOffset;
+ ssize_t sendbytes = p2plist->peerlist[to].sendbytes-sendOffset;
+ if (recvbytes > recvChunkSize) { remaining = 1; recvbytes = recvChunkSize; } else p2plist->peerlist[from].recvbytes = -1;
+ if (sendbytes > sendChunkSize) { remaining = 1; sendbytes = sendChunkSize; } else p2plist->peerlist[to].sendbytes = -1;
+ if (sendbytes >= 0 || recvbytes >= 0) {
+ NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
+ recvbytes, ((char*)(p2plist->peerlist[from].recvbuff)) + recvOffset,
+ sendbytes, ((const char*)(p2plist->peerlist[to].sendbuff)) + sendOffset), ret, end);
+ }
+ recvOffset += recvChunkSize;
+ sendOffset += sendChunkSize;
+ chunk++;
+ }
+ }
+ p2plist->count = 0;
+ }
}
}
@@ -154,25 +285,9 @@ ncclResult_t ncclGroupEnd() {
if (args->coll.comm->userStream == NULL)
CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end);
NCCLCHECKGOTO(ncclEnqueueEvents(args->coll.comm), ret, end);
- doneArray[i] = 1;
- done--;
}
}
- /* For init, since we use threads, we just wait for threads to complete */
- while (done) {
- for (int i=0; i<ncclGroupIndex; i++) {
- struct ncclAsyncArgs* args = ncclGroupArgs+i;
- if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
- int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL);
- if (err == EBUSY) continue;
- if (err != 0) ret = ncclSystemError;
- if (args->ret != ncclSuccess) ret = args->ret;
- doneArray[i] = 1;
- done--;
- }
- }
- }
goto end;
group_cleanup:
if (ret != ncclSuccess) {
@@ -180,12 +295,12 @@ group_cleanup:
// an atomic operation, we need to cancel all operations.
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
- if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) {
- if (args->init.newcomm) NCCLCHECK(ncclCommDestroy(*args->init.newcomm));
+ if (args->funcType == ASYNC_FUNC_INIT) {
+ if (args->init.newcomm) ncclCommDestroy(*args->init.newcomm);
*args->init.newcomm = NULL;
} else {
struct ncclComm* comm = args->coll.comm;
- for (int c=0; c<comm->nChannels; c++) {
+ for (int c=0; c<comm->p2pnChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
for (int i=0; i<channel->collCount; i++) {
channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
diff --git a/src/include/alloc.h b/src/include/alloc.h
index 27e206f..cc652ce 100644
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -12,10 +12,10 @@
#include "align.h"
#include <sys/mman.h>
-static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
- CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
- memset(*ptr, 0, size);
- *devPtr = *ptr;
+template <typename T>
+static ncclResult_t ncclCudaHostCalloc(T** ptr, size_t nelem) {
+ CUDACHECK(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped));
+ memset(*ptr, 0, nelem*sizeof(T));
return ncclSuccess;
}
diff --git a/src/include/checks.h b/src/include/checks.h
index 257e9ca..ce81312 100644
--- a/src/include/checks.h
+++ b/src/include/checks.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,17 +11,17 @@
// Check CUDA calls
#define CUDACHECK(cmd) do { \
- cudaError_t e = cmd; \
- if( e != cudaSuccess ) { \
- WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
+ cudaError_t err = cmd; \
+ if( err != cudaSuccess ) { \
+ WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
return ncclUnhandledCudaError; \
} \
} while(false)
#define CUDACHECKGOTO(cmd, res, label) do { \
- cudaError_t e = cmd; \
- if( e != cudaSuccess ) { \
- WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
+ cudaError_t err = cmd; \
+ if( err != cudaSuccess ) { \
+ WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
res = ncclUnhandledCudaError; \
goto label; \
} \
diff --git a/src/include/collectives.h b/src/include/collectives.h
index bd64106..f854364 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,10 +7,8 @@
#ifndef NCCL_COLLECTIVES_H_
#define NCCL_COLLECTIVES_H_
-#include "core.h"
-#include "info.h"
-
-#define FUNC_INDEX(coll, redop, dtype, al, pr) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
+#define FUNC_INDEX_P2P 0
+#define FUNC_INDEX(coll, redop, dtype, al, pr) (1+(((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
#define NCCL_COLL_NAME(coll, op, dtype) \
coll##_##op##_##dtype
@@ -56,6 +54,7 @@
DECL_COLL2(ncclAllGather, copy) \
DECL_COLL(ncclReduceScatter) \
DECL_COLL(ncclAllReduce) \
+ DECL_COLL5(ncclSendRecv,copy,i8) \
DECL_ALL_COLLS
@@ -70,5 +69,6 @@ DECL_ALL_COLLS
#define BROADCAST_CHUNKSTEPS 1
#define REDUCE_SLICESTEPS 1
#define REDUCE_CHUNKSTEPS 1
+#define SENDRECV_SLICEFACTOR 4
#endif
diff --git a/src/include/comm.h b/src/include/comm.h
index cc87a42..40143f4 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -8,6 +8,7 @@
#define NCCL_COMM_H_
#include "transport.h"
+#include "p2p.h"
#if CUDART_VERSION < 9000
struct cudaLaunchParams {
@@ -40,6 +41,7 @@ struct ncclSendMem {
};
char pad3[MEM_ALIGN];
};
+ char buff[1]; // Actually larger than that
};
struct ncclRecvMem {
@@ -53,8 +55,6 @@ struct ncclRecvMem {
};
char pad4[MEM_ALIGN];
};
- ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
- uint64_t ll128Buff[NCCL_LL128_BUFF_ELEMS];
char buff[1]; // Actually larger than that
};
@@ -88,6 +88,13 @@ struct ncclComm {
// Channels for collectives
int nChannels;
+ // Channels (per peer) for p2p
+ int p2pnChannels;
+ int p2pnChannelsPerPeer;
+ int p2pChannels[MAXCHANNELS];
+
+ // Buffer sizes
+ int buffSizes[NCCL_NUM_PROTOCOLS];
// Algorithm/Protocols thresholds
ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
@@ -134,6 +141,8 @@ struct ncclComm {
// Whether this communicator uses collNet
int collNetSupport;
+ //list of async p2p operation queued in a group semantics
+ struct ncclP2Plist p2plist;
};
#endif
diff --git a/src/include/core.h b/src/include/core.h
index ac5fa85..0435d9b 100644
--- a/src/include/core.h
+++ b/src/include/core.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -50,19 +50,6 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
}
}
-#define NCCL_NUM_FUNCTIONS 5
-typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce } ncclFunc_t;
-
-#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
-#define NCCL_ALGO_TREE 0
-#define NCCL_ALGO_RING 1
-#define NCCL_ALGO_COLLNET 2
-
-#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
-#define NCCL_PROTO_LL 0
-#define NCCL_PROTO_LL128 1
-#define NCCL_PROTO_SIMPLE 2
-
#include "debug.h"
#include "checks.h"
#include "alloc.h"
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
index 96c69ba..f00e6d6 100644
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,6 +11,22 @@
#include "align.h"
#include <stdint.h>
+#define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now
+typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollSendRecv} ncclFunc_t;
+extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
+
+#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
+#define NCCL_ALGO_TREE 0
+#define NCCL_ALGO_RING 1
+#define NCCL_ALGO_COLLNET 2
+extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
+
+#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
+#define NCCL_PROTO_LL 0
+#define NCCL_PROTO_LL128 1
+#define NCCL_PROTO_SIMPLE 2
+extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];
+
#define NCCL_MAX_OPS 2048
#define NCCL_STEPS 8
@@ -34,9 +50,6 @@ union ncclLLFifoLine {
#define NCCL_MAX_NTHREADS 512
#define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS
#define NCCL_LL_LINES_PER_THREAD 8
-#define NCCL_LL_SLICE_LINES (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
-#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
-#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
#ifdef TEST_LL_CLEANUP
#define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
#define NCCL_LL_FLAG_MAX 0x100
@@ -59,10 +72,6 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
// to 3 dests. Use 70% for reduce and 30% for bcast.
#define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32)
-#define NCCL_LL128_SLICE_ELEMS (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
-#define NCCL_LL128_BUFF_ELEMS (NCCL_LL128_SLICE_ELEMS*NCCL_STEPS)
-#define NCCL_LL128_BUFF_SIZE (NCCL_LL128_BUFF_ELEMS*sizeof(uint64_t))
-
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
@@ -71,7 +80,7 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
struct ncclConnInfo {
// Regular comm mechanism
- char *buff; // Local for recv, remote for send
+ char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
uint64_t *tail; // Local for recv, remote for send
uint64_t *head; // Local for send, remote for recv
uint64_t *opCountLoc; // opCount of local rank
@@ -83,13 +92,7 @@ struct ncclConnInfo {
int *fifo; // Size fifo for proxy
uint64_t step; // Keep where we are
-
- // Low latency mechanism
- union ncclLLFifoLine *llBuff; // Local for recv, remote for send
uint64_t llLastCleaning;
-
- // High bandwidth, low latency protocol
- uint64_t* ll128Buff; // Local for recv, remote for send
};
struct ncclConnector {
@@ -136,17 +139,31 @@ struct CollectiveArgs {
uint64_t opCount;
// local and remote input, output, and buffer
- const void * ThisInput;
- void * ThisOutput;
+ const void * sendbuff;
+ void * recvbuff;
- // general parameters
- size_t N;
- uint32_t root;
- uint8_t bid;
- uint8_t nChannels;
- uint16_t nThreads;
-
- int lastChunkSize;
+ // Op-specific fields. Make sure the common part stays the
+ // same on all structs of the union
+ union {
+ struct {
+ uint16_t nThreads;
+ } common;
+ struct {
+ uint16_t nThreads;
+ uint8_t bid;
+ uint8_t nChannels;
+ uint32_t root;
+ size_t count;
+ size_t lastChunkSize;
+ } coll;
+ struct {
+ uint16_t nThreads;
+ uint16_t unused;
+ int32_t delta;
+ size_t sendCount;
+ size_t recvCount;
+ } p2p;
+ };
};
struct ncclColl {
union {
@@ -171,8 +188,6 @@ struct ncclChannel {
struct ncclTree collTreeDn;
int id;
- int nthreads;
- int buffSize;
// Communication structures
struct ncclPeer* peers;
@@ -180,7 +195,6 @@ struct ncclChannel {
// Operation list for aggregation
struct ncclColl* collectives;
- struct ncclColl* devCollectives;
int collStart;
int collCount;
int collFifoHead; // Only used by GPU
@@ -200,6 +214,7 @@ typedef enum {
struct ncclDevComm {
int rank;
int nRanks;
+ int buffSizes[NCCL_NUM_PROTOCOLS];
// Flag to ask NCCL kernels to abort
volatile uint32_t *abortFlag;
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
index cea486e..a7e6e50 100644
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -12,11 +12,12 @@
#include "collectives.h"
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
-ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
-ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
-ncclResult_t ncclCpuBarrierOut(ncclComm_t comm);
-ncclResult_t ncclBarrierEnqueue(ncclComm_t comm);
-ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm);
-ncclResult_t ncclEnqueueEvents(ncclComm_t comm);
+ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast);
+ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm);
+ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm);
+ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm);
+ncclResult_t ncclBarrierEnqueueWait(struct ncclComm* comm);
+ncclResult_t ncclEnqueueEvents(struct ncclComm* comm);
+ncclResult_t ncclSaveKernel(struct ncclInfo* info);
#endif // End include guard
diff --git a/src/include/graph.h b/src/include/graph.h
index 1814440..70117d5 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -25,10 +25,11 @@ ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info);
void ncclTopoFree(struct ncclTopoSystem* system);
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
+ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
// Query topology
-ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int rank, int channelId, int* net);
-ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p);
+ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* net);
+ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read);
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
// Set CPU affinity
@@ -96,7 +97,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
-ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
+ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
#include "info.h"
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time);
diff --git a/src/include/info.h b/src/include/info.h
index 46b9795..8f125e1 100644
--- a/src/include/info.h
+++ b/src/include/info.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,7 +8,7 @@
#define NCCL_INFO_H_
#include "nccl.h"
-#include "core.h"
+#include "devcomm.h"
typedef enum {
ncclPatternRing,
@@ -47,6 +47,10 @@ struct ncclInfo {
size_t nBytes;
int nstepsPerLoop;
int nchunksPerLoop;
+ ssize_t sendbytes;
+ ssize_t recvbytes;
+ uint32_t delta;
+ int channelId;
};
#endif
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
index 95dce5b..fd19f81 100644
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@@ -16,7 +16,7 @@
#define NCCL_PTR_CUDA 0x2
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ALL=~0} ncclDebugLogSubSys;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
diff --git a/src/include/p2p.h b/src/include/p2p.h
new file mode 100644
index 0000000..9d3730e
--- /dev/null
+++ b/src/include/p2p.h
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <stdlib.h>
+
+#ifndef NCCL_P2P_H_
+#define NCCL_P2P_H_
+
+struct ncclP2Pinfo {
+ const void* sendbuff;
+ void* recvbuff;
+ ssize_t sendbytes;
+ ssize_t recvbytes;
+};
+
+struct ncclP2PConnect {
+ int nrecv[MAXCHANNELS];
+ int nsend[MAXCHANNELS];
+ int* recv;
+ int* send;
+};
+
+struct ncclP2Plist {
+ struct ncclP2Pinfo *peerlist;
+ int count;
+ struct ncclP2PConnect connect;
+};
+
+#endif
diff --git a/src/include/proxy.h b/src/include/proxy.h
new file mode 100644
index 0000000..04daa84
--- /dev/null
+++ b/src/include/proxy.h
@@ -0,0 +1,77 @@
+/*************************************************************************
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PROXY_H_
+#define NCCL_PROXY_H_
+
+#include <pthread.h>
+
+enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
+
+struct ncclProxyArgs;
+typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
+
+struct ncclProxyArgs {
+ proxyProgressFunc_t progress;
+ struct ncclChannel* channel;
+ struct ncclConnector* connector;
+ int sliceSteps;
+ int chunkSteps;
+ int nsteps;
+ uint64_t opCount;
+ int protocol;
+ ncclDataType_t dtype;
+ ncclRedOp_t redOp;
+ int state; // add component before this line -- it is left out during initialization
+
+ // Internal state
+ uint64_t head;
+ uint64_t tail;
+ uint64_t end;
+ void* requests[NCCL_STEPS];
+ int idle;
+
+ // Element linking
+ pthread_mutex_t mutex;
+ struct ncclProxyArgs* next;
+ struct ncclProxyArgs* nextPeer;
+};
+
+struct ncclProxyPool;
+struct ncclProxyState {
+ pthread_cond_t cond;
+ pthread_mutex_t mutex;
+ bool stop;
+ struct ncclProxyArgs* ops;
+ struct ncclProxyArgs* pool;
+ struct ncclProxyPool* pools;
+};
+
+typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
+
+enum proxyMode {
+ proxyRing = 0,
+ proxyFrom = 1,
+ proxyTo = 2
+};
+
+ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks);
+ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel);
+ncclResult_t ncclProxyStart(struct ncclComm* comm);
+ncclResult_t ncclProxyCreate(struct ncclComm* comm);
+ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
+
+#include <unistd.h>
+
+// Spin wait until func evaluates to true
+template<typename FUNC>
+inline void transportProxyWait(const FUNC& func) {
+ while (!func()) {
+ sched_yield();
+ }
+}
+
+#endif
diff --git a/src/include/socket.h b/src/include/socket.h
index 9376062..46b204d 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -53,6 +53,8 @@ static inline int envSocketFamily(void) {
if (env == NULL)
return family;
+ INFO(NCCL_ENV, "NCCL_SOCKET_FAMILY set by environment to %s", env);
+
if (strcmp(env, "AF_INET") == 0)
family = AF_INET; // IPv4
else if (strcmp(env, "AF_INET6") == 0)
@@ -290,6 +292,7 @@ static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNam
// User specified interface
char* env = getenv("NCCL_SOCKET_IFNAME");
if (env && strlen(env) > 1) {
+ INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env);
// Specified by user : find or fail
if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env);
nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
@@ -301,7 +304,8 @@ static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNam
if (nIfs == 0) {
char* commId = getenv("NCCL_COMM_ID");
if (commId && strlen(commId) > 1) {
- // Try to find interface that is in the same subnet as the IP in comm id
+ INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
+ // Try to find interface that is in the same subnet as the IP in comm id
union socketAddress idAddr;
GetSocketAddrFromString(&idAddr, commId);
nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
diff --git a/src/include/transport.h b/src/include/transport.h
index e25132f..5a85688 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,6 +11,7 @@
#include "graph.h"
#include "nvmlwrap.h"
#include "core.h"
+#include "proxy.h"
#define NTRANSPORTS 3
#define TRANSPORT_P2P 0
@@ -39,49 +40,8 @@ struct ncclConnect {
char data[CONNECT_SIZE];
};
-enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
-
-struct ncclProxyArgs;
-typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
-
-struct ncclProxyArgs {
- proxyProgressFunc_t progress;
- struct ncclChannel* channel;
- struct ncclConnector* connector;
- int sliceSteps;
- int chunkSteps;
- int nsteps;
- uint64_t opCount;
- int protocol;
- ncclDataType_t dtype;
- ncclRedOp_t redOp;
- int state; // add component before this line -- it is left out during initialization
-
- // Internal state
- uint64_t head;
- uint64_t tail;
- uint64_t end;
- void* requests[NCCL_STEPS];
- int idle;
-
- // Element linking
- pthread_mutex_t mutex;
- struct ncclProxyArgs* next;
- struct ncclProxyArgs* nextPeer;
-};
-
-struct ncclProxyPool;
-struct ncclProxyState {
- pthread_cond_t cond;
- pthread_mutex_t mutex;
- bool stop;
- struct ncclProxyArgs* ops;
- struct ncclProxyArgs* pool;
- struct ncclProxyPool* pools;
-};
-
struct ncclTransportComm {
- ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
+ ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId);
ncclResult_t (*connect)(struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
ncclResult_t (*free)(void*);
ncclResult_t (*proxy)(struct ncclProxyArgs*);
@@ -94,30 +54,6 @@ struct ncclTransport {
struct ncclTransportComm recv;
};
-#include <pthread.h>
-
-typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
-
-enum proxyMode {
- proxyRing = 0,
- proxyFrom = 1,
- proxyTo = 2
-};
-
-ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr);
-ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks);
-ncclResult_t transportStartProxy(struct ncclComm* comm);
-ncclResult_t transportCreateProxy(struct ncclComm* comm);
-ncclResult_t transportDestroyProxy(struct ncclComm* comm);
-
-#include <unistd.h>
-
-// Spin wait until func evaluates to true
-template<typename FUNC>
-inline void transportProxyWait(const FUNC& func) {
- while (!func()) {
- sched_yield();
- }
-}
+ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend);
#endif
diff --git a/src/init.cc b/src/init.cc
index 0a02760..2be994d 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -37,6 +37,10 @@ std::chrono::high_resolution_clock::time_point ncclEpoch;
#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
#endif
+const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
+const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" };
+const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
+
NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
@@ -116,7 +120,7 @@ static ncclResult_t ncclInit() {
pthread_mutex_lock(&initLock);
if (!initialized) {
initEnv();
- initNet();
+ NCCLCHECK(initNet());
INFO(NCCL_INIT, "Using network %s", ncclNetName());
initialized = true;
}
@@ -154,6 +158,9 @@ void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) {
static ncclResult_t commFree(ncclComm_t comm) {
if (comm == NULL)
return ncclSuccess;
+ free(comm->p2plist.peerlist);
+ free(comm->p2plist.connect.recv);
+ free(comm->p2plist.connect.send);
free(comm->peerInfo);
ncclTopoFree(comm->topo);
@@ -164,7 +171,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
CUDACHECK(cudaFree(comm->hostDevComm.channels));
CUDACHECK(cudaFree(comm->devComm));
- for (int channel=0; channel<comm->nChannels; channel++)
+ for (int channel=0; channel<MAXCHANNELS; channel++)
NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks));
if (comm->doneEvent != NULL)
@@ -228,14 +235,24 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
#endif
comm->fatalError = ncclSuccess;
- NCCLCHECK(ncclCudaHostAlloc((void**) &comm->fatalDevError, (void**) &comm->hostDevComm.fatalDevError, sizeof(ncclDevError_t)));
+ NCCLCHECK(ncclCudaHostCalloc((ncclDevError_t**)&comm->fatalDevError, 1));
+ comm->hostDevComm.fatalDevError = comm->fatalDevError;
*comm->fatalDevError = ncclDevSuccess;
- NCCLCHECK(ncclCudaHostAlloc((void**) &comm->abortFlag, (void**) &comm->hostDevComm.abortFlag, sizeof(uint32_t)));
+ NCCLCHECK(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1));
+ comm->hostDevComm.abortFlag = comm->abortFlag;
*comm->abortFlag = 0;
comm->argsptr = &comm->args;
comm->collNetSupport = 0;
+ comm->p2plist.count=0;
+ NCCLCHECK(ncclCalloc(&comm->p2plist.peerlist, comm->nRanks));
+ for (int r=0; r<comm->nRanks; r++) comm->p2plist.peerlist[r].sendbytes = comm->p2plist.peerlist[r].recvbytes = -1;
+ NCCLCHECK(ncclCalloc(&comm->p2plist.connect.recv, MAXCHANNELS*comm->nRanks));
+ NCCLCHECK(ncclCalloc(&comm->p2plist.connect.send, MAXCHANNELS*comm->nRanks));
+
+ // Mark channels as non initialized.
+ for (int c=0; c<MAXCHANNELS; c++) comm->channels[c].id = -1;
*comret = comm;
return ncclSuccess;
@@ -243,13 +260,12 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
static ncclResult_t devCommSetup(ncclComm_t comm) {
// Duplicate the channels on the device
- NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->nChannels));
- NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->nChannels));
+ NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->p2pnChannels));
+ NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->p2pnChannels));
// Copy userRanks and peers
- for (int r=0; r<comm->nChannels; r++) {
+ for (int r=0; r<comm->p2pnChannels; r++) {
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
- NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks+1));
}
// Duplicate the dev comm on the device
@@ -290,23 +306,6 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
return ncclSuccess;
}
-template <int type>
-static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) {
- for (int t=0; t<NTRANSPORTS; t++) {
- struct ncclTransport *transport = ncclTransports+t;
- struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
- int ret = 0;
- NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo));
- if (ret) {
- connector->transportComm = transportComm;
- NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, buffSize, channelId));
- return ncclSuccess;
- }
- }
- WARN("No transport found !");
- return ncclInternalError;
-}
-
static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks) {
TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
NCCLCHECK(initChannel(comm, channelId));
@@ -379,6 +378,7 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct
// Set CG Mode
comm->launchMode = ncclComm::GROUP;
char* str = getenv("NCCL_LAUNCH_MODE");
+ if (str) INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", str);
if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) {
comm->launchMode = ncclComm::PARALLEL;
}
@@ -399,50 +399,26 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct
return ncclSuccess;
}
-static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
- TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
- uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
- struct ncclConnect connect;
- struct ncclConnector* conn;
- for (int i=0; i<nrecv; i++) {
- int peer = peerRecv[i];
- if (peer == -1 || peer >= comm->nRanks) continue;
- conn = &channel->peers[peer].recv;
- if (conn->connected) { ++nSkippedRecv; continue; }
- memset(&connect, 0, sizeof(connect));
- NCCLCHECK(selectTransport<0>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
- NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
- }
- for (int i=0; i<nsend; i++) {
- int peer = peerSend[i];
- if (peer == -1 || peer >= comm->nRanks) continue;
- conn = &channel->peers[peer].send;
- if (conn->connected) { ++nSkippedSend; continue; }
- memset(&connect, 0, sizeof(connect));
- NCCLCHECK(selectTransport<1>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
- NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
- }
- for (int i=0; i<nsend; i++) {
- int peer = peerSend[i];
- if (peer == -1 || peer >= comm->nRanks) continue;
- conn = &channel->peers[peer].send;
- if (conn->connected) {++nSkippedSend; continue; }
- memset(&connect, 0, sizeof(connect));
- NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
- NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
- conn->connected = 1;
- }
- for (int i=0; i<nrecv; i++) {
- int peer = peerRecv[i];
- if (peer == -1 || peer >= comm->nRanks) continue;
- conn = &channel->peers[peer].recv;
- if (conn->connected) {++nSkippedRecv; continue; }
- memset(&connect, 0, sizeof(connect));
- NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
- NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
- conn->connected = 1;
+#define DEFAULT_LL_BUFFSIZE (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_STEPS*sizeof(union ncclLLFifoLine))
+#define DEFAULT_LL128_BUFFSIZE (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS*NCCL_STEPS*sizeof(uint64_t))
+#define DEFAULT_BUFFSIZE (1LL << 22) /* 4MiB */
+#define DEFAULT_BUFFSIZE_ARM (1LL << 20) /* 1MiB */
+NCCL_PARAM(BuffSize, "BUFFSIZE", -2);
+NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2);
+NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2);
+
+static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
+ int cpuArch, cpuVendor, cpuModel;
+ NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
+
+ int64_t envs[NCCL_NUM_PROTOCOLS] = { ncclParamLlBuffSize(), ncclParamLl128BuffSize(), ncclParamBuffSize() };
+ int defaults[NCCL_NUM_PROTOCOLS] = { DEFAULT_LL_BUFFSIZE, DEFAULT_LL128_BUFFSIZE, DEFAULT_BUFFSIZE };
+
+ if (cpuArch == NCCL_TOPO_CPU_ARCH_ARM) defaults[NCCL_PROTO_SIMPLE] = DEFAULT_BUFFSIZE_ARM;
+
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ comm->buffSizes[p] = comm->hostDevComm.buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
}
- TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
return ncclSuccess;
}
@@ -451,7 +427,8 @@ extern struct ncclTransport collNetTransport;
// All ranks must participate in collNetSetup call
// type: 0 for send, 1 for recv
// return: 0 - unsupported, 1 - supported
-static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int collNetChannels, int rank, int nranks, int masterRank, int masterPeer, int nMasters, int type) {
+// We do not NCCLCHECK this call because we would fall back to P2P network in case CollNet setup fails
+static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int rank, int nranks, int masterRank, int masterPeer, int nMasters, int type) {
int rankInCollNet = -1;
int supported = 0;
int isMaster = (rank == masterRank) ? 1 : 0;
@@ -483,7 +460,7 @@ static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGrap
// setup
struct ncclConnect myConnect;
if (isMaster && ret > 0) {
- NCCLCHECK(transportComm->setup(comm->topo, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->buffSize, channel->id));
+ NCCLCHECK(transportComm->setup(comm->topo, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->id));
}
// prepare connect handles
ncclResult_t res;
@@ -514,12 +491,15 @@ static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGrap
// connect
if (isMaster && ret > 0) {
NCCLCHECKGOTO(transportComm->connect(masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
+ struct ncclPeer* devRoot = channel->devPeers+nranks;
+ struct ncclConnector* devConn = (type == 1) ? &devRoot->recv : &devRoot->send;
+ CUDACHECKGOTO(cudaMemcpy(devConn, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice), res, cleanup);
}
// recv side sends connect info to send side
if (isMaster && type == 1) {
sendrecvExchange.collNetRank = rankInCollNet;
memcpy(&sendrecvExchange.connect, masterConnects+rankInCollNet, sizeof(struct ncclConnect));
- NCCLCHECK(bootstrapSend(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange)));
+ NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup);
INFO(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
}
if (ret > 0) {
@@ -746,7 +726,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings));
if (comm->nNodes > 1 &&
ncclParamCollNetEnable() == 1 &&
- collNetSupport()) {
+ collNetSupport() && collNetGraph.nChannels) {
NCCLCHECK(ncclTopoConnectCollNet(comm, &collNetGraph, rank));
}
@@ -758,7 +738,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
- NCCLCHECK(ncclTopoSetThresholds(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
+ NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
char line[1024];
line[0]='\0';
@@ -779,6 +759,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECK(ncclTopoSetAffinity(comm->topo, comm->rank));
ncclResult_t ret;
+ NCCLCHECK(computeBuffSizes(comm));
+
// Connect with prev/next for each ring
struct ncclConnect *connect;
NCCLCHECKGOTO(ncclCalloc(&connect, 2), ret, affinity_restore);
@@ -786,15 +768,15 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
struct ncclChannel* channel = comm->channels+c;
NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore);
if (comm->nRanks == 1) continue;
- NCCLCHECKGOTO(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore);
- NCCLCHECKGOTO(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up), ret, affinity_restore);
- NCCLCHECKGOTO(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down), ret, affinity_restore);
+ NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore);
+ NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up), ret, affinity_restore);
+ NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down), ret, affinity_restore);
}
// Check if we can setup CollNet
if (comm->nNodes > 1 &&
ncclParamCollNetEnable() == 1 &&
- collNetSupport()) {
+ collNetSupport() && collNetGraph.nChannels) {
int logicChannels = comm->nChannels/2;
int collNetSetupFail = 0;
const int recvIndex = 0; // recv GPU index is always 0
@@ -802,13 +784,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
for (int c=0; c<logicChannels; c++) {
struct ncclChannel* channelRecv = comm->channels+logicChannels+c;
struct ncclChannel* channelSend = comm->channels+c;
- NCCLCHECK(p2pSetup(comm, &collNetGraph, channelRecv, 1, &channelRecv->collTreeDn.up, 1, channelRecv->collTreeDn.down));
- NCCLCHECK(p2pSetup(comm, &collNetGraph, channelSend, 1, channelSend->collTreeUp.down, 1, &channelSend->collTreeUp.up));
+ NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, channelRecv, 1, &channelRecv->collTreeDn.up, 1, channelRecv->collTreeDn.down));
+ NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, channelSend, 1, channelSend->collTreeUp.down, 1, &channelSend->collTreeUp.up));
const int recvMaster = collNetGraph.intra[c*comm->localRanks+recvIndex];
const int sendMaster = collNetGraph.intra[c*comm->localRanks+sendIndex];
- if (collNetSetup(comm, &collNetGraph, channelRecv, logicChannels, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1)
+ if (collNetSetup(comm, &collNetGraph, channelRecv, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1)
collNetSetupFail = 1;
- if (collNetSetup(comm, &collNetGraph, channelSend, logicChannels, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1)
+ else if (collNetSetup(comm, &collNetGraph, channelSend, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1)
collNetSetupFail = 1;
}
// Verify CollNet setup across ranks
@@ -818,6 +800,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
free(connect);
free(rings);
+ // Compute nChannels per peer for p2p
+ NCCLCHECK(ncclTopoComputeP2pChannels(comm));
+
// We should have allocated all buffers, collective fifos, ... we can
// restore the affinity.
affinity_restore:
@@ -846,7 +831,7 @@ affinity_restore:
// Done with AllGather1 data
free(allGather1Data);
- if (comm->nNodes) NCCLCHECK(transportCreateProxy(comm));
+ if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
return ncclSuccess;
@@ -873,6 +858,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni
ncclResult_t res;
char* env = getenv("NCCL_COMM_ID");
if (env && myrank == 0) {
+ INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
NCCLCHECKGOTO(bootstrapCreateRoot(&commId, true), res, end);
}
@@ -941,7 +927,7 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, *comm->abortFlag, comm->fatalError);
CUDACHECK(cudaStreamSynchronize(comm->groupStream));
- NCCLCHECK(transportDestroyProxy(comm));
+ NCCLCHECK(ncclProxyDestroy(comm));
NCCLCHECK(commFree(comm));
if (savedDevice != commDevice)
diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc
index 67931f8..27623b2 100644
--- a/src/misc/argcheck.cc
+++ b/src/misc/argcheck.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -34,7 +34,6 @@ ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
}
ncclResult_t ArgsCheck(struct ncclInfo* info) {
- NCCLCHECK(PtrCheck(info->comm, info->opName, "comm"));
// First, the easy ones
if (info->root < 0 || info->root >= info->comm->nRanks) {
WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks);
@@ -44,7 +43,7 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
WARN("%s : invalid type %d", info->opName, info->datatype);
return ncclInvalidArgument;
}
- // Type is OK, compute nbytes. Convert Allgather/Broadcast calls to chars.
+ // Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars.
info->nBytes = info->count * ncclTypeSize(info->datatype);
if (info->coll == ncclCollAllGather || info->coll == ncclCollBroadcast) {
info->count = info->nBytes;
@@ -58,12 +57,20 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
}
if (info->comm->checkPointers) {
- // Check CUDA device pointers
- if (info->coll != ncclCollBroadcast || info->comm->rank == info->root) {
- NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
- }
- if (info->coll != ncclCollReduce || info->comm->rank == info->root) {
- NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
+ if (info->coll == ncclCollSendRecv) {
+ if (strcmp(info->opName, "Send") == 0) {
+ NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", "Send"));
+ } else {
+ NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", "Recv"));
+ }
+ } else {
+ // Check CUDA device pointers
+ if (info->coll != ncclCollBroadcast || info->comm->rank == info->root) {
+ NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
+ }
+ if (info->coll != ncclCollReduce || info->comm->rank == info->root) {
+ NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
+ }
}
}
return ncclSuccess;
diff --git a/src/misc/utils.cc b/src/misc/utils.cc
index 782e9c0..b231eb1 100644
--- a/src/misc/utils.cc
+++ b/src/misc/utils.cc
@@ -93,6 +93,7 @@ uint64_t getHostHash(void) {
int offset = strlen(hostHash);
if ((hostId = getenv("NCCL_HOSTID")) != NULL) {
+ INFO(NCCL_ENV, "NCCL_HOSTID set by environment to %s", hostId);
strncpy(hostHash, hostId, sizeof(hostHash));
} else {
FILE *file = fopen(HOSTID_FILE, "r");
diff --git a/src/nccl.h.in b/src/nccl.h.in
index f07e0a4..b4f34ef 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -221,6 +221,40 @@ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
/*
+ * Send
+ *
+ * Send data from sendbuff to rank peer.
+ *
+ * Rank peer needs to call ncclRecv with the same datatype and the same count from this
+ * rank.
+ *
+ * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
+ * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
+ * ncclGroupEnd section.
+ */
+ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+ ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+ ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Receive
+ *
+ * Receive data from rank peer into recvbuff.
+ *
+ * Rank peer needs to call ncclSend with the same datatype and the same count to this
+ * rank.
+ *
+ * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
+ * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
+ * ncclGroupEnd section.
+ */
+ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+ ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+ ncclComm_t comm, cudaStream_t stream);
+
+/*
* Group semantics
*
* When managing multiple GPUs from a single thread, and since NCCL collective
@@ -235,14 +269,19 @@ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou
* the operation is effectively done.
*
* Both collective communication and ncclCommInitRank can be used in conjunction
- * of ncclGroupStart/ncclGroupEnd.
+ * of ncclGroupStart/ncclGroupEnd, but not together.
+ *
+ * Group semantics also allow to fuse multiple operations on the same device
+ * to improve performance (for aggregated collective calls), or to permit
+ * concurrent progress of multiple send/receive operations.
*/
/*
* Group Start
*
- * Start a group call. All subsequent calls to NCCL may not block due to
- * inter-CPU synchronization.
+ * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
+ * a single NCCL operation. Nothing will be started on the CUDA stream until
+ * ncclGroupEnd.
*/
ncclResult_t ncclGroupStart();
ncclResult_t pncclGroupStart();
@@ -250,8 +289,9 @@ ncclResult_t pncclGroupStart();
/*
* Group End
*
- * End a group call. Wait for all calls since ncclGroupStart to complete
- * before returning.
+ * End a group call. Start a fused NCCL operation consisting of all calls since
+ * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
+ * need to be called after ncclGroupEnd.
*/
ncclResult_t ncclGroupEnd();
ncclResult_t pncclGroupEnd();
diff --git a/src/proxy.cc b/src/proxy.cc
new file mode 100644
index 0000000..19dbced
--- /dev/null
+++ b/src/proxy.cc
@@ -0,0 +1,283 @@
+/*************************************************************************
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "info.h"
+#include "collectives.h"
+
+#define RECV 0
+#define SEND 1
+
+static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
+ if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
+
+ /* In chains, one rank does not need a proxy. Let's figure out which one it is */
+ // Which index in the reorganized rings should we compare root against */
+ const int myrank = 0, nextrank = 1, prevrank = nranks-1;
+ int index = pattern == ncclPatternPipelineFrom ?
+ /* no recv / no send if root = */
+ /* bcast */ (type == RECV ? myrank : nextrank ):
+ /* reduce */ (type == RECV ? prevrank : myrank );
+ int rank = ring->userRanks[index];
+ return (root != rank);
+}
+
+enum { proxyRecv=0, proxySend=1 };
+
+#define PROXYARGS_ALLOCATE_SIZE 32
+struct ncclProxyPool {
+ struct ncclProxyPool *next;
+ struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
+};
+
+static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) {
+ struct ncclProxyState* state = &comm->proxyState;
+ struct ncclProxyArgs* elem;
+ pthread_mutex_lock(&state->mutex);
+ if (state->pool == NULL) {
+ // Allocate a new pool of elements
+ struct ncclProxyPool* newPool;
+ NCCLCHECK(ncclCalloc(&newPool, 1));
+ struct ncclProxyArgs* newElems = newPool->elems;
+ // Chain newly allocated elements
+ for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) {
+ if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1;
+ }
+ // Add them all to the pool list
+ state->pool = newElems;
+ // Save the pool memory block for later resource release
+ newPool->next = state->pools;
+ state->pools = newPool;
+ }
+ elem = state->pool;
+ state->pool = state->pool->next;
+ pthread_mutex_unlock(&state->mutex);
+ elem->next = elem->nextPeer = NULL;
+ *argsptr = elem;
+ return ncclSuccess;
+}
+
+static void ProxyAppend(struct ncclConnector* connector, struct ncclProxyArgs* args) {
+ struct ncclComm* comm = connector->comm;
+ struct ncclProxyState* state = &comm->proxyState;
+ pthread_mutex_lock(&state->mutex);
+ if (connector->proxyAppend == NULL) {
+ // Nothing running for that peer. Add to the circular list
+ if (state->ops == NULL) {
+ // Create the list
+ args->next = args;
+ state->ops = args;
+ } else {
+ // Insert element in the list
+ args->next = state->ops->next;
+ state->ops->next = args;
+ }
+ connector->proxyAppend = args;
+ } else {
+ // There is an active operation already for that peer.
+ // Add it to the per-peer list
+ connector->proxyAppend->nextPeer = args;
+ connector->proxyAppend = args;
+ }
+ pthread_mutex_unlock(&state->mutex);
+}
+
+template <int type>
+static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) {
+ if (peer < 0) return ncclSuccess;
+
+ struct ncclPeer* peerComm = args->channel->peers+peer;
+ struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send;
+ if (connector->transportComm == NULL) {
+ WARN("[%d] Error no transport for %s peer %d on channel %d\n", connector->comm->rank,
+ type == proxyRecv ? "recv" : "send", peer, args->channel->id);
+ return ncclInternalError;
+ }
+ if (connector->transportComm->proxy == NULL) return ncclSuccess;
+
+ struct ncclProxyArgs* op;
+ NCCLCHECK(allocateArgs(connector->comm, &op));
+ memcpy(op, args, sizeof(struct ncclProxyArgs));
+ op->connector = connector;
+ op->progress = connector->transportComm->proxy;
+ op->state = ncclProxyOpReady;
+ ProxyAppend(connector, op);
+ return ncclSuccess;
+}
+
+ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks) {
+ if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
+ struct ncclRing* ring = &args->channel->ring;
+ if (NeedProxy(RECV, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxyRecv>(ring->prev, args));
+ if (NeedProxy(SEND, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxySend>(ring->next, args));
+ }
+ if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
+ // Tree up
+ struct ncclTree* tree = &args->channel->treeUp;
+ for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args));
+ NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
+ }
+ if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
+ // Tree down
+ struct ncclTree* tree = &args->channel->treeDn;
+ for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
+ NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
+ }
+ if (pattern == ncclPatternCollTreeUp) {
+ // CollTree up
+ struct ncclTree* tree = &args->channel->collTreeUp;
+ NCCLCHECK(SaveProxy<proxyRecv>(tree->down[0], args));
+ NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
+ }
+ if (pattern == ncclPatternCollTreeDown) {
+ // CollTree down
+ struct ncclTree* tree = &args->channel->collTreeDn;
+ NCCLCHECK(SaveProxy<proxySend>(tree->down[0], args));
+ NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel) {
+ struct ncclProxyArgs args;
+ memset(&args, 0, sizeof(struct ncclProxyArgs));
+ args.channel = channel;
+ args.sliceSteps = 1;
+ args.chunkSteps = 1;
+ args.protocol = NCCL_PROTO_SIMPLE;
+ args.opCount = info->comm->opCount;
+ args.dtype = info->datatype;
+ if (info->delta > 0 && info->sendbytes >= 0) {
+ int peersend = (info->comm->rank+info->delta)%info->comm->nRanks;
+ args.nsteps = DIVUP(info->sendbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR);
+ if (args.nsteps == 0) args.nsteps = 1;
+ NCCLCHECK(SaveProxy<proxySend>(peersend, &args));
+ }
+ if (info->delta > 0 && info->recvbytes >= 0) {
+ int peerrecv = (info->comm->nRanks+info->comm->rank-info->delta)%info->comm->nRanks;
+ args.nsteps = DIVUP(info->recvbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR);
+ if (args.nsteps == 0) args.nsteps = 1;
+ NCCLCHECK(SaveProxy<proxyRecv>(peerrecv, &args));
+ }
+ return ncclSuccess;
+}
+
+void* persistentThread(void *comm_) {
+ struct ncclComm* comm = (struct ncclComm*)comm_;
+ struct ncclProxyState* state = &comm->proxyState;
+ struct ncclProxyArgs* op = NULL;
+ ncclResult_t ret = ncclSuccess;
+ int idle = 1;
+ int idleSpin = 0;
+ while (1) {
+ do {
+ if (*comm->abortFlag) return NULL;
+ if (op == NULL) {
+ pthread_mutex_lock(&state->mutex);
+ op = state->ops;
+ if (op == NULL) {
+ if (state->stop) {
+ // No more commands to process and proxy has been requested to stop
+ pthread_mutex_unlock(&state->mutex);
+ return NULL;
+ }
+ pthread_cond_wait(&state->cond, &state->mutex);
+ }
+ pthread_mutex_unlock(&state->mutex);
+ }
+ } while (op == NULL);
+ op->idle = 0;
+ // opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started
+ // yet and might be cancelled before they even start. Hold on on those.
+ if (op->state != ncclProxyOpNone && op->opCount < comm->lastOpCount) ret = op->progress(op);
+ if (ret != ncclSuccess) {
+ comm->fatalError = ret;
+ INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
+ return NULL;
+ }
+ idle &= op->idle;
+ pthread_mutex_lock(&state->mutex);
+ if (!idle) idleSpin = 0;
+ struct ncclProxyArgs *next = op->next;
+ if (next->state == ncclProxyOpNone) {
+ struct ncclProxyArgs *freeOp = next;
+ if (next->nextPeer) {
+ // Replace next by its next per-peer element.
+ next = next->nextPeer;
+ if (op != freeOp) {
+ next->next = freeOp->next;
+ op->next = next;
+ } else {
+ next->next = next;
+ }
+ } else {
+ // Remove next from circular list
+ next->connector->proxyAppend = NULL;
+ if (op != freeOp) {
+ next = next->next;
+ op->next = next;
+ } else {
+ next = NULL;
+ }
+ }
+ if (freeOp == state->ops) state->ops = next;
+ freeOp->next = state->pool;
+ state->pool = freeOp;
+ }
+ op = next;
+ if (op == state->ops) {
+ if (idle == 1) {
+ if (++idleSpin == 10) {
+ sched_yield();
+ idleSpin = 0;
+ }
+ }
+ idle = 1;
+ }
+ pthread_mutex_unlock(&state->mutex);
+ }
+}
+
+ncclResult_t ncclProxyStart(struct ncclComm* comm) {
+ pthread_mutex_lock(&comm->proxyState.mutex);
+ if (comm->proxyState.ops != NULL)
+ pthread_cond_signal(&comm->proxyState.cond);
+ pthread_mutex_unlock(&comm->proxyState.mutex);
+ return ncclSuccess;
+}
+
+ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
+ if (!comm->proxyThread) {
+ comm->proxyState.cond = PTHREAD_COND_INITIALIZER;
+ comm->proxyState.mutex = PTHREAD_MUTEX_INITIALIZER;
+ comm->proxyState.ops = NULL;
+ pthread_create(&comm->proxyThread, NULL, persistentThread, comm);
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
+ struct ncclProxyState* state = &comm->proxyState;
+
+ // Request the proxy to stop and then wake it
+ pthread_mutex_lock(&state->mutex);
+ state->stop = true;
+ pthread_cond_signal(&state->cond);
+ pthread_mutex_unlock(&state->mutex);
+ if (comm->proxyThread) pthread_join(comm->proxyThread, NULL);
+
+ // Free off any memory allocated for the proxy arg pools
+ pthread_mutex_lock(&state->mutex);
+ struct ncclProxyState* proxyState = &comm->proxyState;
+ while (proxyState->pools != NULL) {
+ struct ncclProxyPool *next = proxyState->pools->next;
+ free(proxyState->pools);
+ proxyState->pools = next;
+ }
+ pthread_mutex_unlock(&state->mutex);
+
+ return ncclSuccess;
+}
diff --git a/src/transport.cc b/src/transport.cc
index cc8d5d1..7219ea3 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -1,11 +1,12 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "comm.h"
#include "info.h"
+#include "bootstrap.h"
extern struct ncclTransport p2pTransport;
extern struct ncclTransport shmTransport;
@@ -17,248 +18,70 @@ struct ncclTransport ncclTransports[NTRANSPORTS] = {
netTransport,
};
-#define RECV 0
-#define SEND 1
-
-static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
- if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
-
- /* In chains, one rank does not need a proxy. Let's figure out which one it is */
- // Which index in the reorganized rings should we compare root against */
- const int myrank = 0, nextrank = 1, prevrank = nranks-1;
- int index = pattern == ncclPatternPipelineFrom ?
- /* no recv / no send if root = */
- /* bcast */ (type == RECV ? myrank : nextrank ):
- /* reduce */ (type == RECV ? prevrank : myrank );
- int rank = ring->userRanks[index];
- return (root != rank);
-}
-
-enum { proxyRecv=0, proxySend=1 };
-
-#define PROXYARGS_ALLOCATE_SIZE 32
-struct ncclProxyPool {
- struct ncclProxyPool *next;
- struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
-};
-
-ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) {
- struct ncclProxyState* state = &comm->proxyState;
- struct ncclProxyArgs* elem;
- pthread_mutex_lock(&state->mutex);
- if (state->pool == NULL) {
- // Allocate a new pool of elements
- struct ncclProxyPool* newPool;
- NCCLCHECK(ncclCalloc(&newPool, 1));
- struct ncclProxyArgs* newElems = newPool->elems;
- // Chain newly allocated elements
- for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) {
- if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1;
- }
- // Add them all to the pool list
- state->pool = newElems;
- // Save the pool memory block for later resource release
- newPool->next = state->pools;
- state->pools = newPool;
- }
- elem = state->pool;
- state->pool = state->pool->next;
- pthread_mutex_unlock(&state->mutex);
- elem->next = elem->nextPeer = NULL;
- *argsptr = elem;
- return ncclSuccess;
-}
-
-static void ProxyAppend(struct ncclConnector* connector, struct ncclProxyArgs* args) {
- struct ncclComm* comm = connector->comm;
- struct ncclProxyState* state = &comm->proxyState;
- pthread_mutex_lock(&state->mutex);
- if (connector->proxyAppend == NULL) {
- // Nothing running for that peer. Add to the circular list
- if (state->ops == NULL) {
- // Create the list
- args->next = args;
- state->ops = args;
- } else {
- // Insert element in the list
- args->next = state->ops->next;
- state->ops->next = args;
+template <int type>
+static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId) {
+ for (int t=0; t<NTRANSPORTS; t++) {
+ struct ncclTransport *transport = ncclTransports+t;
+ struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
+ int ret = 0;
+ NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo));
+ if (ret) {
+ connector->transportComm = transportComm;
+ NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, channelId));
+ return ncclSuccess;
}
- connector->proxyAppend = args;
- } else {
- // There is an active operation already for that peer.
- // Add it to the per-peer list
- connector->proxyAppend->nextPeer = args;
- connector->proxyAppend = args;
}
- pthread_mutex_unlock(&state->mutex);
-}
-
-template <int type>
-static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) {
- if (peer < 0) return ncclSuccess;
-
- struct ncclPeer* peerComm = args->channel->peers+peer;
- struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send;
- if (connector->transportComm == NULL) return ncclInternalError;
- if (connector->transportComm->proxy == NULL) return ncclSuccess;
-
- struct ncclProxyArgs* op;
- NCCLCHECK(transportAllocateProxyArgs(connector->comm, &op));
- memcpy(op, args, sizeof(struct ncclProxyArgs));
- op->connector = connector;
- op->progress = connector->transportComm->proxy;
- op->state = ncclProxyOpReady;
- ProxyAppend(connector, op);
- return ncclSuccess;
+ WARN("No transport found !");
+ return ncclInternalError;
}
-ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks) {
- if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
- struct ncclRing* ring = &args->channel->ring;
- if (NeedProxy(RECV, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxyRecv>(ring->prev, args));
- if (NeedProxy(SEND, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxySend>(ring->next, args));
+ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
+ TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
+ uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
+ struct ncclConnect connect;
+ struct ncclConnector* conn;
+ for (int i=0; i<nrecv; i++) {
+ int peer = peerRecv[i];
+ if (peer == -1 || peer >= comm->nRanks) continue;
+ conn = &channel->peers[peer].recv;
+ if (conn->connected) { ++nSkippedRecv; continue; }
+ memset(&connect, 0, sizeof(connect));
+ NCCLCHECK(selectTransport<0>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->id));
+ NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
}
- if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
- // Tree up
- struct ncclTree* tree = &args->channel->treeUp;
- for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args));
- NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
+ for (int i=0; i<nsend; i++) {
+ int peer = peerSend[i];
+ if (peer == -1 || peer >= comm->nRanks) continue;
+ conn = &channel->peers[peer].send;
+ if (conn->connected) { ++nSkippedSend; continue; }
+ memset(&connect, 0, sizeof(connect));
+ NCCLCHECK(selectTransport<1>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->id));
+ NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
}
- if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
- // Tree down
- struct ncclTree* tree = &args->channel->treeDn;
- for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
- NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
+ for (int i=0; i<nsend; i++) {
+ int peer = peerSend[i];
+ if (peer == -1 || peer >= comm->nRanks) continue;
+ conn = &channel->peers[peer].send;
+ if (conn->connected) {++nSkippedSend; continue; }
+ memset(&connect, 0, sizeof(connect));
+ NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+ NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
+ conn->connected = 1;
+ CUDACHECK(cudaMemcpy(&channel->devPeers[peer].send, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice));
}
- if (pattern == ncclPatternCollTreeUp) {
- // CollTree up
- struct ncclTree* tree = &args->channel->collTreeUp;
- NCCLCHECK(SaveProxy<proxyRecv>(tree->down[0], args));
- NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
- }
- if (pattern == ncclPatternCollTreeDown) {
- // CollTree down
- struct ncclTree* tree = &args->channel->collTreeDn;
- NCCLCHECK(SaveProxy<proxySend>(tree->down[0], args));
- NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
+ for (int i=0; i<nrecv; i++) {
+ int peer = peerRecv[i];
+ if (peer == -1 || peer >= comm->nRanks) continue;
+ conn = &channel->peers[peer].recv;
+ if (conn->connected) {++nSkippedRecv; continue; }
+ memset(&connect, 0, sizeof(connect));
+ NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+ NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
+ conn->connected = 1;
+ CUDACHECK(cudaMemcpy(&channel->devPeers[peer].recv, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice));
}
+ TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
return ncclSuccess;
}
-void* persistentThread(void *comm_) {
- struct ncclComm* comm = (struct ncclComm*)comm_;
- struct ncclProxyState* state = &comm->proxyState;
- struct ncclProxyArgs* op = NULL;
- ncclResult_t ret = ncclSuccess;
- int idle = 1;
- int idleSpin = 0;
- while (1) {
- do {
- if (*comm->abortFlag) return NULL;
- if (op == NULL) {
- pthread_mutex_lock(&state->mutex);
- op = state->ops;
- if (op == NULL) {
- if (state->stop) {
- // No more commands to process and proxy has been requested to stop
- pthread_mutex_unlock(&state->mutex);
- return NULL;
- }
- pthread_cond_wait(&state->cond, &state->mutex);
- }
- pthread_mutex_unlock(&state->mutex);
- }
- } while (op == NULL);
- op->idle = 0;
- // opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started
- // yet and might be cancelled before they even start. Hold on on those.
- if (op->state != ncclProxyOpNone && op->opCount < comm->lastOpCount) ret = op->progress(op);
- if (ret != ncclSuccess) {
- comm->fatalError = ret;
- INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
- return NULL;
- }
- idle &= op->idle;
- pthread_mutex_lock(&state->mutex);
- if (!idle) idleSpin = 0;
- struct ncclProxyArgs *next = op->next;
- if (next->state == ncclProxyOpNone) {
- struct ncclProxyArgs *freeOp = next;
- if (next->nextPeer) {
- // Replace next by its next per-peer element.
- next = next->nextPeer;
- if (op != freeOp) {
- next->next = freeOp->next;
- op->next = next;
- } else {
- next->next = next;
- }
- } else {
- // Remove next from circular list
- next->connector->proxyAppend = NULL;
- if (op != freeOp) {
- next = next->next;
- op->next = next;
- } else {
- next = NULL;
- }
- }
- if (freeOp == state->ops) state->ops = next;
- freeOp->next = state->pool;
- state->pool = freeOp;
- }
- op = next;
- if (op == state->ops) {
- if (idle == 1) {
- if (++idleSpin == 10) {
- sched_yield();
- idleSpin = 0;
- }
- }
- idle = 1;
- }
- pthread_mutex_unlock(&state->mutex);
- }
-}
-
-ncclResult_t transportStartProxy(struct ncclComm* comm) {
- pthread_mutex_lock(&comm->proxyState.mutex);
- if (comm->proxyState.ops != NULL)
- pthread_cond_signal(&comm->proxyState.cond);
- pthread_mutex_unlock(&comm->proxyState.mutex);
- return ncclSuccess;
-}
-ncclResult_t transportCreateProxy(struct ncclComm* comm) {
- if (!comm->proxyThread) {
- comm->proxyState.cond = PTHREAD_COND_INITIALIZER;
- comm->proxyState.mutex = PTHREAD_MUTEX_INITIALIZER;
- comm->proxyState.ops = NULL;
- pthread_create(&comm->proxyThread, NULL, persistentThread, comm);
- }
- return ncclSuccess;
-}
-
-ncclResult_t transportDestroyProxy(struct ncclComm* comm) {
- struct ncclProxyState* state = &comm->proxyState;
-
- // Request the proxy to stop and then wake it
- pthread_mutex_lock(&state->mutex);
- state->stop = true;
- pthread_cond_signal(&state->cond);
- pthread_mutex_unlock(&state->mutex);
- if (comm->proxyThread) pthread_join(comm->proxyThread, NULL);
-
- // Free off any memory allocated for the proxy arg pools
- pthread_mutex_lock(&state->mutex);
- struct ncclProxyState* proxyState = &comm->proxyState;
- while (proxyState->pools != NULL) {
- struct ncclProxyPool *next = proxyState->pools->next;
- free(proxyState->pools);
- proxyState->pools = next;
- }
- pthread_mutex_unlock(&state->mutex);
-
- return ncclSuccess;
-}
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index 435c88d..a11f8be 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -15,17 +15,10 @@ struct collNetRecvConnectInfo {
struct collNetSendConnectInfo {
void* collNetComm;
- void* mhandle;
- void* llMhandle;
+ void* mhandles[NCCL_NUM_PROTOCOLS];
struct reqSlot* reqFifo;
};
-struct ncclLLDataLine {
- uint32_t data1;
- uint32_t data2;
-};
-static_assert(sizeof(struct ncclLLDataLine) == sizeof(union ncclLLFifoLine)>>1, "ncclLLDataLine is not half size of ncclLLFifoLine");
-
struct reqSlot {
volatile void* recvBuff;
volatile int size;
@@ -37,14 +30,11 @@ struct collNetSendResources {
struct ncclRecvMem* hostRecvMem;
struct ncclSendMem* devHostSendMem;
struct ncclRecvMem* devHostRecvMem;
- struct ncclLLDataLine* llData;
+ uint32_t* llData;
int netDev;
int useGdr;
- int buffSize;
- void* sendMhandle;
- void* llSendMhandle;
- void* recvMhandle;
- void* llRecvMhandle;
+ void* sendMhandles[NCCL_NUM_PROTOCOLS];
+ void* recvMhandles[NCCL_NUM_PROTOCOLS];
struct ncclRecvMem* devRecvMem;
uint64_t step;
uint64_t llLastCleaning;
@@ -59,12 +49,10 @@ struct collNetRecvResources {
struct ncclRecvMem* hostRecvMem;
struct ncclSendMem* devHostSendMem;
struct ncclRecvMem* devHostRecvMem;
- struct ncclLLDataLine* llData;
+ uint32_t* llData;
int netDev;
int useGdr;
- int buffSize;
- void* mhandle;
- void* llMhandle;
+ void* mhandles[NCCL_NUM_PROTOCOLS];
struct ncclRecvMem* devRecvMem;
uint64_t step;
uint64_t llLastCleaning;
@@ -79,112 +67,120 @@ ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncc
}
/* Setup send connector, and return connect information for others in the coll communicator to connect to me */
-ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
- struct collNetSendResources* sendResources;
- NCCLCHECK(ncclCalloc(&sendResources, 1));
- send->transportResources = sendResources;
+ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
+ struct collNetSendResources* resources;
+ NCCLCHECK(ncclCalloc(&resources, 1));
+ send->transportResources = resources;
- NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &sendResources->netDev));
- NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, sendResources->netDev, 1, &sendResources->useGdr));
+ NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
+ NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
- int sendSize = sizeof(struct ncclSendMem);
- NCCLCHECK(ncclCudaHostAlloc((void**)&sendResources->hostSendMem, (void**)&sendResources->devHostSendMem, sendSize));
+ NCCLCHECK(ncclCudaHostCalloc(&resources->hostSendMem, 1));
+ resources->devHostSendMem = resources->hostSendMem;
- int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
- if (sendResources->useGdr) {
- NCCLCHECK(ncclCudaCalloc((char**)(&sendResources->devRecvMem), recvSize));
- }
- NCCLCHECK(ncclCudaHostAlloc((void**)&sendResources->hostRecvMem, (void**)&sendResources->devHostRecvMem, recvSize));
- NCCLCHECK(ncclIbMalloc((void**)&(sendResources->llData), NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine)));
- sendResources->buffSize = buffSize;
+ int recvSize = offsetof(struct ncclRecvMem, buff);
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) recvSize += send->comm->buffSizes[p];
- INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), sendResources->netDev,
- sendResources->useGdr ? "/GDRDMA" : "");
+ if (resources->useGdr) {
+ NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
+ }
+ NCCLCHECK(ncclCudaHostCalloc((char**)&resources->hostRecvMem, recvSize));
+ resources->devHostRecvMem = resources->hostRecvMem;
+ NCCLCHECK(ncclIbMalloc((void**)&(resources->llData), send->comm->buffSizes[NCCL_PROTO_LL]/2));
+ INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
+ resources->useGdr ? "/GDRDMA" : "");
return ncclSuccess;
}
/* Setup recv connector */
-ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
- struct collNetRecvResources* recvResources;
- NCCLCHECK(ncclCalloc(&recvResources, 1));
- recv->transportResources = recvResources;
+ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
+ struct collNetRecvResources* resources;
+ NCCLCHECK(ncclCalloc(&resources, 1));
+ recv->transportResources = resources;
- NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &recvResources->netDev));
- NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, recvResources->netDev, 0, &recvResources->useGdr));
+ NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
+ NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
- int sendSize = sizeof(struct ncclSendMem);
- NCCLCHECK(ncclCudaHostAlloc((void**)&recvResources->hostSendMem, (void**)&recvResources->devHostSendMem, sendSize));
+ NCCLCHECK(ncclCudaHostCalloc(&resources->hostSendMem, 1));
+ resources->devHostSendMem = resources->hostSendMem;
- int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
- if (recvResources->useGdr) {
- NCCLCHECK(ncclCudaCalloc((char**)(&recvResources->devRecvMem), recvSize));
+ int recvSize = offsetof(struct ncclRecvMem, buff);
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) recvSize += recv->comm->buffSizes[p];
+
+ if (resources->useGdr) {
+ NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
}
- NCCLCHECK(ncclCudaHostAlloc((void**)&recvResources->hostRecvMem, (void**)&recvResources->devHostRecvMem, recvSize));
- NCCLCHECK(ncclIbMalloc((void**)&(recvResources->llData), NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine)));
- recvResources->buffSize = buffSize;
+ NCCLCHECK(ncclCudaHostCalloc((char**)&resources->hostRecvMem, recvSize));
+ resources->devHostRecvMem = resources->hostRecvMem;
- INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), recvResources->netDev,
- recvResources->useGdr ? "/GDRDMA" : "");
+ NCCLCHECK(ncclIbMalloc((void**)&(resources->llData), recv->comm->buffSizes[NCCL_PROTO_LL]/2));
+ INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
+ resources->useGdr ? "/GDRDMA" : "");
struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
- NCCLCHECK(collNetListen(recvResources->netDev, &info->collNetHandle, &recvResources->netListenComm));
-
+ NCCLCHECK(collNetListen(resources->netDev, &info->collNetHandle, &resources->netListenComm));
return ncclSuccess;
}
ncclResult_t collNetSendConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
// Setup device pointers
- struct collNetSendResources* sendResources = (struct collNetSendResources*)send->transportResources;
- sendResources->collNetRank = rank;
-
- // Get info from recv side
- struct collNetSendConnectInfo* sInfo = (struct collNetSendConnectInfo*)(connectInfos+rank);
- sendResources->reqFifo = sInfo->reqFifo;
- sendResources->collNetSendComm = sInfo->collNetComm;
- sendResources->recvMhandle = sInfo->mhandle;
- sendResources->llRecvMhandle = sInfo->llMhandle;
+ struct collNetSendResources* resources = (struct collNetSendResources*)send->transportResources;
+ struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank);
// Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host
- struct ncclRecvMem* sRecvMem = sendResources->useGdr ? sendResources->devRecvMem : sendResources->devHostRecvMem;
- // Register buffers
- NCCLCHECK(collNetRegMr(sendResources->collNetSendComm, sRecvMem->buff, sendResources->buffSize,
- sendResources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &sendResources->sendMhandle));
- NCCLCHECK(collNetRegMr(sendResources->collNetSendComm, sendResources->llData,
- NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine), NCCL_PTR_HOST, &sendResources->llSendMhandle));
-
- send->conn.buff = sRecvMem->buff;
- send->conn.llBuff = sendResources->devHostRecvMem->llBuff;
- send->conn.direct |= sendResources->useGdr ? NCCL_DIRECT_NIC : 0;
+ struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
+ int offset = 0;
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ send->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->devHostRecvMem->buff : recvMem->buff) + offset;
+ offset += send->comm->buffSizes[p];
+ }
+ send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
// Head/Tail/Opcount/Fifos are always on host
- send->conn.tail = &sendResources->devHostRecvMem->tail;
- send->conn.opCountRem = &sendResources->devHostRecvMem->opCount;
- send->conn.fifo = sendResources->devHostRecvMem->sizesFifo;
- send->conn.head = &sendResources->devHostSendMem->head;
- send->conn.opCountLoc = &sendResources->devHostSendMem->opCount;
+ send->conn.tail = &resources->devHostRecvMem->tail;
+ send->conn.opCountRem = &resources->devHostRecvMem->opCount;
+ send->conn.fifo = resources->devHostRecvMem->sizesFifo;
+ send->conn.head = &resources->devHostSendMem->head;
+ send->conn.opCountLoc = &resources->devHostSendMem->opCount;
for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
+ // Get info from recv side
+ resources->collNetRank = rank;
+ resources->reqFifo = info->reqFifo;
+ resources->collNetSendComm = info->collNetComm;
+
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
+ resources->recvMhandles[p] = info->mhandles[p];
+
+ // Register buffers
+ NCCLCHECK(collNetRegMr(resources->collNetSendComm, send->conn.buffs[NCCL_PROTO_SIMPLE], send->comm->buffSizes[NCCL_PROTO_SIMPLE],
+ resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
+ NCCLCHECK(collNetRegMr(resources->collNetSendComm, resources->llData, send->comm->buffSizes[NCCL_PROTO_LL]/2,
+ NCCL_PTR_HOST, &resources->sendMhandles[NCCL_PROTO_LL]));
return ncclSuccess;
}
ncclResult_t collNetRecvConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
// Setup device pointers
- struct collNetRecvResources* recvResources = (struct collNetRecvResources*)recv->transportResources;
- struct collNetSendConnectInfo* sInfo = (struct collNetSendConnectInfo*)(connectInfos+rank);
- recvResources->collNetRank = rank;
+ struct collNetRecvResources* resources = (struct collNetRecvResources*)recv->transportResources;
+ struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank);
+ resources->collNetRank = rank;
// Intermediate buffering on GPU for GPU Direct RDMA
- struct ncclRecvMem* rRecvMem = recvResources->useGdr ? recvResources->devRecvMem : recvResources->devHostRecvMem;
- recv->conn.buff = rRecvMem->buff;
- recv->conn.llBuff = recvResources->devHostRecvMem->llBuff; // recv LL buff always on host
- recv->conn.direct |= recvResources->useGdr ? NCCL_DIRECT_NIC : 0;
+ struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
+ int offset = 0;
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ recv->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->devHostRecvMem->buff : recvMem->buff) + offset;
+ offset += recv->comm->buffSizes[p];
+ }
+ recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
// Head/Tail/Opcount are always on host
- recv->conn.tail = &recvResources->devHostRecvMem->tail;
- recv->conn.opCountLoc = &recvResources->devHostRecvMem->opCount;
- recv->conn.head = &recvResources->devHostSendMem->head;
- recv->conn.opCountRem = &recvResources->devHostSendMem->opCount;
+ recv->conn.tail = &resources->devHostRecvMem->tail;
+ recv->conn.opCountLoc = &resources->devHostRecvMem->opCount;
+ recv->conn.head = &resources->devHostSendMem->head;
+ recv->conn.opCountRem = &resources->devHostSendMem->opCount;
// Connect to coll comm
collNetHandle_t** handlePtrs = NULL;
@@ -194,64 +190,64 @@ ncclResult_t collNetRecvConnect(struct ncclConnect* connectInfos, int nranks, in
handlePtrs[i] = &(info->collNetHandle);
}
ncclResult_t res;
- NCCLCHECKGOTO(collNetConnect((void**)handlePtrs, nranks, rank, recvResources->netListenComm, &recvResources->collNetRecvComm), res, cleanup);
+ NCCLCHECKGOTO(collNetConnect((void**)handlePtrs, nranks, rank, resources->netListenComm, &resources->collNetRecvComm), res, cleanup);
// Register buffers
- NCCLCHECK(collNetRegMr(recvResources->collNetRecvComm, rRecvMem->buff, recvResources->buffSize,
- recvResources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &recvResources->mhandle));
- NCCLCHECK(collNetRegMr(recvResources->collNetRecvComm, recvResources->llData,
- NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine), NCCL_PTR_HOST, &recvResources->llMhandle));
+ NCCLCHECK(collNetRegMr(resources->collNetRecvComm, recv->conn.buffs[NCCL_PROTO_SIMPLE], recv->comm->buffSizes[NCCL_PROTO_SIMPLE],
+ resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[NCCL_PROTO_SIMPLE]));
+ NCCLCHECK(collNetRegMr(resources->collNetRecvComm, resources->llData, recv->comm->buffSizes[NCCL_PROTO_LL]/2,
+ NCCL_PTR_HOST, &resources->mhandles[NCCL_PROTO_LL]));
// Create shared info between send and recv proxies
- NCCLCHECK(ncclCalloc(&(recvResources->reqFifo), NCCL_STEPS));
+ NCCLCHECK(ncclCalloc(&(resources->reqFifo), NCCL_STEPS));
// Pass info to send side
- sInfo->reqFifo = recvResources->reqFifo;
- sInfo->collNetComm = recvResources->collNetRecvComm;
- sInfo->mhandle = recvResources->mhandle;
- sInfo->llMhandle = recvResources->llMhandle;
+ info->reqFifo = resources->reqFifo;
+ info->collNetComm = resources->collNetRecvComm;
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
+ info->mhandles[p] = resources->mhandles[p];
cleanup:
if (handlePtrs != NULL) free(handlePtrs);
// Close listen comm
- NCCLCHECK(collNetCloseListen(recvResources->netListenComm));
+ NCCLCHECK(collNetCloseListen(resources->netListenComm));
return res;
}
ncclResult_t collNetSendFree(void* sendTransportResources) {
- struct collNetSendResources* sendResources = (struct collNetSendResources*)sendTransportResources;
- NCCLCHECK(ncclCudaHostFree(sendResources->hostSendMem));
- NCCLCHECK(ncclCudaHostFree(sendResources->hostRecvMem));
- if (sendResources->collNetSendComm) {
- NCCLCHECK(collNetDeregMr(sendResources->collNetSendComm, sendResources->sendMhandle));
- NCCLCHECK(collNetDeregMr(sendResources->collNetSendComm, sendResources->llSendMhandle));
+ struct collNetSendResources* resources = (struct collNetSendResources*)sendTransportResources;
+ NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
+ NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
+ if (resources->collNetSendComm) {
+ NCCLCHECK(collNetDeregMr(resources->collNetSendComm, resources->sendMhandles[NCCL_PROTO_LL]));
+ NCCLCHECK(collNetDeregMr(resources->collNetSendComm, resources->sendMhandles[NCCL_PROTO_SIMPLE]));
}
- if (sendResources->useGdr)
- CUDACHECK(cudaFree(sendResources->devRecvMem));
- free(sendResources->llData);
- free(sendResources);
+ if (resources->useGdr)
+ CUDACHECK(cudaFree(resources->devRecvMem));
+ free(resources->llData);
+ free(resources);
return ncclSuccess;
}
ncclResult_t collNetRecvFree(void* recvTransportResources) {
- struct collNetRecvResources* recvResources = (struct collNetRecvResources*)recvTransportResources;
- NCCLCHECK(ncclCudaHostFree(recvResources->hostSendMem));
- if (recvResources->collNetRecvComm) {
- NCCLCHECK(collNetDeregMr(recvResources->collNetRecvComm, recvResources->mhandle));
- NCCLCHECK(collNetDeregMr(recvResources->collNetRecvComm, recvResources->llMhandle));
+ struct collNetRecvResources* resources = (struct collNetRecvResources*)recvTransportResources;
+ NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
+ if (resources->collNetRecvComm) {
+ NCCLCHECK(collNetDeregMr(resources->collNetRecvComm, resources->mhandles[NCCL_PROTO_LL]));
+ NCCLCHECK(collNetDeregMr(resources->collNetRecvComm, resources->mhandles[NCCL_PROTO_SIMPLE]));
}
- NCCLCHECK(ncclCudaHostFree(recvResources->hostRecvMem));
- if (recvResources->useGdr)
- CUDACHECK(cudaFree(recvResources->devRecvMem));
- free(recvResources->llData);
- free(recvResources->reqFifo);
+ NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
+ if (resources->useGdr)
+ CUDACHECK(cudaFree(resources->devRecvMem));
+ free(resources->llData);
+ free(resources->reqFifo);
// Make sure SendFree is called before RecvFree
- if (recvResources->collNetRecvComm) {
- NCCLCHECK(collNetCloseColl(recvResources->collNetRecvComm));
+ if (resources->collNetRecvComm) {
+ NCCLCHECK(collNetCloseColl(resources->collNetRecvComm));
}
- free(recvResources);
+ free(resources);
return ncclSuccess;
}
@@ -273,6 +269,11 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
args->state = ncclProxyOpProgress;
}
if (args->state == ncclProxyOpProgress) {
+ int p = args->protocol;
+ int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
+ char* localBuff = args->connector->conn.buffs[p];
+ void* sendMhandle = resources->sendMhandles[p];
+ void* recvMhandle = resources->recvMhandles[p];
args->idle = 1;
struct reqSlot* reqFifo = resources->reqFifo;
if (args->head < args->end) {
@@ -286,7 +287,7 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
if (size != -1) {
uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
- union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
+ union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
int ready = 1;
for (int i=0; i<nFifoLines; i++) {
volatile uint32_t *f1 = &lines[i].flag1;
@@ -294,16 +295,17 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
}
if (ready) {
+ int stepLines = stepSize / sizeof(union ncclLLFifoLine);
//separate data from flag
- struct ncclLLDataLine* sendBuff = resources->llData+buffSlot*NCCL_LL_SLICE_LINES;
+ uint32_t* sendBuff = resources->llData+buffSlot*2*stepLines; // each line has two data elements
for (int i=0; i<nFifoLines; i++) {
volatile uint32_t *d1 = &lines[i].data1;
volatile uint32_t *d2 = &lines[i].data2;
- sendBuff[i].data1 = d1[0];
- sendBuff[i].data2 = d2[0];
+ sendBuff[2*i] = d1[0];
+ sendBuff[2*i+1] = d2[0];
}
- int count = nFifoLines*sizeof(struct ncclLLDataLine) / ncclTypeSize(args->dtype);
- NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*)sendBuff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, resources->llSendMhandle, resources->llRecvMhandle, args->requests+buffSlot));
+ int count = nFifoLines*2*sizeof(uint32_t) / ncclTypeSize(args->dtype);
+ NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*)sendBuff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce (LL) posted, req %p", args->head, buffSlot, args->requests[buffSlot]);
sizesFifo[buffSlot] = -1;
@@ -315,12 +317,10 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
}
}
} else if (args->tail < *recvTail) {
- int stepSize = args->channel->buffSize/NCCL_STEPS;
- struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
// Send through network
if (sizesFifo[buffSlot] != -1) {
int count = sizesFifo[buffSlot]/ncclTypeSize(args->dtype);
- NCCLCHECK(collNetIallreduce(resources->collNetSendComm, localMem->buff+buffSlot*stepSize, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, resources->sendMhandle, resources->recvMhandle, args->requests+buffSlot));
+ NCCLCHECK(collNetIallreduce(resources->collNetSendComm, localBuff+buffSlot*stepSize, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce posted, req %p count %d", args->head, buffSlot, args->requests[buffSlot], count);
sizesFifo[buffSlot] = -1;
@@ -377,16 +377,18 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
}
if (args->state == ncclProxyOpProgress) {
args->idle = 1;
- int stepSize = ( args->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine) : args->channel->buffSize ) / NCCL_STEPS;
+ int p = args->protocol;
+ int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
+ char* localBuff = args->connector->conn.buffs[p];
+ void* mhandle = resources->mhandles[p];
struct reqSlot* reqFifo = resources->reqFifo;
if (args->head < args->end) {
- struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
- char* localBuff = args->protocol == NCCL_PROTO_LL ? (char*)resources->llData : localMem->buff;
- void* mhandle = args->protocol == NCCL_PROTO_LL ? resources->llMhandle : resources->mhandle;
if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
int buffSlot = args->tail%NCCL_STEPS;
- reqFifo[buffSlot].recvBuff = localBuff+buffSlot*stepSize;
- TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->tail, buffSlot, localBuff+buffSlot*stepSize);
+ char* recvBuff = p == NCCL_PROTO_LL ? (char*)resources->llData : localBuff;
+ int recvStepSize = p == NCCL_PROTO_LL ? stepSize/2 : stepSize;
+ reqFifo[buffSlot].recvBuff = recvBuff+buffSlot*recvStepSize;
+ TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->tail, buffSlot, reqFifo[buffSlot].recvBuff);
args->tail += args->sliceSteps;
args->idle = 0;
}
@@ -398,15 +400,16 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
if (args->protocol == NCCL_PROTO_LL) { // ll
// re-attach flag
uint32_t flag = args->head;
- union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(resources->hostRecvMem->llBuff)+buffSlot*NCCL_LL_SLICE_LINES;
- struct ncclLLDataLine* recvData = resources->llData+buffSlot*NCCL_LL_SLICE_LINES;
- int nFifoLines = DIVUP(reqFifo[buffSlot].size, sizeof(struct ncclLLDataLine));
+ int stepLines = stepSize / sizeof(union ncclLLFifoLine);
+ union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
+ uint32_t* recvData = resources->llData+buffSlot*2*stepLines;
+ int nFifoLines = DIVUP(reqFifo[buffSlot].size, 2*sizeof(uint32_t));
for (int i=0; i<nFifoLines; i++) {
- lines[i].v[0] = ((uint64_t)flag << 32) + recvData[i].data1;
- lines[i].v[1] = ((uint64_t)flag << 32) + recvData[i].data2;
+ lines[i].v[0] = ((uint64_t)flag << 32) + recvData[2*i];
+ lines[i].v[1] = ((uint64_t)flag << 32) + recvData[2*i+1];
}
} else if (args->protocol == NCCL_PROTO_SIMPLE) {
- if (resources->useGdr) collNetFlush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, reqFifo[buffSlot].size, mhandle);
+ if (resources->useGdr) NCCLCHECK(collNetFlush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, reqFifo[buffSlot].size, mhandle));
resources->hostRecvMem->tail = args->head;
}
args->idle = 0;
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 288ad92..7cf4d09 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -12,19 +12,20 @@ struct netConnectInfo {
ncclNetHandle_t netHandle;
};
+#define LOC_HOSTMEM 0
+#define LOC_DEVMEM 1
+#define LOC_COUNT 2
+
struct netSendResources {
void* netSendComm;
- struct ncclSendMem* hostSendMem;
- struct ncclRecvMem* hostRecvMem;
- struct ncclSendMem* devHostSendMem;
- struct ncclRecvMem* devHostRecvMem;
+ struct ncclSendMem* sendMem;
+ struct ncclRecvMem* recvMem;
int netDev;
int useGdr;
- int buffSize;
- void* mhandle;
- void* llMhandle;
- void* ll128Mhandle;
- struct ncclRecvMem* devRecvMem;
+ char* buffers[LOC_COUNT];
+ int buffSizes[LOC_COUNT];
+ void* mhandles[LOC_COUNT];
+ void** mhandlesProto[NCCL_NUM_PROTOCOLS];
uint64_t step;
uint64_t llLastCleaning;
};
@@ -32,17 +33,14 @@ struct netSendResources {
struct netRecvResources {
void* netListenComm;
void* netRecvComm;
- struct ncclSendMem* hostSendMem;
- struct ncclRecvMem* hostRecvMem;
- struct ncclSendMem* devHostSendMem;
- struct ncclRecvMem* devHostRecvMem;
+ struct ncclSendMem* sendMem;
+ struct ncclRecvMem* recvMem;
int netDev;
int useGdr;
- int buffSize;
- void* mhandle;
- void* llMhandle;
- void* ll128Mhandle;
- struct ncclRecvMem* devRecvMem;
+ char* buffers[LOC_COUNT];
+ int buffSizes[LOC_COUNT];
+ void* mhandles[LOC_COUNT];
+ void** mhandlesProto[NCCL_NUM_PROTOCOLS];
uint64_t step;
uint64_t llLastCleaning;
};
@@ -55,84 +53,123 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
/* Determine if we will use this transport for this peer and return connect
* information for this peer */
-ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
struct netSendResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
- NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &resources->netDev));
+ NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
- int sendSize = sizeof(struct ncclSendMem);
- NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
+ NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
+ NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1));
+
+ send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
+ send->conn.tail = &resources->recvMem->tail;
+ send->conn.opCountRem = &resources->recvMem->opCount;
+ send->conn.fifo = resources->recvMem->sizesFifo;
+ send->conn.head = &resources->sendMem->head;
+ send->conn.opCountLoc = &resources->sendMem->opCount;
+ for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
+
+ int protoLoc[NCCL_NUM_PROTOCOLS];
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ protoLoc[p] = p != NCCL_PROTO_LL && resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
+ }
- int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
- if (resources->useGdr) {
- NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
+ int buffSizes[NCCL_NUM_PROTOCOLS];
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ // Only allocate buffers for simple for p2p connections
+ buffSizes[p] = graph == NULL && p != NCCL_PROTO_SIMPLE ? 0 : send->comm->buffSizes[p];
+ resources->buffSizes[protoLoc[p]] += buffSizes[p];
}
- NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
- resources->buffSize = buffSize;
- INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
+ if (resources->buffSizes[LOC_DEVMEM]) {
+ NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM]));
+ }
+ if (resources->buffSizes[LOC_HOSTMEM]) {
+ NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
+ }
+
+ int offsets[LOC_COUNT];
+ offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
+ send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
+ offsets[protoLoc[p]] += buffSizes[p];
+ }
+
+ INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
resources->useGdr ? "/GDRDMA" : "");
return ncclSuccess;
}
-ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
+ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
struct netRecvResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
- NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &resources->netDev));
+ NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev));
NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
- int sendSize = sizeof(struct ncclSendMem);
- NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
+ NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
+ NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1));
+
+ recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
+ recv->conn.tail = &resources->recvMem->tail;
+ recv->conn.opCountLoc = &resources->recvMem->opCount;
+ recv->conn.head = &resources->sendMem->head;
+ recv->conn.opCountRem = &resources->sendMem->opCount;
+
+ int protoLoc[NCCL_NUM_PROTOCOLS];
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ protoLoc[p] = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
+ }
+
+ int buffSizes[NCCL_NUM_PROTOCOLS];
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ // Only allocate buffers for simple for p2p connections
+ buffSizes[p] = graph == NULL && p != NCCL_PROTO_SIMPLE ? 0 : recv->comm->buffSizes[p];
+ resources->buffSizes[protoLoc[p]] += buffSizes[p];
+ }
+
+ if (resources->buffSizes[LOC_DEVMEM]) {
+ NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM]));
+ }
+ if (resources->buffSizes[LOC_HOSTMEM]) {
+ NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
+ }
- int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
- if (resources->useGdr) {
- NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
+ int offsets[LOC_COUNT];
+ offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
+ recv->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
+ offsets[protoLoc[p]] += buffSizes[p];
}
- NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
- resources->buffSize = buffSize;
- INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
+ INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
resources->useGdr ? "/GDRDMA" : "");
struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
+
return ncclSuccess;
}
ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
// Setup device pointers
struct netSendResources* resources = (struct netSendResources*)send->transportResources;
-
- // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host
- struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
- send->conn.buff = recvMem->buff;
- send->conn.llBuff = resources->devHostRecvMem->llBuff;
- send->conn.ll128Buff = recvMem->ll128Buff;
- send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
-
- // Head/Tail/Opcount/Fifos are always on host
- send->conn.tail = &resources->devHostRecvMem->tail;
- send->conn.opCountRem = &resources->devHostRecvMem->opCount;
- send->conn.fifo = resources->devHostRecvMem->sizesFifo;
- send->conn.head = &resources->devHostSendMem->head;
- send->conn.opCountLoc = &resources->devHostSendMem->opCount;
- for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
+ struct netConnectInfo* info = (struct netConnectInfo*)connectInfo;
// Connect to remote peer
- struct netConnectInfo* info = (struct netConnectInfo*)connectInfo;
NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm));
- NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->buff, resources->buffSize,
- resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
- NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->devHostRecvMem->llBuff,
- NCCL_LL_BUFF_SIZE, NCCL_PTR_HOST, &resources->llMhandle));
- NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE,
- resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle));
-
+ if (resources->buffSizes[LOC_DEVMEM]) {
+ NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
+ }
+ if (resources->buffSizes[LOC_HOSTMEM]) {
+ NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM]));
+ }
return ncclSuccess;
}
@@ -141,42 +178,29 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran
// Setup device pointers
struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources;
- // Intermediate buffering on GPU for GPU Direct RDMA
- struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
- recv->conn.buff = recvMem->buff;
- recv->conn.llBuff = recvMem->llBuff;
- recv->conn.ll128Buff = recvMem->ll128Buff;
- recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
-
- // Head/Tail/Opcount are always on host
- recv->conn.tail = &resources->devHostRecvMem->tail;
- recv->conn.opCountLoc = &resources->devHostRecvMem->opCount;
- recv->conn.head = &resources->devHostSendMem->head;
- recv->conn.opCountRem = &resources->devHostSendMem->opCount;
-
// Finish connection establishment from remote peer
NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
- NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->buff, resources->buffSize,
- resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
- NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->llBuff, NCCL_LL_BUFF_SIZE,
- resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->llMhandle));
- NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE,
- resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle));
-
+ if (resources->buffSizes[LOC_DEVMEM]) {
+ NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
+ }
+ if (resources->buffSizes[LOC_HOSTMEM]) {
+ NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM]));
+ }
return ncclSuccess;
}
ncclResult_t netSendFree(void* transportResources) {
struct netSendResources* resources = (struct netSendResources*)transportResources;
- NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
- NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandle));
- NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->llMhandle));
- NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->ll128Mhandle));
- NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
- if (resources->useGdr)
- CUDACHECK(cudaFree(resources->devRecvMem));
+ NCCLCHECK(ncclCudaHostFree(resources->sendMem));
+ NCCLCHECK(ncclCudaHostFree(resources->recvMem));
+ for (int l=0; l<LOC_COUNT; l++) {
+ if (resources->buffers[l])
+ NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[l]));
+ }
+ NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
+ CUDACHECK(cudaFree(resources->buffers[LOC_DEVMEM]));
NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
free(resources);
return ncclSuccess;
@@ -184,13 +208,14 @@ ncclResult_t netSendFree(void* transportResources) {
ncclResult_t netRecvFree(void* transportResources) {
struct netRecvResources* resources = (struct netRecvResources*)transportResources;
- NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
- NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandle));
- NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->llMhandle));
- NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->ll128Mhandle));
- NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
- if (resources->useGdr)
- CUDACHECK(cudaFree(resources->devRecvMem));
+ NCCLCHECK(ncclCudaHostFree(resources->sendMem));
+ NCCLCHECK(ncclCudaHostFree(resources->recvMem));
+ for (int l=0; l<LOC_COUNT; l++) {
+ if (resources->buffers[l])
+ NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[l]));
+ }
+ NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
+ CUDACHECK(cudaFree(resources->buffers[LOC_DEVMEM]));
NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
free(resources);
return ncclSuccess;
@@ -200,7 +225,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
struct netSendResources* resources = (struct netSendResources*) (args->connector->transportResources);
if (args->state == ncclProxyOpReady) {
// Update opCount
- resources->hostRecvMem->opCount = args->opCount;
+ resources->recvMem->opCount = args->opCount;
// Round to next multiple of sliceSteps
resources->step = ROUNDUP(resources->step, args->chunkSteps);
@@ -210,18 +235,19 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
args->state = ncclProxyOpProgress;
}
if (args->state == ncclProxyOpProgress) {
+ int p = args->protocol;
+ int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
+ char* localBuff = args->connector->conn.buffs[p];
+ void* mhandle = *(resources->mhandlesProto[p]);
args->idle = 1;
if (args->head < args->end) {
+ int buffSlot = args->tail%NCCL_STEPS;
if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
- volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
- volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
+ volatile int* sizesFifo = resources->recvMem->sizesFifo;
+ volatile uint64_t* recvTail = &resources->recvMem->tail;
if (args->protocol == NCCL_PROTO_LL128) {
- int stepSize = NCCL_LL128_BUFF_SIZE/NCCL_STEPS;
if (args->tail < *recvTail) {
- int buffSlot = args->tail%NCCL_STEPS;
if (sizesFifo[buffSlot] != -1) {
- struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
- char* localBuff = (char*)localMem->ll128Buff;
int ready = resources->useGdr;
if (!ready) {
// When data is in sysmem, we need to wait until all flags are correct since the GPU only
@@ -236,7 +262,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
}
if (ready) {
// Send through network
- NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, sizesFifo[buffSlot], resources->ll128Mhandle, args->requests+buffSlot));
+ NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, sizesFifo[buffSlot], mhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
sizesFifo[buffSlot] = -1;
// Make sure size is reset to zero before we update the head.
@@ -248,13 +274,12 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
}
}
} else if (args->protocol == NCCL_PROTO_LL) {
- int buffSlot = args->tail%NCCL_STEPS;
int size = sizesFifo[buffSlot];
if (size != -1) {
uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
size = nFifoLines * sizeof(union ncclLLFifoLine);
- union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
+ union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
int ready = 1;
for (int i=0; i<nFifoLines; i++) {
volatile uint32_t *f1 = &lines[i].flag1;
@@ -262,7 +287,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
}
if (ready) {
- NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, resources->llMhandle, args->requests+buffSlot));
+ NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, mhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
sizesFifo[buffSlot] = -1;
// Make sure size is reset to zero before we update the head.
@@ -273,12 +298,9 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
}
}
} else if (args->tail < *recvTail) {
- int stepSize = args->channel->buffSize/NCCL_STEPS;
- struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
// Send through network
- int buffSlot = args->tail%NCCL_STEPS;
if (sizesFifo[buffSlot] != -1) {
- NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot));
+ NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, sizesFifo[buffSlot], mhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
sizesFifo[buffSlot] = -1;
// Make sure size is reset to zero before we update the head.
@@ -295,7 +317,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
if (done) {
args->head += args->sliceSteps;
- resources->hostSendMem->head = args->head;
+ resources->sendMem->head = args->head;
args->idle = 0;
}
}
@@ -313,7 +335,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
struct netRecvResources* resources = (struct netRecvResources*) (args->connector->transportResources);
if (args->state == ncclProxyOpReady) {
// Update opCount
- resources->hostSendMem->opCount = args->opCount;
+ resources->sendMem->opCount = args->opCount;
// Round to next multiple of sliceSteps
resources->step = ROUNDUP(resources->step, args->chunkSteps);
@@ -324,12 +346,12 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
}
if (args->state == ncclProxyOpProgress) {
args->idle = 1;
- int stepSize = ( args->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : args->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS;
+ int p = args->protocol;
+ int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS;
+ char* localBuff = args->connector->conn.buffs[p];
+ void* mhandle = *(resources->mhandlesProto[p]);
if (args->head < args->end) {
- struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
- char* localBuff = args->protocol == NCCL_PROTO_LL ? (char*)localMem->llBuff : args->protocol == NCCL_PROTO_LL128 ? (char*)localMem->ll128Buff : localMem->buff;
- void* mhandle = args->protocol == NCCL_PROTO_LL ? resources->llMhandle : args->protocol == NCCL_PROTO_LL128 ? resources->ll128Mhandle : resources->mhandle;
- volatile uint64_t* sendHead = &resources->hostSendMem->head;
+ volatile uint64_t* sendHead = &resources->sendMem->head;
if ((args->tail < args->head + NCCL_STEPS) && (args->tail < *sendHead + NCCL_STEPS) && (args->tail < args->end)) {
int buffSlot = args->tail%NCCL_STEPS;
int sliceSize = stepSize * args->sliceSteps;
@@ -347,7 +369,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
args->head += args->sliceSteps;
if (args->protocol == NCCL_PROTO_SIMPLE) {
if (resources->useGdr) NCCLCHECK(ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle));
- resources->hostRecvMem->tail = args->head;
+ resources->recvMem->tail = args->head;
}
args->idle = 0;
}
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index 1a832f2..97eca9f 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -112,6 +112,7 @@ static int ncclIbSpeed(int speed) {
}
ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
+ static int shownIbHcaEnv = 0;
if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
if (ncclParamIbDisable()) return ncclInternalError;
@@ -131,6 +132,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
// Check if user defined which IB device:port to use
char* userIbEnv = getenv("NCCL_IB_HCA");
+ if (userIbEnv != NULL && shownIbHcaEnv++ == 0) INFO(NCCL_NET|NCCL_ENV, "NCCL_IB_HCA set to %s", userIbEnv);
struct netIf userIfs[MAX_IB_DEVS];
bool searchNot = userIbEnv && userIbEnv[0] == '^';
if (searchNot) userIbEnv++;
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index 6586ce7..15816ce 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -10,6 +10,7 @@
struct p2pConnectInfo {
int direct;
+ int read;
union {
void* directPtr;
cudaIpcMemHandle_t devIpc;
@@ -54,7 +55,8 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
}
// Check topology / p2p level.
- NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret));
+ int read;
+ NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, &read));
if (*ret == 0) return ncclSuccess;
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
@@ -95,23 +97,44 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \
} while (0)
+// Setting this to non zero causes P2P to use Reads rather than Writes
+NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
+
+static int p2pUseRead(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+ int readEnable = ncclParamP2pReadEnable();
+ if (readEnable != -2) return readEnable;
+
+ int p2p, read;
+ // Queries the topology to see if the GPUs are Ampere and
+ // connected via NVLink, if so we enable P2P Read by default
+ NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, &p2p, &read));
+
+ return read;
+}
+
/* Send: Create and return connect structures for this peer to connect to me */
ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
- struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+ struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
struct p2pSendResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
+ int useRead = p2pUseRead(topo, myInfo, peerInfo);
int sendSize = sizeof(struct ncclSendMem);
+ // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
+ if (useRead) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE];
ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize));
struct p2pConnectInfo info;
+ info.read = useRead;
+ const char* useReadStr = info.read ? "/read" : "";
if (myInfo->pidHash == peerInfo->pidHash) {
info.direct = 1;
info.directPtr = resources->devMem;
if (myInfo->cudaDev == peerInfo->cudaDev) {
- INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/common device", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+ INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%d] -> %d[%d] via P2P/common device%s",
+ channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev, useReadStr);
return ncclInternalError;
} else {
// Enable P2P access
@@ -123,8 +146,8 @@ ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
return ncclInternalError;
}
- INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer",
- channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+ INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
+ channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
}
} else {
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
@@ -137,8 +160,8 @@ ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
myInfo->rank, peerCudaDev, peerInfo->busId, err, cudaGetErrorString(err));
return ncclInternalError;
}
- INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/IPC",
- channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+ INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s",
+ channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
//TRACE_DUMP_IPC(&info.devIpc);
}
static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
@@ -148,16 +171,20 @@ ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
/* Create and return connect structures for this peer to connect to me */
ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
- struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) {
+ struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId) {
struct p2pRecvResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
- int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+ int useRead = p2pUseRead(topo, myInfo, peerInfo);
+ int recvSize = offsetof(struct ncclRecvMem, buff);
+ // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(useRead && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p];
ALIGN_SIZE(recvSize, CUDA_IPC_MIN);
NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize));
struct p2pConnectInfo info;
+ info.read = useRead;
if (myInfo->pidHash == peerInfo->pidHash) {
info.direct = 1;
info.directPtr = resources->devMem;
@@ -173,7 +200,7 @@ ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err));
return ncclInternalError;
}
- TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/direct pointer", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+ TRACE(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] <- %d[%lx] via P2P/direct pointer", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
}
} else {
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
@@ -186,7 +213,7 @@ ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
myInfo->rank, peerCudaDev, peerInfo->busId, err, cudaGetErrorString(err));
return ncclInternalError;
}
- TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/IPC", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+ TRACE(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] <- %d[%lx] via P2P/IPC", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
//TRACE_DUMP_IPC(&info.devIpc);
}
static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
@@ -201,7 +228,7 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks,
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
if (info->direct) {
remDevMem = (struct ncclRecvMem*)(info->directPtr);
- send->conn.direct |= NCCL_DIRECT_GPU;
+ if (info->read == 0) send->conn.direct |= NCCL_DIRECT_GPU;
} else {
//TRACE_DUMP_IPC(&info->devIpc);
cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
@@ -213,9 +240,16 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks,
}
}
- send->conn.buff = remDevMem->buff;
- send->conn.llBuff = remDevMem->llBuff;
- send->conn.ll128Buff = remDevMem->ll128Buff;
+ int offset = 0;
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ if (info->read && p == NCCL_PROTO_SIMPLE) {
+ /* For P2P Read the SIMPLE buffer is local (ncclSendMem) */
+ send->conn.buffs[p] = resources->devMem->buff;
+ } else {
+ send->conn.buffs[p] = remDevMem->buff + offset;
+ offset += send->comm->buffSizes[p];
+ }
+ }
send->conn.tail = &remDevMem->tail;
send->conn.opCountRem = &remDevMem->opCount;
send->conn.head = &resources->devMem->head;
@@ -231,8 +265,10 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
if (info->direct) {
remDevMem = (struct ncclSendMem*)(info->directPtr);
- recv->conn.direct |= NCCL_DIRECT_GPU;
- recv->conn.ptrExchange = &remDevMem->ptrExchange;
+ if (info->read == 0) {
+ recv->conn.direct |= NCCL_DIRECT_GPU;
+ recv->conn.ptrExchange = &remDevMem->ptrExchange;
+ }
} else {
//TRACE_DUMP_IPC(&info->devIpc);
cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
@@ -244,9 +280,16 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran
}
}
- recv->conn.buff = resources->devMem->buff;
- recv->conn.llBuff = resources->devMem->llBuff;
- recv->conn.ll128Buff = resources->devMem->ll128Buff;
+ int offset = 0;
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ if (info->read && p == NCCL_PROTO_SIMPLE) {
+ /* For P2P Read the SIMPLE buffer is remote (ncclSendMem) */
+ recv->conn.buffs[p] = remDevMem->buff;
+ } else {
+ recv->conn.buffs[p] = resources->devMem->buff + offset;
+ offset += recv->comm->buffSizes[p];
+ }
+ }
recv->conn.tail = &resources->devMem->tail;
recv->conn.opCountLoc = &resources->devMem->opCount;
recv->conn.head = &remDevMem->head;
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
index 0b1d8ee..caac3f6 100644
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -57,7 +57,7 @@ ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
#define MAX_SHM_NAME_LEN 1024
/* Create and return connect structures for this peer to connect to me */
-ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) {
struct shmSendResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
@@ -75,13 +75,13 @@ ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
- INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
+ INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
return ncclSuccess;
}
-ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
+ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) {
struct shmRecvResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
@@ -94,7 +94,9 @@ ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
char shmName[MAX_SHM_NAME_LEN];
sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
- info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+ int shmSize = offsetof(struct ncclRecvMem, buff);
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
+ info.shmSize = resources->shmSize = shmSize;
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
@@ -118,9 +120,11 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, int nranks, int ran
NCCLCHECK(shmUnlink(shmName));
send->transportResources = resources;
- send->conn.buff = resources->devRemHostMem->buff;
- send->conn.llBuff = resources->devRemHostMem->llBuff;
- send->conn.ll128Buff = resources->devRemHostMem->ll128Buff;
+ int offset = 0;
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ send->conn.buffs[p] = resources->devRemHostMem->buff + offset;
+ offset += send->comm->buffSizes[p];
+ }
send->conn.tail = &resources->devRemHostMem->tail;
send->conn.opCountRem = &resources->devRemHostMem->opCount;
@@ -143,9 +147,11 @@ ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran
recv->conn.head = &resources->devRemHostMem->head;
recv->conn.opCountRem = &resources->devRemHostMem->opCount;
- recv->conn.buff = resources->devHostMem->buff;
- recv->conn.llBuff = resources->devHostMem->llBuff;
- recv->conn.ll128Buff = resources->devHostMem->ll128Buff;
+ int offset = 0;
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ recv->conn.buffs[p] = resources->devHostMem->buff + offset;
+ offset += recv->comm->buffSizes[p];
+ }
recv->conn.tail = &resources->devHostMem->tail;
recv->conn.opCountLoc = &resources->devHostMem->opCount;
return ncclSuccess;