Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/collectives')
-rw-r--r--src/collectives/all_gather.cc5
-rw-r--r--src/collectives/all_reduce.cc5
-rw-r--r--src/collectives/broadcast.cc5
-rw-r--r--src/collectives/device/Makefile4
-rw-r--r--src/collectives/device/all_gather.cu4
-rw-r--r--src/collectives/device/all_gather.h369
-rw-r--r--src/collectives/device/all_reduce.cu4
-rw-r--r--src/collectives/device/all_reduce.h870
-rw-r--r--src/collectives/device/broadcast.cu4
-rw-r--r--src/collectives/device/broadcast.h278
-rw-r--r--src/collectives/device/common.h204
-rw-r--r--src/collectives/device/common_kernel.h174
-rw-r--r--src/collectives/device/functions.cu103
-rwxr-xr-xsrc/collectives/device/gen_rules.sh4
-rw-r--r--src/collectives/device/primitives.h360
-rw-r--r--src/collectives/device/prims_ll.h46
-rw-r--r--src/collectives/device/prims_ll128.h47
-rw-r--r--src/collectives/device/reduce.cu4
-rw-r--r--src/collectives/device/reduce.h264
-rw-r--r--src/collectives/device/reduce_scatter.cu4
-rw-r--r--src/collectives/device/reduce_scatter.h352
-rw-r--r--src/collectives/device/sendrecv.cu11
-rw-r--r--src/collectives/device/sendrecv.h92
-rw-r--r--src/collectives/reduce.cc5
-rw-r--r--src/collectives/reduce_scatter.cc5
-rw-r--r--src/collectives/sendrecv.cc39
26 files changed, 1817 insertions, 1445 deletions
diff --git a/src/collectives/all_gather.cc b/src/collectives/all_gather.cc
index 348c176..266fd5a 100644
--- a/src/collectives/all_gather.cc
+++ b/src/collectives/all_gather.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,7 +11,8 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
- struct ncclInfo info = { ncclCollAllGather, "AllGather",
+ NVTX3_FUNC_RANGE_IN(nccl_domain);
+ struct ncclInfo info = { ncclFuncAllGather, "AllGather",
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
return ncclEnqueueCheck(&info);
diff --git a/src/collectives/all_reduce.cc b/src/collectives/all_reduce.cc
index 7796d5b..b67f3be 100644
--- a/src/collectives/all_reduce.cc
+++ b/src/collectives/all_reduce.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,7 +10,8 @@ NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
- struct ncclInfo info = { ncclCollAllReduce, "AllReduce",
+ NVTX3_FUNC_RANGE_IN(nccl_domain);
+ struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
return ncclEnqueueCheck(&info);
diff --git a/src/collectives/broadcast.cc b/src/collectives/broadcast.cc
index 042301b..db0fb49 100644
--- a/src/collectives/broadcast.cc
+++ b/src/collectives/broadcast.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,7 +11,8 @@ NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
- struct ncclInfo info = { ncclCollBroadcast, "Broadcast",
+ NVTX3_FUNC_RANGE_IN(nccl_domain);
+ struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
return ncclEnqueueCheck(&info);
diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile
index 001059c..3796fb1 100644
--- a/src/collectives/device/Makefile
+++ b/src/collectives/device/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -10,7 +10,7 @@ include ../../../makefiles/version.mk
BUILDDIR ?= $(abspath ../../../build)
OBJDIR := $(BUILDDIR)/obj/collectives/device
-LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu
+LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu
LIBSRCFILES += functions.cu
diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu
index 109c341..4022e2e 100644
--- a/src/collectives/device/all_gather.cu
+++ b/src/collectives/device/all_gather.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,4 +8,4 @@
#include "common.h"
#include "collectives.h"
-IMPL_COLL_C(ncclAllGather, ncclCollAllGather);
+IMPL_COLL_C(AllGather);
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index 0ad5ba9..e057dc8 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,188 +8,201 @@
#include "primitives.h"
#include "collectives.h"
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
- const int tid = threadIdx.x;
- const int nthreads = args->nThreads-WARP_SIZE;
- const int bid = args->bid;
- struct ncclDevComm* comm = args->comm;
- struct ncclChannel* channel = comm->channels+blockIdx.x;
- struct ncclRing* ring = &channel->ring;
- const ssize_t size = args->N;
- const int nranks = comm->nRanks;
- const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
- const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
- const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
-
- // Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
-
- ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, FUNC>
- prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
-
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
- ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
- ssize_t chunkOffset = gridOffset + bid*realChunkSize;
-
- /////////////// begin AllGather steps ///////////////
- ssize_t offset;
- int nelem = min(realChunkSize, size-chunkOffset);
- int rankDest;
-
- // step 0: push data to next GPU
- rankDest = ring->devUserRanks[0];
- offset = chunkOffset + rankDest * size;
-
- if (thisInput + chunkOffset == thisOutput + offset) { // In place
- prims.directSend(thisInput+chunkOffset, offset, nelem);
- } else {
- prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads-WARP_SIZE;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+ const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+ const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS;
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+ const ssize_t size = args->coll.count;
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+ ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, 1, FUNC>
+ prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
+
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
+ ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+ ssize_t chunkOffset = gridOffset + bid*realChunkSize;
+
+ /////////////// begin AllGather steps ///////////////
+ ssize_t offset;
+ int nelem = min(realChunkSize, size-chunkOffset);
+ int rankDest;
+
+ // step 0: push data to next GPU
+ rankDest = ring->devUserRanks[0];
+ offset = chunkOffset + rankDest * size;
+
+ if (thisInput + chunkOffset == thisOutput + offset) { // In place
+ prims.directSend(thisInput+chunkOffset, offset, nelem);
+ } else {
+ prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
+ }
+
+ // k-2 steps: copy to next GPU
+ for (int j=1; j<nranks-1; ++j) {
+ rankDest = ring->devUserRanks[nranks-j];
+ offset = chunkOffset + rankDest * size;
+
+ prims.directRecvCopySend(thisOutput+offset, offset, nelem);
+ }
+
+ // Make final copy from buffer to dest.
+ rankDest = ring->devUserRanks[1];
+ offset = chunkOffset + rankDest * size;
+
+ // Final wait/copy.
+ prims.directRecv(thisOutput+offset, offset, nelem);
+ }
}
-
- // k-2 steps: copy to next GPU
- for (int j=1; j<nranks-1; ++j) {
- rankDest = ring->devUserRanks[nranks-j];
- offset = chunkOffset + rankDest * size;
-
- prims.directRecvCopySend(thisOutput+offset, offset, nelem);
+};
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+ const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+ ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
+
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ if (size-gridOffset < loopSize) {
+ chunkSize = args->coll.lastChunkSize;
+ }
+ ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+ /////////////// begin AllGather steps ///////////////
+ ssize_t offset;
+ int nelem = min(chunkSize, size-chunkOffset);
+ int rankDest;
+
+ // step 0: push data to next GPU
+ rankDest = ring->devUserRanks[0];
+ offset = chunkOffset + rankDest * size;
+
+ if (thisInput + chunkOffset == thisOutput + offset) { // In place
+ LLprims.send(thisInput+chunkOffset, nelem);
+ } else {
+ LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
+ }
+
+ // k-2 steps: copy to next GPU
+ for (int j=1; j<nranks-1; ++j) {
+ rankDest = ring->devUserRanks[nranks-j];
+ offset = chunkOffset + rankDest * size;
+
+ LLprims.recvCopySend(thisOutput+offset, nelem);
+ }
+
+ // step k-1: final store
+ rankDest = ring->devUserRanks[1];
+ offset = chunkOffset + rankDest * size;
+
+ LLprims.recv(thisOutput+offset, nelem);
+ }
}
-
- // Make final copy from buffer to dest.
- rankDest = ring->devUserRanks[1];
- offset = chunkOffset + rankDest * size;
-
- // Final wait/copy.
- prims.directRecv(thisOutput+offset, offset, nelem);
- }
-}
-
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
- const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
- struct ncclDevComm* comm = args->comm;
- struct ncclChannel* channel = comm->channels+blockIdx.x;
- struct ncclRing* ring = &channel->ring;
-
- ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
- //const int rank = comm->rank;
- const int nranks = comm->nRanks;
- ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
- const ssize_t loopSize = args->nChannels*chunkSize;
-
- // Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
-
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- if (size-gridOffset < loopSize) {
- chunkSize = args->lastChunkSize;
- }
- ssize_t chunkOffset = gridOffset + bid*chunkSize;
-
- /////////////// begin AllGather steps ///////////////
- ssize_t offset;
- int nelem = min(chunkSize, size-chunkOffset);
- int rankDest;
-
- // step 0: push data to next GPU
- rankDest = ring->devUserRanks[0];
- offset = chunkOffset + rankDest * size;
-
- if (thisInput + chunkOffset == thisOutput + offset) { // In place
- LLprims.send(thisInput+chunkOffset, nelem);
- } else {
- LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
- }
-
- // k-2 steps: copy to next GPU
- for (int j=1; j<nranks-1; ++j) {
- rankDest = ring->devUserRanks[nranks-j];
- offset = chunkOffset + rankDest * size;
-
- LLprims.recvCopySend(thisOutput+offset, nelem);
- }
-
- // step k-1: final store
- rankDest = ring->devUserRanks[1];
- offset = chunkOffset + rankDest * size;
-
- LLprims.recv(thisOutput+offset, nelem);
- }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
+};
#include "prims_ll128.h"
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
- const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
- struct ncclDevComm* comm = args->comm;
- struct ncclChannel* channel = comm->channels+blockIdx.x;
- struct ncclRing* ring = &channel->ring;
-
- ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
- //const int rank = comm->rank;
- const int nranks = comm->nRanks;
- ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
- // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
- const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
-
- const ssize_t loopSize = args->nChannels*chunkSize;
-
- // Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
-
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
-
- ssize_t chunkOffset = gridOffset + bid*chunkSize;
-
- /////////////// begin AllGather steps ///////////////
- ssize_t offset;
- int nelem = min(chunkSize, size-chunkOffset);
- int rankDest;
-
- // step 0: push data to next GPU
- rankDest = ring->devUserRanks[0];
- offset = chunkOffset + rankDest * size;
-
- if (thisInput + chunkOffset == thisOutput + offset) { // In place
- LLprims.send(thisInput+chunkOffset, nelem);
- } else {
- LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
- }
-
- // k-2 steps: copy to next GPU
- for (int j=1; j<nranks-1; ++j) {
- rankDest = ring->devUserRanks[nranks-j];
- offset = chunkOffset + rankDest * size;
-
- LLprims.recvCopySend(thisOutput+offset, nelem);
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllGather, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+ const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+ ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
+ // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
+ const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
+
+ ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
+
+ ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+ /////////////// begin AllGather steps ///////////////
+ ssize_t offset;
+ int nelem = min(chunkSize, size-chunkOffset);
+ int rankDest;
+
+ // step 0: push data to next GPU
+ rankDest = ring->devUserRanks[0];
+ offset = chunkOffset + rankDest * size;
+
+ if (thisInput + chunkOffset == thisOutput + offset) { // In place
+ LLprims.send(thisInput+chunkOffset, nelem);
+ } else {
+ LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
+ }
+
+ // k-2 steps: copy to next GPU
+ for (int j=1; j<nranks-1; ++j) {
+ rankDest = ring->devUserRanks[nranks-j];
+ offset = chunkOffset + rankDest * size;
+
+ LLprims.recvCopySend(thisOutput+offset, nelem);
+ }
+
+ // step k-1: final store
+ rankDest = ring->devUserRanks[1];
+ offset = chunkOffset + rankDest * size;
+
+ LLprims.recv(thisOutput+offset, nelem);
+ }
}
+};
- // step k-1: final store
- rankDest = ring->devUserRanks[1];
- offset = chunkOffset + rankDest * size;
+template<int PROTO, class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllGather, NCCL_ALGO_TREE, PROTO, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {}
+};
- LLprims.recv(thisOutput+offset, nelem);
- }
-}
+template<int PROTO, class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllGather, NCCL_ALGO_COLLNET, PROTO, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {}
+};
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu
index 85d007e..e7c3c28 100644
--- a/src/collectives/device/all_reduce.cu
+++ b/src/collectives/device/all_reduce.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,4 +8,4 @@
#include "common.h"
#include "collectives.h"
-IMPL_COLL_R(ncclAllReduce, ncclCollAllReduce);
+IMPL_COLL_R(AllReduce);
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index 2449c2b..fe2e6fc 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,398 +8,442 @@
#include "primitives.h"
#include "collectives.h"
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
- const int tid = threadIdx.x;
- const int nthreads = args->nThreads-WARP_SIZE;
- const int bid = args->bid;
- struct ncclDevComm* comm = args->comm;
- struct ncclChannel* channel = comm->channels+blockIdx.x;
- struct ncclRing* ring = &channel->ring;
- const ssize_t size = args->N;
- const int nranks = comm->nRanks;
- const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
- const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
- const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
-
- // Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
-
- ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
- prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
-
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
- int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels));
- ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
- ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize;
-
- /////////////// begin AllReduce steps ///////////////
- ssize_t offset;
- int nelem;
- int slice;
-
- // step 0: push data to next GPU
- slice = ring->devUserRanks[nranks-1];
- offset = chunkOffset + slice * realChunkSize;
- nelem = min(realChunkSize, size-offset);
-
- prims.send(thisInput+offset, nelem);
-
- // k-2 steps: reduce and copy to next GPU
- for (int j=2; j<nranks; ++j) {
- slice = ring->devUserRanks[nranks-j];
- offset = chunkOffset + slice * realChunkSize;
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads-WARP_SIZE;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+ const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+ const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+ const ssize_t size = args->coll.count;
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+ ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, 1, FUNC>
+ prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
+
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
+ ssize_t realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*nChannels));
+ ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+ ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize;
+
+ /////////////// begin AllReduce steps ///////////////
+ ssize_t offset;
+ int nelem;
+ int chunk;
+
+ // step 0: push data to next GPU
+ chunk = ring->devUserRanks[nranks-1];
+ offset = chunkOffset + chunk * realChunkSize;
nelem = min(realChunkSize, size-offset);
- prims.recvReduceSend(thisInput+offset, nelem);
- }
+ prims.send(thisInput+offset, nelem);
- // step k-1: reduce this buffer and data, which will produce the final
- // result that we store in this data and push to the next GPU
- slice = ring->devUserRanks[0];
- offset = chunkOffset + slice * realChunkSize;
- nelem = min(realChunkSize, size-offset);
+ // k-2 steps: reduce and copy to next GPU
+ for (int j=2; j<nranks; ++j) {
+ chunk = ring->devUserRanks[nranks-j];
+ offset = chunkOffset + chunk * realChunkSize;
+ nelem = min(realChunkSize, size-offset);
- prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
+ prims.recvReduceSend(thisInput+offset, nelem);
+ }
- // k-2 steps: copy to next GPU
- for (int j=1; j<nranks-1; ++j) {
- slice = ring->devUserRanks[nranks-j];
- offset = chunkOffset + slice * realChunkSize;
+ // step k-1: reduce this buffer and data, which will produce the final
+ // result that we store in this data and push to the next GPU
+ chunk = ring->devUserRanks[0];
+ offset = chunkOffset + chunk * realChunkSize;
nelem = min(realChunkSize, size-offset);
- prims.directRecvCopySend(thisOutput+offset, offset, nelem);
- }
+ prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
- // Make final copy from buffer to dest.
- slice = ring->devUserRanks[1];
- offset = chunkOffset + slice * realChunkSize;
- nelem = min(realChunkSize, size-offset);
+ // k-2 steps: copy to next GPU
+ for (int j=1; j<nranks-1; ++j) {
+ chunk = ring->devUserRanks[nranks-j];
+ offset = chunkOffset + chunk * realChunkSize;
+ nelem = min(realChunkSize, size-offset);
- // Final wait/copy.
- prims.directRecv(thisOutput+offset, offset, nelem);
- }
-}
-
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
- const int tid = threadIdx.x;
- const int nthreads = args->nThreads-WARP_SIZE;
- const int bid = args->bid;
- struct ncclDevComm* comm = args->comm;
- struct ncclChannel* channel = comm->channels+blockIdx.x;
- const ssize_t size = args->N;
- const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
- int chunkSize = args->lastChunkSize;
- const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
- const ssize_t loopSize = args->nChannels*chunkSize;
-
- if (loopSize > size) {
- chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+ prims.directRecvCopySend(thisOutput+offset, offset, nelem);
+ }
+
+ // Make final copy from buffer to dest.
+ chunk = ring->devUserRanks[1];
+ offset = chunkOffset + chunk * realChunkSize;
+ nelem = min(realChunkSize, size-offset);
+
+ // Final wait/copy.
+ prims.directRecv(thisOutput+offset, offset, nelem);
+ }
}
+};
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads-2*WARP_SIZE;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclTree* tree = &channel->tree;
+ const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+ int chunkSize = args->coll.lastChunkSize;
+ const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
+
+ if (loopSize > size) {
+ chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
+ }
- // Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
- do {
- struct ncclTree* tree = &channel->treeUp;
- // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
- ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- // Up
- ssize_t offset = gridOffset + bid*chunkSize;
- int nelem = min(chunkSize, size-offset);
- if (tree->up == -1) {
- prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
- } else if (tree->down[0] == -1) {
- prims.send(thisInput+offset, nelem);
- } else {
- prims.recvReduceSend(thisInput+offset, nelem);
+#if 1
+ if (tid < nthreads+WARP_SIZE) {
+ // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
+ ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_DEV_ARITY, 1, 0, FUNC>
+ prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Up
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->up == -1) {
+ prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+ } else if (tree->down[0] == -1) {
+ prims.send(thisInput+offset, nelem);
+ } else {
+ prims.recvReduceSend(thisInput+offset, nelem);
+ }
}
}
- } while(0);
- do {
- struct ncclTree* tree = &channel->treeDn;
- // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
- ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- // Down
- ssize_t offset = gridOffset + bid*chunkSize;
- int nelem = min(chunkSize, size-offset);
- if (tree->up == -1) {
- prims.send(thisOutput+offset, nelem);
- } else if (tree->down[0] == -1) {
- prims.recv(thisOutput+offset, nelem);
+ if (tid < nthreads+WARP_SIZE) {
+ // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
+ ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_DEV_ARITY, 1, FUNC>
+ prims(tid, nthreads, &tree->up, tree->down, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Down
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->up == -1) {
+ prims.directSend(thisOutput+offset, offset, nelem);
+ } else if (tree->down[0] == -1) {
+ prims.directRecv(thisOutput+offset, offset, nelem);
+ } else {
+ prims.directRecvCopySend(thisOutput+offset, offset, nelem);
+ }
+ }
+ }
+#else
+ int nthreadsSplit = nthreads/2;
+ if (nthreadsSplit == 256) nthreadsSplit += 64;
+ if (tree->up == -1) {
+ if (tid < nthreads+WARP_SIZE) {
+ // ReduceAndBroadcast : max number of recv is 3, max number of send is 3
+ ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_DEV_ARITY, NCCL_MAX_DEV_ARITY, 1, FUNC>
+ prims(tid, nthreads, tree->down, tree->down, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
+ }
+ }
+ } else {
+ if (tid < nthreadsSplit + WARP_SIZE) {
+ // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
+ ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_DEV_ARITY, 1, 0, FUNC>
+ prims(tid, nthreadsSplit, tree->down, &tree->up, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Up
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->down[0] == -1) {
+ prims.send(thisInput+offset, nelem);
+ } else {
+ prims.recvReduceSend(thisInput+offset, nelem);
+ }
+ }
} else {
- prims.recvCopySend(thisOutput+offset, nelem);
+ // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
+ ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_DEV_ARITY, 1, FUNC>
+ prims(tid-nthreadsSplit-WARP_SIZE, nthreads-nthreadsSplit, &tree->up, tree->down, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 2);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Down
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->down[0] == -1) {
+ prims.directRecv(thisOutput+offset, offset, nelem);
+ } else {
+ prims.directRecvCopySend(thisOutput+offset, offset, nelem);
+ }
+ }
}
}
- } while(0);
-}
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
- const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
- struct ncclDevComm* comm = args->comm;
- struct ncclChannel* channel = comm->channels+blockIdx.x;
- struct ncclRing* ring = &channel->ring;
-
- ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
- //const int rank = comm->rank;
- const int nranks = comm->nRanks;
- ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
- const ssize_t minChunkSize = nthreads * (sizeof(uint64_t)) / sizeof(T);
-
- const ssize_t loopSize = args->nChannels*nranks*chunkSize;
-
- // Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
-
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
-
- /////////////// begin AllReduce steps ///////////////
- ssize_t offset;
- int nelem;
- int slice;
-
- // step 0: push data to next GPU
- slice = ring->devUserRanks[nranks-1];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
- nelem = min(chunkSize, size-offset);
-
- LLprims.send(thisInput+offset, nelem);
-
- // k-2 steps: reduce and copy to next GPU
- for (int j=2; j<nranks; ++j) {
- slice = ring->devUserRanks[nranks-j];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
- nelem = min(chunkSize, size-offset);
-
- LLprims.recvReduceSend(thisInput+offset, nelem);
+#endif
+ }
+};
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_COLLNET, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads-WARP_SIZE;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclTree* tree = &channel->collTree;
+ const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+ int chunkSize = args->coll.lastChunkSize;
+ const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
+
+ if (loopSize > size) {
+ chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
}
- // step k-1: reduce this buffer and data, which will produce the final
- // result that we store in this data and push to the next GPU
- slice = ring->devUserRanks[0];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
- nelem = min(chunkSize, size-offset);
-
- LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
- // k-2 steps: copy to next GPU
- for (int j=1; j<nranks-1; ++j) {
- slice = ring->devUserRanks[nranks-j];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
- nelem = min(chunkSize, size-offset);
+ if (blockIdx.x < nChannels) { // first half of the channels do reduce
+ ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC>
+ prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Up
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->up == -1) {
+ prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+ } else if (tree->down[0] == -1) {
+ prims.send(thisInput+offset, nelem);
+ } else {
+ prims.recvReduceSend(thisInput+offset, nelem);
+ }
+ }
+ }
- LLprims.recvCopySend(thisOutput+offset, nelem);
+ if (blockIdx.x >= nChannels) { // second half of the channels do broadcast
+ ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC>
+ prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Down
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->up == -1) {
+ prims.send(thisOutput+offset, nelem);
+ } else if (tree->down[0] == -1) {
+ prims.recv(thisOutput+offset, nelem);
+ } else {
+ prims.recvCopySend(thisOutput+offset, nelem);
+ }
+ }
}
+ }
+};
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+ const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+ ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+ const ssize_t minChunkSize = nthreads * (sizeof(uint64_t)) / sizeof(T);
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*nranks*chunkSize;
+ const ssize_t size = args->coll.count;
+
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
- // Make final copy from buffer to dest.
- slice = ring->devUserRanks[1];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
- nelem = min(chunkSize, size-offset);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ chunkSize = min(DIVUP(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
- // Here we need to copy from buffer to this output.
- LLprims.recv(thisOutput+offset, nelem);
- }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
- const int tid = threadIdx.x;
- const int nthreads = args->nThreads;
- const int bid = args->bid;
- struct ncclDevComm* comm = args->comm;
- struct ncclChannel* channel = comm->channels+blockIdx.x;
- const ssize_t size = args->N;
- ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
- const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
- const ssize_t loopSize = args->nChannels*chunkSize;
-
- if (loopSize > size) {
- chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
- }
+ /////////////// begin AllReduce steps ///////////////
+ ssize_t offset;
+ int nelem;
+ int chunk;
- // Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ // step 0: push data to next GPU
+ chunk = ring->devUserRanks[nranks-1];
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
+ nelem = min(chunkSize, size-offset);
- do {
- struct ncclTree* tree = &channel->treeUp;
- // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
- ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- // Up
- ssize_t offset = gridOffset + bid*chunkSize;
- int nelem = min(chunkSize, size-offset);
- if (tree->up == -1) {
- LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
- } else if (tree->down[0] == -1) {
- LLprims.send(thisInput+offset, nelem);
- } else {
- LLprims.recvReduceSend(thisInput+offset, nelem);
- }
- }
- } while(0);
+ LLprims.send(thisInput+offset, nelem);
- do {
- struct ncclTree* tree = &channel->treeDn;
- // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
- ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- // Down
- ssize_t offset = gridOffset + bid*chunkSize;
- int nelem = min(chunkSize, size-offset);
- if (tree->up == -1) {
- LLprims.send(thisOutput+offset, nelem);
- } else if (tree->down[0] == -1) {
- LLprims.recv(thisOutput+offset, nelem);
- } else {
- LLprims.recvCopySend(thisOutput+offset, nelem);
+ // k-2 steps: reduce and copy to next GPU
+ for (int j=2; j<nranks; ++j) {
+ chunk = ring->devUserRanks[nranks-j];
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
+ nelem = min(chunkSize, size-offset);
+
+ LLprims.recvReduceSend(thisInput+offset, nelem);
}
- }
- } while(0);
-}
-#include "prims_ll128.h"
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
- const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
- struct ncclDevComm* comm = args->comm;
- struct ncclChannel* channel = comm->channels+blockIdx.x;
- struct ncclRing* ring = &channel->ring;
-
- ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
- //const int rank = comm->rank;
- const int nranks = comm->nRanks;
- ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
- // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
- const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
-
- const ssize_t loopSize = args->nChannels*nranks*chunkSize;
-
- // Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
-
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
-
- /////////////// begin AllReduce steps ///////////////
- ssize_t offset;
- int nelem;
- int slice;
-
- // step 0: push data to next GPU
- slice = ring->devUserRanks[nranks-1];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
- nelem = min(chunkSize, size-offset);
-
- LLprims.send(thisInput+offset, nelem);
-
- // k-2 steps: reduce and copy to next GPU
- for (int j=2; j<nranks; ++j) {
- slice = ring->devUserRanks[nranks-j];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+ // step k-1: reduce this buffer and data, which will produce the final
+ // result that we store in this data and push to the next GPU
+ chunk = ring->devUserRanks[0];
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
- LLprims.recvReduceSend(thisInput+offset, nelem);
- }
+ LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
- // step k-1: reduce this buffer and data, which will produce the final
- // result that we store in this data and push to the next GPU
- slice = ring->devUserRanks[0];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
- nelem = min(chunkSize, size-offset);
+ // k-2 steps: copy to next GPU
+ for (int j=1; j<nranks-1; ++j) {
+ chunk = ring->devUserRanks[nranks-j];
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
+ nelem = min(chunkSize, size-offset);
- LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
+ LLprims.recvCopySend(thisOutput+offset, nelem);
+ }
- // k-2 steps: copy to next GPU
- for (int j=1; j<nranks-1; ++j) {
- slice = ring->devUserRanks[nranks-j];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
+ // Make final copy from buffer to dest.
+ chunk = ring->devUserRanks[1];
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
nelem = min(chunkSize, size-offset);
- LLprims.recvCopySend(thisOutput+offset, nelem);
+ // Here we need to copy from buffer to this output.
+ LLprims.recv(thisOutput+offset, nelem);
}
-
- // Make final copy from buffer to dest.
- slice = ring->devUserRanks[1];
- offset = gridOffset + (slice*args->nChannels+bid) * chunkSize;
- nelem = min(chunkSize, size-offset);
-
- // Here we need to copy from buffer to this output.
- LLprims.recv(thisOutput+offset, nelem);
- }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
- const int tid = threadIdx.x;
- const int nthreads = args->nThreads;
- const int bid = args->bid;
- struct ncclDevComm* comm = args->comm;
- struct ncclChannel* channel = comm->channels+blockIdx.x;
- struct ncclTree* treeUp = &channel->treeUp;
- struct ncclTree* treeDn = &channel->treeDn;
- const ssize_t size = args->N;
- ssize_t chunkSize = args->lastChunkSize;
- const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/8;
- const ssize_t loopSize = args->nChannels*chunkSize;
- int nthreadsSplit = NCCL_LL128_SPLIT(nthreads);
-
- if (loopSize > size) {
- chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
}
+};
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_TREE, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclTree* tree = &channel->tree;
+ const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+ ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+ const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
+
+ if (loopSize > size) {
+ chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
+ }
- // Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
- if (treeUp->up == -1) {
- // ReduceAndBroadcast : max number of recv is 3, max number of send is 3
- ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, treeUp->down, treeDn->down, channel, comm, args->opCount);
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- ssize_t offset = gridOffset + bid*chunkSize;
- int nelem = min(chunkSize, size-offset);
- LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
- }
- } else {
- if (tid < nthreadsSplit) {
+ do {
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
- ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreadsSplit, treeUp->down, &treeUp->up, channel, comm, args->opCount);
+ ncclLLPrimitives<T, FUNC, NCCL_MAX_DEV_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
- if (treeUp->down[0] == -1) {
+ if (tree->up == -1) {
+ LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+ } else if (tree->down[0] == -1) {
LLprims.send(thisInput+offset, nelem);
} else {
LLprims.recvReduceSend(thisInput+offset, nelem);
}
}
- } else {
+ } while(0);
+
+ do {
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
- ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &treeDn->up, treeDn->down, channel, comm, args->opCount);
+ ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_DEV_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Down
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->up == -1) {
+ LLprims.send(thisOutput+offset, nelem);
+ } else if (tree->down[0] == -1) {
+ LLprims.recv(thisOutput+offset, nelem);
+ } else {
+ LLprims.recvCopySend(thisOutput+offset, nelem);
+ }
+ }
+ } while(0);
+ }
+};
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_COLLNET, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclTree* tree = &channel->collTree;
+ const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+ ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+ const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
+
+ if (loopSize > size) {
+ chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
+ }
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+ if (blockIdx.x < nChannels) { // first half of the channels do reduce
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Up
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->up == -1) {
+ LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+ } else if (tree->down[0] == -1) {
+ LLprims.send(thisInput+offset, nelem);
+ } else {
+ LLprims.recvReduceSend(thisInput+offset, nelem);
+ }
+ }
+ }
+
+ if (blockIdx.x >= nChannels) { // second half of the channels do broadcast
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
- if (treeDn->down[0] == -1) {
+ if (tree->up == -1) {
+ LLprims.send(thisOutput+offset, nelem);
+ } else if (tree->down[0] == -1) {
LLprims.recv(thisOutput+offset, nelem);
} else {
LLprims.recvCopySend(thisOutput+offset, nelem);
@@ -407,4 +451,154 @@ __device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
}
}
}
-}
+};
+
+#include "prims_ll128.h"
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+ const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+ ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
+ // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
+ const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*nranks*chunkSize;
+ const ssize_t size = args->coll.count;
+
+ ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ chunkSize = min(DIVUP(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
+
+ /////////////// begin AllReduce steps ///////////////
+ ssize_t offset;
+ int nelem;
+ int chunk;
+
+ // step 0: push data to next GPU
+ chunk = ring->devUserRanks[nranks-1];
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
+ nelem = min(chunkSize, size-offset);
+
+ LLprims.send(thisInput+offset, nelem);
+
+ // k-2 steps: reduce and copy to next GPU
+ for (int j=2; j<nranks; ++j) {
+ chunk = ring->devUserRanks[nranks-j];
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
+ nelem = min(chunkSize, size-offset);
+
+ LLprims.recvReduceSend(thisInput+offset, nelem);
+ }
+
+ // step k-1: reduce this buffer and data, which will produce the final
+ // result that we store in this data and push to the next GPU
+ chunk = ring->devUserRanks[0];
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
+ nelem = min(chunkSize, size-offset);
+
+ LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
+
+ // k-2 steps: copy to next GPU
+ for (int j=1; j<nranks-1; ++j) {
+ chunk = ring->devUserRanks[nranks-j];
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
+ nelem = min(chunkSize, size-offset);
+
+ LLprims.recvCopySend(thisOutput+offset, nelem);
+ }
+
+ // Make final copy from buffer to dest.
+ chunk = ring->devUserRanks[1];
+ offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
+ nelem = min(chunkSize, size-offset);
+
+ // Here we need to copy from buffer to this output.
+ LLprims.recv(thisOutput+offset, nelem);
+ }
+ }
+};
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_TREE, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclTree* tree = &channel->tree;
+ const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+ ssize_t chunkSize = args->coll.lastChunkSize;
+ const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/8;
+ const ssize_t loopSize = nChannels*chunkSize;
+ int nthreadsSplit = NCCL_LL128_SPLIT(nthreads);
+ const ssize_t size = args->coll.count;
+
+ if (loopSize > size) {
+ chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
+ }
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+ if (tree->up == -1) {
+ // ReduceAndBroadcast : max number of recv is 3, max number of send is 3
+ ncclLL128Primitives<T, FUNC, NCCL_MAX_DEV_ARITY, NCCL_MAX_DEV_ARITY> LLprims(tid, nthreads, tree->down, tree->down, stepSize, channel, comm);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
+ }
+ } else {
+ if (tid < nthreadsSplit) {
+ // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
+ ncclLL128Primitives<T, FUNC, NCCL_MAX_DEV_ARITY, 1> LLprims(tid, nthreadsSplit, tree->down, &tree->up, stepSize, channel, comm);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Up
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->down[0] == -1) {
+ LLprims.send(thisInput+offset, nelem);
+ } else {
+ LLprims.recvReduceSend(thisInput+offset, nelem);
+ }
+ }
+ } else {
+ // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
+ ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_DEV_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, stepSize, channel, comm);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Down
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->down[0] == -1) {
+ LLprims.recv(thisOutput+offset, nelem);
+ } else {
+ LLprims.recvCopySend(thisOutput+offset, nelem);
+ }
+ }
+ }
+ }
+ }
+};
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncAllReduce, NCCL_ALGO_COLLNET, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+ public:
+__device__ void run(struct ncclWorkElem* args) { }
+};
diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu
index 8c8dbb6..7759585 100644
--- a/src/collectives/device/broadcast.cu
+++ b/src/collectives/device/broadcast.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,4 +8,4 @@
#include "common.h"
#include "collectives.h"
-IMPL_COLL_C(ncclBroadcast, ncclCollBroadcast);
+IMPL_COLL_C(Broadcast);
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index de8b989..72216ac 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,143 +8,155 @@
#include "primitives.h"
#include "collectives.h"
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
- const int tid = threadIdx.x;
- const int nthreads = args->nThreads-WARP_SIZE;
- const int bid = args->bid;
- struct ncclDevComm* comm = args->comm;
- struct ncclChannel* channel = comm->channels+blockIdx.x;
- struct ncclRing* ring = &channel->ring;
- const ssize_t size = args->N;
- const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
- const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
- const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
- const int rank = ring->devUserRanks[0];
- const int nextRank = ring->devUserRanks[1];
- const int root = args->root;
-
- // Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
-
- ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, FUNC>
- prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
-
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
- ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
- ssize_t offset = gridOffset + bid*realChunkSize;
- int nelem = min(realChunkSize, size-offset);
-
- if (rank == root) {
- if (thisInput == thisOutput) {
- prims.send(thisInput+offset, nelem);
- } else {
- prims.copySend(thisInput+offset, thisOutput+offset, nelem);
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads-WARP_SIZE;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+ const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+ const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
+ const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+ const ssize_t size = args->coll.count;
+ const int rank = ring->devUserRanks[0];
+ const int nextRank = ring->devUserRanks[1];
+ const int root = args->coll.root;
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+ ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, 0, FUNC>
+ prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
+
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
+ ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+ ssize_t offset = gridOffset + bid*realChunkSize;
+ int nelem = min(realChunkSize, size-offset);
+
+ if (rank == root) {
+ if (thisInput == thisOutput) {
+ prims.send(thisInput+offset, nelem);
+ } else {
+ prims.copySend(thisInput+offset, thisOutput+offset, nelem);
+ }
+ } else if (nextRank == root) {
+ prims.recv(thisOutput+offset, nelem);
+ } else {
+ prims.recvCopySend(thisOutput+offset, nelem);
+ }
}
- } else if (nextRank == root) {
- prims.recv(thisOutput+offset, nelem);
- } else {
- prims.recvCopySend(thisOutput+offset, nelem);
}
- }
-}
-
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
- const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
- struct ncclDevComm* comm = args->comm;
- struct ncclChannel* channel = comm->channels+blockIdx.x;
- struct ncclRing* ring = &channel->ring;
-
- ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
- const int rank = ring->devUserRanks[0];
- const int nextRank = ring->devUserRanks[1];
- const int root = args->root;
-
- ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
- const ssize_t loopSize = args->nChannels*chunkSize;
-
- // Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
-
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- if (size-gridOffset < loopSize) {
- chunkSize = args->lastChunkSize;
- }
- ssize_t offset = gridOffset + bid*chunkSize;
-
- int nelem = min(chunkSize, size-offset);
- if (rank == root) {
- if (thisInput == thisOutput) {
- LLprims.send(thisInput+offset, nelem);
- } else {
- LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
+};
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+ const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+ ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
+ const int rank = ring->devUserRanks[0];
+ const int nextRank = ring->devUserRanks[1];
+ const int root = args->coll.root;
+
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ if (size-gridOffset < loopSize) {
+ chunkSize = args->coll.lastChunkSize;
+ }
+ ssize_t offset = gridOffset + bid*chunkSize;
+
+ int nelem = min(chunkSize, size-offset);
+ if (rank == root) {
+ if (thisInput == thisOutput) {
+ LLprims.send(thisInput+offset, nelem);
+ } else {
+ LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
+ }
+ } else if (nextRank == root) {
+ LLprims.recv(thisOutput + offset, nelem);
+ } else {
+ LLprims.recvCopySend(thisOutput + offset, nelem);
+ }
}
- } else if (nextRank == root) {
- LLprims.recv(thisOutput + offset, nelem);
- } else {
- LLprims.recvCopySend(thisOutput + offset, nelem);
}
- }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
+};
#include "prims_ll128.h"
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
- const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
- struct ncclDevComm* comm = args->comm;
- struct ncclChannel* channel = comm->channels+blockIdx.x;
- struct ncclRing* ring = &channel->ring;
-
- ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
- const int rank = ring->devUserRanks[0];
- const int nextRank = ring->devUserRanks[1];
- const int root = args->root;
-
- ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
- const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
-
- const ssize_t loopSize = args->nChannels*chunkSize;
-
- // Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
-
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
- ssize_t offset = gridOffset + bid*chunkSize;
-
- int nelem = min(chunkSize, size-offset);
- if (rank == root) {
- if (thisInput == thisOutput) {
- LLprims.send(thisInput+offset, nelem);
- } else {
- LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+ const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+ ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
+ const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
+ const int rank = ring->devUserRanks[0];
+ const int nextRank = ring->devUserRanks[1];
+ const int root = args->coll.root;
+
+ ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
+ ssize_t offset = gridOffset + bid*chunkSize;
+
+ int nelem = min(chunkSize, size-offset);
+ if (rank == root) {
+ if (thisInput == thisOutput) {
+ LLprims.send(thisInput+offset, nelem);
+ } else {
+ LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
+ }
+ } else if (nextRank == root) {
+ LLprims.recv(thisOutput + offset, nelem);
+ } else {
+ LLprims.recvCopySend(thisOutput + offset, nelem);
+ }
}
- } else if (nextRank == root) {
- LLprims.recv(thisOutput + offset, nelem);
- } else {
- LLprims.recvCopySend(thisOutput + offset, nelem);
}
- }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { }
+};
+
+template<int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {}
+};
+
+template<int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction<ncclFuncBroadcast, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {}
+};
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index 46eb9f5..265218a 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,6 +10,15 @@
#include "collectives.h"
#include "devcomm.h"
+
+#if __CUDA_ARCH__ >= 800
+#define COLL_UNROLL 8
+#define NCCL_MAX_DEV_ARITY (NCCL_MAX_TREE_ARITY-1) // Using balanced tree instead of split tree
+#else
+#define COLL_UNROLL 4
+#define NCCL_MAX_DEV_ARITY NCCL_MAX_TREE_ARITY
+#endif
+
// Exit If Abort Barrier across CTA: make sure all threads exit consistently
// Each thread sets a predicate to true if abort == 1
// all CTA's threads enter the barrier and do a popc on their predicates being True
@@ -19,12 +28,12 @@ static inline __device__ void exitIfAbortBarrier(int abort) {
asm ("{");
asm volatile (" .reg .pred barr_pred;");
asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
- asm volatile (" bar.red.popc.u32 %0, 13, barr_pred;" : "=r"(popc));
+ asm volatile (" bar.red.popc.u32 %0, 0, barr_pred;" : "=r"(popc));
asm ("}");
if (popc) { asm volatile ("exit;"); }
}
-typedef void(*ncclKern_t)(struct CollectiveArgs* args);
+typedef void(*ncclKern_t)(struct ncclWorkElem* args);
extern __device__ ncclKern_t ncclFuncs[];
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
@@ -32,130 +41,143 @@ static __device__ void load_parallel(void* dst, void* src, size_t size, int tid)
int* s = (int*)src;
for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
}
-static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, struct ncclDevComm* comm) {
+static __device__ void load_coll(struct ncclWork* localWork, struct ncclWork* hostWork, int tid, struct ncclDevComm* comm) {
+ __syncthreads();
+ load_parallel(localWork, hostWork, sizeof(struct ncclWork), tid);
// Check whether the last operation was aborted and make sure all threads exit
int abort = tid == 0 ? *(comm->abortFlag) : 0;
exitIfAbortBarrier(abort);
- load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid);
- __syncthreads();
- if (tid == 0) hostColl->active = 0;
+ if (tid == 0) hostWork->elems[0].active = 0;
}
-extern __device__ volatile uint64_t* ncclShmem;
+template <ncclFunc_t FUNCTION, int ALGO, int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {}
+};
+
+struct ncclShmemPtrs {
+ void* srcs[NCCL_MAX_DEV_ARITY+1];
+ void* dsts[NCCL_MAX_DEV_ARITY+1];
+};
+
+struct ncclShmemData {
+ union {
+ volatile uint64_t data[NCCL_LL128_SHMEM_SIZE];
+ struct ncclShmemPtrs ptrs[NCCL_MAX_GROUPS];
+ };
+ struct ncclWork localWork;
+};
-/* Functions for aggregation case */
-#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
-__device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
- coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(args); \
+extern __device__ struct ncclShmemData *ncclShmem;
+template <ncclFunc_t FUNCTION, int ALGO, int PROTO, class REDOP, typename T, int UNROLL, int FINDEX>
+__device__ void ncclKernel(struct ncclWorkElem first) {
+ int tid = threadIdx.x;
+ int bid = blockIdx.x;
+ __shared__ struct ncclShmemData shmem;
+ ncclShmem = &shmem;
+
+ auto f = ncclFunction<FUNCTION, ALGO, PROTO, REDOP, T, UNROLL>();
+
+ struct ncclDevComm* comm = first.comm;
+ struct ncclChannel* channel = comm->channels+bid;
+ struct ncclWorkElem* w = NULL;
+ uint16_t index = first.index;
+
+ /* To optimize for latency, (only) the first operation is passed as argument.*/
+ if (bid == 0 && first.funcIndex != FUNC_INDEX_P2P) w = &first;
+
+ while (1) {
+ if (w == NULL) {
+ w = shmem.localWork.elems;
+ load_coll(&shmem.localWork, channel->workFifo+index, tid, comm);
+ }
+ if (tid < w->nThreads) {
+ if (w->funcIndex == FINDEX) {
+ f.run(w);
+ } else {
+ ncclFuncs[w->funcIndex](w);
+ }
+ }
+ index = (index+1) % NCCL_MAX_OPS;
+ if (w->active == 2) {
+ return;
+ }
+ w = NULL;
+ }
}
+// Only generate kernels for SUM
#if NCCL_OP == 0
-/* Kernels with the first operation inlined */
-#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \
-__global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
- int tid = threadIdx.x; \
- int bid = blockIdx.x; \
- __shared__ volatile uint64_t shmem[NCCL_LL128_SHMEM_SIZE]; \
- ncclShmem = shmem; \
- __shared__ struct ncclColl localColl; \
- \
- struct ncclDevComm* comm = firstColl.args.comm; \
- struct ncclChannel* channel = comm->channels+bid; \
- struct ncclColl* c; \
- if (bid == 0) { \
- /* To optimize for latency, (only) the first operation is passed as argument.*/ \
- c = &firstColl; \
- } else { \
- c = &localColl; \
- load_coll(c, channel->devCollectives+channel->collFifoHead, tid, comm); \
- } \
- while (1) { \
- if (tid < c->args.nThreads) { \
- if (c->funcIndex == fIndex) { \
- coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
- } else { \
- ncclFuncs[c->funcIndex](&c->args); \
- } \
- } \
- int nextIndex = c->nextIndex; \
- if (tid == 0) channel->collFifoHead = nextIndex; \
- \
- if (c->active == 2) { \
- return; \
- } \
- \
- /* Load next collective operation*/ \
- c = &localColl; /* for bid 0 */ \
- load_coll(c, channel->devCollectives+nextIndex, tid, comm); \
- } \
+#define IMPL_COLL_KERN(func, algo, proto, redop, type, fIndex) \
+__global__ void NCCL_KERN_NAME(func, algo, proto, redop, type)(struct ncclWorkElem first) { \
+ ncclKernel<ncclFunc##func, NCCL_ALGO_##algo, NCCL_PROTO_##proto, Func##redop<type>, type, COLL_UNROLL, fIndex>(first); \
}
#else
-#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex)
+#define IMPL_COLL_KERN(func, algo, proto, redop, type, fInded)
#endif
+// Examples : AllReduce, RING, LL, Sum, uint8
+#define IMPL_COLL_FUNC(func, algo, proto, redop, type) \
+__device__ void NCCL_FUNC_NAME(func, algo, proto, redop, type)(struct ncclWorkElem* args) { \
+ auto f = ncclFunction<ncclFunc##func, NCCL_ALGO_##algo, NCCL_PROTO_##proto, Func##redop<type>, type, COLL_UNROLL>(); \
+ f.run(args); \
+}
+
// Only generate inline kernels for LL
-#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \
- IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \
- IMPL_COLL_FUNC(coll##LL128, op, ncclFunc, dtype, ctype) \
- IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
- IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, al, NCCL_PROTO_LL)) \
+#define IMPL_COLL4(func, algo, redop, type, ncclType) \
+ IMPL_COLL_FUNC(func, algo, LL, redop, type) \
+ IMPL_COLL_FUNC(func, algo, LL128, redop, type) \
+ IMPL_COLL_FUNC(func, algo, SIMPLE, redop, type) \
+ IMPL_COLL_KERN(func, algo, LL, redop, type, FUNC_INDEX(ncclFunc##func, nccl##redop, ncclType, NCCL_ALGO_##algo, NCCL_PROTO_LL)) \
-#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
- IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \
- IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING)
+#define IMPL_COLL3(func, redop, type, ncclType) \
+ IMPL_COLL4(func, TREE, redop, type, ncclType) \
+ IMPL_COLL4(func, RING, redop, type, ncclType) \
+ IMPL_COLL4(func, COLLNET, redop, type, ncclType)
#if NCCL_TYPE == 0
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
- IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8)
+#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, int8_t, ncclInt8)
#elif NCCL_TYPE == 1
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
- IMPL_COLL3(coll, op, ncclFunc, u8, uint8_t, ncclColl, ncclOp, ncclUint8)
+#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, uint8_t, ncclUint8)
#elif NCCL_TYPE == 2
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
- IMPL_COLL3(coll, op, ncclFunc, i32, int32_t, ncclColl, ncclOp, ncclInt32)
+#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, int32_t, ncclInt32)
#elif NCCL_TYPE == 3
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
- IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32)
+#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, uint32_t, ncclUint32)
#elif NCCL_TYPE == 4
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
- IMPL_COLL3(coll, op, ncclFunc, i64, int64_t, ncclColl, ncclOp, ncclInt64)
+#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, int64_t, ncclInt64)
#elif NCCL_TYPE == 5
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
- IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64)
+#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, uint64_t, ncclUint64)
#elif NCCL_TYPE == 6
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
- IMPL_COLL3(coll, op, ncclFunc, f16, half, ncclColl, ncclOp, ncclFloat16)
+#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, half, ncclFloat16)
#elif NCCL_TYPE == 7
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
- IMPL_COLL3(coll, op, ncclFunc, f32, float, ncclColl, ncclOp, ncclFloat32)
+#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, float, ncclFloat32)
#elif NCCL_TYPE == 8
-#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
- IMPL_COLL3(coll, op, ncclFunc, f64, double, ncclColl, ncclOp, ncclFloat64)
+#define IMPL_COLL2(func, redop) IMPL_COLL3(func, redop, double, ncclFloat64)
#endif
// Reduction define all functions
#if NCCL_OP == 0
-#define IMPL_COLL_R(collf, colln) \
- IMPL_COLL2(collf, sum, FuncSum, colln, ncclSum);
+#define IMPL_COLL_R(func) IMPL_COLL2(func, Sum);
#elif NCCL_OP == 1
-#define IMPL_COLL_R(collf, colln) \
- IMPL_COLL2(collf, prod, FuncProd, colln, ncclProd);
+#define IMPL_COLL_R(func) IMPL_COLL2(func, Prod);
#elif NCCL_OP == 2
-#define IMPL_COLL_R(collf, colln) \
- IMPL_COLL2(collf, min, FuncMin, colln, ncclMin);
+#define IMPL_COLL_R(func) IMPL_COLL2(func, Min);
#elif NCCL_OP == 3
-#define IMPL_COLL_R(collf, colln) \
- IMPL_COLL2(collf, max, FuncMax, colln, ncclMax);
+#define IMPL_COLL_R(func) IMPL_COLL2(func, Max);
#endif
-// Copy primitives only define one
#if NCCL_OP == 0 && NCCL_TYPE == 0
-#define IMPL_COLL_C(collf, colln) \
- IMPL_COLL3(collf, copy, FuncSum, i8, int8_t, colln, ncclSum, ncclInt8);
+// Copy primitives only define one function for copy
+#define IMPL_COLL_C(func) IMPL_COLL3(func, Sum, int8_t, ncclInt8);
+
+// Point-to-point primitives only have one function/kernel.
+#define IMPL_COLL_P(func) \
+ IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t); \
+ IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, 0);
#else
-#define IMPL_COLL_C(collf, colln)
+#define IMPL_COLL_C(func)
+#define IMPL_COLL_P(func)
#endif
-#define COLL_UNROLL 4
-
#endif
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
index aa1e936..ff466a0 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -16,6 +16,12 @@
// Define min for ssize_t
static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
+template <typename T>
+inline __device__ void loadPtr(void** ptr, T* &v) {
+ asm volatile("ld.volatile.global.u64 %0, [%1];"
+ : "=l"(v) : "l"(ptr));
+}
+
typedef uint64_t PackType;
// unpack x and y to elements of type T and apply FUNC to each element
@@ -245,28 +251,57 @@ inline __device__ void Store128(Pack128* p, Pack128& v) {
asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" :: "l"(p), "l"(v.x), "l"(v.y) : "memory");
}
-template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
-__device__ __forceinline__ void ReduceCopyMulti(const int tid, const int nthreads,
- int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
- const int offset, const int N) {
- for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
- T val = vFetch(srcs[0]+idx);
+template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ __forceinline__ void ReduceCopyMulti(const int w, const int nw, const int t,
+ int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const int Nelem) {
+ const int inc = nw * UNROLL * WARP_SIZE;
+ int offset = w * UNROLL * WARP_SIZE + t;
+
+ const T* srcs[MAXSRCS];
+ for (int i=0; i<MAXSRCS; i++) srcs[i] = s[i]+elemOffset+offset;
+ T* dsts[MAXDSTS];
+ for (int i=0; i<MAXDSTS; i++) dsts[i] = d[i]+elemOffset+offset;
+
+ while (offset < Nelem) {
+ T vals[UNROLL];
+ // Load and reduce
+ for (int u = 0; u < UNROLL; ++u) vals[u] = vFetch(srcs[0]+u*WARP_SIZE);
+
+ #pragma unroll
+ for (int i=1; i<MINSRCS; i++) {
+ T vals2[UNROLL];
+ for (int u = 0; u < UNROLL; ++u) vals2[u] = vFetch(srcs[i]+u*WARP_SIZE);
+ for (int u = 0; u < UNROLL; ++u) vals[u] = FUNC()(vals[u], vals2[u]);
+ }
#pragma unroll
- for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
- #pragma unroll 1
- for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
+ for (int i=MINSRCS; i<MAXSRCS; i++) {
+ if (i<nsrcs) {
+ T vals2[UNROLL];
+ for (int u = 0; u < UNROLL; ++u) vals2[u] = vFetch(srcs[i]+u*WARP_SIZE);
+ for (int u = 0; u < UNROLL; ++u) vals[u] = FUNC()(vals[u], vals2[u]);
+ }
+ }
+ // Store
#pragma unroll
- for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
- #pragma unroll 1
- for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
+ for (int i = 0; i < MINDSTS; i++) {
+ for (int u = 0; u < UNROLL; ++u) vStore(dsts[i]+u*WARP_SIZE, vals[u]);
+ }
+ #pragma unroll
+ for (int i=MINDSTS; i<MAXDSTS; i++) {
+ if (i<ndsts) {
+ for (int u = 0; u < UNROLL; ++u) vStore(dsts[i]+u*WARP_SIZE, vals[u]);
+ }
+ }
+ for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
+ for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
+ offset += inc;
}
}
template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
-__device__ __forceinline__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
- int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
- const int elemOffset, const int Npack) {
+__device__ __forceinline__ void ReduceCopy128bMulti(const int w, const int nw, const int t,
+ int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const int Npack) {
const int inc = nw * UNROLL * WARP_SIZE;
int offset = w * UNROLL * WARP_SIZE + t;
@@ -280,25 +315,31 @@ __device__ __forceinline__ void ReduceCopy128bMulti( const int w, const int nw,
// Load and reduce
for (int u = 0; u < UNROLL; ++u) Fetch128(vals[u], srcs[0]+u*WARP_SIZE);
+ #pragma unroll
for (int i=1; i<MINSRCS; i++) {
Pack128 vals2[UNROLL];
for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
}
- #pragma unroll 1
- for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) {
- Pack128 vals2[UNROLL];
- for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
- for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
+ #pragma unroll
+ for (int i=MINSRCS; i<MAXSRCS; i++) {
+ if (i<nsrcs) {
+ Pack128 vals2[UNROLL];
+ for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+ for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
+ }
}
// Store
+ #pragma unroll
for (int i = 0; i < MINDSTS; i++) {
for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
}
- #pragma unroll 1
- for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
- for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+ #pragma unroll
+ for (int i=MINDSTS; i<MAXDSTS; i++) {
+ if (i<ndsts) {
+ for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+ }
}
for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
@@ -309,72 +350,65 @@ __device__ __forceinline__ void ReduceCopy128bMulti( const int w, const int nw,
template <typename T>
__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
-// Try to limit consecutive load/stores to 8.
-// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
-#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
+#define PACKELEMS (sizeof(Pack128) / sizeof(T))
template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ __forceinline__ void ReduceOrCopyMulti(const int tid, const int nthreads,
- int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
+ int nsrcs, const T** srcs, int ndsts, T** dsts,
int N) {
int Nrem = N;
if (Nrem <= 0) return;
- int alignDiff = 0;
- int align = ptrAlign128(srcs[0]);
- #pragma unroll
- for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
- for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
- #pragma unroll
- for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
- for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
-
- int Npreamble = alignDiff ? Nrem :
- N < alignof(Pack128) ? N :
- (alignof(Pack128) - align) % alignof(Pack128);
-
- // stage 1: preamble: handle any elements up to the point of everything coming
- // into alignment
- if (Npreamble) {
- ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
- Nrem -= Npreamble;
- if (Nrem == 0) return;
- }
- int offset = Npreamble;
-
- // stage 2: fast path: use 128b loads/stores to do the bulk of the work,
- // assuming the pointers we have are all 128-bit alignable.
int w = tid / WARP_SIZE; // Warp number
int nw = nthreads / WARP_SIZE; // Number of warps
int t = tid % WARP_SIZE; // Thread (inside the warp)
- const int packFactor = sizeof(Pack128) / sizeof(T);
+ // Check that all is 16B aligned. If not don't use 16B load/stores.
+ int align = 0;
+ #pragma unroll
+ for (int i=0; i<MINSRCS; i++) align |= ptrAlign128(srcs[i]);
+ for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) align |= ptrAlign128(srcs[i]);
+ #pragma unroll
+ for (int i=0; i<MINDSTS; i++) align |= ptrAlign128(dsts[i]);
+ for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) align |= ptrAlign128(dsts[i]);
- // stage 2a: main loop
- int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
- * (AUTOUNROLL * WARP_SIZE); // round down
- int Nelem2a = Npack2a * packFactor;
+ int offset = 0;
+ if (align == 0) {
+ // fast path: use 128b loads/stores to do the bulk of the work,
+ // assuming the pointers we have are all 128-bit aligned.
- ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);
+ // main loop
+ int Npack = (Nrem / (PACKELEMS*UNROLL*WARP_SIZE)) * (UNROLL*WARP_SIZE); // round down
+ int Nelem = Npack * PACKELEMS;
- Nrem -= Nelem2a;
- if (Nrem == 0) return;
- offset += Nelem2a;
+ ReduceCopy128bMulti<FUNC, T, UNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack);
- // stage 2b: slightly less optimized for section when we don't have full
- // unrolling
+ Nrem -= Nelem;
+ if (Nrem == 0) return;
+ offset += Nelem;
+
+ // slightly less optimized for section when we don't have full unrolling
+ Npack = Nrem / PACKELEMS;
+ Nelem = Npack * PACKELEMS;
+
+ ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack);
+
+ Nrem -= Nelem;
+ if (Nrem == 0) return;
+ offset += Nelem;
+ }
- int Npack2b = Nrem / packFactor;
- int Nelem2b = Npack2b * packFactor;
+ // unrolled, by-type (mostly for unaligned buffers)
+ int Nelem = (Nrem / (UNROLL*PACKELEMS/2*WARP_SIZE)) * (UNROLL*PACKELEMS/2*WARP_SIZE); // round down
- ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);
+ ReduceCopyMulti<FUNC, T, UNROLL*PACKELEMS/2, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Nelem);
- Nrem -= Nelem2b;
+ Nrem -= Nelem;
if (Nrem == 0) return;
- offset += Nelem2b;
+ offset += Nelem;
- // stage 2c: tail
- ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
+ // no unroll, by type. Should finish what's remaining.
+ ReduceCopyMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Nrem);
}
#endif // COMMON_KERNEL_H_
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu
index 034fe96..553a882 100644
--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,70 +8,73 @@
#include "collectives.h"
#include "common.h"
-__device__ volatile uint64_t* ncclShmem;
+__device__ struct ncclShmemData* ncclShmem;
-#define NCCL_FUNC5(coll, op, dtype) \
- NCCL_COLL_NAME(coll##LL, op, dtype), \
- NCCL_COLL_NAME(coll##LL128, op, dtype), \
- NCCL_COLL_NAME(coll, op, dtype)
+#define NCCL_FUNC5(func, algo, redop, type) \
+ NCCL_FUNC_NAME(func, algo, LL, redop, type), \
+ NCCL_FUNC_NAME(func, algo, LL128, redop, type), \
+ NCCL_FUNC_NAME(func, algo, SIMPLE, redop, type)
-#define NCCL_FUNC4(coll, op, dtype) \
- NCCL_FUNC5(coll##Tree, op, dtype), \
- NCCL_FUNC5(coll##Ring, op, dtype)
+#define NCCL_FUNC4(func, redop, type) \
+ NCCL_FUNC5(func, TREE, redop, type), \
+ NCCL_FUNC5(func, RING, redop, type), \
+ NCCL_FUNC5(func, COLLNET, redop, type)
// Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(coll, op) \
- NCCL_FUNC4(coll, op, i8), \
- NCCL_FUNC4(coll, op, u8), \
- NCCL_FUNC4(coll, op, i32), \
- NCCL_FUNC4(coll, op, u32), \
- NCCL_FUNC4(coll, op, i64), \
- NCCL_FUNC4(coll, op, u64), \
- NCCL_FUNC4(coll, op, f16), \
- NCCL_FUNC4(coll, op, f32), \
- NCCL_FUNC4(coll, op, f64)
-#define NCCL_FUNCS3B(coll, op) \
- NCCL_FUNC4(coll, op, i8), \
- NCCL_FUNC4(coll, op, i8), \
- NCCL_FUNC4(coll, op, i8), \
- NCCL_FUNC4(coll, op, i8), \
- NCCL_FUNC4(coll, op, i8), \
- NCCL_FUNC4(coll, op, i8), \
- NCCL_FUNC4(coll, op, i8), \
- NCCL_FUNC4(coll, op, i8), \
- NCCL_FUNC4(coll, op, i8)
+#define NCCL_FUNCS3A(func, redop) \
+ NCCL_FUNC4(func, redop, int8_t), \
+ NCCL_FUNC4(func, redop, uint8_t), \
+ NCCL_FUNC4(func, redop, int32_t), \
+ NCCL_FUNC4(func, redop, uint32_t), \
+ NCCL_FUNC4(func, redop, int64_t), \
+ NCCL_FUNC4(func, redop, uint64_t), \
+ NCCL_FUNC4(func, redop, half), \
+ NCCL_FUNC4(func, redop, float), \
+ NCCL_FUNC4(func, redop, double)
+#define NCCL_FUNCS3B(func, redop) \
+ NCCL_FUNC4(func, redop, int8_t), \
+ NCCL_FUNC4(func, redop, int8_t), \
+ NCCL_FUNC4(func, redop, int8_t), \
+ NCCL_FUNC4(func, redop, int8_t), \
+ NCCL_FUNC4(func, redop, int8_t), \
+ NCCL_FUNC4(func, redop, int8_t), \
+ NCCL_FUNC4(func, redop, int8_t), \
+ NCCL_FUNC4(func, redop, int8_t), \
+ NCCL_FUNC4(func, redop, int8_t)
// Must be consistent with ncclRedOp_t
-#define NCCL_FUNCS2A(coll) \
- NCCL_FUNCS3A(coll, sum ), \
- NCCL_FUNCS3A(coll, prod), \
- NCCL_FUNCS3A(coll, max ), \
- NCCL_FUNCS3A(coll, min )
-#define NCCL_FUNCS2B(coll) \
- NCCL_FUNCS3B(coll, copy), \
- NCCL_FUNCS3B(coll, copy), \
- NCCL_FUNCS3B(coll, copy), \
- NCCL_FUNCS3B(coll, copy)
+#define NCCL_FUNCS2A(func) \
+ NCCL_FUNCS3A(func, Sum ), \
+ NCCL_FUNCS3A(func, Prod), \
+ NCCL_FUNCS3A(func, Max ), \
+ NCCL_FUNCS3A(func, Min )
+#define NCCL_FUNCS2B(func) \
+ NCCL_FUNCS3B(func, Sum), \
+ NCCL_FUNCS3B(func, Sum), \
+ NCCL_FUNCS3B(func, Sum), \
+ NCCL_FUNCS3B(func, Sum)
// Must be consistent with ncclFunc_t
#define NCCL_FUNCS() { \
- NCCL_FUNCS2B(ncclBroadcast), \
- NCCL_FUNCS2A(ncclReduce), \
- NCCL_FUNCS2B(ncclAllGather), \
- NCCL_FUNCS2A(ncclReduceScatter), \
- NCCL_FUNCS2A(ncclAllReduce) }
+ NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),\
+ NCCL_FUNCS2B(Broadcast), \
+ NCCL_FUNCS2A(Reduce), \
+ NCCL_FUNCS2B(AllGather), \
+ NCCL_FUNCS2A(ReduceScatter), \
+ NCCL_FUNCS2A(AllReduce) }
// Must be consistent with the ncclFuncSet enum
-__device__ ncclKern_t ncclFuncs[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
+__device__ ncclKern_t ncclFuncs[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
// Don't try to initialize the host shadow copy of this device-side global
// variable. There is no host pointer to a device-side function, which
// confuses clang. This will be fixed in the next clang release.
#if __CUDA_ARCH__
- NCCL_FUNCS2B(ncclBroadcast),
- NCCL_FUNCS2A(ncclReduce),
- NCCL_FUNCS2B(ncclAllGather),
- NCCL_FUNCS2A(ncclReduceScatter),
- NCCL_FUNCS2A(ncclAllReduce)
+ NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
+ NCCL_FUNCS2B(Broadcast),
+ NCCL_FUNCS2A(Reduce),
+ NCCL_FUNCS2B(AllGather),
+ NCCL_FUNCS2A(ReduceScatter),
+ NCCL_FUNCS2A(AllReduce)
#endif
};
diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh
index 4413213..97dc0ae 100755
--- a/src/collectives/device/gen_rules.sh
+++ b/src/collectives/device/gen_rules.sh
@@ -1,6 +1,6 @@
#!/bin/bash
#
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -9,7 +9,7 @@ dir=$1
targets="GENOBJS := \\\\\n"
-for base in all_reduce all_gather broadcast reduce reduce_scatter; do
+for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do
opn=0
for op in sum prod min max; do
dtn=0
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index aa3d20d..69348db 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -31,278 +31,235 @@
} \
} while (0)
+#define ROLE_SRC 0x01
+#define ROLE_DST 0x02
+#define ROLE_WAIT_RECV 0x04
+#define ROLE_WAIT_SEND 0x08
+#define ROLE_POST_SEND 0x10
+#define ROLE_POST_RECV 0x20
+
// Implementation of primitive types
-template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, class FUNC>
+template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, int DIRECT, class FUNC>
class ncclPrimitives {
private:
const int tid;
- const int nthreads;
- const int wid;
+ int nthreads;
+ int nworkers;
const int stepSize;
int nrecv = 0;
int nsend = 0;
- struct ncclConnInfo* recvConn = NULL;
- volatile uint64_t* recvConnHeadPtr = NULL;
- uint64_t recvConnHead;
- volatile uint64_t* recvConnTailPtr = NULL;
- uint64_t recvConnTail;
- uint64_t recvConnTailCache; // Cache last seen value
-
- struct ncclConnInfo* sendConn = NULL;
- volatile int* sendConnFifoPtr = NULL;
- volatile uint64_t* sendConnTailPtr = NULL;
- uint64_t sendConnTail;
- volatile uint64_t* sendConnHeadPtr = NULL;
- uint64_t sendConnHead;
- uint64_t sendConnHeadCache; // Cache last seen value
-
- uint64_t recvStep[NRECV];
- uint64_t sendStep[NSEND];
- const T* recvDirectBuff[NRECV];
- T* sendDirectBuff[NSEND];
- const T* recvBuff[NRECV];
- T* sendBuff[NSEND];
+ struct ncclConnInfo* conn = NULL;
+ volatile int* connSizesFifoPtr = NULL;
+ void** connPtrsFifoPtr = NULL;
+ volatile uint64_t* connHeadPtr = NULL;
+ volatile uint64_t* connTailPtr = NULL;
+ uint64_t connTailCache; // Cache last seen value
+ uint64_t connHeadCache; // Cache last seen value
+
+ int index; // Peer index I'm responsible for
+ int peer = -1;
+ int role = 0;
+ int group;
+ uint64_t step;
+ T* direct = NULL;
+ T* buff;
struct ncclDevComm* comm;
- inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
- inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
- inline __device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
- inline __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
+ const T** srcs;
+ T** dsts;
+ // Don't use barrier 0 as it's used by the final sync
inline __device__ void barrier() {
- asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+ if (nthreads == WARP_SIZE) __syncwarp();
+ else asm volatile ("bar.sync %0, %1;" :: "r"(group+1), "r"(nthreads));
}
inline __device__ void subBarrier() {
- asm volatile ("bar.sync 2, %0;" :: "r"(nthreads-WARP_SIZE));
- }
-
- uint32_t mismatch = 0;
- const uint64_t opCount;
-
- inline __device__ void checkMismatch(struct ncclConnInfo* conn) {
- if (mismatch) {
- // In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch
- *(comm->fatalDevError) = ncclDevAssertedMismatch;
- } else if (conn && *conn->opCountRem > opCount) {
- mismatch += 1;
- }
+ if (nworkers == nthreads) barrier();
+ else asm volatile ("bar.sync %0, %1;" :: "r"(group+2), "r"(nworkers));
}
uint32_t spins = 0;
uint32_t abort = 0;
- inline __device__ int checkAbort(int i, int send) {
+ inline __device__ int checkAbort() {
spins++;
if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
abort = *(comm->abortFlag);
- if (wid == i) checkMismatch(send ? sendConn : recvConn);
spins = 0;
}
return abort;
}
- inline __device__ void waitSend(int nbytes) {
+ template <int DIRECTPTR>
+ inline __device__ T* directPtr(ssize_t directOffset) {
+ return DIRECTPTR && direct ? direct+directOffset : buff+(step%NCCL_STEPS)*stepSize;
+ }
+
+ template <int DST, int DIRECTSEND>
+ inline __device__ void waitSend(ssize_t directOffset, int nbytes) {
spins = 0;
- mismatch = 0;
- if (sendConnHeadPtr) {
- while (sendConnHeadCache + NCCL_STEPS < sendConnHead + SLICESTEPS) {
- sendConnHeadCache = *sendConnHeadPtr;
- if (checkAbort(wid, 1)) break;
- }
- if (sendConnFifoPtr) {
- sendConnFifoPtr[sendConnHead%NCCL_STEPS] = nbytes;
- }
- sendConnHead += SLICESTEPS;
+ while (connHeadCache + NCCL_STEPS < step + SLICESTEPS) {
+ connHeadCache = *connHeadPtr;
+ if (checkAbort()) break;
+ }
+ if (connSizesFifoPtr) {
+ connSizesFifoPtr[step%NCCL_STEPS] = nbytes;
}
+
+ if (connPtrsFifoPtr) loadPtr(connPtrsFifoPtr+step%NCCL_STEPS, dsts[DST+index]);
+ else dsts[DST+index] = directPtr<DIRECTSEND>(directOffset);
+ step += SLICESTEPS;
}
- inline __device__ void waitRecv() {
+ template <int SRC, int DIRECTRECV>
+ inline __device__ void waitRecv(ssize_t directOffset) {
spins = 0;
- mismatch = 0;
- if (recvConnTailPtr) {
- while (recvConnTailCache < recvConnTail + SLICESTEPS) {
- recvConnTailCache = *recvConnTailPtr;
- if (checkAbort(wid, 0)) break;
- }
- recvConnTail += SLICESTEPS;
+ while (connTailCache < step + SLICESTEPS) {
+ connTailCache = *connTailPtr;
+ if (checkAbort()) break;
}
+ if (connPtrsFifoPtr) loadPtr(connPtrsFifoPtr+step%NCCL_STEPS, srcs[SRC+index]);
+ else srcs[SRC+index] = directPtr<DIRECTRECV>(directOffset);
+ step += SLICESTEPS;
}
- inline __device__ void incRecv(int i) {
- recvStep[i] += SLICESTEPS;
- }
inline __device__ void postRecv() {
- if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += SLICESTEPS;
+ *connHeadPtr = step += SLICESTEPS;
}
- inline __device__ void incSend(int i) {
- sendStep[i] += SLICESTEPS;
- }
inline __device__ void postSend() {
- if (sendConnTailPtr) *sendConnTailPtr = sendConnTail += SLICESTEPS;
- }
-
- template <int DIRECTRECV>
- inline __device__ const T* directRecvPtr(int i, int directOffset) {
- return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i);
- }
-
- template <int DIRECTSEND>
- inline __device__ T* directSendPtr(int i, int directOffset) {
- return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
- }
-
- template <int DIRECTRECV>
- inline __device__ int directRecvInc(int i, int directInc, int sliceInc) {
- return DIRECTRECV && recvDirectBuff[i] ? directInc : sliceInc;
- }
-
- template <int DIRECTSEND>
- inline __device__ int directSendInc(int i, int directInc, int sliceInc) {
- return DIRECTSEND && sendDirectBuff[i] ? directInc : sliceInc;
+ *connTailPtr = step += SLICESTEPS;
}
template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
inline __device__ void
- GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) {
+ GenericOp(const T* srcPtr, T* dstPtr, int nelem, ssize_t directOffset) {
int offset = 0;
int sliceSize = stepSize*SLICESTEPS;
int dataSize = max(DIVUP(nelem, 16*SLICESPERCHUNK)*16, sliceSize/32);
- const T* srcs[RECV*NRECV+SRC];
- srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset);
- if (RECV) {
- if (SRC) srcs[1] = recvPtr(0);
- for (int i=1; i<NRECV && i<nrecv; i++) srcs[SRC+i] = recvPtr(i);
- }
-
- T* dsts[SEND*NSEND+DST];
- dsts[0] = DST ? dstPtr : directSendPtr<DIRECTSEND>(0, directOffset);
- if (SEND) {
- if (DST) dsts[1] = directSendPtr<DIRECTSEND>(0, directOffset);
- for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
- }
-
- bool syncThread = tid >= nthreads-WARP_SIZE;
-
#pragma unroll
for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
int realSize = max(0, min(dataSize, nelem-offset));
- if (!syncThread) {
- if (SEND) waitSend(realSize*sizeof(T));
- if (RECV) waitRecv();
+ if (tid < nworkers) {
+ if (SRC && (role & ROLE_SRC)) srcs[0] = srcPtr+offset;
+ if (RECV && (role & ROLE_WAIT_RECV)) waitRecv<SRC, DIRECTRECV>(directOffset+offset);
+ if (DST && (role & ROLE_DST)) dsts[0] = dstPtr+offset;
+ if (SEND && (role & ROLE_WAIT_SEND)) waitSend<DST, DIRECTSEND>(directOffset+offset, realSize*sizeof(T));
if (realSize > 0) {
subBarrier();
- if (DIRECTRECV && recvDirectBuff[0]) {
+ if (DIRECTRECV && srcs[0] == dsts[0]) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (SEND) {
- ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads-WARP_SIZE, 1, srcs, nsend, dsts+1, realSize);
+ // (1-SEND) is only there to avoid compilation errors in case NSEND=0 (and SEND=0).
+ ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, (1-SEND)+NSEND>(tid, nworkers, 1, srcs, nsend, dsts+1, realSize);
}
} else {
- ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads-WARP_SIZE, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
+ ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nworkers, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
}
}
}
barrier();
- FOR_SEND(incSend);
- FOR_RECV(incRecv);
- if (syncThread) {
- if (SEND) {
- if (realSize > 0 && wid == 0) __threadfence_system();
- __syncwarp();
- postSend();
- }
- if (RECV) postRecv();
- }
- srcs[0] += SRC ? realSize : directRecvInc<DIRECTRECV>(0, realSize, sliceSize);
- for (int i=1-SRC; i<RECV*NRECV; i++) srcs[SRC+i] += sliceSize;
- dsts[0] += DST ? realSize : directSendInc<DIRECTSEND>(0, realSize, sliceSize);
- for (int i=1-DST; i<SEND*NSEND; i++) dsts[DST+i] += directSendInc<DIRECTSEND>(i, realSize, sliceSize);
+ if (SEND && (role & ROLE_POST_SEND) && realSize > 0 && index == 0) __threadfence_system();
+ __syncwarp();
+ if (SEND && (role & ROLE_POST_SEND)) postSend();
+ if (RECV && (role & ROLE_POST_RECV)) postRecv();
offset += realSize;
}
}
- __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
- recvBuff[i] = (const T*)conn->buff;
- recvStep[i] = conn->step;
- recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
- recvDirectBuff[i] = NULL;
- if (directBuff && conn->direct) {
- recvDirectBuff[i] = directBuff;
- if (tid == 0) *conn->ptrExchange = directBuff;
- }
- if (wid == i) recvConn = conn;
- if (wid == i) recvConnTail = recvConnHead = recvStep[i]; // Make sure we set this after rounding up
- nrecv++;
- }
- __device__ __forceinline__ void loadRecvSync() {
- if (tid >= WARP_SIZE && tid < 2*WARP_SIZE && wid<nrecv) {
- recvConnTailPtr = recvConn->tail;
- recvConnTailCache = *recvConnTailPtr;
- }
- if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
- recvConnHeadPtr = recvConn->head;
- // Return credits in case we rounded up.
- *recvConnHeadPtr = recvConnHead;
- // Update opCount in case we skipped some operations
- *(recvConn->opCountLoc) = opCount;
- }
- }
-
- __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
- sendBuff[i] = (T*)conn->buff;
- sendStep[i] = conn->step;
- sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
- sendDirectBuff[i] = NULL;
- if (directBuff && conn->direct) {
- void* volatile* ptr = conn->ptrExchange;
- while ((sendDirectBuff[i] = (T*)(*ptr)) == NULL);
- barrier();
- if (tid == 0) *ptr = NULL;
- }
- if (wid == i) sendConn = conn;
- if (wid == i) sendConnTail = sendConnHead = sendStep[i]; // Make sure we set this after rounding up
- nsend++;
- }
- __device__ __forceinline__ void loadSendSync() {
- if (tid < nsend) {
- sendConnHeadPtr = sendConn->head;
- sendConnHeadCache = *sendConnHeadPtr;
- sendConnFifoPtr = sendConn->fifo;
- *(sendConn->opCountLoc) = opCount;
- }
- if (tid >= nthreads-WARP_SIZE && wid<nsend) {
- sendConnTailPtr = sendConn->tail;
+ __device__ __forceinline__ void loadRecvConn(struct ncclChannel* channel, T* directBuff) {
+ if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) {
+ conn = &channel->devPeers[peer].recv.conn;
+ step = conn->step;
+ step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
+ if (role & ROLE_POST_RECV) {
+ connHeadPtr = conn->head;
+ // Return credits in case we rounded up.
+ *connHeadPtr = step;
+ }
+ if (role & ROLE_WAIT_RECV) {
+ buff = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
+ if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
+ direct = directBuff;
+ *conn->ptrExchange = directBuff;
+ }
+ connTailPtr = conn->tail;
+ connTailCache = *connTailPtr;
+ connPtrsFifoPtr = conn->ptrsFifo;
+ }
}
}
- __device__ __forceinline__ void saveRecvSync() {
- if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
- recvConn->step = recvConnHead;
- *(recvConn->opCountLoc) = opCount+1;
- __threadfence_system();
+ __device__ __forceinline__ void loadSendConn(struct ncclChannel* channel) {
+ if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) {
+ conn = &channel->devPeers[peer].send.conn;
+ step = conn->step;
+ step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS);
+ if (role & ROLE_POST_SEND) {
+ connTailPtr = conn->tail;
+ }
+ if (role & ROLE_WAIT_SEND) {
+ buff = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
+ if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) {
+ void* volatile* ptr = conn->ptrExchange;
+ while ((direct = (T*)(*ptr)) == NULL);
+ *ptr = NULL;
+ }
+ connHeadPtr = conn->head;
+ connHeadCache = *connHeadPtr;
+ connSizesFifoPtr = conn->sizesFifo;
+ connPtrsFifoPtr = conn->ptrsFifo;
+ }
}
}
- __device__ __forceinline__ void saveSendSync() {
- if (tid < nsend) {
- sendConn->step = sendConnHead;
- *(sendConn->opCountLoc) = opCount+1;
+ __device__ __forceinline__ void saveSync() {
+ if (role & (ROLE_POST_SEND|ROLE_POST_RECV)) {
+ conn->step = step;
__threadfence_system();
}
}
public:
__device__ __forceinline__
- ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
- : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepSize(stepSize), opCount(opCount) {
+ ncclPrimitives(const int tid, const int nworkers, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, struct ncclShmemPtrs* ptrs, int group)
+ : comm(comm), tid(tid), nworkers(nworkers), stepSize(stepSize), srcs((const T**)ptrs[group].srcs), dsts((T**)ptrs[group].dsts), group(group) {
+ nthreads = nworkers;
+ // For send operations, we need an extra warp to overlap the threadfence and the copy
+ int postThreads = NSEND && nworkers >= 64 ? WARP_SIZE : 0;
+ nthreads += postThreads;
+
// Make sure step is updated before we read it.
barrier();
- for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, directBuff);
- for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, directBuff);
- loadRecvSync();
- loadSendSync();
+ for (int i=0; i<NRECV; i++) if (recvPeers[i] != -1) nrecv++;
+ for (int i=0; i<NSEND; i++) if (sendPeers[i] != -1) nsend++;
+
+ #define SYNC_GROUP 8
+ static_assert(NSEND < SYNC_GROUP && NRECV < SYNC_GROUP, "Not enough threads to cover all peers");
+
+ int g = tid / SYNC_GROUP;
+ int ng = nthreads / SYNC_GROUP;
+ index = tid % SYNC_GROUP;
+
+ if (g == 0) {
+ if (index < nrecv) role |= ROLE_WAIT_RECV;
+ if (index == nrecv) role |= ROLE_SRC;
+ } else if (g == 1) {
+ if (index < nsend) role |= ROLE_WAIT_SEND;
+ if (index == nsend) role |= ROLE_DST;
+ } else if (g == ng - 2) {
+ if (index < nrecv) role |= ROLE_POST_RECV;
+ } else if (g == ng - 1) {
+ if (index < nsend) role |= ROLE_POST_SEND;
+ }
+
+ if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) peer = recvPeers[index];
+ if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) peer = sendPeers[index];
+
+ loadRecvConn(channel, directBuff);
+ loadSendConn(channel);
}
__device__ __forceinline__ void
@@ -310,7 +267,7 @@ class ncclPrimitives {
GenericOp<0, 0, 0, 1, 1, 0>(src, NULL, nelem, 0);
}
__device__ __forceinline__ void
- directSend(const T* src, int directOffset, int nelem) {
+ directSend(const T* src, ssize_t directOffset, int nelem) {
GenericOp<0, 1, 0, 1, 1, 0>(src, NULL, nelem, directOffset);
}
@@ -319,7 +276,7 @@ class ncclPrimitives {
GenericOp<0, 0, 1, 0, 0, 1>(NULL, dst, nelem, 0);
}
__device__ __forceinline__ void
- directRecv(T* dst, int directOffset, int nelem) {
+ directRecv(T* dst, ssize_t directOffset, int nelem) {
GenericOp<1, 0, 1, 0, 0, 1>(NULL, dst, nelem, directOffset);
}
@@ -328,7 +285,7 @@ class ncclPrimitives {
GenericOp<0, 0, 0, 1, 1, 1>(src, dst, nelem, 0);
}
__device__ __forceinline__ void
- directCopySend(const T* src, T* dst, int directOffset, int nelem) {
+ directCopySend(const T* src, T* dst, ssize_t directOffset, int nelem) {
GenericOp<0, 1, 0, 1, 1, 1>(src, dst, nelem, directOffset);
}
@@ -337,7 +294,7 @@ class ncclPrimitives {
GenericOp<0, 0, 1, 1, 0, 1>(NULL, dst, nelem, 0);
}
__device__ __forceinline__ void
- directRecvCopySend(T* dst, int directOffset, int nelem) {
+ directRecvCopySend(T* dst, ssize_t directOffset, int nelem) {
GenericOp<1, 1, 1, 1, 0, 1>(NULL, dst, nelem, directOffset);
}
@@ -356,15 +313,14 @@ class ncclPrimitives {
GenericOp<0, 0, 1, 1, 1, 1>(src, dst, nelem, 0);
}
__device__ __forceinline__ void
- directRecvReduceCopySend(const T* src, T* dst, int directOffset, int nelem) {
+ directRecvReduceCopySend(const T* src, T* dst, ssize_t directOffset, int nelem) {
// Direct is only for the send part
GenericOp<0, 1, 1, 1, 1, 1>(src, dst, nelem, directOffset);
}
__device__ __forceinline__ ~ncclPrimitives() {
// Save steps for the next operation
- saveRecvSync();
- saveSendSync();
+ saveSync();
}
};
diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h
index f919493..9e362f9 100644
--- a/src/collectives/device/prims_ll.h
+++ b/src/collectives/device/prims_ll.h
@@ -1,9 +1,16 @@
+/*************************************************************************
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
template <typename T, class FUNC, int NRECV, int NSEND>
class ncclLLPrimitives {
private:
const int tid;
const int nthreads;
const int wid;
+ const int stepLines;
int nrecv = 0;
int nsend = 0;
struct ncclConnInfo* recvConn = NULL;
@@ -22,8 +29,8 @@ class ncclLLPrimitives {
union ncclLLFifoLine* sendBuff[NSEND];
struct ncclDevComm* comm;
- inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
- inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+ inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepLines; }
+ inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepLines; }
inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
@@ -33,19 +40,6 @@ class ncclLLPrimitives {
asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
}
- uint32_t mismatch = 0;
- const uint64_t opCount;
-
- inline __device__ void checkMismatch(struct ncclConnInfo* conn) {
- if (mismatch > 20) {
- // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
- // Note that we are not using _threadfence_system in LL so the error cannot be asserted
- *(comm->fatalDevError) = ncclDevSuspectedMismatch;
- } else if (conn && *conn->opCountRem > opCount) {
- mismatch += 1;
- }
- }
-
uint32_t spins = 0;
uint32_t abort = 0;
@@ -53,7 +47,6 @@ class ncclLLPrimitives {
spins++;
if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
abort = *(comm->abortFlag);
- if (wid == i) checkMismatch(send ? sendConn : recvConn);
spins = 0;
}
return abort;
@@ -61,14 +54,13 @@ class ncclLLPrimitives {
inline __device__ void waitSend(int nbytes) {
spins = 0;
- mismatch = 0;
if (sendConnHeadPtr) {
while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
sendConnHeadCache = *sendConnHeadPtr;
if (checkAbort(wid, 1)) break;
}
if (sendConnFifoPtr) {
- int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes;
+ int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? stepLines*sizeof(union ncclLLFifoLine) : nbytes;
sendConnFifoPtr[sendConnHead%NCCL_STEPS] = size;
}
sendConnHead += 1;
@@ -88,7 +80,7 @@ class ncclLLPrimitives {
// LL Cleanup : write all flags in the slice to make sure we don't have
// data corruption when flag loops over.
if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
- for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
+ for (int o = offset; o<stepLines; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
}
sendStep[i]++;
}
@@ -98,7 +90,6 @@ class ncclLLPrimitives {
uint32_t flag = recvFlag(i);
uint32_t data1, flag1, data2, flag2;
spins = 0;
- mismatch = 0;
do {
asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
if (checkAbort(i, 0)) break;
@@ -164,7 +155,7 @@ class ncclLLPrimitives {
}
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
- recvBuff[i] = conn->llBuff;
+ recvBuff[i] = (union ncclLLFifoLine*)conn->buffs[NCCL_PROTO_LL];
recvStep[i] = conn->step;
if (wid == i) recvConn = conn;
nrecv++;
@@ -173,13 +164,11 @@ class ncclLLPrimitives {
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
recvConnHeadPtr = recvConn->head;
recvConnHead = recvConn->step;
- // Update opCount in case we skipped some operations
- *(recvConn->opCountLoc) = opCount;
}
}
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
- sendBuff[i] = conn->llBuff;
+ sendBuff[i] = (union ncclLLFifoLine*)conn->buffs[NCCL_PROTO_LL];
sendStep[i] = conn->step;
if (wid == i) sendConn = conn;
nsend++;
@@ -189,15 +178,13 @@ class ncclLLPrimitives {
sendConnHeadPtr = sendConn->head;
sendConnHeadCache = *sendConnHeadPtr;
sendConnHead = sendConn->step;
- sendConnFifoPtr = sendConn->fifo;
- *(sendConn->opCountLoc) = opCount;
+ sendConnFifoPtr = sendConn->sizesFifo;
}
}
__device__ __forceinline__ void saveRecvSync() {
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
recvConn->step = recvConnHead;
- *(recvConn->opCountLoc) = opCount+1;
__threadfence_block();
}
}
@@ -205,15 +192,14 @@ class ncclLLPrimitives {
__device__ __forceinline__ void saveSendSync() {
if (tid < nsend) {
sendConn->step = sendConnHead;
- *(sendConn->opCountLoc) = opCount+1;
__threadfence_block();
}
}
public:
__device__ __forceinline__
- ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
- : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), opCount(opCount) {
+ ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, int stepLines, struct ncclChannel* channel, struct ncclDevComm* comm)
+ : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepLines(stepLines) {
// Make sure step is updated before we read it.
barrier();
diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h
index 40a8cff..999d0d5 100644
--- a/src/collectives/device/prims_ll128.h
+++ b/src/collectives/device/prims_ll128.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -14,6 +14,7 @@ class ncclLL128Primitives {
const int tid;
const int nthreads;
const int wid;
+ const int stepSize;
const int warp;
const bool flagThread;
int nrecv = 0;
@@ -38,8 +39,8 @@ class ncclLL128Primitives {
volatile uint64_t* shmem;
- inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; }
- inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; }
+ inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
+ inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
inline __device__ uint64_t* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
inline __device__ uint64_t* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
inline __device__ uint64_t recvFlag(int i) { return recvStep[i]+1; }
@@ -47,22 +48,9 @@ class ncclLL128Primitives {
inline __device__ void barrier() {
if (NSEND>NRECV) {
- asm volatile ("bar.sync 2, %0;" :: "r"(nthreads));
+ asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
} else {
- asm volatile ("bar.sync 3, %0;" :: "r"(nthreads));
- }
- }
-
- uint32_t mismatch = 0;
- const uint64_t opCount;
-
- inline __device__ void checkMismatch(struct ncclConnInfo* conn) {
- if (mismatch > 20) {
- // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
- // Note that we are not using _threadfence_system in LL so the error cannot be asserted
- *(comm->fatalDevError) = ncclDevSuspectedMismatch;
- } else if (conn && *conn->opCountRem > opCount) {
- mismatch += 1;
+ asm volatile ("bar.sync 2, %0;" :: "r"(nthreads));
}
}
@@ -73,7 +61,6 @@ class ncclLL128Primitives {
spins++;
if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) {
abort = *(comm->abortFlag);
- if (wid == i) checkMismatch(send ? sendConn : recvConn);
spins = 0;
}
return abort;
@@ -81,7 +68,6 @@ class ncclLL128Primitives {
inline __device__ void waitSend(int nbytes) {
spins = 0;
- mismatch = 0;
if (sendConnHeadPtr) {
while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
sendConnHeadCache = *sendConnHeadPtr;
@@ -225,14 +211,14 @@ class ncclLL128Primitives {
/************************ Send **************************/
if (SEND) {
for (int i=1; i<NSEND && i<nsend; i++) {
- int flag = sendFlag(i);
+ uint64_t flag = sendFlag(i);
uint64_t* ptr = sendPtr(i)+ll128Offset;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]);
}
}
- int flag = sendFlag(0);
+ uint64_t flag = sendFlag(0);
uint64_t* ptr = sendPtr(0)+ll128Offset;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
@@ -309,7 +295,7 @@ class ncclLL128Primitives {
}
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
- recvBuff[i] = conn->ll128Buff;
+ recvBuff[i] = (uint64_t*)conn->buffs[NCCL_PROTO_LL128];
recvStep[i] = conn->step;
if (wid == i) recvConn = conn;
nrecv++;
@@ -318,13 +304,11 @@ class ncclLL128Primitives {
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
recvConnHeadPtr = recvConn->head;
recvConnHead = recvConn->step;
- // Update opCount in case we skipped some operations
- *(recvConn->opCountLoc) = opCount;
}
}
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
- sendBuff[i] = conn->ll128Buff;
+ sendBuff[i] = (uint64_t*)conn->buffs[NCCL_PROTO_LL128];
sendStep[i] = conn->step;
if (wid == i) sendConn = conn;
nsend++;
@@ -334,11 +318,10 @@ class ncclLL128Primitives {
sendConnHeadPtr = sendConn->head;
sendConnHeadCache = *sendConnHeadPtr;
sendConnHead = sendConn->step;
- sendConnFifoPtr = sendConn->fifo;
- *(sendConn->opCountLoc) = opCount;
+ sendConnFifoPtr = sendConn->sizesFifo;
}
if (tid >= nthreads-WARP_SIZE && wid<nsend) {
- if (sendConn->fifo) {
+ if (sendConn->sizesFifo) {
sendConnTailPtr = sendConn->tail;
sendConnTail = sendConn->step;
}
@@ -348,7 +331,6 @@ class ncclLL128Primitives {
__device__ __forceinline__ void saveRecvSync() {
if (tid >= nthreads-WARP_SIZE && wid < nrecv) {
recvConn->step = recvConnHead;
- *(recvConn->opCountLoc) = opCount+1;
__threadfence_block();
}
}
@@ -356,15 +338,14 @@ class ncclLL128Primitives {
__device__ __forceinline__ void saveSendSync() {
if (tid < nsend) {
sendConn->step = sendConnHead;
- *(sendConn->opCountLoc) = opCount+1;
__threadfence_block();
}
}
public:
__device__ __forceinline__
- ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount)
- : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), opCount(opCount), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) {
+ ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm)
+ : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), stepSize(stepSize), shmem(ncclShmem->data+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) {
// Make sure step is updated before we read it.
barrier();
diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu
index a2caac5..66f1bb2 100644
--- a/src/collectives/device/reduce.cu
+++ b/src/collectives/device/reduce.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,4 +8,4 @@
#include "common.h"
#include "collectives.h"
-IMPL_COLL_R(ncclReduce, ncclCollReduce);
+IMPL_COLL_R(Reduce);
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index 0680abe..313209d 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,133 +8,145 @@
#include "primitives.h"
#include "collectives.h"
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
- const int tid = threadIdx.x;
- const int nthreads = args->nThreads-WARP_SIZE;
- const int bid = args->bid;
- struct ncclDevComm* comm = args->comm;
- struct ncclChannel* channel = comm->channels+blockIdx.x;
- struct ncclRing* ring = &channel->ring;
- const ssize_t size = args->N;
- const int nranks = comm->nRanks;
- const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
- const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
- const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
- const int rank = ring->devUserRanks[0];
- const int prevRank = ring->devUserRanks[nranks-1];
- const int root = args->root;
-
- // Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
-
- ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, FUNC>
- prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
-
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
- ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
- ssize_t offset = gridOffset + bid*realChunkSize;
- int nelem = min(realChunkSize, size-offset);
- if (prevRank == root) {
- prims.send(thisInput+offset, nelem);
- } else if (rank == root) {
- prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
- } else {
- prims.recvReduceSend(thisInput+offset, nelem);
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads-WARP_SIZE;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+ const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+ const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+ const ssize_t size = args->coll.count;
+ const int rank = ring->devUserRanks[0];
+ const int prevRank = ring->devUserRanks[nranks-1];
+ const int root = args->coll.root;
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+ ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, 0, FUNC>
+ prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
+
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
+ ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+ ssize_t offset = gridOffset + bid*realChunkSize;
+ int nelem = min(realChunkSize, size-offset);
+ if (prevRank == root) {
+ prims.send(thisInput+offset, nelem);
+ } else if (rank == root) {
+ prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+ } else {
+ prims.recvReduceSend(thisInput+offset, nelem);
+ }
+ }
}
- }
-}
-
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { }
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
- const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
- struct ncclDevComm* comm = args->comm;
- struct ncclChannel* channel = comm->channels+blockIdx.x;
- struct ncclRing* ring = &channel->ring;
-
- ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
- const int rank = comm->rank;
- const int nranks = comm->nRanks;
- const int prevRank = ring->devUserRanks[nranks-1];
- const int root = args->root;
-
- ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
- const ssize_t loopSize = args->nChannels*chunkSize;
-
- // Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
-
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- if (size-gridOffset < loopSize) {
- chunkSize = args->lastChunkSize;
+};
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+ const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+ ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
+ const int rank = comm->rank;
+ const int prevRank = ring->devUserRanks[nranks-1];
+ const int root = args->coll.root;
+
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ if (size-gridOffset < loopSize) {
+ chunkSize = args->coll.lastChunkSize;
+ }
+ ssize_t offset = gridOffset + bid*chunkSize;
+
+ int nelem = min(chunkSize, size-offset);
+ if (prevRank == root) {
+ LLprims.send(thisInput+offset, nelem);
+ } else if (rank == root) {
+ LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+ } else {
+ LLprims.recvReduceSend(thisInput+offset, nelem);
+ }
+ }
}
- ssize_t offset = gridOffset + bid*chunkSize;
-
- int nelem = min(chunkSize, size-offset);
- if (prevRank == root) {
- LLprims.send(thisInput+offset, nelem);
- } else if (rank == root) {
- LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
- } else {
- LLprims.recvReduceSend(thisInput+offset, nelem);
- }
- }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
+};
#include "prims_ll128.h"
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) {
- const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
- struct ncclDevComm* comm = args->comm;
- struct ncclChannel* channel = comm->channels+blockIdx.x;
- struct ncclRing* ring = &channel->ring;
-
- ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
- const int rank = comm->rank;
- const int nranks = comm->nRanks;
- const int prevRank = ring->devUserRanks[nranks-1];
- const int root = args->root;
-
- ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
- const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
-
- const ssize_t loopSize = args->nChannels*chunkSize;
-
- // Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
-
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
- ssize_t offset = gridOffset + bid*chunkSize;
-
- int nelem = min(chunkSize, size-offset);
- if (prevRank == root) {
- LLprims.send(thisInput+offset, nelem);
- } else if (rank == root) {
- LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
- } else {
- LLprims.recvReduceSend(thisInput+offset, nelem);
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduce, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+ const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+ ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
+ const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
+ const int rank = comm->rank;
+ const int prevRank = ring->devUserRanks[nranks-1];
+ const int root = args->coll.root;
+
+ ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
+ ssize_t offset = gridOffset + bid*chunkSize;
+
+ int nelem = min(chunkSize, size-offset);
+ if (prevRank == root) {
+ LLprims.send(thisInput+offset, nelem);
+ } else if (rank == root) {
+ LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+ } else {
+ LLprims.recvReduceSend(thisInput+offset, nelem);
+ }
+ }
}
- }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceTreeLL128Kernel(struct CollectiveArgs* args) { }
+};
+
+template<int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduce, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {}
+};
+
+template<int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduce, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {}
+};
diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu
index 8b45299..c2c6d42 100644
--- a/src/collectives/device/reduce_scatter.cu
+++ b/src/collectives/device/reduce_scatter.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,4 +8,4 @@
#include "common.h"
#include "collectives.h"
-IMPL_COLL_R(ncclReduceScatter, ncclCollReduceScatter);
+IMPL_COLL_R(ReduceScatter);
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index 1985148..a0d45dc 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,177 +8,189 @@
#include "primitives.h"
#include "collectives.h"
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
- const int tid = threadIdx.x;
- const int nthreads = args->nThreads-WARP_SIZE;
- const int bid = args->bid;
- struct ncclDevComm* comm = args->comm;
- struct ncclChannel* channel = comm->channels+blockIdx.x;
- struct ncclRing* ring = &channel->ring;
- const ssize_t size = args->N;
- const int nranks = comm->nRanks;
- const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
- const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
- const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
-
- // Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
-
- ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, FUNC>
- prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
-
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
- ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
- ssize_t chunkOffset = gridOffset + bid*realChunkSize;
-
- /////////////// begin ReduceScatter steps ///////////////
- ssize_t offset;
- int nelem = min(realChunkSize, size-chunkOffset);
- int rankDest;
-
- // step 0: push data to next GPU
- rankDest = ring->devUserRanks[nranks-1];
- offset = chunkOffset + rankDest * size;
-
- prims.send(thisInput+offset, nelem);
-
- // k-2 steps: reduce and copy to next GPU
- for (int j=2; j<nranks; ++j) {
- rankDest = ring->devUserRanks[nranks-j];
- offset = chunkOffset + rankDest * size;
-
- prims.recvReduceSend(thisInput+offset, nelem);
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads-WARP_SIZE;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+ const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
+ const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS;
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*(ssize_t)chunkSize;
+ const ssize_t size = args->coll.count;
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+ ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, 0, FUNC>
+ prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
+
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels));
+ ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+ ssize_t chunkOffset = gridOffset + bid*realChunkSize;
+
+ /////////////// begin ReduceScatter steps ///////////////
+ ssize_t offset;
+ int nelem = min(realChunkSize, size-chunkOffset);
+ int rankDest;
+
+ // step 0: push data to next GPU
+ rankDest = ring->devUserRanks[nranks-1];
+ offset = chunkOffset + rankDest * size;
+
+ prims.send(thisInput+offset, nelem);
+
+ // k-2 steps: reduce and copy to next GPU
+ for (int j=2; j<nranks; ++j) {
+ rankDest = ring->devUserRanks[nranks-j];
+ offset = chunkOffset + rankDest * size;
+
+ prims.recvReduceSend(thisInput+offset, nelem);
+ }
+
+ // step k-1: reduce this buffer and data, which will produce the final result
+ rankDest = ring->devUserRanks[0];
+ offset = chunkOffset + rankDest * size;
+
+ prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
+ }
}
-
- // step k-1: reduce this buffer and data, which will produce the final result
- rankDest = ring->devUserRanks[0];
- offset = chunkOffset + rankDest * size;
-
- prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
- }
-}
-
-template<int UNROLL, class FUNC, typename T>
-__device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { }
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
- const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
- struct ncclDevComm* comm = args->comm;
- struct ncclChannel* channel = comm->channels+blockIdx.x;
- struct ncclRing* ring = &channel->ring;
-
- ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
- //const int rank = comm->rank;
- const int nranks = comm->nRanks;
- ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
- const ssize_t loopSize = args->nChannels*chunkSize;
-
- // Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
-
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- if (size-gridOffset < loopSize) {
- chunkSize = args->lastChunkSize;
- }
- ssize_t chunkOffset = gridOffset + bid*chunkSize;
-
- /////////////// begin ReduceScatter steps ///////////////
- ssize_t offset;
- int nelem = min(chunkSize, size-chunkOffset);
- int rankDest;
-
- // step 0: push data to next GPU
- rankDest = ring->devUserRanks[nranks-1];
- offset = chunkOffset + rankDest * size;
-
- LLprims.send(thisInput+offset, nelem);
-
- // k-2 steps: reduce and copy to next GPU
- for (int j=2; j<nranks; ++j) {
- rankDest = ring->devUserRanks[nranks-j];
- offset = chunkOffset + rankDest * size;
-
- LLprims.recvReduceSend(thisInput+offset, nelem);
+};
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_LL, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+ const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS);
+ ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T);
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
+
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm);
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ if (size-gridOffset < loopSize) {
+ chunkSize = args->coll.lastChunkSize;
+ }
+ ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+ /////////////// begin ReduceScatter steps ///////////////
+ ssize_t offset;
+ int nelem = min(chunkSize, size-chunkOffset);
+ int rankDest;
+
+ // step 0: push data to next GPU
+ rankDest = ring->devUserRanks[nranks-1];
+ offset = chunkOffset + rankDest * size;
+
+ LLprims.send(thisInput+offset, nelem);
+
+ // k-2 steps: reduce and copy to next GPU
+ for (int j=2; j<nranks; ++j) {
+ rankDest = ring->devUserRanks[nranks-j];
+ offset = chunkOffset + rankDest * size;
+
+ LLprims.recvReduceSend(thisInput+offset, nelem);
+ }
+
+ // step k-1: reduce this buffer and data, which will produce the final
+ // result that we store in this data
+ rankDest = ring->devUserRanks[0];
+ offset = chunkOffset + rankDest * size;
+
+ LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
+ }
}
-
- // step k-1: reduce this buffer and data, which will produce the final
- // result that we store in this data
- rankDest = ring->devUserRanks[0];
- offset = chunkOffset + rankDest * size;
-
- LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
- }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
+};
#include "prims_ll128.h"
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) {
- const int tid = threadIdx.x;
- const int bid = args->bid;
- const int nthreads = args->nThreads;
- struct ncclDevComm* comm = args->comm;
- struct ncclChannel* channel = comm->channels+blockIdx.x;
- struct ncclRing* ring = &channel->ring;
-
- ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
-
- const ssize_t size = args->N;
- //const int rank = comm->rank;
- const int nranks = comm->nRanks;
- ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T));
- // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
- const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
-
- const ssize_t loopSize = args->nChannels*chunkSize;
-
- // Compute pointers
- const T * __restrict__ thisInput = (const T*)args->ThisInput;
- T * __restrict__ thisOutput = (T*)args->ThisOutput;
-
- for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize);
-
- ssize_t chunkOffset = gridOffset + bid*chunkSize;
-
- /////////////// begin ReduceScatter steps ///////////////
- ssize_t offset;
- int nelem = min(chunkSize, size-chunkOffset);
- int rankDest;
-
- // step 0: push data to next GPU
- rankDest = ring->devUserRanks[nranks-1];
- offset = chunkOffset + rankDest * size;
-
- LLprims.send(thisInput+offset, nelem);
-
- // k-2 steps: reduce and copy to next GPU
- for (int j=2; j<nranks; ++j) {
- rankDest = ring->devUserRanks[nranks-j];
- offset = chunkOffset + rankDest * size;
-
- LLprims.recvReduceSend(thisInput+offset, nelem);
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_RING, NCCL_PROTO_LL128, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads;
+ const int bid = args->coll.bid;
+ const int nChannels = args->coll.nChannels;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+ const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
+ ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
+ // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
+ const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
+ const int nranks = comm->nRanks;
+ const ssize_t loopSize = nChannels*chunkSize;
+ const ssize_t size = args->coll.count;
+
+ ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->sendbuff;
+ T * __restrict__ thisOutput = (T*)args->recvbuff;
+
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize);
+
+ ssize_t chunkOffset = gridOffset + bid*chunkSize;
+
+ /////////////// begin ReduceScatter steps ///////////////
+ ssize_t offset;
+ int nelem = min(chunkSize, size-chunkOffset);
+ int rankDest;
+
+ // step 0: push data to next GPU
+ rankDest = ring->devUserRanks[nranks-1];
+ offset = chunkOffset + rankDest * size;
+
+ LLprims.send(thisInput+offset, nelem);
+
+ // k-2 steps: reduce and copy to next GPU
+ for (int j=2; j<nranks; ++j) {
+ rankDest = ring->devUserRanks[nranks-j];
+ offset = chunkOffset + rankDest * size;
+
+ LLprims.recvReduceSend(thisInput+offset, nelem);
+ }
+
+ // step k-1: reduce this buffer and data, which will produce the final
+ // result that we store in this data
+ rankDest = ring->devUserRanks[0];
+ offset = chunkOffset + rankDest * size;
+
+ LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
+ }
}
-
- // step k-1: reduce this buffer and data, which will produce the final
- // result that we store in this data
- rankDest = ring->devUserRanks[0];
- offset = chunkOffset + rankDest * size;
-
- LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
- }
-}
-
-template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceScatterTreeLL128Kernel(struct CollectiveArgs* args) { }
+};
+
+template<int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_TREE, PROTO, REDOP, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {}
+};
+
+template<int PROTO, class REDOP, typename T, int UNROLL>
+class ncclFunction<ncclFuncReduceScatter, NCCL_ALGO_COLLNET, PROTO, REDOP, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* args) {}
+};
diff --git a/src/collectives/device/sendrecv.cu b/src/collectives/device/sendrecv.cu
new file mode 100644
index 0000000..59e38b5
--- /dev/null
+++ b/src/collectives/device/sendrecv.cu
@@ -0,0 +1,11 @@
+/*************************************************************************
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "sendrecv.h"
+#include "common.h"
+#include "collectives.h"
+
+IMPL_COLL_P(SendRecv);
diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h
new file mode 100644
index 0000000..1cb34f3
--- /dev/null
+++ b/src/collectives/device/sendrecv.h
@@ -0,0 +1,92 @@
+/*************************************************************************
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "devcomm.h"
+#include "primitives.h"
+#include "collectives.h"
+
+template<class FUNC, typename T, int UNROLL>
+class ncclFunction<ncclFuncSendRecv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
+ public:
+ __device__ void run(struct ncclWorkElem* firstArgs) {
+ struct ncclWorkElem* args = firstArgs;
+ int tid = threadIdx.x;
+ int group = 0;
+ for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s++) {
+ int nThreadsSegment = args->p2p.nThreads;
+ if (nThreadsSegment == 0) return; // Nothing else to do
+ int groupRecv = group;
+ group += 1;
+ int groupSend = group;
+ group += nThreadsSegment > 128 ? 2 : 1;
+ if (tid < nThreadsSegment) {
+ const int nThreads = nThreadsSegment > 128 ? nThreadsSegment-WARP_SIZE : nThreadsSegment;
+
+ // Compute pointers
+ const T* sendbuff = (const T*)args->sendbuff;
+ T* recvbuff = (T*)args->recvbuff;
+ const ssize_t sendCount = args->p2p.sendCount;
+ const ssize_t recvCount = args->p2p.recvCount;
+
+ const int delta = args->p2p.delta;
+ if (delta == 0) {
+ if (tid < nThreads && sendbuff != recvbuff) {
+ // local copy : ReduceOrCopyMulti takes an int as number of elements,
+ // so we split it in blocks of 1G elements.
+ int blockSize = 1<<30;
+ for (size_t offset=0; offset<sendCount; offset += blockSize) {
+ size_t remaining = sendCount - offset;
+ if (remaining < blockSize) blockSize = remaining;
+ ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nThreads, 1, &sendbuff, 1, &recvbuff, blockSize);
+ sendbuff += blockSize; recvbuff += blockSize;
+ }
+ }
+ } else {
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+
+ const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(sizeof(T)*NCCL_STEPS);
+ const int chunkSize = stepSize/SENDRECV_SLICEFACTOR;
+
+ int nThreadsSplit = nThreads/2;
+ if ((tid < nThreadsSplit) && recvCount >= 0) {
+ int peer = (comm->rank-delta+comm->nRanks)%comm->nRanks;
+ int nt = nThreadsSplit;
+ ncclPrimitives<UNROLL, 1, 1, T, 1, 0, 1, FUNC>
+ prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupRecv);
+
+ if (recvCount == 0) {
+ prims.recv(recvbuff, 0);
+ } else for (ssize_t offset = 0; offset < recvCount; offset += chunkSize) {
+ int realChunkSize = min(chunkSize, recvCount-offset);
+ ALIGN_SIZE(realChunkSize, nt*sizeof(uint64_t)/sizeof(T));
+ int nelem = min(realChunkSize, recvCount-offset);
+ prims.directRecv(recvbuff+offset, offset, nelem);
+ }
+ }
+ if ((tid >= nThreadsSplit) && sendCount >= 0) {
+ int peer = (comm->rank+delta)%comm->nRanks;
+ int nt = nThreads-nThreadsSplit;
+ ncclPrimitives<UNROLL, 1, 1, T, 0, 1, 1, FUNC>
+ prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupSend);
+
+ if (sendCount == 0) {
+ prims.send(sendbuff, 0);
+ } else for (ssize_t offset = 0; offset < sendCount; offset += chunkSize) {
+ int realChunkSize = min(chunkSize, sendCount-offset);
+ ALIGN_SIZE(realChunkSize, nt*sizeof(uint64_t)/sizeof(T));
+ int nelem = min(realChunkSize, sendCount-offset);
+ prims.directSend(sendbuff+offset, offset, nelem);
+ }
+ }
+ }
+ }
+ tid -= nThreadsSegment;
+ if (tid < 0) return;
+ args++;
+ }
+ }
+};
diff --git a/src/collectives/reduce.cc b/src/collectives/reduce.cc
index 67f2fae..86388df 100644
--- a/src/collectives/reduce.cc
+++ b/src/collectives/reduce.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,7 +11,8 @@ NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
- struct ncclInfo info = { ncclCollReduce, "Reduce",
+ NVTX3_FUNC_RANGE_IN(nccl_domain);
+ struct ncclInfo info = { ncclFuncReduce, "Reduce",
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
return ncclEnqueueCheck(&info);
diff --git a/src/collectives/reduce_scatter.cc b/src/collectives/reduce_scatter.cc
index 5ad7f5f..57c67bf 100644
--- a/src/collectives/reduce_scatter.cc
+++ b/src/collectives/reduce_scatter.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,7 +11,8 @@ NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
- struct ncclInfo info = { ncclCollReduceScatter, "ReduceScatter",
+ NVTX3_FUNC_RANGE_IN(nccl_domain);
+ struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
return ncclEnqueueCheck(&info);
diff --git a/src/collectives/sendrecv.cc b/src/collectives/sendrecv.cc
new file mode 100644
index 0000000..65222a5
--- /dev/null
+++ b/src/collectives/sendrecv.cc
@@ -0,0 +1,39 @@
+/*************************************************************************
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "collectives.h"
+#include "argcheck.h" // Need some checks here since we access comm
+
+NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+ ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+ ncclComm_t comm, cudaStream_t stream) {
+ NVTX3_FUNC_RANGE_IN(nccl_domain);
+ struct ncclInfo info = { ncclFuncSendRecv, "Send",
+ sendbuff, NULL, count, datatype, ncclSum, peer, comm, stream, /* Args */
+ 1, 1 };
+ ncclResult_t ret;
+ NCCLCHECK(ncclGroupStart());
+ ret = ncclEnqueueCheck(&info);
+ NCCLCHECK(ncclGroupEnd());
+ return ret;
+}
+
+NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+ ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+ ncclComm_t comm, cudaStream_t stream) {
+ NVTX3_FUNC_RANGE_IN(nccl_domain);
+ struct ncclInfo info = { ncclFuncSendRecv, "Recv",
+ NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
+ 1, 1 };
+ ncclResult_t ret;
+ NCCLCHECK(ncclGroupStart());
+ ret = ncclEnqueueCheck(&info);
+ NCCLCHECK(ncclGroupEnd());
+ return ret;
+}