diff options
author | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2020-05-13 00:40:18 +0300 |
---|---|---|
committer | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2020-06-08 19:31:44 +0300 |
commit | 5949d96f36d050e59d05872f8bbffd2549318e95 (patch) | |
tree | e56476c71668bbd1ce4ddbc189b1be7d037b065c | |
parent | f36540f55a15683a121b6c330657af442b85c796 (diff) |
2.7.3-1
Add support for A100 GPU and related platforms.
Add support for CUDA 11.
Add support for send/receive operations (beta).
54 files changed, 2047 insertions, 1250 deletions
diff --git a/makefiles/common.mk b/makefiles/common.mk index ece18c7..8e91a45 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -23,19 +23,24 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2) #$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR}) -# Better define NVCC_GENCODE in your environment to the minimal set +# You should define NVCC_GENCODE in your environment to the minimal set # of archs to reduce compile time. CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \ -gencode=arch=compute_50,code=sm_50 \ -gencode=arch=compute_60,code=sm_60 \ -gencode=arch=compute_61,code=sm_61 CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70 +CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80 CUDA8_PTX = -gencode=arch=compute_61,code=compute_61 CUDA9_PTX = -gencode=arch=compute_70,code=compute_70 +CUDA11_PTX = -gencode=arch=compute_80,code=compute_80 +# Include Ampere support if we're using CUDA11 or above +ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) + NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX) $(CUDA11_GENCODE) $(CUDA11_PTX) # Include Volta support if we're using CUDA9 or above -ifeq ($(shell test "0$(CUDA_MAJOR)" -gt 8; echo $$?),0) +else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0) NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX) else NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX) diff --git a/makefiles/version.mk b/makefiles/version.mk index 883e625..4a82cb9 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 6 -NCCL_PATCH := 4 +NCCL_MINOR := 7 +NCCL_PATCH := 3 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/Makefile b/src/Makefile index db1698a..d065888 100644 --- a/src/Makefile +++ b/src/Makefile @@ -9,10 +9,10 @@ include ../makefiles/version.mk ##### src files INCEXPORTS := nccl.h nccl_net.h -LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc \ +LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc \ misc/nvmlwrap.cc misc/ibvwrap.cc misc/utils.cc misc/argcheck.cc \ transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \ - collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \ + collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \ graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc ##### lib files diff --git a/src/bootstrap.cc b/src/bootstrap.cc index 11ffc35..e90dd66 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -240,6 +240,7 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) { char* env = getenv("NCCL_COMM_ID"); if (env) { + INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env); if (bootstrapNetCreateHandle(netHandle, env) != 0) { WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>"); return ncclInvalidArgument; diff --git a/src/channel.cc b/src/channel.cc index 0a43e17..d22ea63 100644 --- a/src/channel.cc +++ b/src/channel.cc @@ -1,29 +1,17 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "channel.h" #include "param.h" -#include "graph.h" - -#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */ -#define DEFAULT_BUFFER_SIZE_BYTES_ARM (1LL << 20) /* 1MiB */ - -NCCL_PARAM(Buffsize, "BUFFSIZE", -2); ncclResult_t initChannel(struct ncclComm* comm, int channelid) { struct ncclChannel* channel = comm->channels+channelid; + if (channel->id != -1) return ncclSuccess; channel->id = channelid; - // Setup intermediate buffering - int buffSize = ncclParamBuffsize(); - int cpuArch, cpuVendor, cpuModel; - NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel)); - channel->buffSize = buffSize != -2 ? buffSize : - cpuArch == NCCL_TOPO_CPU_ARCH_ARM ? DEFAULT_BUFFER_SIZE_BYTES_ARM : DEFAULT_BUFFER_SIZE_BYTES; - // Ring index to user rank table. NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks)); NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks)); @@ -37,11 +25,12 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) { } // Per-channel operation list. - NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS)); + NCCLCHECK(ncclCudaHostCalloc(&channel->collectives, NCCL_MAX_OPS)); return ncclSuccess; } ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) { + if (channel->id == -1) return ncclSuccess; // Operation list NCCLCHECK(ncclCudaHostFree(channel->collectives)); diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile index 001059c..3796fb1 100644 --- a/src/collectives/device/Makefile +++ b/src/collectives/device/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -10,7 +10,7 @@ include ../../../makefiles/version.mk BUILDDIR ?= $(abspath ../../../build) OBJDIR := $(BUILDDIR)/obj/collectives/device -LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu +LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu LIBSRCFILES += functions.cu diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h index 059092c..724b1aa 100644 --- a/src/collectives/device/all_gather.h +++ b/src/collectives/device/all_gather.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,26 +11,27 @@ template<int UNROLL, class FUNC, typename T> __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads-WARP_SIZE; - const int bid = args->bid; + const int nthreads = args->coll.nThreads-WARP_SIZE; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; - const ssize_t size = args->N; - const int nranks = comm->nRanks; - const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS); const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS; - const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; + const int nranks = comm->nRanks; + const ssize_t loopSize = nChannels*(ssize_t)chunkSize; + const ssize_t size = args->coll.count; // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; - ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, FUNC> - prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount); + ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, 1, FUNC> + prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); + int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels)); ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); ssize_t chunkOffset = gridOffset + bid*realChunkSize; @@ -75,27 +76,27 @@ __device__ void ncclAllGatherCollNetKernel(struct CollectiveArgs* args) { } template<int UNUSED, class FUNC, typename T> __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int bid = args->bid; - const int nthreads = args->nThreads; + const int nthreads = args->coll.nThreads; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; - - ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); - - const ssize_t size = args->N; - //const int rank = comm->rank; + const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS); + ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T); const int nranks = comm->nRanks; - ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nChannels*chunkSize; + const ssize_t loopSize = nChannels*chunkSize; + const ssize_t size = args->coll.count; + + ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm, args->opCount); // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { - chunkSize = args->lastChunkSize; + chunkSize = args->coll.lastChunkSize; } ssize_t chunkOffset = gridOffset + bid*chunkSize; @@ -140,29 +141,28 @@ __device__ void ncclAllGatherCollNetLLKernel(struct CollectiveArgs* args) { } template<int UNUSED, class FUNC, typename T> __device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int bid = args->bid; - const int nthreads = args->nThreads; + const int nthreads = args->coll.nThreads; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; - - ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); - - const ssize_t size = args->N; - //const int rank = comm->rank; - const int nranks = comm->nRanks; - ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS); + ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T)); // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2; + const int nranks = comm->nRanks; + const ssize_t loopSize = nChannels*chunkSize; + const ssize_t size = args->coll.count; - const ssize_t loopSize = args->nChannels*chunkSize; + ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm, args->opCount); // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize); + chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize); ssize_t chunkOffset = gridOffset + bid*chunkSize; diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h index 4e04f88..6891ac0 100644 --- a/src/collectives/device/all_reduce.h +++ b/src/collectives/device/all_reduce.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,26 +11,27 @@ template<int UNROLL, class FUNC, typename T> __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads-WARP_SIZE; - const int bid = args->bid; + const int nthreads = args->coll.nThreads-WARP_SIZE; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; - const ssize_t size = args->N; - const int nranks = comm->nRanks; - const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS); const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS; - const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; + const int nranks = comm->nRanks; + const ssize_t loopSize = nChannels*(ssize_t)chunkSize; + const ssize_t size = args->coll.count; // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; - ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC> - prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount); + ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, 1, FUNC> + prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) { - int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels)); + ssize_t realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*nChannels)); ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize; @@ -85,28 +86,29 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) { template<int UNROLL, class FUNC, typename T> __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads-WARP_SIZE; - const int bid = args->bid; + const int nthreads = args->coll.nThreads-WARP_SIZE; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; - const ssize_t size = args->N; - const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); - int chunkSize = args->lastChunkSize; + const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS); + int chunkSize = args->coll.lastChunkSize; const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nChannels*chunkSize; + const ssize_t loopSize = nChannels*chunkSize; + const ssize_t size = args->coll.count; if (loopSize > size) { - chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize; + chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize; } // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; do { struct ncclTree* tree = &channel->treeUp; // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) - ncclPrimitives<UNROLL/2, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount); + ncclPrimitives<UNROLL/2, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, 0, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { // Up ssize_t offset = gridOffset + bid*chunkSize; @@ -124,17 +126,17 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) { do { struct ncclTree* tree = &channel->treeDn; // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) - ncclPrimitives<UNROLL/2, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount); + ncclPrimitives<UNROLL/2, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, &tree->up, tree->down, thisOutput, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { // Down ssize_t offset = gridOffset + bid*chunkSize; int nelem = min(chunkSize, size-offset); if (tree->up == -1) { - prims.send(thisOutput+offset, nelem); + prims.directSend(thisOutput+offset, offset, nelem); } else if (tree->down[0] == -1) { - prims.recv(thisOutput+offset, nelem); + prims.directRecv(thisOutput+offset, offset, nelem); } else { - prims.recvCopySend(thisOutput+offset, nelem); + prims.directRecvCopySend(thisOutput+offset, offset, nelem); } } } while(0); @@ -143,27 +145,28 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) { template<int UNROLL, class FUNC, typename T> __device__ void ncclAllReduceCollNetKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads-WARP_SIZE; - const int bid = args->bid; + const int nthreads = args->coll.nThreads-WARP_SIZE; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; - const ssize_t size = args->N; - const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); - int chunkSize = args->lastChunkSize; + const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS); + int chunkSize = args->coll.lastChunkSize; const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nChannels*chunkSize; + const ssize_t loopSize = nChannels*chunkSize; + const ssize_t size = args->coll.count; if (loopSize > size) { - chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize; + chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize; } // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; - if (blockIdx.x < args->nChannels) { // first half of the channels do reduce + if (blockIdx.x < nChannels) { // first half of the channels do reduce struct ncclTree* tree = &channel->collTreeUp; - ncclPrimitives<UNROLL, 1, 1, T, 1, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount); + ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { // Up ssize_t offset = gridOffset + bid*chunkSize; @@ -178,9 +181,9 @@ __device__ void ncclAllReduceCollNetKernel(struct CollectiveArgs* args) { } } - if (blockIdx.x >= args->nChannels) { // second half of the channels do broadcast + if (blockIdx.x >= nChannels) { // second half of the channels do broadcast struct ncclTree* tree = &channel->collTreeDn; - ncclPrimitives<UNROLL, 1, 1, T, 1, 1, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount); + ncclPrimitives<UNROLL, 1, 1, T, 1, 1, 0, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { // Down ssize_t offset = gridOffset + bid*chunkSize; @@ -199,28 +202,27 @@ __device__ void ncclAllReduceCollNetKernel(struct CollectiveArgs* args) { template<int UNUSED, class FUNC, typename T> __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int bid = args->bid; - const int nthreads = args->nThreads; + const int nthreads = args->coll.nThreads; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; - - ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); - - const ssize_t size = args->N; - //const int rank = comm->rank; - const int nranks = comm->nRanks; - ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); + const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS); + ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T); const ssize_t minChunkSize = nthreads * (sizeof(uint64_t)) / sizeof(T); + const int nranks = comm->nRanks; + const ssize_t loopSize = nChannels*nranks*chunkSize; + const ssize_t size = args->coll.count; - const ssize_t loopSize = args->nChannels*nranks*chunkSize; + ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm, args->opCount); // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize); + chunkSize = min(DIVUP(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize, chunkSize); /////////////// begin AllReduce steps /////////////// ssize_t offset; @@ -229,7 +231,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { // step 0: push data to next GPU chunk = ring->devUserRanks[nranks-1]; - offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; + offset = gridOffset + (chunk*nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.send(thisInput+offset, nelem); @@ -237,7 +239,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { // k-2 steps: reduce and copy to next GPU for (int j=2; j<nranks; ++j) { chunk = ring->devUserRanks[nranks-j]; - offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; + offset = gridOffset + (chunk*nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.recvReduceSend(thisInput+offset, nelem); @@ -246,7 +248,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { // step k-1: reduce this buffer and data, which will produce the final // result that we store in this data and push to the next GPU chunk = ring->devUserRanks[0]; - offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; + offset = gridOffset + (chunk*nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem); @@ -254,7 +256,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { // k-2 steps: copy to next GPU for (int j=1; j<nranks-1; ++j) { chunk = ring->devUserRanks[nranks-j]; - offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; + offset = gridOffset + (chunk*nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.recvCopySend(thisOutput+offset, nelem); @@ -262,7 +264,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { // Make final copy from buffer to dest. chunk = ring->devUserRanks[1]; - offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; + offset = gridOffset + (chunk*nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); // Here we need to copy from buffer to this output. @@ -273,27 +275,29 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { template<int UNUSED, class FUNC, typename T> __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int bid = args->bid; + const int nthreads = args->coll.nThreads; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; - const ssize_t size = args->N; - ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); + const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS); + ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T); const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nChannels*chunkSize; + const ssize_t loopSize = nChannels*chunkSize; + const ssize_t size = args->coll.count; if (loopSize > size) { - chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize; + chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize; } // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; do { struct ncclTree* tree = &channel->treeUp; // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) - ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount); + ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { // Up ssize_t offset = gridOffset + bid*chunkSize; @@ -311,7 +315,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) { do { struct ncclTree* tree = &channel->treeDn; // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) - ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount); + ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { // Down ssize_t offset = gridOffset + bid*chunkSize; @@ -330,26 +334,28 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) { template<int UNUSED, class FUNC, typename T> __device__ void ncclAllReduceCollNetLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int bid = args->bid; + const int nthreads = args->coll.nThreads; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; - const ssize_t size = args->N; - ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); + const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS); + ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T); const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nChannels*chunkSize; + const ssize_t loopSize = nChannels*chunkSize; + const ssize_t size = args->coll.count; if (loopSize > size) { - chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize; + chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize; } // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; - if (blockIdx.x < args->nChannels) { // first half of the channels do reduce + if (blockIdx.x < nChannels) { // first half of the channels do reduce struct ncclTree* tree = &channel->collTreeUp; - ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount); + ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { // Up ssize_t offset = gridOffset + bid*chunkSize; @@ -364,9 +370,9 @@ __device__ void ncclAllReduceCollNetLLKernel(struct CollectiveArgs* args) { } } - if (blockIdx.x >= args->nChannels) { // second half of the channels do broadcast + if (blockIdx.x >= nChannels) { // second half of the channels do broadcast struct ncclTree* tree = &channel->collTreeDn; - ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount); + ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { // Down ssize_t offset = gridOffset + bid*chunkSize; @@ -386,29 +392,28 @@ __device__ void ncclAllReduceCollNetLLKernel(struct CollectiveArgs* args) { template<int UNUSED, class FUNC, typename T> __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int bid = args->bid; - const int nthreads = args->nThreads; + const int nthreads = args->coll.nThreads; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; - - ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); - - const ssize_t size = args->N; - //const int rank = comm->rank; - const int nranks = comm->nRanks; - ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS); + ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T)); // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2; + const int nranks = comm->nRanks; + const ssize_t loopSize = nChannels*nranks*chunkSize; + const ssize_t size = args->coll.count; - const ssize_t loopSize = args->nChannels*nranks*chunkSize; + ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm, args->opCount); // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize); + chunkSize = min(DIVUP(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize, chunkSize); /////////////// begin AllReduce steps /////////////// ssize_t offset; @@ -417,7 +422,7 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) { // step 0: push data to next GPU chunk = ring->devUserRanks[nranks-1]; - offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; + offset = gridOffset + (chunk*nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.send(thisInput+offset, nelem); @@ -425,7 +430,7 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) { // k-2 steps: reduce and copy to next GPU for (int j=2; j<nranks; ++j) { chunk = ring->devUserRanks[nranks-j]; - offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; + offset = gridOffset + (chunk*nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.recvReduceSend(thisInput+offset, nelem); @@ -434,7 +439,7 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) { // step k-1: reduce this buffer and data, which will produce the final // result that we store in this data and push to the next GPU chunk = ring->devUserRanks[0]; - offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; + offset = gridOffset + (chunk*nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem); @@ -442,7 +447,7 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) { // k-2 steps: copy to next GPU for (int j=1; j<nranks-1; ++j) { chunk = ring->devUserRanks[nranks-j]; - offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; + offset = gridOffset + (chunk*nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.recvCopySend(thisOutput+offset, nelem); @@ -450,7 +455,7 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) { // Make final copy from buffer to dest. chunk = ring->devUserRanks[1]; - offset = gridOffset + (chunk*args->nChannels+bid) * chunkSize; + offset = gridOffset + (chunk*nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); // Here we need to copy from buffer to this output. @@ -461,29 +466,31 @@ __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) { template<int UNUSED, class FUNC, typename T> __device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int bid = args->bid; + const int nthreads = args->coll.nThreads; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclTree* treeUp = &channel->treeUp; struct ncclTree* treeDn = &channel->treeDn; - const ssize_t size = args->N; - ssize_t chunkSize = args->lastChunkSize; + const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS); + ssize_t chunkSize = args->coll.lastChunkSize; const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/8; - const ssize_t loopSize = args->nChannels*chunkSize; + const ssize_t loopSize = nChannels*chunkSize; int nthreadsSplit = NCCL_LL128_SPLIT(nthreads); + const ssize_t size = args->coll.count; if (loopSize > size) { - chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize; + chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize; } // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; if (treeUp->up == -1) { // ReduceAndBroadcast : max number of recv is 3, max number of send is 3 - ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, treeUp->down, treeDn->down, channel, comm, args->opCount); + ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, treeUp->down, treeDn->down, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*chunkSize; int nelem = min(chunkSize, size-offset); @@ -492,7 +499,7 @@ __device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) { } else { if (tid < nthreadsSplit) { // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) - ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreadsSplit, treeUp->down, &treeUp->up, channel, comm, args->opCount); + ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreadsSplit, treeUp->down, &treeUp->up, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { // Up ssize_t offset = gridOffset + bid*chunkSize; @@ -505,7 +512,7 @@ __device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) { } } else { // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) - ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &treeDn->up, treeDn->down, channel, comm, args->opCount); + ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &treeDn->up, treeDn->down, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { // Down ssize_t offset = gridOffset + bid*chunkSize; diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h index 5146682..b141a5d 100644 --- a/src/collectives/device/broadcast.h +++ b/src/collectives/device/broadcast.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,28 +11,29 @@ template<int UNROLL, class FUNC, typename T> __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads-WARP_SIZE; - const int bid = args->bid; + const int nthreads = args->coll.nThreads-WARP_SIZE; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; - const ssize_t size = args->N; - const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS); const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS; - const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; + const ssize_t loopSize = nChannels*(ssize_t)chunkSize; + const ssize_t size = args->coll.count; const int rank = ring->devUserRanks[0]; const int nextRank = ring->devUserRanks[1]; - const int root = args->root; + const int root = args->coll.root; // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; - ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, FUNC> - prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); + ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, 0, FUNC> + prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); + int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels)); ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); ssize_t offset = gridOffset + bid*realChunkSize; int nelem = min(realChunkSize, size-offset); @@ -60,29 +61,29 @@ __device__ void ncclBroadcastCollNetKernel(struct CollectiveArgs* args) { } template<int UNUSED, class FUNC, typename T> __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int bid = args->bid; - const int nthreads = args->nThreads; + const int nthreads = args->coll.nThreads; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; - - ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); - - const ssize_t size = args->N; + const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS); + ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T); + const ssize_t loopSize = nChannels*chunkSize; + const ssize_t size = args->coll.count; const int rank = ring->devUserRanks[0]; const int nextRank = ring->devUserRanks[1]; - const int root = args->root; + const int root = args->coll.root; - ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nChannels*chunkSize; + ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm, args->opCount); // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { - chunkSize = args->lastChunkSize; + chunkSize = args->coll.lastChunkSize; } ssize_t offset = gridOffset + bid*chunkSize; @@ -111,30 +112,29 @@ __device__ void ncclBroadcastCollNetLLKernel(struct CollectiveArgs* args) { } template<int UNUSED, class FUNC, typename T> __device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int bid = args->bid; - const int nthreads = args->nThreads; + const int nthreads = args->coll.nThreads; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; - - ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); - - const ssize_t size = args->N; + const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS); + ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T)); + const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + const ssize_t loopSize = nChannels*chunkSize; + const ssize_t size = args->coll.count; const int rank = ring->devUserRanks[0]; const int nextRank = ring->devUserRanks[1]; - const int root = args->root; - - ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); - const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + const int root = args->coll.root; - const ssize_t loopSize = args->nChannels*chunkSize; + ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm, args->opCount); // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize); + chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize); ssize_t offset = gridOffset + bid*chunkSize; int nelem = min(chunkSize, size-offset); diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index 6e06369..a76f4e8 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -67,10 +67,10 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ c = &firstColl; \ } else { \ c = &localColl; \ - load_coll(c, channel->devCollectives+channel->collFifoHead, tid, comm); \ + load_coll(c, channel->collectives+channel->collFifoHead, tid, comm); \ } \ while (1) { \ - if (tid < c->args.nThreads) { \ + if (tid < c->args.common.nThreads) { \ if (c->funcIndex == fIndex) { \ coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&c->args); \ } else { \ @@ -86,7 +86,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ \ /* Load next collective operation*/ \ c = &localColl; /* for bid 0 */ \ - load_coll(c, channel->devCollectives+nextIndex, tid, comm); \ + load_coll(c, channel->collectives+nextIndex, tid, comm); \ } \ } #else diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu index d10f11e..119cd36 100644 --- a/src/collectives/device/functions.cu +++ b/src/collectives/device/functions.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -56,6 +56,7 @@ __device__ volatile uint64_t* ncclShmem; // Must be consistent with ncclFunc_t #define NCCL_FUNCS() { \ + NCCL_COLL_NAME(ncclSendRecv, copy, i8),\ NCCL_FUNCS2B(ncclBroadcast), \ NCCL_FUNCS2A(ncclReduce), \ NCCL_FUNCS2B(ncclAllGather), \ @@ -63,11 +64,12 @@ __device__ volatile uint64_t* ncclShmem; NCCL_FUNCS2A(ncclAllReduce) } // Must be consistent with the ncclFuncSet enum -__device__ ncclKern_t ncclFuncs[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = { +__device__ ncclKern_t ncclFuncs[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = { // Don't try to initialize the host shadow copy of this device-side global // variable. There is no host pointer to a device-side function, which // confuses clang. This will be fixed in the next clang release. #if __CUDA_ARCH__ + NCCL_COLL_NAME(ncclSendRecv, copy, i8), NCCL_FUNCS2B(ncclBroadcast), NCCL_FUNCS2A(ncclReduce), NCCL_FUNCS2B(ncclAllGather), diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh index 4413213..97dc0ae 100755 --- a/src/collectives/device/gen_rules.sh +++ b/src/collectives/device/gen_rules.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -9,7 +9,7 @@ dir=$1 targets="GENOBJS := \\\\\n" -for base in all_reduce all_gather broadcast reduce reduce_scatter; do +for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do opn=0 for op in sum prod min max; do dtn=0 diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h index c1067bf..bbbde25 100644 --- a/src/collectives/device/primitives.h +++ b/src/collectives/device/primitives.h @@ -32,7 +32,7 @@ } while (0) // Implementation of primitive types -template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, class FUNC> +template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, int DIRECT, class FUNC> class ncclPrimitives { private: const int tid; @@ -70,10 +70,18 @@ class ncclPrimitives { inline __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); } inline __device__ void barrier() { - asm volatile ("bar.sync 1, %0;" :: "r"(nthreads)); + if (NSEND>NRECV) { + asm volatile ("bar.sync 1, %0;" :: "r"(nthreads+WARP_SIZE)); + } else { + asm volatile ("bar.sync 2, %0;" :: "r"(nthreads+WARP_SIZE)); + } } inline __device__ void subBarrier() { - asm volatile ("bar.sync 2, %0;" :: "r"(nthreads-WARP_SIZE)); + if (NSEND>NRECV) { + asm volatile ("bar.sync 3, %0;" :: "r"(nthreads)); + } else { + asm volatile ("bar.sync 4, %0;" :: "r"(nthreads)); + } } uint32_t mismatch = 0; @@ -183,7 +191,7 @@ class ncclPrimitives { for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset); } - bool syncThread = tid >= nthreads-WARP_SIZE; + bool syncThread = tid >= nthreads; #pragma unroll for (int slice=0; slice<SLICESPERCHUNK; ++slice) { @@ -196,10 +204,10 @@ class ncclPrimitives { if (DIRECTRECV && recvDirectBuff[0]) { // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy if (SEND) { - ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads-WARP_SIZE, 1, srcs, nsend, dsts+1, realSize); + ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize); } } else { - ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads-WARP_SIZE, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize); + ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize); } } } @@ -223,11 +231,11 @@ class ncclPrimitives { } __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) { - recvBuff[i] = (const T*)conn->buff; + recvBuff[i] = (const T*)conn->buffs[NCCL_PROTO_SIMPLE]; recvStep[i] = conn->step; recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS); recvDirectBuff[i] = NULL; - if (directBuff && (conn->direct & NCCL_DIRECT_GPU)) { + if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) { recvDirectBuff[i] = directBuff; if (tid == 0) *conn->ptrExchange = directBuff; } @@ -240,7 +248,7 @@ class ncclPrimitives { recvConnTailPtr = recvConn->tail; recvConnTailCache = *recvConnTailPtr; } - if (tid >= nthreads-WARP_SIZE && wid < nrecv) { + if (tid >= nthreads && wid < nrecv) { recvConnHeadPtr = recvConn->head; // Return credits in case we rounded up. *recvConnHeadPtr = recvConnHead; @@ -249,12 +257,12 @@ class ncclPrimitives { } } - __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) { - sendBuff[i] = (T*)conn->buff; + __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) { + sendBuff[i] = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; sendStep[i] = conn->step; sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS); sendDirectBuff[i] = NULL; - if (directBuff && (conn->direct & NCCL_DIRECT_GPU)) { + if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) { void* volatile* ptr = conn->ptrExchange; while ((sendDirectBuff[i] = (T*)(*ptr)) == NULL); barrier(); @@ -271,13 +279,13 @@ class ncclPrimitives { sendConnFifoPtr = sendConn->fifo; *(sendConn->opCountLoc) = opCount; } - if (tid >= nthreads-WARP_SIZE && wid<nsend) { + if (tid >= nthreads && wid<nsend) { sendConnTailPtr = sendConn->tail; } } __device__ __forceinline__ void saveRecvSync() { - if (tid >= nthreads-WARP_SIZE && wid < nrecv) { + if (tid >= nthreads && wid < nrecv) { recvConn->step = recvConnHead; *(recvConn->opCountLoc) = opCount+1; __threadfence_system(); @@ -300,7 +308,7 @@ class ncclPrimitives { barrier(); for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, directBuff); - for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, directBuff); + for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i); loadRecvSync(); loadSendSync(); } diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h index f919493..5518061 100644 --- a/src/collectives/device/prims_ll.h +++ b/src/collectives/device/prims_ll.h @@ -1,9 +1,16 @@ +/************************************************************************* + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + template <typename T, class FUNC, int NRECV, int NSEND> class ncclLLPrimitives { private: const int tid; const int nthreads; const int wid; + const int stepLines; int nrecv = 0; int nsend = 0; struct ncclConnInfo* recvConn = NULL; @@ -22,8 +29,8 @@ class ncclLLPrimitives { union ncclLLFifoLine* sendBuff[NSEND]; struct ncclDevComm* comm; - inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } - inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } + inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepLines; } + inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepLines; } inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); } inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); } @@ -68,7 +75,7 @@ class ncclLLPrimitives { if (checkAbort(wid, 1)) break; } if (sendConnFifoPtr) { - int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes; + int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? stepLines*sizeof(union ncclLLFifoLine) : nbytes; sendConnFifoPtr[sendConnHead%NCCL_STEPS] = size; } sendConnHead += 1; @@ -88,7 +95,7 @@ class ncclLLPrimitives { // LL Cleanup : write all flags in the slice to make sure we don't have // data corruption when flag loops over. if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) { - for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i)); + for (int o = offset; o<stepLines; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i)); } sendStep[i]++; } @@ -164,7 +171,7 @@ class ncclLLPrimitives { } __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) { - recvBuff[i] = conn->llBuff; + recvBuff[i] = (union ncclLLFifoLine*)conn->buffs[NCCL_PROTO_LL]; recvStep[i] = conn->step; if (wid == i) recvConn = conn; nrecv++; @@ -179,7 +186,7 @@ class ncclLLPrimitives { } __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) { - sendBuff[i] = conn->llBuff; + sendBuff[i] = (union ncclLLFifoLine*)conn->buffs[NCCL_PROTO_LL]; sendStep[i] = conn->step; if (wid == i) sendConn = conn; nsend++; @@ -212,8 +219,8 @@ class ncclLLPrimitives { public: __device__ __forceinline__ - ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount) - : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), opCount(opCount) { + ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, int stepLines, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount) + : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepLines(stepLines), opCount(opCount) { // Make sure step is updated before we read it. barrier(); diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h index 40a8cff..f445e0d 100644 --- a/src/collectives/device/prims_ll128.h +++ b/src/collectives/device/prims_ll128.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -14,6 +14,7 @@ class ncclLL128Primitives { const int tid; const int nthreads; const int wid; + const int stepSize; const int warp; const bool flagThread; int nrecv = 0; @@ -38,8 +39,8 @@ class ncclLL128Primitives { volatile uint64_t* shmem; - inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; } - inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; } + inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; } + inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; } inline __device__ uint64_t* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } inline __device__ uint64_t* sendPtr(int i) { return sendBuff[i]+sendOffset(i); } inline __device__ uint64_t recvFlag(int i) { return recvStep[i]+1; } @@ -47,9 +48,9 @@ class ncclLL128Primitives { inline __device__ void barrier() { if (NSEND>NRECV) { - asm volatile ("bar.sync 2, %0;" :: "r"(nthreads)); + asm volatile ("bar.sync 1, %0;" :: "r"(nthreads)); } else { - asm volatile ("bar.sync 3, %0;" :: "r"(nthreads)); + asm volatile ("bar.sync 2, %0;" :: "r"(nthreads)); } } @@ -309,7 +310,7 @@ class ncclLL128Primitives { } __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) { - recvBuff[i] = conn->ll128Buff; + recvBuff[i] = (uint64_t*)conn->buffs[NCCL_PROTO_LL128]; recvStep[i] = conn->step; if (wid == i) recvConn = conn; nrecv++; @@ -324,7 +325,7 @@ class ncclLL128Primitives { } __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) { - sendBuff[i] = conn->ll128Buff; + sendBuff[i] = (uint64_t*)conn->buffs[NCCL_PROTO_LL128]; sendStep[i] = conn->step; if (wid == i) sendConn = conn; nsend++; @@ -363,8 +364,8 @@ class ncclLL128Primitives { public: __device__ __forceinline__ - ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount) - : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), opCount(opCount), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) { + ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount) + : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), stepSize(stepSize), opCount(opCount), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) { // Make sure step is updated before we read it. barrier(); diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h index e36613f..19b090e 100644 --- a/src/collectives/device/reduce.h +++ b/src/collectives/device/reduce.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,29 +11,30 @@ template<int UNROLL, class FUNC, typename T> __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads-WARP_SIZE; - const int bid = args->bid; + const int nthreads = args->coll.nThreads-WARP_SIZE; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; - const ssize_t size = args->N; - const int nranks = comm->nRanks; - const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS); const int chunkSize = stepSize * REDUCE_CHUNKSTEPS; - const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; + const int nranks = comm->nRanks; + const ssize_t loopSize = nChannels*(ssize_t)chunkSize; + const ssize_t size = args->coll.count; const int rank = ring->devUserRanks[0]; const int prevRank = ring->devUserRanks[nranks-1]; - const int root = args->root; + const int root = args->coll.root; // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; - ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, FUNC> - prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); + ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, 0, FUNC> + prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); + int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels)); ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); ssize_t offset = gridOffset + bid*realChunkSize; int nelem = min(realChunkSize, size-offset); @@ -56,30 +57,30 @@ __device__ void ncclReduceCollNetKernel(struct CollectiveArgs* args) { } template<int UNUSED, class FUNC, typename T> __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int bid = args->bid; - const int nthreads = args->nThreads; + const int nthreads = args->coll.nThreads; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; - - ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); - - const ssize_t size = args->N; - const int rank = comm->rank; + const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS); + ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T); const int nranks = comm->nRanks; + const ssize_t loopSize = nChannels*chunkSize; + const ssize_t size = args->coll.count; + const int rank = comm->rank; const int prevRank = ring->devUserRanks[nranks-1]; - const int root = args->root; + const int root = args->coll.root; - ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nChannels*chunkSize; + ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm, args->opCount); // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { - chunkSize = args->lastChunkSize; + chunkSize = args->coll.lastChunkSize; } ssize_t offset = gridOffset + bid*chunkSize; @@ -104,31 +105,30 @@ __device__ void ncclReduceCollNetLLKernel(struct CollectiveArgs* args) { } template<int UNUSED, class FUNC, typename T> __device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int bid = args->bid; - const int nthreads = args->nThreads; + const int nthreads = args->coll.nThreads; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; - - ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); - - const ssize_t size = args->N; - const int rank = comm->rank; + const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS); + ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T)); + const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); const int nranks = comm->nRanks; + const ssize_t loopSize = nChannels*chunkSize; + const ssize_t size = args->coll.count; + const int rank = comm->rank; const int prevRank = ring->devUserRanks[nranks-1]; - const int root = args->root; - - ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); - const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + const int root = args->coll.root; - const ssize_t loopSize = args->nChannels*chunkSize; + ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm, args->opCount); // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize); + chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize); ssize_t offset = gridOffset + bid*chunkSize; int nelem = min(chunkSize, size-offset); diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h index 0b0ae81..a0a9cc0 100644 --- a/src/collectives/device/reduce_scatter.h +++ b/src/collectives/device/reduce_scatter.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,26 +11,27 @@ template<int UNROLL, class FUNC, typename T> __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads-WARP_SIZE; - const int bid = args->bid; + const int nthreads = args->coll.nThreads-WARP_SIZE; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; - const ssize_t size = args->N; - const int nranks = comm->nRanks; - const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS); const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS; - const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; + const int nranks = comm->nRanks; + const ssize_t loopSize = nChannels*(ssize_t)chunkSize; + const ssize_t size = args->coll.count; // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; - ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, FUNC> - prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); + ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, 0, FUNC> + prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); + int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nChannels)); ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); ssize_t chunkOffset = gridOffset + bid*realChunkSize; @@ -70,27 +71,27 @@ __device__ void ncclReduceScatterCollNetKernel(struct CollectiveArgs* args) { } template<int UNUSED, class FUNC, typename T> __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int bid = args->bid; - const int nthreads = args->nThreads; + const int nthreads = args->coll.nThreads; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; - - ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); - - const ssize_t size = args->N; - //const int rank = comm->rank; + const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS); + ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T); const int nranks = comm->nRanks; - ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nChannels*chunkSize; + const ssize_t loopSize = nChannels*chunkSize; + const ssize_t size = args->coll.count; + + ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepLines, channel, comm, args->opCount); // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { - chunkSize = args->lastChunkSize; + chunkSize = args->coll.lastChunkSize; } ssize_t chunkOffset = gridOffset + bid*chunkSize; @@ -132,29 +133,28 @@ __device__ void ncclReduceScatterCollNetLLKernel(struct CollectiveArgs* args) { template<int UNUSED, class FUNC, typename T> __device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int bid = args->bid; - const int nthreads = args->nThreads; + const int nthreads = args->coll.nThreads; + const int bid = args->coll.bid; + const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; - - ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); - - const ssize_t size = args->N; - //const int rank = comm->rank; - const int nranks = comm->nRanks; - ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS); + ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T)); // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2; + const int nranks = comm->nRanks; + const ssize_t loopSize = nChannels*chunkSize; + const ssize_t size = args->coll.count; - const ssize_t loopSize = args->nChannels*chunkSize; + ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm, args->opCount); // Compute pointers - const T * __restrict__ thisInput = (const T*)args->ThisInput; - T * __restrict__ thisOutput = (T*)args->ThisOutput; + const T * __restrict__ thisInput = (const T*)args->sendbuff; + T * __restrict__ thisOutput = (T*)args->recvbuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize); + chunkSize = min(DIVUP(size-gridOffset, nChannels*minChunkSize)*minChunkSize, chunkSize); ssize_t chunkOffset = gridOffset + bid*chunkSize; diff --git a/src/collectives/device/sendrecv.cu b/src/collectives/device/sendrecv.cu new file mode 100644 index 0000000..34e7adf --- /dev/null +++ b/src/collectives/device/sendrecv.cu @@ -0,0 +1,14 @@ +/************************************************************************* + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "sendrecv.h" +#include "common.h" +#include "collectives.h" + +#if NCCL_OP == 0 && NCCL_TYPE == 0 +IMPL_COLL_FUNC(ncclSendRecv, copy, FuncSum, i8, int8_t); +IMPL_COLL_KERN(ncclSendRecv, copy, FuncSum, i8, int8_t, 0); +#endif diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h new file mode 100644 index 0000000..2fc64af --- /dev/null +++ b/src/collectives/device/sendrecv.h @@ -0,0 +1,81 @@ +/************************************************************************* + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "devcomm.h" +#include "primitives.h" +#include "collectives.h" + +template<int UNROLL, class FUNC, typename T> +__device__ void ncclSendRecvKernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int nthreads = args->p2p.nThreads-2*WARP_SIZE; + + // Compute pointers + const T* sendbuff = (const T*)args->sendbuff; + T* recvbuff = (T*)args->recvbuff; + + if (args->p2p.delta < 0 ) return; // No-op + + if (args->p2p.delta == 0) { + if (tid < nthreads && sendbuff != recvbuff) { + // local copy : ReduceOrCopyMulti takes an int as number of elements, + // so we split it in blocks of 1G elements. + int blockSize = 1<<30; + for (size_t offset=0; offset<args->p2p.sendCount; offset += blockSize) { + size_t remaining = args->p2p.sendCount - offset; + if (remaining < blockSize) blockSize = remaining; + ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, blockSize); + sendbuff += blockSize; recvbuff += blockSize; + } + } + return; + } + + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + + const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(sizeof(T)*NCCL_STEPS)/SENDRECV_SLICEFACTOR; + + int nthreadsSplit = nthreads/2; + // We set NRECV or NSEND to 2 to use different barriers in primitives for the send threads and + // receive threads, but then we define all peers to -1 since sender threads don't receive and + // receive threads don't send. + int peerNone[2] = {-1,-1}; + + if (tid < nthreadsSplit + WARP_SIZE ) { + const ssize_t sendSize = args->p2p.sendCount; + if (sendSize < 0) return; + + int peer = (comm->rank+(int)args->p2p.delta)%comm->nRanks; + ncclPrimitives<UNROLL, 1, 1, T, 2, 1, 1, FUNC> + prims(tid, nthreadsSplit, peerNone, &peer, recvbuff, stepSize*4, channel, comm, args->opCount); + + if (sendSize == 0) { + prims.send(sendbuff, 0); + } else for (ssize_t offset = 0; offset < sendSize; offset += stepSize) { + int realChunkSize = min(stepSize, sendSize-offset); + ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); + int nelem = min(realChunkSize, sendSize-offset); + prims.directSend(sendbuff+offset, offset, nelem); + } + } else { + const ssize_t recvSize = args->p2p.recvCount; + if (recvSize < 0) return; + + int peer = (comm->rank-(int)args->p2p.delta+comm->nRanks)%comm->nRanks; + ncclPrimitives<UNROLL, 1, 1, T, 1, 2, 1, FUNC> + prims(tid-nthreadsSplit-WARP_SIZE, nthreads-nthreadsSplit, &peer, peerNone, recvbuff, stepSize*4, channel, comm, args->opCount); + + if (recvSize == 0) { + prims.recv(recvbuff, 0); + } else for (ssize_t offset = 0; offset < recvSize; offset += stepSize) { + int realChunkSize = min(stepSize, recvSize-offset); + ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); + int nelem = min(realChunkSize, recvSize-offset); + prims.directRecv(recvbuff+offset, offset, nelem); + } + } +} diff --git a/src/collectives/sendrecv.cc b/src/collectives/sendrecv.cc new file mode 100644 index 0000000..2e32875 --- /dev/null +++ b/src/collectives/sendrecv.cc @@ -0,0 +1,37 @@ +/************************************************************************* + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "enqueue.h" +#include "collectives.h" +#include "argcheck.h" // Need some checks here since we access comm + +NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream) { + struct ncclInfo info = { ncclCollSendRecv, "Send", + sendbuff, NULL, count, datatype, ncclSum, peer, comm, stream, /* Args */ + 1, 1 }; + ncclResult_t ret; + NCCLCHECK(ncclGroupStart()); + ret = ncclEnqueueCheck(&info); + NCCLCHECK(ncclGroupEnd()); + return ret; +} + +NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream) { + struct ncclInfo info = { ncclCollSendRecv, "Recv", + NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ + 1, 1 }; + ncclResult_t ret; + NCCLCHECK(ncclGroupStart()); + ret = ncclEnqueueCheck(&info); + NCCLCHECK(ncclGroupEnd()); + return ret; +} diff --git a/src/debug.cc b/src/debug.cc index b2fc03c..3b99201 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -17,7 +17,7 @@ pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER; void ncclDebugInit() { pthread_mutex_lock(&ncclDebugLock); - if (ncclDebugLevel != -1) return; + if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; } const char* nccl_debug = getenv("NCCL_DEBUG"); if (nccl_debug == NULL) { ncclDebugLevel = NCCL_LOG_NONE; @@ -60,6 +60,8 @@ void ncclDebugInit() { mask = NCCL_GRAPH; } else if (strcasecmp(subsys, "TUNING") == 0) { mask = NCCL_TUNING; + } else if (strcasecmp(subsys, "ENV") == 0) { + mask = NCCL_ENV; } else if (strcasecmp(subsys, "ALL") == 0) { mask = NCCL_ALL; } @@ -125,27 +127,32 @@ void ncclDebugInit() { void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) { if (ncclDebugLevel == -1) ncclDebugInit(); if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; } + if (ncclDebugLevel < level) return; + // Gather the rank information. This can take > 1us so we want to make sure + // we only do it when needed. char hostname[1024]; getHostName(hostname, 1024, '.'); int cudaDev; cudaGetDevice(&cudaDev); + int pid = getpid(); + int tid = gettid(); char buffer[1024]; size_t len = 0; pthread_mutex_lock(&ncclDebugLock); - if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN) + if (level == NCCL_LOG_WARN) len = snprintf(buffer, sizeof(buffer), - "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line); - else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask)) + "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, pid, tid, cudaDev, filefunc, line); + else if (level == NCCL_LOG_INFO && (flags & ncclDebugMask)) len = snprintf(buffer, sizeof(buffer), - "%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev); + "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev); #ifdef ENABLE_TRACE - else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) { + else if (level == NCCL_LOG_TRACE && (flags & ncclDebugMask)) { auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch; double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000; len = snprintf(buffer, sizeof(buffer), - "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line); + "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, pid, tid, cudaDev, timestamp, filefunc, line); } #endif if (len) { @@ -157,11 +164,4 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file fflush(ncclDebugFile); } pthread_mutex_unlock(&ncclDebugLock); - - // If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort() - if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) { - fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n", - hostname, getpid(), gettid(), cudaDev, filefunc, line); - abort(); - } } diff --git a/src/enqueue.cc b/src/enqueue.cc index 92f3467..2aeaf65 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -54,7 +54,8 @@ NCCL_FUNCS3B(coll, copy) // Must be consistent with the ncclFuncSet enum -static void* const ncclKerns[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = { +static void* const ncclKerns[1+NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = { + (void*)NCCL_KERN_NAME(ncclSendRecv, copy, i8), NCCL_FUNCS2B(ncclBroadcast), NCCL_FUNCS2A(ncclReduce), NCCL_FUNCS2B(ncclAllGather), @@ -87,11 +88,29 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *par } ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) { - params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels); + // Only launch blocks where we have work to do. + for (int c=0; c<comm->p2pnChannels; c++) { + if (comm->channels[c].collCount) params->gridDim.x = c+1; + } - // Set active = 2 for the last operation - for (int r=0; r<params->gridDim.x; r++) { - struct ncclChannel* channel = comm->channels+r; + // Set active = 2 for the last operation and add a no-op on empty channels (p2p case). + for (int c=0; c<params->gridDim.x; c++) { + struct ncclChannel* channel = comm->channels+c; + if (channel->collCount == 0) { + int opIndex = channel->collFifoTail; + struct ncclColl* c = channel->collectives+opIndex; + volatile uint8_t* activePtr = (volatile uint8_t*)&c->active; + while (activePtr[0] != 0) sched_yield(); + + c->args.p2p.delta = -1; // no-op + c->funcIndex = FUNC_INDEX_P2P; + c->args.comm = comm->devComm; + c->active = 1; + opIndex = (opIndex+1)%NCCL_MAX_OPS; + c->nextIndex = opIndex; + channel->collFifoTail = opIndex; + channel->collCount++; + } channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active = 2; } @@ -146,8 +165,8 @@ ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) { } ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) { - if (comm->nRanks == 1) return ncclSuccess; struct cudaLaunchParams* params = comm->myParams; + if (params->gridDim.x == 0) return ncclSuccess; NCCLCHECK(setupLaunch(comm, params)); @@ -166,21 +185,22 @@ ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) { params->stream = comm->userStream; } - int isLast = 0; - NCCLCHECK(ncclCpuBarrierIn(comm, &isLast)); - - if (isLast) { - if (comm->launchMode == ncclComm::GROUP) { + if (comm->launchMode == ncclComm::GROUP) { + int isLast = 0; + NCCLCHECK(ncclCpuBarrierIn(comm, &isLast)); + if (isLast) { // I'm the last. Launch all operations. NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode)); + NCCLCHECK(ncclCpuBarrierLast(comm)); } - NCCLCHECK(ncclCpuBarrierLast(comm)); } return ncclSuccess; } ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) { - if (comm->nRanks == 1) return ncclSuccess; + struct cudaLaunchParams *params = comm->myParams; + if (params->gridDim.x == 0) return ncclSuccess; + // We can't print the CG mode before the first barrier happened. if (comm->rank == 0 && *comm->intraCGMode & 0x10) { *comm->intraCGMode ^= 0x10; @@ -190,15 +210,16 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) { (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : ""); } - NCCLCHECK(ncclCpuBarrierOut(comm)); - struct cudaLaunchParams *params = comm->myParams; if (comm->launchMode == ncclComm::PARALLEL) { CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream)); + } else { + NCCLCHECK(ncclCpuBarrierOut(comm)); } + // Start the network proxies as soon as the kernel has been launched. We can't // perform any CUDA call between the two or having a cudaFree between the CUDA - // launch and the transportStartProxy call could cause a deadlock. + // launch and the ncclProxyStart call could cause a deadlock. // Also, starting the proxies after the CUDA launch seems to be better for // performance (latency). for (int r=0; r<params->gridDim.x; r++) { @@ -208,7 +229,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) { } params->gridDim.x = params->blockDim.x = 0; comm->lastOpCount = comm->opCount; - NCCLCHECK(transportStartProxy(comm)); + NCCLCHECK(ncclProxyStart(comm)); return ncclSuccess; } @@ -313,23 +334,32 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) { } static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) { + coll->args.sendbuff = info->sendbuff; + coll->args.recvbuff = info->recvbuff; + coll->args.comm = info->comm->devComm; + coll->args.opCount = info->comm->opCount; + + if (info->coll == ncclCollSendRecv) { + coll->args.p2p.sendCount = info->sendbytes; + coll->args.p2p.recvCount = info->recvbytes; + coll->args.p2p.delta = info->delta; + coll->funcIndex = FUNC_INDEX_P2P; + coll->args.p2p.nThreads = info->nThreads = info->comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]+2*WARP_SIZE; + return ncclSuccess; + } // Set nstepsPerLoop and nchunksPerLoop NCCLCHECK(getAlgoInfo(info)); NCCLCHECK(getPatternInfo(info)); NCCLCHECK(getLoopInfo(info)); - coll->args.root = info->root; - coll->args.N = info->count; - coll->args.ThisInput = info->sendbuff; - coll->args.ThisOutput = info->recvbuff; - coll->args.comm = info->comm->devComm; - coll->args.opCount = info->comm->opCount; - coll->args.nChannels = info->nChannels; - coll->args.nThreads = info->nThreads; + coll->args.coll.root = info->root; + coll->args.coll.count = info->count; + coll->args.coll.nChannels = info->nChannels; + coll->args.coll.nThreads = info->nThreads; coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol); - int stepSize = (info->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : info->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS; + int stepSize = info->comm->buffSizes[info->protocol]/NCCL_STEPS; int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1; int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1; int chunkSize = stepSize*chunkSteps; @@ -343,25 +373,28 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth && chunkSize > 32768) chunkSize /= 2; } // Use lastChunkSize as chunkSize - coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype); + coll->args.coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype); } else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) { // Optimize chunkSize / nSteps while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*16 && chunkSize > 131072) chunkSize /= 2; while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*4 && chunkSize > 65536) chunkSize /= 2; while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth && chunkSize > 32768) chunkSize /= 2; // Use lastChunkSize as chunkSize - coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype); + coll->args.coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype); } else if (info->protocol == NCCL_PROTO_LL) { - int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t); + const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine); const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize; - coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop); - ALIGN_SIZE(coll->args.lastChunkSize, info->nThreads*sizeof(uint64_t)); - coll->args.lastChunkSize /= ncclTypeSize(info->datatype); + coll->args.coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop); + ALIGN_SIZE(coll->args.coll.lastChunkSize, info->nThreads*sizeof(uint64_t)); + coll->args.coll.lastChunkSize /= ncclTypeSize(info->datatype); } else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) { - int nstepsInter = 1+log2i(info->comm->nNodes); - while (info->nBytes / (info->nChannels*chunkSize) < nstepsInter*4 && chunkSize > 32768) chunkSize /= 2; + int nNodes = info->comm->nNodes; + float ppn = info->comm->nRanks / (float)nNodes; + float nstepsLL128 = 1+log2i(nNodes) + 0.1*ppn; + while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2; + while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2; // Use lastChunkSize as chunkSize - coll->args.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype)); + coll->args.coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype)); } // Compute nSteps for proxies @@ -383,8 +416,19 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo return ncclSuccess; } -static ncclResult_t saveKernel(struct ncclInfo* info) { - if (info->comm->nRanks == 1) { +static ncclResult_t checkSetStream(struct ncclInfo* info) { + if (info->comm->userStreamSet == false) { + info->comm->userStream = info->stream; + info->comm->userStreamSet = true; + } else if (info->stream != info->comm->userStream) { + WARN("Error : mixing different streams within a group call is not supported."); + return ncclInvalidUsage; + } + return ncclSuccess; +} + +ncclResult_t ncclSaveKernel(struct ncclInfo* info) { + if (info->comm->nRanks == 1 && info->coll != ncclCollSendRecv) { if (info->sendbuff != info->recvbuff) CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, cudaMemcpyDeviceToDevice, info->stream)); return ncclSuccess; @@ -395,22 +439,18 @@ static ncclResult_t saveKernel(struct ncclInfo* info) { memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs)); NCCLCHECK(computeColl(info, &coll, &proxyArgs)); - info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, coll.args.nThreads); - if (info->comm->userStreamSet == false) { - info->comm->userStream = info->stream; - info->comm->userStreamSet = true; - } else if (info->stream != info->comm->userStream) { - WARN("Error : mixing different streams within a group call is not supported."); - return ncclInvalidUsage; - } + info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads); + int nChannels = info->coll == ncclCollSendRecv ? 1 : coll.args.coll.nChannels; int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1; - for (int bid=0; bid<coll.args.nChannels*nSubChannels; bid++) { - int channelId = info->comm->myParams->gridDim.x % info->comm->nChannels; + + for (int bid=0; bid<nChannels*nSubChannels; bid++) { + int channelId = (info->coll == ncclCollSendRecv) ? info->channelId : + info->comm->myParams->gridDim.x % info->comm->nChannels; struct ncclChannel* channel = info->comm->channels+channelId; if (channel->collCount == NCCL_MAX_OPS) { - WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS); + WARN("Too many aggregated operations on channel %d (%d max)", channel->id, NCCL_MAX_OPS); return ncclInvalidUsage; } @@ -420,18 +460,22 @@ static ncclResult_t saveKernel(struct ncclInfo* info) { if (nSubChannels == 2) { info->pattern = (channelId < info->comm->nChannels/nSubChannels) ? ncclPatternCollTreeUp : ncclPatternCollTreeDown; } - NCCLCHECK(transportSaveProxies(&proxyArgs, info->pattern, info->root, info->comm->nRanks)); + if (info->coll == ncclCollSendRecv) { + info->comm->myParams->gridDim.x = std::max<unsigned>(info->comm->myParams->gridDim.x, channelId+1); + NCCLCHECK(ncclProxySaveP2p(info, channel)); + } else { + NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks)); + } info->comm->myParams->gridDim.x++; - int opIndex = channel->collFifoTail; struct ncclColl* c = channel->collectives+opIndex; volatile uint8_t* activePtr = (volatile uint8_t*)&c->active; while (activePtr[0] != 0) sched_yield(); memcpy(c, &coll, sizeof(struct ncclColl)); + if (info->coll != ncclCollSendRecv) c->args.coll.bid = bid % coll.args.coll.nChannels; - c->args.bid = bid % coll.args.nChannels; c->active = 1; opIndex = (opIndex+1)%NCCL_MAX_OPS; c->nextIndex = opIndex; @@ -442,35 +486,82 @@ static ncclResult_t saveKernel(struct ncclInfo* info) { return ncclSuccess; } +// Save p2p operations in comm->p2plist. Operations will be posted to channels +// during ncclGroupEnd() +ncclResult_t ncclSaveP2p(struct ncclInfo* info) { + struct ncclComm* comm = info->comm; + struct ncclP2Plist* p2plist = &comm->p2plist; + int peer = info->root; + p2plist->count++; + ssize_t nBytes = info->count*ncclTypeSize(info->datatype); + if (info->recvbuff == NULL) { + if (peer != comm->rank) { + int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks; + for (int c=0; c<comm->p2pnChannelsPerPeer; c++) { + int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels; + if (comm->channels[channelId].peers[peer].send.connected == 0) { + p2plist->connect.send[channelId*comm->nRanks+p2plist->connect.nsend[channelId]++] = peer; + } + } + } + p2plist->peerlist[info->root].sendbytes = nBytes; + p2plist->peerlist[info->root].sendbuff = info->sendbuff; + } else { + if (peer != comm->rank) { + int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks; + for (int c=0; c<comm->p2pnChannelsPerPeer; c++) { + int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels; + if (comm->channels[channelId].peers[peer].recv.connected == 0) { + p2plist->connect.recv[channelId*comm->nRanks+p2plist->connect.nrecv[channelId]++] = peer; + } + } + } + p2plist->peerlist[info->root].recvbytes = nBytes; + p2plist->peerlist[info->root].recvbuff = info->recvbuff; + } + return ncclSuccess; +} ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { - if (info->comm == NULL) return ncclInvalidArgument; - - INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", - info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count, - info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream); - // Launch asynchronously if needed if (ncclAsyncMode()) { ncclResult_t ret = ncclSuccess; int savedDev = -1; + // Check arguments + NCCLCHECK(PtrCheck(info->comm, info->opName, "comm")); if (info->comm->checkPointers) { CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end); CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, end); } - // Check arguments NCCLCHECKGOTO(ArgsCheck(info), ret, end); // Always register comm even in case of error to make sure ncclGroupEnd // cleans it up. NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end); - NCCLCHECKGOTO(saveKernel(info), ret, end); + NCCLCHECKGOTO(checkSetStream(info), ret, end); + + INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", + info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count, + info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream); + + if (info->coll == ncclCollSendRecv) { //p2p stored separately + NCCLCHECKGOTO(ncclSaveP2p(info), ret, end); + } else { + NCCLCHECKGOTO(ncclSaveKernel(info), ret, end); + } end: if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev)); ncclAsyncErrCheck(ret); return ret; } else { + NCCLCHECK(PtrCheck(info->comm, info->opName, "comm")); NCCLCHECK(ArgsCheck(info)); - NCCLCHECK(saveKernel(info)); + NCCLCHECK(checkSetStream(info)); + + INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", + info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count, + info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream); + + NCCLCHECK(ncclSaveKernel(info)); NCCLCHECK(ncclBarrierEnqueue(info->comm)); NCCLCHECK(ncclBarrierEnqueueWait(info->comm)); NCCLCHECK(ncclEnqueueEvents(info->comm)); diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 0872ae7..b711874 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -9,6 +9,7 @@ #include "topo.h" #include "comm.h" #include "net.h" +#include "channel.h" // Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths @@ -231,15 +232,16 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE } } } - if (l >= 0) INFO(NCCL_GRAPH, "%s set from environment to %s", levelEnv, topoPathTypeStr[l]); + if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %s", levelEnv, topoPathTypeStr[l]); *level = l >= 0 ? l : -2; } return ncclSuccess; } int ncclTopoUserP2pLevel = -1; -ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p) { +ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read) { *p2p = 0; + *read = 0; // Get GPUs from topology int g1, g2; @@ -254,21 +256,33 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_ // In general, use P2P whenever we can. int p2pLevel = PATH_SYS; + // User override + if (ncclTopoUserP2pLevel == -1) + NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL")); + if (ncclTopoUserP2pLevel != -2) { + p2pLevel = ncclTopoUserP2pLevel; + goto compare; + } + // Don't use P2P through ARM CPUs int arch, vendor, model; NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB; - if (arch == NCCL_TOPO_CPU_ARCH_X86 && - vendor == NCCL_TOPO_CPU_VENDOR_INTEL && - model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB; - - // User override - NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL")); - if (ncclTopoUserP2pLevel != -2) p2pLevel = ncclTopoUserP2pLevel; + if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { + if (model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB; + else p2pLevel = PATH_PHB; + } +compare: // Compute the PCI distance and compare with the p2pLevel. if (path->type <= p2pLevel) *p2p = 1; + if (path->type == PATH_NVL) { + struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2; + // Enable P2P Read for Ampere/NVLink only + if ((gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1; + } + return ncclSuccess; } @@ -341,8 +355,8 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer // Update path when we don't want to / can't use GPU Direct P2P for (int p=0; p<system->nodes[GPU].count; p++) { - int p2p; - NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p)); + int p2p, read; + NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, &read)); if (p2p == 0) { // Divert all traffic through the CPU int cpu; @@ -437,3 +451,69 @@ void ncclTopoFree(struct ncclTopoSystem* system) { for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t); free(system); } + +static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*local gpu index*/, int peerRank, int* nChannels) { + int peer; + struct ncclTopoLinkList* path = NULL; + if (ncclTopoRankToIndex(system, peerRank, &peer) == ncclSuccess) { + // Same rank + if (g == peer) { + *nChannels = -1; + return ncclSuccess; + } + // Local rank + path = system->nodes[GPU].nodes[peer].paths[GPU]+g; + if (path->type == PATH_NVL) { + int sm = system->nodes[GPU].nodes[g].gpu.cudaCompCap; + double nvlWidth = sm < 70 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH; + *nChannels = 2*std::max(1, (int)(path->width / nvlWidth)); + } else { + *nChannels = 2; + } + } else { + // Remote rank, use network + *nChannels = 1; + } + return ncclSuccess; +} + +NCCL_PARAM(MinP2pNChannels, "MIN_P2P_NCHANNELS", 1); +NCCL_PARAM(MaxP2pNChannels, "MAX_P2P_NCHANNELS", MAXCHANNELS); + +static int nextPow2(int v) { + int pow2 = 1; + while (pow2 < v) pow2 <<= 1; + return pow2; +} + +ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) { + comm->p2pnChannels = std::min(comm->nChannels, (int)ncclParamMaxP2pNChannels()); + comm->p2pnChannels = std::max(comm->p2pnChannels, (int)ncclParamMinP2pNChannels()); + int minChannels = comm->p2pnChannels; + // We need to loop through all local GPUs to have a global picture + for (int g=0; g<comm->topo->nodes[GPU].count; g++) { + for (int r=0; r<comm->nRanks; r++) { + int nChannels; + NCCLCHECK(ncclTopoGetNchannels(comm->topo, g, r, &nChannels)); + if (nChannels >= 0) minChannels = std::min(minChannels, nChannels); + } + } + + // Round to next pow2 nChannelsPerPeer and nChannels + comm->p2pnChannelsPerPeer = nextPow2(minChannels); + comm->p2pnChannels = nextPow2(comm->p2pnChannels); + + // Init channels that weren't used so far + for (int c=comm->nChannels; c<comm->p2pnChannels; c++) NCCLCHECK(initChannel(comm, c)); + + // We want to spread channels used when there aren't many and progressively + // fill the whole space of nChannels. To do so we mirror the bits in the + // nChannels space. + for (int c=0; c<comm->p2pnChannelsPerPeer; c++) { + int mirror = 0; + for (int b=1, mb=(comm->p2pnChannels>>1); b<comm->p2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb; + comm->p2pChannels[c] = mirror; + } + INFO(NCCL_INIT, "%d coll channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer); + return ncclSuccess; +} diff --git a/src/graph/search.cc b/src/graph/search.cc index 1bbb7d3..42e1bb9 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -13,13 +13,11 @@ // Initialize system->maxWidth. This is the per-channel (i.e. per-SM) // max speed. static float getMaxWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) { - float nvLinkWidth = gpu->gpu.cudaCompCap > 60 ? VOLTA_NVLINK_WIDTH : PASCAL_NVLINK_WIDTH; float maxWidth = 0.0; for (int i=0; i<system->nodes[type].count; i++) { struct ncclTopoLinkList* path = gpu->paths[type]+i; float width = path->width; if (path->count == 0) continue; - if (path->type == PATH_NVL) width = std::min(nvLinkWidth, width); maxWidth = std::max(maxWidth, width); } return maxWidth; @@ -73,7 +71,7 @@ static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNod struct ncclTopoLink* revLink = NULL; float fwSpeed = link->type == LINK_PCI ? pciSpeed : speed; float revSpeed = 0; - if (link->remNode->type == GPU && start->type != GPU) { + if (link->remNode->type == GPU && link->remNode->gpu.cudaCompCap < 80 && start->type != GPU) { if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink)); revSpeed += fwSpeed/8; } @@ -326,6 +324,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex; for (int n=0; n<system->nodes[NET].count; n++) { struct ncclTopoNode* net = system->nodes[NET].nodes+n; + if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue; NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net)); if (net) { @@ -394,8 +393,10 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo } if (graph->nChannels == 0 || graph->sameChannels == 0) { if (graph->nChannels == 0) { - // Always try the PCI order first to set a reference - NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, NET, n, 0)); + // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long + int t = 1 << 10; + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0)); + if (t == -1) *time = -1; } // Then try the most local GPUs @@ -528,7 +529,7 @@ ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, st } return ncclSuccess; } -ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { +ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels) { int id; NCCLCHECK(xmlGetAttrInt(xmlGraph, "id", &id)); if (graph->id != id) return ncclSuccess; @@ -551,11 +552,12 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc for (int s=0; s<xmlGraph->nSubs; s++) { NCCLCHECK(ncclTopoGetChannelFromXml(xmlGraph->subs[s], s, system, graph)); } + *nChannels = xmlGraph->nSubs; return ncclSuccess; } -ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { +ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels) { for (int s=0; s<xmlGraphs->nSubs; s++) { - NCCLCHECK(ncclTopoGetGraphFromXmlSub(xmlGraphs->subs[s], system, graph)); + NCCLCHECK(ncclTopoGetGraphFromXmlSub(xmlGraphs->subs[s], system, graph, nChannels)); } return ncclSuccess; } @@ -621,7 +623,7 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs return ncclSuccess; } -float speedArray[] = { 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; +float speedArray[] = { 42.0, 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; #define NSPEEDS (sizeof(speedArray)/sizeof(float)) ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) { @@ -636,10 +638,13 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph char* str = getenv("NCCL_GRAPH_FILE"); if (str) { + INFO(NCCL_ENV, "NCCL_GRAPH_FILE set by environment to %s", str); struct ncclXml* xml; NCCLCHECK(ncclCalloc(&xml, 1)); NCCLCHECK(ncclTopoGetXmlGraphFromFile(str, xml)); - NCCLCHECK(ncclTopoGetGraphFromXml(xml->nodes, system, graph)); + int nChannels; + NCCLCHECK(ncclTopoGetGraphFromXml(xml->nodes, system, graph, &nChannels)); + INFO(NCCL_GRAPH, "Search %d : %d channels loaded from XML graph", graph->id, nChannels); free(xml); if (graph->nChannels > 0) return ncclSuccess; } @@ -764,6 +769,15 @@ done: graph->typeIntra = graph->typeInter = PATH_SYS; graph->nChannels = 1; } + + if (graph->speedIntra >= 25.0) { + int dupChannels = std::min(graph->nChannels*2, graph->maxChannels); + memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int)); + memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int)); + graph->speedIntra /= 2; + graph->speedInter /= 2; + graph->nChannels = dupChannels; + } return ncclSuccess; } @@ -795,6 +809,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs) { char* str = getenv("NCCL_GRAPH_DUMP_FILE"); if (str) { + INFO(NCCL_ENV, "NCCL_GRAPH_DUMP_FILE set by environment to %s", str); struct ncclXml* xml; NCCLCHECK(ncclCalloc(&xml, 1)); NCCLCHECK(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml)); @@ -804,10 +819,17 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru return ncclSuccess; } -ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int rank, int channelId, int* dev) { - int channel = channelId%graph->nChannels; - int ngpus = system->nodes[GPU].count; - int index = graph->intra[channel*ngpus] == rank ? 0 : 1; - *dev = graph->inter[channel*2+index]; +ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* dev) { + if (graph) { + // Honor the net device in the graph + int channel = channelId%graph->nChannels; + int ngpus = system->nodes[GPU].count; + int index = graph->intra[channel*ngpus] == rank ? 0 : 1; + *dev = graph->inter[channel*2+index]; + } else { + int64_t id; + NCCLCHECK(ncclTopoGetLocalNet(system, rank, &id, channelId)); + *dev = id; + } return ncclSuccess; } diff --git a/src/graph/topo.cc b/src/graph/topo.cc index ac6b111..ed79e09 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -504,6 +504,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy NCCLCHECK(ncclCalloc(&xml, 1)); char* xmlTopoFile = getenv("NCCL_TOPO_FILE"); if (xmlTopoFile) { + INFO(NCCL_ENV, "NCCL_TOPO_FILE set by environment to %s", xmlTopoFile); NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml)); } if (xml->maxIndex == 0) { @@ -562,6 +563,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE"); if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) { + INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile); NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml)); } @@ -570,6 +572,28 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy return ncclSuccess; } +ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_t* id, int rr) { + int g; + NCCLCHECK(ncclTopoRankToIndex(system, rank, &g)); + int minType = PATH_SYS; + float maxWidth = 0; + int count = 0; + int* nets; + NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); + for (int n=0; n<system->nodes[NET].count; n++) { + struct ncclTopoLinkList* path = system->nodes[NET].nodes[n].paths[GPU]+g; + if (path->width > maxWidth || (path->width == maxWidth && path->type < minType)) { + maxWidth = path->width; + minType = path->type; + count = 0; + } + if (path->width == maxWidth && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id; + } + *id = nets[rr % count]; + free(nets); + return ncclSuccess; +} + /****************************/ /* External query functions */ /****************************/ diff --git a/src/graph/topo.h b/src/graph/topo.h index 848fc03..950cff8 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -126,8 +126,10 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system); ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system); +ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_t* id, int rr); + ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem); -ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph); +ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels); ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml); static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, int64_t id, int* index) { @@ -141,4 +143,15 @@ static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, i return ncclInternalError; } +static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank, int* index) { + *index = -1; + for (int i=0; i<system->nodes[GPU].count; i++) { + if (system->nodes[GPU].nodes[i].gpu.rank == rank) { + *index = i; + return ncclSuccess; + } + } + return ncclInternalError; +} + #endif diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index 8a0b4cd..29424b0 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -51,13 +51,9 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li return ncclSuccess; } -static const char* ncclFuncStr[] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" }; -static const char* ncclAlgoStr[] = { "Tree", "Ring", "CollNet" }; -static const char* ncclProtoStr[] = { "LL", "LL128", "Simple" }; - // Latencies in us, Bandwidths in GB/s // Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple } -static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4, 0 }, { 3.6, 3.6, 8.4 }, { 4.4, 4.4, 0 } }; +static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4, 0 }, { 3.6, 10.0, 8.4 }, { 4.4, 4.4, 0 } }; // NVLink, PCI, Network #define NCCL_HW_NVLINK 0 @@ -66,17 +62,18 @@ static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, // Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network). static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { /* NVLINK */ - { /* Tree (LL/LL128/Simple)*/ { .5, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { .4, 2.5, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { .5, 1.9, 4.0 } }, + { /* Tree (LL/LL128/Simple)*/ { .52, 1.2, 28 }, /* Ring (LL/LL128/Simple)*/ { .47, 1.9, 3.4 }, /* CollNet (LL/LL128/Simple)*/ { .5, 1.2, 4.0 } }, /* PCI */ { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { 1.0, 1.9, 5.5 } }, /* NET */ - { /* Tree (LL/LL128/Simple)*/ { 5.0, 7.5, 50 }, /* Ring (LL/LL128/Simple)*/ { .9, 2.5, 6.6 }, /* CollNet (LL/LL128/Simple)*/ { 5.0, 5.0, 10.7 } } + { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 50 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 9.6 }, /* CollNet (LL/LL128/Simple)*/ { 5.0, 5.0, 10.7 } } }; -// LL128 max BW for the different collectives -static const double ll128MaxBw[NCCL_NUM_FUNCTIONS] = { 113.0, 72.0, 110.0, 91.0, 100.0 }; +// LL128 max BW (per channel) for the different collectives +// ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce +static const double ll128MaxBwPerCh[NCCL_NUM_FUNCTIONS] = { 18.8, 12.0, 18.3, 15.2, 16.7 }; -ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) { +ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) { int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_MAX_NTHREADS; comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads); @@ -89,6 +86,8 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma if (comm->nRanks <= 1) return ncclSuccess; + int compCap80 = minCompCap == 80 && maxCompCap == 80 ? 1 : 0; + float ppn = (float)comm->nRanks / comm->nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph }; int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS]; for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI; @@ -98,6 +97,9 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma int nsteps = coll == ncclCollAllReduce ? 2*(comm->nRanks-1) : coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nRanks-1 : comm->nRanks; + int nInterSteps = coll == ncclCollAllReduce ? 2*(comm->nNodes-1) : + coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nNodes-1 : + comm->nNodes; for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) { if (coll != ncclCollAllReduce && a != NCCL_ALGO_RING) continue; @@ -105,13 +107,17 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { float speed = comm->nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter; float busBw = graphs[a]->nChannels * speed; + if (compCap80) busBw *= 0.92; // Various model refinements - if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= 1.0/4.0; - if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw*120.0/128.0, ll128MaxBw[coll]); - if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.9, comm->nNodes > 2 ? 80.0 : 110.0); - if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/3.0; - if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw *= 7.0/9.0; + if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= (comm->nNodes > 1 || coll == ncclCollAllReduce || coll == ncclCollReduce) ? 1.0/4.0 : 1.0/3.0; + if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels); + double maxTreeBw = comm->nNodes > 2 ? + compCap80 && p == NCCL_PROTO_LL128 ? 105.0 : 80.0 : + compCap80 && p == NCCL_PROTO_LL128 ? 130.0 : 110.0; + if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.9, maxTreeBw); + if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/3.8; + if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (comm->nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels*7.0/9.0); if (a == NCCL_ALGO_COLLNET) busBw *= .9; if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL) busBw *= 1.0/6.0; // Take into account that GDR read is disabled on both sides if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL128) busBw = 0; // CollNet does not support LL128 @@ -121,6 +127,9 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma comm->bandwidths[coll][a][p] = busBw * ratio; comm->latencies[coll][a][p] = baseLat[a][p]; + float intraLat = hwLat[intraHw[a]][a][p]; + float interLat = hwLat[NCCL_HW_NET][a][p]; + if (comm->nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8; if (a == NCCL_ALGO_RING) { float lat = hwLat[hw[a]][a][p]; if ((coll == ncclCollReduce || coll == ncclCollBroadcast)) { @@ -131,16 +140,12 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma comm->latencies[coll][a][p] += nsteps*lat; } } else { - comm->latencies[coll][a][p] += nsteps*lat; + comm->latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat; } } else if (a == NCCL_ALGO_TREE) { - float intraLat = hwLat[intraHw[a]][a][p]; - float interLat = hwLat[NCCL_HW_NET][a][p]; comm->latencies[coll][a][p] += 2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat); } else { - float intraLat = hwLat[intraHw[a]][a][p]; - float interLat = hwLat[NCCL_HW_NET][a][p]; comm->latencies[coll][a][p] += 2 * (comm->nRanks/comm->nNodes-1) * intraLat + interLat; } @@ -154,17 +159,26 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1 }; const char *protoStr = getenv("NCCL_PROTO"); - if (protoStr) NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable)); + if (protoStr) { + INFO(NCCL_ENV, "NCCL_PROTO set by environment to %s", protoStr); + NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable)); + } const char *algoStr = getenv("NCCL_ALGO"); - if (algoStr) NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable)); + if (algoStr) { + INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr); + NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable)); + } for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { int pEnable = protoEnable[p]; if (pEnable == 2 && p == NCCL_PROTO_LL128) { - // Enable LL128 by default only on Volta+NVLink. Other cases are not tested and may cause silent data corruption. - pEnable = (graphs[a]->typeInter <= LINK_PCI) && graphs[a]->typeIntra == LINK_NVL && minCompCap == 70 && maxCompCap == 70 ? 1 : 0; + // Enable LL128 by default only on Volta/Ampere+NVLink. Other cases are not tested and may cause silent data corruption. + pEnable = (graphs[a]->typeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL && + ((minCompCap == 70 && maxCompCap == 70) || (minCompCap == 80 && maxCompCap == 80)) ? 1 : 0; } - if (pEnable == 0 || algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0; + if (pEnable == 0) comm->bandwidths[c][a][p] = 0; + // Only disable algo for Allreduce since others only have one + if (c == ncclCollAllReduce && algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0; } if (comm->rank == 0) { @@ -205,6 +219,7 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma // Override defaults with user env char* str = getenv("NCCL_THREAD_THRESHOLDS"); if (str) { + INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str); ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2}}; sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2); for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) { @@ -228,20 +243,23 @@ ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int ma } // Trees are not perfectly sticking to the model for medium sizes. Applying a static correction -// factor is not ideal but works quite well. Powers of two, 64 B to 1 GB. +// factor is not ideal but works quite well. Powers of two, 64 B to 128MB. static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = { - { 1.0, 1.0, 1.0, 1.0, .9, .8, .7, .7, .7, .7, .6, .5, .5, .5, .6, .7, .8, .9, .9, 1.0, 1.0, 1.0 }, - { 1.0, 1.0, 1.0, 1.0, 1.0, .9, .8, .8, .8, .8, .7, .7, .7, .6, .6, .7, .7, .8, .8, .9, .9, 1.0 }, + { 1.0, 1.0, 1.0, 1.0, .9, .8, .7, .7, .7, .7, .6, .5, .4, .4, .5, .6, .7, .8, .9, 1.0, 1.0, 1.0 }, + { 1.0, 1.0, 1.0, 1.0, 1.0, .9, .8, .8, .8, .7, .6, .6, .6, .5, .6, .6, .7, .7, .8, .9, .9, 1.0 }, { .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .5, .5, .6, .6, .7, .8, .9 } }; ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time) { float bw = info->comm->bandwidths[info->coll][algorithm][protocol]; + float lat = info->comm->latencies[info->coll][algorithm][protocol]; if (bw == 0) { *time = -1.0; return ncclSuccess; } int logSize = log2i(info->nBytes>>6); if (algorithm == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[protocol][logSize]; - *time = info->comm->latencies[info->coll][algorithm][protocol] + (info->nBytes) / (1000 * bw); + if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && info->comm->nNodes > 1 + && info->coll == ncclCollAllReduce && info->nBytes >= info->comm->nRanks/16.0*65536) lat *= 1.9; // Plateau effect of ring + *time = lat + (info->nBytes) / (1000 * bw); return ncclSuccess; } diff --git a/src/graph/xml.cc b/src/graph/xml.cc index f138d0b..2885787 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -590,7 +590,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm NCCLCHECK(xmlGetSub(pciNode, "nvlink", &nvlNode)); if (nvlNode == NULL) { // NVML NVLink detection - int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : 6; + int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : 12; if (maxNvLinks > 0 && nvmlDev == NULL) { WARN("No NVML device handle. Skipping nvlink detection.\n"); diff --git a/src/group.cc b/src/group.cc index 9bf8ac9..549a4fd 100644 --- a/src/group.cc +++ b/src/group.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,6 +7,7 @@ #include "group.h" #include "debug.h" #include "enqueue.h" +#include "transport.h" #define MAX_ASYNC_OPS 128 thread_local pthread_t ncclGroupThreads[MAX_ASYNC_OPS]; @@ -33,6 +34,7 @@ struct ncclInitArgs { }; struct ncclCollArgs { ncclComm_t comm; + int connect; }; enum ncclAsyncFuncType { @@ -51,16 +53,24 @@ struct ncclAsyncArgs { thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS]; -#define CHECK(a) do { \ +#define NCCLCHECKTHREAD(a) do { \ if ((args->ret = (a)) != ncclSuccess) { \ INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ return args; \ } \ } while(0) +#define CUDACHECKTHREAD(a) do { \ + if ((a) != cudaSuccess) { \ + INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ + args->ret = ncclUnhandledCudaError; \ + return args; \ + } \ +} while(0) + void* ncclAsyncThreadMain(void* args_) { struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_; - CHECK(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev)); + NCCLCHECKTHREAD(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev)); return args; } @@ -99,20 +109,50 @@ ncclResult_t ncclAsyncColl(ncclComm_t comm) { NCCL_API(ncclResult_t, ncclGroupStart); ncclResult_t ncclGroupStart() { + if (ncclGroupMode == 0) { + memset(ncclGroupArgs, 0, sizeof(struct ncclAsyncArgs)*MAX_ASYNC_OPS); + } ncclGroupMode++; return ncclSuccess; } +static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes, void* recvbuff, ssize_t sendbytes, const void* sendbuff) { + struct ncclInfo info = { ncclCollSendRecv, "SendRecv", + sendbuff, recvbuff, (size_t)std::max<ssize_t>(sendbytes,recvbytes), ncclInt8, ncclSum, -1, comm, comm->userStream, /* Args */ + 1, 1 }; + info.delta = delta; + info.channelId = channelId; + info.sendbytes = sendbytes; + info.recvbytes = recvbytes; + if (delta == 0 && sendbytes != recvbytes) return ncclInvalidUsage; + NCCLCHECK(ncclSaveKernel(&info)); + return ncclSuccess; +} + +void* ncclAsyncThreadPreconnect(void* args_) { + struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_; + CUDACHECKTHREAD(cudaSetDevice(args->coll.comm->cudaDev)); + for (int c=0; c<args->coll.comm->p2pnChannels; c++) { + struct ncclComm* comm = args->coll.comm; + struct ncclChannel* channel = comm->channels+c; + struct ncclP2PConnect* connect = &comm->p2plist.connect; + NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, channel, connect->nrecv[c], connect->recv+c*comm->nRanks, connect->nsend[c], connect->send+c*comm->nRanks)); + connect->nrecv[c] = 0; + connect->nsend[c] = 0; + } + return args; +} + NCCL_API(ncclResult_t, ncclGroupEnd); ncclResult_t ncclGroupEnd() { + if (ncclGroupMode == 0) return ncclInvalidUsage; ncclGroupMode--; if (ncclGroupMode > 0) return ncclSuccess; int savedDev; CUDACHECK(cudaGetDevice(&savedDev)); - int done = ncclGroupIndex; + int activeThreads = 0; int doneArray[MAX_ASYNC_OPS]; - for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 0; - + for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 1; ncclResult_t ret = ncclGroupError; if (ret != ncclSuccess) goto group_cleanup; @@ -121,6 +161,97 @@ ncclResult_t ncclGroupEnd() { struct ncclAsyncArgs* args = ncclGroupArgs+i; if (args->funcType == ASYNC_FUNC_INIT) { pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadMain, args); + activeThreads++; + doneArray[i] = 0; + } + } + /* For init, since we use threads, we just wait for threads to complete */ + while (activeThreads) { + for (int i=0; i<ncclGroupIndex; i++) { + struct ncclAsyncArgs* args = ncclGroupArgs+i; + if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) { + int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL); + if (err == EBUSY) continue; + if (err != 0) ret = ncclSystemError; + if (args->ret != ncclSuccess) ret = args->ret; + doneArray[i] = 1; + activeThreads--; + } + } + } + + for (int i=0; i<ncclGroupIndex; i++) { + struct ncclAsyncArgs* args = ncclGroupArgs+i; + if (args->funcType == ASYNC_FUNC_COLL) { + struct ncclP2Plist* p2plist = &args->coll.comm->p2plist; + if (p2plist->count != 0) { + struct ncclComm* comm = args->coll.comm; + args->coll.connect = 0; + for (int c=0; c<comm->p2pnChannels; c++) + args->coll.connect += comm->p2plist.connect.nsend[c] + comm->p2plist.connect.nrecv[c]; + if (args->coll.connect) { + pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadPreconnect, args); + } + } + } + } + + for (int i=0; i<ncclGroupIndex; i++) { + struct ncclAsyncArgs* args = ncclGroupArgs+i; + if (args->funcType == ASYNC_FUNC_COLL && (args->coll.connect)) { + int err = pthread_join(ncclGroupThreads[i], NULL); + if (err != 0) { + WARN("Error waiting for pthread_join : %s\n", strerror(errno)); + return ncclSystemError; + } + NCCLCHECKGOTO(args->ret, ret, end); + } + } + + for (int i=0; i<ncclGroupIndex; i++) { + struct ncclAsyncArgs* args = ncclGroupArgs+i; + if (args->funcType == ASYNC_FUNC_COLL) { + struct ncclComm* comm = args->coll.comm; + int rank = comm->rank; + int nRanks = comm->nRanks; + struct ncclP2Plist* p2plist = &args->coll.comm->p2plist; + if (p2plist->count) { + for (int delta=0; delta<nRanks; delta++) { + uint32_t from = (rank+nRanks-delta)%nRanks; + uint32_t to = (rank+delta)%nRanks; + + // Compute how much to split operations + // Natural step size matching buffer steps. + ssize_t stepSize = 4*comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS; + // Split each operation on p2pnChannelsPerPeer max. + ssize_t recvChunkSize = DIVUP(p2plist->peerlist[from].recvbytes, comm->p2pnChannelsPerPeer); + ssize_t sendChunkSize = DIVUP(p2plist->peerlist[to].sendbytes, comm->p2pnChannelsPerPeer); + recvChunkSize = std::max((ssize_t)1, DIVUP(recvChunkSize, stepSize)) * stepSize; + sendChunkSize = std::max((ssize_t)1, DIVUP(sendChunkSize, stepSize)) * stepSize; + + ssize_t sendOffset = 0; + ssize_t recvOffset = 0; + int remaining = 1; + int chunk = 0; + while (remaining) { + int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels; + remaining = 0; + ssize_t recvbytes = p2plist->peerlist[from].recvbytes-recvOffset; + ssize_t sendbytes = p2plist->peerlist[to].sendbytes-sendOffset; + if (recvbytes > recvChunkSize) { remaining = 1; recvbytes = recvChunkSize; } else p2plist->peerlist[from].recvbytes = -1; + if (sendbytes > sendChunkSize) { remaining = 1; sendbytes = sendChunkSize; } else p2plist->peerlist[to].sendbytes = -1; + if (sendbytes >= 0 || recvbytes >= 0) { + NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId, + recvbytes, ((char*)(p2plist->peerlist[from].recvbuff)) + recvOffset, + sendbytes, ((const char*)(p2plist->peerlist[to].sendbuff)) + sendOffset), ret, end); + } + recvOffset += recvChunkSize; + sendOffset += sendChunkSize; + chunk++; + } + } + p2plist->count = 0; + } } } @@ -154,25 +285,9 @@ ncclResult_t ncclGroupEnd() { if (args->coll.comm->userStream == NULL) CUDACHECKGOTO(cudaSetDevice(args->coll.comm->cudaDev), ret, end); NCCLCHECKGOTO(ncclEnqueueEvents(args->coll.comm), ret, end); - doneArray[i] = 1; - done--; } } - /* For init, since we use threads, we just wait for threads to complete */ - while (done) { - for (int i=0; i<ncclGroupIndex; i++) { - struct ncclAsyncArgs* args = ncclGroupArgs+i; - if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) { - int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL); - if (err == EBUSY) continue; - if (err != 0) ret = ncclSystemError; - if (args->ret != ncclSuccess) ret = args->ret; - doneArray[i] = 1; - done--; - } - } - } goto end; group_cleanup: if (ret != ncclSuccess) { @@ -180,12 +295,12 @@ group_cleanup: // an atomic operation, we need to cancel all operations. for (int i=0; i<ncclGroupIndex; i++) { struct ncclAsyncArgs* args = ncclGroupArgs+i; - if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) { - if (args->init.newcomm) NCCLCHECK(ncclCommDestroy(*args->init.newcomm)); + if (args->funcType == ASYNC_FUNC_INIT) { + if (args->init.newcomm) ncclCommDestroy(*args->init.newcomm); *args->init.newcomm = NULL; } else { struct ncclComm* comm = args->coll.comm; - for (int c=0; c<comm->nChannels; c++) { + for (int c=0; c<comm->p2pnChannels; c++) { struct ncclChannel* channel = comm->channels+c; for (int i=0; i<channel->collCount; i++) { channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0; diff --git a/src/include/alloc.h b/src/include/alloc.h index 27e206f..cc652ce 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -12,10 +12,10 @@ #include "align.h" #include <sys/mman.h> -static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) { - CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped)); - memset(*ptr, 0, size); - *devPtr = *ptr; +template <typename T> +static ncclResult_t ncclCudaHostCalloc(T** ptr, size_t nelem) { + CUDACHECK(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped)); + memset(*ptr, 0, nelem*sizeof(T)); return ncclSuccess; } diff --git a/src/include/checks.h b/src/include/checks.h index 257e9ca..ce81312 100644 --- a/src/include/checks.h +++ b/src/include/checks.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,17 +11,17 @@ // Check CUDA calls #define CUDACHECK(cmd) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ - WARN("Cuda failure '%s'", cudaGetErrorString(e)); \ + cudaError_t err = cmd; \ + if( err != cudaSuccess ) { \ + WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ return ncclUnhandledCudaError; \ } \ } while(false) #define CUDACHECKGOTO(cmd, res, label) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ - WARN("Cuda failure '%s'", cudaGetErrorString(e)); \ + cudaError_t err = cmd; \ + if( err != cudaSuccess ) { \ + WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ res = ncclUnhandledCudaError; \ goto label; \ } \ diff --git a/src/include/collectives.h b/src/include/collectives.h index bd64106..f854364 100644 --- a/src/include/collectives.h +++ b/src/include/collectives.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,10 +7,8 @@ #ifndef NCCL_COLLECTIVES_H_ #define NCCL_COLLECTIVES_H_ -#include "core.h" -#include "info.h" - -#define FUNC_INDEX(coll, redop, dtype, al, pr) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)) +#define FUNC_INDEX_P2P 0 +#define FUNC_INDEX(coll, redop, dtype, al, pr) (1+(((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)) #define NCCL_COLL_NAME(coll, op, dtype) \ coll##_##op##_##dtype @@ -56,6 +54,7 @@ DECL_COLL2(ncclAllGather, copy) \ DECL_COLL(ncclReduceScatter) \ DECL_COLL(ncclAllReduce) \ + DECL_COLL5(ncclSendRecv,copy,i8) \ DECL_ALL_COLLS @@ -70,5 +69,6 @@ DECL_ALL_COLLS #define BROADCAST_CHUNKSTEPS 1 #define REDUCE_SLICESTEPS 1 #define REDUCE_CHUNKSTEPS 1 +#define SENDRECV_SLICEFACTOR 4 #endif diff --git a/src/include/comm.h b/src/include/comm.h index cc87a42..40143f4 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -8,6 +8,7 @@ #define NCCL_COMM_H_ #include "transport.h" +#include "p2p.h" #if CUDART_VERSION < 9000 struct cudaLaunchParams { @@ -40,6 +41,7 @@ struct ncclSendMem { }; char pad3[MEM_ALIGN]; }; + char buff[1]; // Actually larger than that }; struct ncclRecvMem { @@ -53,8 +55,6 @@ struct ncclRecvMem { }; char pad4[MEM_ALIGN]; }; - ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES]; - uint64_t ll128Buff[NCCL_LL128_BUFF_ELEMS]; char buff[1]; // Actually larger than that }; @@ -88,6 +88,13 @@ struct ncclComm { // Channels for collectives int nChannels; + // Channels (per peer) for p2p + int p2pnChannels; + int p2pnChannelsPerPeer; + int p2pChannels[MAXCHANNELS]; + + // Buffer sizes + int buffSizes[NCCL_NUM_PROTOCOLS]; // Algorithm/Protocols thresholds ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; @@ -134,6 +141,8 @@ struct ncclComm { // Whether this communicator uses collNet int collNetSupport; + //list of async p2p operation queued in a group semantics + struct ncclP2Plist p2plist; }; #endif diff --git a/src/include/core.h b/src/include/core.h index ac5fa85..0435d9b 100644 --- a/src/include/core.h +++ b/src/include/core.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -50,19 +50,6 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) { } } -#define NCCL_NUM_FUNCTIONS 5 -typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce } ncclFunc_t; - -#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet -#define NCCL_ALGO_TREE 0 -#define NCCL_ALGO_RING 1 -#define NCCL_ALGO_COLLNET 2 - -#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 -#define NCCL_PROTO_LL 0 -#define NCCL_PROTO_LL128 1 -#define NCCL_PROTO_SIMPLE 2 - #include "debug.h" #include "checks.h" #include "alloc.h" diff --git a/src/include/devcomm.h b/src/include/devcomm.h index 96c69ba..f00e6d6 100644 --- a/src/include/devcomm.h +++ b/src/include/devcomm.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,6 +11,22 @@ #include "align.h" #include <stdint.h> +#define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now +typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollSendRecv} ncclFunc_t; +extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS]; + +#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet +#define NCCL_ALGO_TREE 0 +#define NCCL_ALGO_RING 1 +#define NCCL_ALGO_COLLNET 2 +extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS]; + +#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 +#define NCCL_PROTO_LL 0 +#define NCCL_PROTO_LL128 1 +#define NCCL_PROTO_SIMPLE 2 +extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS]; + #define NCCL_MAX_OPS 2048 #define NCCL_STEPS 8 @@ -34,9 +50,6 @@ union ncclLLFifoLine { #define NCCL_MAX_NTHREADS 512 #define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS #define NCCL_LL_LINES_PER_THREAD 8 -#define NCCL_LL_SLICE_LINES (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS) -#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS) -#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine)) #ifdef TEST_LL_CLEANUP #define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup #define NCCL_LL_FLAG_MAX 0x100 @@ -59,10 +72,6 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK // to 3 dests. Use 70% for reduce and 30% for bcast. #define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32) -#define NCCL_LL128_SLICE_ELEMS (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS) -#define NCCL_LL128_BUFF_ELEMS (NCCL_LL128_SLICE_ELEMS*NCCL_STEPS) -#define NCCL_LL128_BUFF_SIZE (NCCL_LL128_BUFF_ELEMS*sizeof(uint64_t)) - #define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8 #define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS) @@ -71,7 +80,7 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK struct ncclConnInfo { // Regular comm mechanism - char *buff; // Local for recv, remote for send + char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send uint64_t *tail; // Local for recv, remote for send uint64_t *head; // Local for send, remote for recv uint64_t *opCountLoc; // opCount of local rank @@ -83,13 +92,7 @@ struct ncclConnInfo { int *fifo; // Size fifo for proxy uint64_t step; // Keep where we are - - // Low latency mechanism - union ncclLLFifoLine *llBuff; // Local for recv, remote for send uint64_t llLastCleaning; - - // High bandwidth, low latency protocol - uint64_t* ll128Buff; // Local for recv, remote for send }; struct ncclConnector { @@ -136,17 +139,31 @@ struct CollectiveArgs { uint64_t opCount; // local and remote input, output, and buffer - const void * ThisInput; - void * ThisOutput; + const void * sendbuff; + void * recvbuff; - // general parameters - size_t N; - uint32_t root; - uint8_t bid; - uint8_t nChannels; - uint16_t nThreads; - - int lastChunkSize; + // Op-specific fields. Make sure the common part stays the + // same on all structs of the union + union { + struct { + uint16_t nThreads; + } common; + struct { + uint16_t nThreads; + uint8_t bid; + uint8_t nChannels; + uint32_t root; + size_t count; + size_t lastChunkSize; + } coll; + struct { + uint16_t nThreads; + uint16_t unused; + int32_t delta; + size_t sendCount; + size_t recvCount; + } p2p; + }; }; struct ncclColl { union { @@ -171,8 +188,6 @@ struct ncclChannel { struct ncclTree collTreeDn; int id; - int nthreads; - int buffSize; // Communication structures struct ncclPeer* peers; @@ -180,7 +195,6 @@ struct ncclChannel { // Operation list for aggregation struct ncclColl* collectives; - struct ncclColl* devCollectives; int collStart; int collCount; int collFifoHead; // Only used by GPU @@ -200,6 +214,7 @@ typedef enum { struct ncclDevComm { int rank; int nRanks; + int buffSizes[NCCL_NUM_PROTOCOLS]; // Flag to ask NCCL kernels to abort volatile uint32_t *abortFlag; diff --git a/src/include/enqueue.h b/src/include/enqueue.h index cea486e..a7e6e50 100644 --- a/src/include/enqueue.h +++ b/src/include/enqueue.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -12,11 +12,12 @@ #include "collectives.h" ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); -ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast); -ncclResult_t ncclCpuBarrierLast(ncclComm_t comm); -ncclResult_t ncclCpuBarrierOut(ncclComm_t comm); -ncclResult_t ncclBarrierEnqueue(ncclComm_t comm); -ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm); -ncclResult_t ncclEnqueueEvents(ncclComm_t comm); +ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast); +ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm); +ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm); +ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm); +ncclResult_t ncclBarrierEnqueueWait(struct ncclComm* comm); +ncclResult_t ncclEnqueueEvents(struct ncclComm* comm); +ncclResult_t ncclSaveKernel(struct ncclInfo* info); #endif // End include guard diff --git a/src/include/graph.h b/src/include/graph.h index 1814440..70117d5 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -25,10 +25,11 @@ ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system); ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info); void ncclTopoFree(struct ncclTopoSystem* system); ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm); +ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm); // Query topology -ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int rank, int channelId, int* net); -ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p); +ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* net); +ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read); ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr); // Set CPU affinity @@ -96,7 +97,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank); -ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph); +ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph); #include "info.h" ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time); diff --git a/src/include/info.h b/src/include/info.h index 46b9795..8f125e1 100644 --- a/src/include/info.h +++ b/src/include/info.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,7 +8,7 @@ #define NCCL_INFO_H_ #include "nccl.h" -#include "core.h" +#include "devcomm.h" typedef enum { ncclPatternRing, @@ -47,6 +47,10 @@ struct ncclInfo { size_t nBytes; int nstepsPerLoop; int nchunksPerLoop; + ssize_t sendbytes; + ssize_t recvbytes; + uint32_t delta; + int channelId; }; #endif diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h index 95dce5b..fd19f81 100644 --- a/src/include/nccl_net.h +++ b/src/include/nccl_net.h @@ -16,7 +16,7 @@ #define NCCL_PTR_CUDA 0x2 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; -typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ALL=~0} ncclDebugLogSubSys; +typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); diff --git a/src/include/p2p.h b/src/include/p2p.h new file mode 100644 index 0000000..9d3730e --- /dev/null +++ b/src/include/p2p.h @@ -0,0 +1,32 @@ +/************************************************************************* + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include <stdlib.h> + +#ifndef NCCL_P2P_H_ +#define NCCL_P2P_H_ + +struct ncclP2Pinfo { + const void* sendbuff; + void* recvbuff; + ssize_t sendbytes; + ssize_t recvbytes; +}; + +struct ncclP2PConnect { + int nrecv[MAXCHANNELS]; + int nsend[MAXCHANNELS]; + int* recv; + int* send; +}; + +struct ncclP2Plist { + struct ncclP2Pinfo *peerlist; + int count; + struct ncclP2PConnect connect; +}; + +#endif diff --git a/src/include/proxy.h b/src/include/proxy.h new file mode 100644 index 0000000..04daa84 --- /dev/null +++ b/src/include/proxy.h @@ -0,0 +1,77 @@ +/************************************************************************* + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_PROXY_H_ +#define NCCL_PROXY_H_ + +#include <pthread.h> + +enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }; + +struct ncclProxyArgs; +typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*); + +struct ncclProxyArgs { + proxyProgressFunc_t progress; + struct ncclChannel* channel; + struct ncclConnector* connector; + int sliceSteps; + int chunkSteps; + int nsteps; + uint64_t opCount; + int protocol; + ncclDataType_t dtype; + ncclRedOp_t redOp; + int state; // add component before this line -- it is left out during initialization + + // Internal state + uint64_t head; + uint64_t tail; + uint64_t end; + void* requests[NCCL_STEPS]; + int idle; + + // Element linking + pthread_mutex_t mutex; + struct ncclProxyArgs* next; + struct ncclProxyArgs* nextPeer; +}; + +struct ncclProxyPool; +struct ncclProxyState { + pthread_cond_t cond; + pthread_mutex_t mutex; + bool stop; + struct ncclProxyArgs* ops; + struct ncclProxyArgs* pool; + struct ncclProxyPool* pools; +}; + +typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*); + +enum proxyMode { + proxyRing = 0, + proxyFrom = 1, + proxyTo = 2 +}; + +ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks); +ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel); +ncclResult_t ncclProxyStart(struct ncclComm* comm); +ncclResult_t ncclProxyCreate(struct ncclComm* comm); +ncclResult_t ncclProxyDestroy(struct ncclComm* comm); + +#include <unistd.h> + +// Spin wait until func evaluates to true +template<typename FUNC> +inline void transportProxyWait(const FUNC& func) { + while (!func()) { + sched_yield(); + } +} + +#endif diff --git a/src/include/socket.h b/src/include/socket.h index 9376062..46b204d 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -53,6 +53,8 @@ static inline int envSocketFamily(void) { if (env == NULL) return family; + INFO(NCCL_ENV, "NCCL_SOCKET_FAMILY set by environment to %s", env); + if (strcmp(env, "AF_INET") == 0) family = AF_INET; // IPv4 else if (strcmp(env, "AF_INET6") == 0) @@ -290,6 +292,7 @@ static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNam // User specified interface char* env = getenv("NCCL_SOCKET_IFNAME"); if (env && strlen(env) > 1) { + INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env); // Specified by user : find or fail if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env); nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); @@ -301,7 +304,8 @@ static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNam if (nIfs == 0) { char* commId = getenv("NCCL_COMM_ID"); if (commId && strlen(commId) > 1) { - // Try to find interface that is in the same subnet as the IP in comm id + INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId); + // Try to find interface that is in the same subnet as the IP in comm id union socketAddress idAddr; GetSocketAddrFromString(&idAddr, commId); nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs); diff --git a/src/include/transport.h b/src/include/transport.h index e25132f..5a85688 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,6 +11,7 @@ #include "graph.h" #include "nvmlwrap.h" #include "core.h" +#include "proxy.h" #define NTRANSPORTS 3 #define TRANSPORT_P2P 0 @@ -39,49 +40,8 @@ struct ncclConnect { char data[CONNECT_SIZE]; }; -enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }; - -struct ncclProxyArgs; -typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*); - -struct ncclProxyArgs { - proxyProgressFunc_t progress; - struct ncclChannel* channel; - struct ncclConnector* connector; - int sliceSteps; - int chunkSteps; - int nsteps; - uint64_t opCount; - int protocol; - ncclDataType_t dtype; - ncclRedOp_t redOp; - int state; // add component before this line -- it is left out during initialization - - // Internal state - uint64_t head; - uint64_t tail; - uint64_t end; - void* requests[NCCL_STEPS]; - int idle; - - // Element linking - pthread_mutex_t mutex; - struct ncclProxyArgs* next; - struct ncclProxyArgs* nextPeer; -}; - -struct ncclProxyPool; -struct ncclProxyState { - pthread_cond_t cond; - pthread_mutex_t mutex; - bool stop; - struct ncclProxyArgs* ops; - struct ncclProxyArgs* pool; - struct ncclProxyPool* pools; -}; - struct ncclTransportComm { - ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId); + ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId); ncclResult_t (*connect)(struct ncclConnect*, int nranks, int rank, struct ncclConnector*); ncclResult_t (*free)(void*); ncclResult_t (*proxy)(struct ncclProxyArgs*); @@ -94,30 +54,6 @@ struct ncclTransport { struct ncclTransportComm recv; }; -#include <pthread.h> - -typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*); - -enum proxyMode { - proxyRing = 0, - proxyFrom = 1, - proxyTo = 2 -}; - -ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr); -ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks); -ncclResult_t transportStartProxy(struct ncclComm* comm); -ncclResult_t transportCreateProxy(struct ncclComm* comm); -ncclResult_t transportDestroyProxy(struct ncclComm* comm); - -#include <unistd.h> - -// Spin wait until func evaluates to true -template<typename FUNC> -inline void transportProxyWait(const FUNC& func) { - while (!func()) { - sched_yield(); - } -} +ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend); #endif diff --git a/src/init.cc b/src/init.cc index 0a02760..2be994d 100644 --- a/src/init.cc +++ b/src/init.cc @@ -37,6 +37,10 @@ std::chrono::high_resolution_clock::time_point ncclEpoch; #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream #endif +const char* ncclFuncStr[NCCL_NUM_FUNCTIONS] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" }; +const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" }; +const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" }; + NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM); NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); @@ -116,7 +120,7 @@ static ncclResult_t ncclInit() { pthread_mutex_lock(&initLock); if (!initialized) { initEnv(); - initNet(); + NCCLCHECK(initNet()); INFO(NCCL_INIT, "Using network %s", ncclNetName()); initialized = true; } @@ -154,6 +158,9 @@ void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) { static ncclResult_t commFree(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; + free(comm->p2plist.peerlist); + free(comm->p2plist.connect.recv); + free(comm->p2plist.connect.send); free(comm->peerInfo); ncclTopoFree(comm->topo); @@ -164,7 +171,7 @@ static ncclResult_t commFree(ncclComm_t comm) { CUDACHECK(cudaFree(comm->hostDevComm.channels)); CUDACHECK(cudaFree(comm->devComm)); - for (int channel=0; channel<comm->nChannels; channel++) + for (int channel=0; channel<MAXCHANNELS; channel++) NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks)); if (comm->doneEvent != NULL) @@ -228,14 +235,24 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { #endif comm->fatalError = ncclSuccess; - NCCLCHECK(ncclCudaHostAlloc((void**) &comm->fatalDevError, (void**) &comm->hostDevComm.fatalDevError, sizeof(ncclDevError_t))); + NCCLCHECK(ncclCudaHostCalloc((ncclDevError_t**)&comm->fatalDevError, 1)); + comm->hostDevComm.fatalDevError = comm->fatalDevError; *comm->fatalDevError = ncclDevSuccess; - NCCLCHECK(ncclCudaHostAlloc((void**) &comm->abortFlag, (void**) &comm->hostDevComm.abortFlag, sizeof(uint32_t))); + NCCLCHECK(ncclCudaHostCalloc((uint32_t**)&comm->abortFlag, 1)); + comm->hostDevComm.abortFlag = comm->abortFlag; *comm->abortFlag = 0; comm->argsptr = &comm->args; comm->collNetSupport = 0; + comm->p2plist.count=0; + NCCLCHECK(ncclCalloc(&comm->p2plist.peerlist, comm->nRanks)); + for (int r=0; r<comm->nRanks; r++) comm->p2plist.peerlist[r].sendbytes = comm->p2plist.peerlist[r].recvbytes = -1; + NCCLCHECK(ncclCalloc(&comm->p2plist.connect.recv, MAXCHANNELS*comm->nRanks)); + NCCLCHECK(ncclCalloc(&comm->p2plist.connect.send, MAXCHANNELS*comm->nRanks)); + + // Mark channels as non initialized. + for (int c=0; c<MAXCHANNELS; c++) comm->channels[c].id = -1; *comret = comm; return ncclSuccess; @@ -243,13 +260,12 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { static ncclResult_t devCommSetup(ncclComm_t comm) { // Duplicate the channels on the device - NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->nChannels)); - NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->nChannels)); + NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->p2pnChannels)); + NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->p2pnChannels)); // Copy userRanks and peers - for (int r=0; r<comm->nChannels; r++) { + for (int r=0; r<comm->p2pnChannels; r++) { NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks)); - NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks+1)); } // Duplicate the dev comm on the device @@ -290,23 +306,6 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u return ncclSuccess; } -template <int type> -static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) { - for (int t=0; t<NTRANSPORTS; t++) { - struct ncclTransport *transport = ncclTransports+t; - struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv; - int ret = 0; - NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo)); - if (ret) { - connector->transportComm = transportComm; - NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, buffSize, channelId)); - return ncclSuccess; - } - } - WARN("No transport found !"); - return ncclInternalError; -} - static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks) { TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); NCCLCHECK(initChannel(comm, channelId)); @@ -379,6 +378,7 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct // Set CG Mode comm->launchMode = ncclComm::GROUP; char* str = getenv("NCCL_LAUNCH_MODE"); + if (str) INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", str); if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) { comm->launchMode = ncclComm::PARALLEL; } @@ -399,50 +399,26 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct return ncclSuccess; } -static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) { - TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv); - uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */ - struct ncclConnect connect; - struct ncclConnector* conn; - for (int i=0; i<nrecv; i++) { - int peer = peerRecv[i]; - if (peer == -1 || peer >= comm->nRanks) continue; - conn = &channel->peers[peer].recv; - if (conn->connected) { ++nSkippedRecv; continue; } - memset(&connect, 0, sizeof(connect)); - NCCLCHECK(selectTransport<0>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); - NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); - } - for (int i=0; i<nsend; i++) { - int peer = peerSend[i]; - if (peer == -1 || peer >= comm->nRanks) continue; - conn = &channel->peers[peer].send; - if (conn->connected) { ++nSkippedSend; continue; } - memset(&connect, 0, sizeof(connect)); - NCCLCHECK(selectTransport<1>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); - NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); - } - for (int i=0; i<nsend; i++) { - int peer = peerSend[i]; - if (peer == -1 || peer >= comm->nRanks) continue; - conn = &channel->peers[peer].send; - if (conn->connected) {++nSkippedSend; continue; } - memset(&connect, 0, sizeof(connect)); - NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); - NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn)); - conn->connected = 1; - } - for (int i=0; i<nrecv; i++) { - int peer = peerRecv[i]; - if (peer == -1 || peer >= comm->nRanks) continue; - conn = &channel->peers[peer].recv; - if (conn->connected) {++nSkippedRecv; continue; } - memset(&connect, 0, sizeof(connect)); - NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); - NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn)); - conn->connected = 1; +#define DEFAULT_LL_BUFFSIZE (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_STEPS*sizeof(union ncclLLFifoLine)) +#define DEFAULT_LL128_BUFFSIZE (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS*NCCL_STEPS*sizeof(uint64_t)) +#define DEFAULT_BUFFSIZE (1LL << 22) /* 4MiB */ +#define DEFAULT_BUFFSIZE_ARM (1LL << 20) /* 1MiB */ +NCCL_PARAM(BuffSize, "BUFFSIZE", -2); +NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2); +NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2); + +static ncclResult_t computeBuffSizes(struct ncclComm* comm) { + int cpuArch, cpuVendor, cpuModel; + NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel)); + + int64_t envs[NCCL_NUM_PROTOCOLS] = { ncclParamLlBuffSize(), ncclParamLl128BuffSize(), ncclParamBuffSize() }; + int defaults[NCCL_NUM_PROTOCOLS] = { DEFAULT_LL_BUFFSIZE, DEFAULT_LL128_BUFFSIZE, DEFAULT_BUFFSIZE }; + + if (cpuArch == NCCL_TOPO_CPU_ARCH_ARM) defaults[NCCL_PROTO_SIMPLE] = DEFAULT_BUFFSIZE_ARM; + + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + comm->buffSizes[p] = comm->hostDevComm.buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p]; } - TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv); return ncclSuccess; } @@ -451,7 +427,8 @@ extern struct ncclTransport collNetTransport; // All ranks must participate in collNetSetup call // type: 0 for send, 1 for recv // return: 0 - unsupported, 1 - supported -static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int collNetChannels, int rank, int nranks, int masterRank, int masterPeer, int nMasters, int type) { +// We do not NCCLCHECK this call because we would fall back to P2P network in case CollNet setup fails +static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int rank, int nranks, int masterRank, int masterPeer, int nMasters, int type) { int rankInCollNet = -1; int supported = 0; int isMaster = (rank == masterRank) ? 1 : 0; @@ -483,7 +460,7 @@ static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGrap // setup struct ncclConnect myConnect; if (isMaster && ret > 0) { - NCCLCHECK(transportComm->setup(comm->topo, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->buffSize, channel->id)); + NCCLCHECK(transportComm->setup(comm->topo, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->id)); } // prepare connect handles ncclResult_t res; @@ -514,12 +491,15 @@ static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGrap // connect if (isMaster && ret > 0) { NCCLCHECKGOTO(transportComm->connect(masterConnects, nMasters, rankInCollNet, conn), res, cleanup); + struct ncclPeer* devRoot = channel->devPeers+nranks; + struct ncclConnector* devConn = (type == 1) ? &devRoot->recv : &devRoot->send; + CUDACHECKGOTO(cudaMemcpy(devConn, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice), res, cleanup); } // recv side sends connect info to send side if (isMaster && type == 1) { sendrecvExchange.collNetRank = rankInCollNet; memcpy(&sendrecvExchange.connect, masterConnects+rankInCollNet, sizeof(struct ncclConnect)); - NCCLCHECK(bootstrapSend(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange))); + NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup); INFO(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer); } if (ret > 0) { @@ -746,7 +726,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings)); if (comm->nNodes > 1 && ncclParamCollNetEnable() == 1 && - collNetSupport()) { + collNetSupport() && collNetGraph.nChannels) { NCCLCHECK(ncclTopoConnectCollNet(comm, &collNetGraph, rank)); } @@ -758,7 +738,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels); - NCCLCHECK(ncclTopoSetThresholds(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph)); + NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph)); char line[1024]; line[0]='\0'; @@ -779,6 +759,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECK(ncclTopoSetAffinity(comm->topo, comm->rank)); ncclResult_t ret; + NCCLCHECK(computeBuffSizes(comm)); + // Connect with prev/next for each ring struct ncclConnect *connect; NCCLCHECKGOTO(ncclCalloc(&connect, 2), ret, affinity_restore); @@ -786,15 +768,15 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm struct ncclChannel* channel = comm->channels+c; NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore); if (comm->nRanks == 1) continue; - NCCLCHECKGOTO(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore); - NCCLCHECKGOTO(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up), ret, affinity_restore); - NCCLCHECKGOTO(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down), ret, affinity_restore); + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore); + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up), ret, affinity_restore); + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down), ret, affinity_restore); } // Check if we can setup CollNet if (comm->nNodes > 1 && ncclParamCollNetEnable() == 1 && - collNetSupport()) { + collNetSupport() && collNetGraph.nChannels) { int logicChannels = comm->nChannels/2; int collNetSetupFail = 0; const int recvIndex = 0; // recv GPU index is always 0 @@ -802,13 +784,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm for (int c=0; c<logicChannels; c++) { struct ncclChannel* channelRecv = comm->channels+logicChannels+c; struct ncclChannel* channelSend = comm->channels+c; - NCCLCHECK(p2pSetup(comm, &collNetGraph, channelRecv, 1, &channelRecv->collTreeDn.up, 1, channelRecv->collTreeDn.down)); - NCCLCHECK(p2pSetup(comm, &collNetGraph, channelSend, 1, channelSend->collTreeUp.down, 1, &channelSend->collTreeUp.up)); + NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, channelRecv, 1, &channelRecv->collTreeDn.up, 1, channelRecv->collTreeDn.down)); + NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, channelSend, 1, channelSend->collTreeUp.down, 1, &channelSend->collTreeUp.up)); const int recvMaster = collNetGraph.intra[c*comm->localRanks+recvIndex]; const int sendMaster = collNetGraph.intra[c*comm->localRanks+sendIndex]; - if (collNetSetup(comm, &collNetGraph, channelRecv, logicChannels, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1) + if (collNetSetup(comm, &collNetGraph, channelRecv, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1) collNetSetupFail = 1; - if (collNetSetup(comm, &collNetGraph, channelSend, logicChannels, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1) + else if (collNetSetup(comm, &collNetGraph, channelSend, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1) collNetSetupFail = 1; } // Verify CollNet setup across ranks @@ -818,6 +800,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm free(connect); free(rings); + // Compute nChannels per peer for p2p + NCCLCHECK(ncclTopoComputeP2pChannels(comm)); + // We should have allocated all buffers, collective fifos, ... we can // restore the affinity. affinity_restore: @@ -846,7 +831,7 @@ affinity_restore: // Done with AllGather1 data free(allGather1Data); - if (comm->nNodes) NCCLCHECK(transportCreateProxy(comm)); + if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm)); TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); return ncclSuccess; @@ -873,6 +858,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni ncclResult_t res; char* env = getenv("NCCL_COMM_ID"); if (env && myrank == 0) { + INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env); NCCLCHECKGOTO(bootstrapCreateRoot(&commId, true), res, end); } @@ -941,7 +927,7 @@ static ncclResult_t commDestroy(ncclComm_t comm) { TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, *comm->abortFlag, comm->fatalError); CUDACHECK(cudaStreamSynchronize(comm->groupStream)); - NCCLCHECK(transportDestroyProxy(comm)); + NCCLCHECK(ncclProxyDestroy(comm)); NCCLCHECK(commFree(comm)); if (savedDevice != commDevice) diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc index 67931f8..27623b2 100644 --- a/src/misc/argcheck.cc +++ b/src/misc/argcheck.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -34,7 +34,6 @@ ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) { } ncclResult_t ArgsCheck(struct ncclInfo* info) { - NCCLCHECK(PtrCheck(info->comm, info->opName, "comm")); // First, the easy ones if (info->root < 0 || info->root >= info->comm->nRanks) { WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks); @@ -44,7 +43,7 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) { WARN("%s : invalid type %d", info->opName, info->datatype); return ncclInvalidArgument; } - // Type is OK, compute nbytes. Convert Allgather/Broadcast calls to chars. + // Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars. info->nBytes = info->count * ncclTypeSize(info->datatype); if (info->coll == ncclCollAllGather || info->coll == ncclCollBroadcast) { info->count = info->nBytes; @@ -58,12 +57,20 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) { } if (info->comm->checkPointers) { - // Check CUDA device pointers - if (info->coll != ncclCollBroadcast || info->comm->rank == info->root) { - NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName)); - } - if (info->coll != ncclCollReduce || info->comm->rank == info->root) { - NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName)); + if (info->coll == ncclCollSendRecv) { + if (strcmp(info->opName, "Send") == 0) { + NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", "Send")); + } else { + NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", "Recv")); + } + } else { + // Check CUDA device pointers + if (info->coll != ncclCollBroadcast || info->comm->rank == info->root) { + NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName)); + } + if (info->coll != ncclCollReduce || info->comm->rank == info->root) { + NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName)); + } } } return ncclSuccess; diff --git a/src/misc/utils.cc b/src/misc/utils.cc index 782e9c0..b231eb1 100644 --- a/src/misc/utils.cc +++ b/src/misc/utils.cc @@ -93,6 +93,7 @@ uint64_t getHostHash(void) { int offset = strlen(hostHash); if ((hostId = getenv("NCCL_HOSTID")) != NULL) { + INFO(NCCL_ENV, "NCCL_HOSTID set by environment to %s", hostId); strncpy(hostHash, hostId, sizeof(hostHash)); } else { FILE *file = fopen(HOSTID_FILE, "r"); diff --git a/src/nccl.h.in b/src/nccl.h.in index f07e0a4..b4f34ef 100644 --- a/src/nccl.h.in +++ b/src/nccl.h.in @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -221,6 +221,40 @@ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); /* + * Send + * + * Send data from sendbuff to rank peer. + * + * Rank peer needs to call ncclRecv with the same datatype and the same count from this + * rank. + * + * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations + * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ + * ncclGroupEnd section. + */ +ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); + +/* + * Receive + * + * Receive data from rank peer into recvbuff. + * + * Rank peer needs to call ncclSend with the same datatype and the same count to this + * rank. + * + * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations + * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ + * ncclGroupEnd section. + */ +ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, + ncclComm_t comm, cudaStream_t stream); + +/* * Group semantics * * When managing multiple GPUs from a single thread, and since NCCL collective @@ -235,14 +269,19 @@ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou * the operation is effectively done. * * Both collective communication and ncclCommInitRank can be used in conjunction - * of ncclGroupStart/ncclGroupEnd. + * of ncclGroupStart/ncclGroupEnd, but not together. + * + * Group semantics also allow to fuse multiple operations on the same device + * to improve performance (for aggregated collective calls), or to permit + * concurrent progress of multiple send/receive operations. */ /* * Group Start * - * Start a group call. All subsequent calls to NCCL may not block due to - * inter-CPU synchronization. + * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into + * a single NCCL operation. Nothing will be started on the CUDA stream until + * ncclGroupEnd. */ ncclResult_t ncclGroupStart(); ncclResult_t pncclGroupStart(); @@ -250,8 +289,9 @@ ncclResult_t pncclGroupStart(); /* * Group End * - * End a group call. Wait for all calls since ncclGroupStart to complete - * before returning. + * End a group call. Start a fused NCCL operation consisting of all calls since + * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations + * need to be called after ncclGroupEnd. */ ncclResult_t ncclGroupEnd(); ncclResult_t pncclGroupEnd(); diff --git a/src/proxy.cc b/src/proxy.cc new file mode 100644 index 0000000..19dbced --- /dev/null +++ b/src/proxy.cc @@ -0,0 +1,283 @@ +/************************************************************************* + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "comm.h" +#include "info.h" +#include "collectives.h" + +#define RECV 0 +#define SEND 1 + +static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) { + if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true; + + /* In chains, one rank does not need a proxy. Let's figure out which one it is */ + // Which index in the reorganized rings should we compare root against */ + const int myrank = 0, nextrank = 1, prevrank = nranks-1; + int index = pattern == ncclPatternPipelineFrom ? + /* no recv / no send if root = */ + /* bcast */ (type == RECV ? myrank : nextrank ): + /* reduce */ (type == RECV ? prevrank : myrank ); + int rank = ring->userRanks[index]; + return (root != rank); +} + +enum { proxyRecv=0, proxySend=1 }; + +#define PROXYARGS_ALLOCATE_SIZE 32 +struct ncclProxyPool { + struct ncclProxyPool *next; + struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE]; +}; + +static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) { + struct ncclProxyState* state = &comm->proxyState; + struct ncclProxyArgs* elem; + pthread_mutex_lock(&state->mutex); + if (state->pool == NULL) { + // Allocate a new pool of elements + struct ncclProxyPool* newPool; + NCCLCHECK(ncclCalloc(&newPool, 1)); + struct ncclProxyArgs* newElems = newPool->elems; + // Chain newly allocated elements + for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) { + if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1; + } + // Add them all to the pool list + state->pool = newElems; + // Save the pool memory block for later resource release + newPool->next = state->pools; + state->pools = newPool; + } + elem = state->pool; + state->pool = state->pool->next; + pthread_mutex_unlock(&state->mutex); + elem->next = elem->nextPeer = NULL; + *argsptr = elem; + return ncclSuccess; +} + +static void ProxyAppend(struct ncclConnector* connector, struct ncclProxyArgs* args) { + struct ncclComm* comm = connector->comm; + struct ncclProxyState* state = &comm->proxyState; + pthread_mutex_lock(&state->mutex); + if (connector->proxyAppend == NULL) { + // Nothing running for that peer. Add to the circular list + if (state->ops == NULL) { + // Create the list + args->next = args; + state->ops = args; + } else { + // Insert element in the list + args->next = state->ops->next; + state->ops->next = args; + } + connector->proxyAppend = args; + } else { + // There is an active operation already for that peer. + // Add it to the per-peer list + connector->proxyAppend->nextPeer = args; + connector->proxyAppend = args; + } + pthread_mutex_unlock(&state->mutex); +} + +template <int type> +static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) { + if (peer < 0) return ncclSuccess; + + struct ncclPeer* peerComm = args->channel->peers+peer; + struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send; + if (connector->transportComm == NULL) { + WARN("[%d] Error no transport for %s peer %d on channel %d\n", connector->comm->rank, + type == proxyRecv ? "recv" : "send", peer, args->channel->id); + return ncclInternalError; + } + if (connector->transportComm->proxy == NULL) return ncclSuccess; + + struct ncclProxyArgs* op; + NCCLCHECK(allocateArgs(connector->comm, &op)); + memcpy(op, args, sizeof(struct ncclProxyArgs)); + op->connector = connector; + op->progress = connector->transportComm->proxy; + op->state = ncclProxyOpReady; + ProxyAppend(connector, op); + return ncclSuccess; +} + +ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks) { + if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) { + struct ncclRing* ring = &args->channel->ring; + if (NeedProxy(RECV, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxyRecv>(ring->prev, args)); + if (NeedProxy(SEND, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxySend>(ring->next, args)); + } + if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) { + // Tree up + struct ncclTree* tree = &args->channel->treeUp; + for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args)); + NCCLCHECK(SaveProxy<proxySend>(tree->up, args)); + } + if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) { + // Tree down + struct ncclTree* tree = &args->channel->treeDn; + for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args)); + NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args)); + } + if (pattern == ncclPatternCollTreeUp) { + // CollTree up + struct ncclTree* tree = &args->channel->collTreeUp; + NCCLCHECK(SaveProxy<proxyRecv>(tree->down[0], args)); + NCCLCHECK(SaveProxy<proxySend>(tree->up, args)); + } + if (pattern == ncclPatternCollTreeDown) { + // CollTree down + struct ncclTree* tree = &args->channel->collTreeDn; + NCCLCHECK(SaveProxy<proxySend>(tree->down[0], args)); + NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args)); + } + return ncclSuccess; +} + +ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel) { + struct ncclProxyArgs args; + memset(&args, 0, sizeof(struct ncclProxyArgs)); + args.channel = channel; + args.sliceSteps = 1; + args.chunkSteps = 1; + args.protocol = NCCL_PROTO_SIMPLE; + args.opCount = info->comm->opCount; + args.dtype = info->datatype; + if (info->delta > 0 && info->sendbytes >= 0) { + int peersend = (info->comm->rank+info->delta)%info->comm->nRanks; + args.nsteps = DIVUP(info->sendbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR); + if (args.nsteps == 0) args.nsteps = 1; + NCCLCHECK(SaveProxy<proxySend>(peersend, &args)); + } + if (info->delta > 0 && info->recvbytes >= 0) { + int peerrecv = (info->comm->nRanks+info->comm->rank-info->delta)%info->comm->nRanks; + args.nsteps = DIVUP(info->recvbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR); + if (args.nsteps == 0) args.nsteps = 1; + NCCLCHECK(SaveProxy<proxyRecv>(peerrecv, &args)); + } + return ncclSuccess; +} + +void* persistentThread(void *comm_) { + struct ncclComm* comm = (struct ncclComm*)comm_; + struct ncclProxyState* state = &comm->proxyState; + struct ncclProxyArgs* op = NULL; + ncclResult_t ret = ncclSuccess; + int idle = 1; + int idleSpin = 0; + while (1) { + do { + if (*comm->abortFlag) return NULL; + if (op == NULL) { + pthread_mutex_lock(&state->mutex); + op = state->ops; + if (op == NULL) { + if (state->stop) { + // No more commands to process and proxy has been requested to stop + pthread_mutex_unlock(&state->mutex); + return NULL; + } + pthread_cond_wait(&state->cond, &state->mutex); + } + pthread_mutex_unlock(&state->mutex); + } + } while (op == NULL); + op->idle = 0; + // opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started + // yet and might be cancelled before they even start. Hold on on those. + if (op->state != ncclProxyOpNone && op->opCount < comm->lastOpCount) ret = op->progress(op); + if (ret != ncclSuccess) { + comm->fatalError = ret; + INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); + return NULL; + } + idle &= op->idle; + pthread_mutex_lock(&state->mutex); + if (!idle) idleSpin = 0; + struct ncclProxyArgs *next = op->next; + if (next->state == ncclProxyOpNone) { + struct ncclProxyArgs *freeOp = next; + if (next->nextPeer) { + // Replace next by its next per-peer element. + next = next->nextPeer; + if (op != freeOp) { + next->next = freeOp->next; + op->next = next; + } else { + next->next = next; + } + } else { + // Remove next from circular list + next->connector->proxyAppend = NULL; + if (op != freeOp) { + next = next->next; + op->next = next; + } else { + next = NULL; + } + } + if (freeOp == state->ops) state->ops = next; + freeOp->next = state->pool; + state->pool = freeOp; + } + op = next; + if (op == state->ops) { + if (idle == 1) { + if (++idleSpin == 10) { + sched_yield(); + idleSpin = 0; + } + } + idle = 1; + } + pthread_mutex_unlock(&state->mutex); + } +} + +ncclResult_t ncclProxyStart(struct ncclComm* comm) { + pthread_mutex_lock(&comm->proxyState.mutex); + if (comm->proxyState.ops != NULL) + pthread_cond_signal(&comm->proxyState.cond); + pthread_mutex_unlock(&comm->proxyState.mutex); + return ncclSuccess; +} + +ncclResult_t ncclProxyCreate(struct ncclComm* comm) { + if (!comm->proxyThread) { + comm->proxyState.cond = PTHREAD_COND_INITIALIZER; + comm->proxyState.mutex = PTHREAD_MUTEX_INITIALIZER; + comm->proxyState.ops = NULL; + pthread_create(&comm->proxyThread, NULL, persistentThread, comm); + } + return ncclSuccess; +} + +ncclResult_t ncclProxyDestroy(struct ncclComm* comm) { + struct ncclProxyState* state = &comm->proxyState; + + // Request the proxy to stop and then wake it + pthread_mutex_lock(&state->mutex); + state->stop = true; + pthread_cond_signal(&state->cond); + pthread_mutex_unlock(&state->mutex); + if (comm->proxyThread) pthread_join(comm->proxyThread, NULL); + + // Free off any memory allocated for the proxy arg pools + pthread_mutex_lock(&state->mutex); + struct ncclProxyState* proxyState = &comm->proxyState; + while (proxyState->pools != NULL) { + struct ncclProxyPool *next = proxyState->pools->next; + free(proxyState->pools); + proxyState->pools = next; + } + pthread_mutex_unlock(&state->mutex); + + return ncclSuccess; +} diff --git a/src/transport.cc b/src/transport.cc index cc8d5d1..7219ea3 100644 --- a/src/transport.cc +++ b/src/transport.cc @@ -1,11 +1,12 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "comm.h" #include "info.h" +#include "bootstrap.h" extern struct ncclTransport p2pTransport; extern struct ncclTransport shmTransport; @@ -17,248 +18,70 @@ struct ncclTransport ncclTransports[NTRANSPORTS] = { netTransport, }; -#define RECV 0 -#define SEND 1 - -static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) { - if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true; - - /* In chains, one rank does not need a proxy. Let's figure out which one it is */ - // Which index in the reorganized rings should we compare root against */ - const int myrank = 0, nextrank = 1, prevrank = nranks-1; - int index = pattern == ncclPatternPipelineFrom ? - /* no recv / no send if root = */ - /* bcast */ (type == RECV ? myrank : nextrank ): - /* reduce */ (type == RECV ? prevrank : myrank ); - int rank = ring->userRanks[index]; - return (root != rank); -} - -enum { proxyRecv=0, proxySend=1 }; - -#define PROXYARGS_ALLOCATE_SIZE 32 -struct ncclProxyPool { - struct ncclProxyPool *next; - struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE]; -}; - -ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) { - struct ncclProxyState* state = &comm->proxyState; - struct ncclProxyArgs* elem; - pthread_mutex_lock(&state->mutex); - if (state->pool == NULL) { - // Allocate a new pool of elements - struct ncclProxyPool* newPool; - NCCLCHECK(ncclCalloc(&newPool, 1)); - struct ncclProxyArgs* newElems = newPool->elems; - // Chain newly allocated elements - for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) { - if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1; - } - // Add them all to the pool list - state->pool = newElems; - // Save the pool memory block for later resource release - newPool->next = state->pools; - state->pools = newPool; - } - elem = state->pool; - state->pool = state->pool->next; - pthread_mutex_unlock(&state->mutex); - elem->next = elem->nextPeer = NULL; - *argsptr = elem; - return ncclSuccess; -} - -static void ProxyAppend(struct ncclConnector* connector, struct ncclProxyArgs* args) { - struct ncclComm* comm = connector->comm; - struct ncclProxyState* state = &comm->proxyState; - pthread_mutex_lock(&state->mutex); - if (connector->proxyAppend == NULL) { - // Nothing running for that peer. Add to the circular list - if (state->ops == NULL) { - // Create the list - args->next = args; - state->ops = args; - } else { - // Insert element in the list - args->next = state->ops->next; - state->ops->next = args; +template <int type> +static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId) { + for (int t=0; t<NTRANSPORTS; t++) { + struct ncclTransport *transport = ncclTransports+t; + struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv; + int ret = 0; + NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo)); + if (ret) { + connector->transportComm = transportComm; + NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, channelId)); + return ncclSuccess; } - connector->proxyAppend = args; - } else { - // There is an active operation already for that peer. - // Add it to the per-peer list - connector->proxyAppend->nextPeer = args; - connector->proxyAppend = args; } - pthread_mutex_unlock(&state->mutex); -} - -template <int type> -static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) { - if (peer < 0) return ncclSuccess; - - struct ncclPeer* peerComm = args->channel->peers+peer; - struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send; - if (connector->transportComm == NULL) return ncclInternalError; - if (connector->transportComm->proxy == NULL) return ncclSuccess; - - struct ncclProxyArgs* op; - NCCLCHECK(transportAllocateProxyArgs(connector->comm, &op)); - memcpy(op, args, sizeof(struct ncclProxyArgs)); - op->connector = connector; - op->progress = connector->transportComm->proxy; - op->state = ncclProxyOpReady; - ProxyAppend(connector, op); - return ncclSuccess; + WARN("No transport found !"); + return ncclInternalError; } -ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks) { - if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) { - struct ncclRing* ring = &args->channel->ring; - if (NeedProxy(RECV, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxyRecv>(ring->prev, args)); - if (NeedProxy(SEND, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxySend>(ring->next, args)); +ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) { + TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv); + uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */ + struct ncclConnect connect; + struct ncclConnector* conn; + for (int i=0; i<nrecv; i++) { + int peer = peerRecv[i]; + if (peer == -1 || peer >= comm->nRanks) continue; + conn = &channel->peers[peer].recv; + if (conn->connected) { ++nSkippedRecv; continue; } + memset(&connect, 0, sizeof(connect)); + NCCLCHECK(selectTransport<0>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->id)); + NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); } - if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) { - // Tree up - struct ncclTree* tree = &args->channel->treeUp; - for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args)); - NCCLCHECK(SaveProxy<proxySend>(tree->up, args)); + for (int i=0; i<nsend; i++) { + int peer = peerSend[i]; + if (peer == -1 || peer >= comm->nRanks) continue; + conn = &channel->peers[peer].send; + if (conn->connected) { ++nSkippedSend; continue; } + memset(&connect, 0, sizeof(connect)); + NCCLCHECK(selectTransport<1>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->id)); + NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); } - if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) { - // Tree down - struct ncclTree* tree = &args->channel->treeDn; - for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args)); - NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args)); + for (int i=0; i<nsend; i++) { + int peer = peerSend[i]; + if (peer == -1 || peer >= comm->nRanks) continue; + conn = &channel->peers[peer].send; + if (conn->connected) {++nSkippedSend; continue; } + memset(&connect, 0, sizeof(connect)); + NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn)); + conn->connected = 1; + CUDACHECK(cudaMemcpy(&channel->devPeers[peer].send, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice)); } - if (pattern == ncclPatternCollTreeUp) { - // CollTree up - struct ncclTree* tree = &args->channel->collTreeUp; - NCCLCHECK(SaveProxy<proxyRecv>(tree->down[0], args)); - NCCLCHECK(SaveProxy<proxySend>(tree->up, args)); - } - if (pattern == ncclPatternCollTreeDown) { - // CollTree down - struct ncclTree* tree = &args->channel->collTreeDn; - NCCLCHECK(SaveProxy<proxySend>(tree->down[0], args)); - NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args)); + for (int i=0; i<nrecv; i++) { + int peer = peerRecv[i]; + if (peer == -1 || peer >= comm->nRanks) continue; + conn = &channel->peers[peer].recv; + if (conn->connected) {++nSkippedRecv; continue; } + memset(&connect, 0, sizeof(connect)); + NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn)); + conn->connected = 1; + CUDACHECK(cudaMemcpy(&channel->devPeers[peer].recv, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice)); } + TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv); return ncclSuccess; } -void* persistentThread(void *comm_) { - struct ncclComm* comm = (struct ncclComm*)comm_; - struct ncclProxyState* state = &comm->proxyState; - struct ncclProxyArgs* op = NULL; - ncclResult_t ret = ncclSuccess; - int idle = 1; - int idleSpin = 0; - while (1) { - do { - if (*comm->abortFlag) return NULL; - if (op == NULL) { - pthread_mutex_lock(&state->mutex); - op = state->ops; - if (op == NULL) { - if (state->stop) { - // No more commands to process and proxy has been requested to stop - pthread_mutex_unlock(&state->mutex); - return NULL; - } - pthread_cond_wait(&state->cond, &state->mutex); - } - pthread_mutex_unlock(&state->mutex); - } - } while (op == NULL); - op->idle = 0; - // opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started - // yet and might be cancelled before they even start. Hold on on those. - if (op->state != ncclProxyOpNone && op->opCount < comm->lastOpCount) ret = op->progress(op); - if (ret != ncclSuccess) { - comm->fatalError = ret; - INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); - return NULL; - } - idle &= op->idle; - pthread_mutex_lock(&state->mutex); - if (!idle) idleSpin = 0; - struct ncclProxyArgs *next = op->next; - if (next->state == ncclProxyOpNone) { - struct ncclProxyArgs *freeOp = next; - if (next->nextPeer) { - // Replace next by its next per-peer element. - next = next->nextPeer; - if (op != freeOp) { - next->next = freeOp->next; - op->next = next; - } else { - next->next = next; - } - } else { - // Remove next from circular list - next->connector->proxyAppend = NULL; - if (op != freeOp) { - next = next->next; - op->next = next; - } else { - next = NULL; - } - } - if (freeOp == state->ops) state->ops = next; - freeOp->next = state->pool; - state->pool = freeOp; - } - op = next; - if (op == state->ops) { - if (idle == 1) { - if (++idleSpin == 10) { - sched_yield(); - idleSpin = 0; - } - } - idle = 1; - } - pthread_mutex_unlock(&state->mutex); - } -} - -ncclResult_t transportStartProxy(struct ncclComm* comm) { - pthread_mutex_lock(&comm->proxyState.mutex); - if (comm->proxyState.ops != NULL) - pthread_cond_signal(&comm->proxyState.cond); - pthread_mutex_unlock(&comm->proxyState.mutex); - return ncclSuccess; -} -ncclResult_t transportCreateProxy(struct ncclComm* comm) { - if (!comm->proxyThread) { - comm->proxyState.cond = PTHREAD_COND_INITIALIZER; - comm->proxyState.mutex = PTHREAD_MUTEX_INITIALIZER; - comm->proxyState.ops = NULL; - pthread_create(&comm->proxyThread, NULL, persistentThread, comm); - } - return ncclSuccess; -} - -ncclResult_t transportDestroyProxy(struct ncclComm* comm) { - struct ncclProxyState* state = &comm->proxyState; - - // Request the proxy to stop and then wake it - pthread_mutex_lock(&state->mutex); - state->stop = true; - pthread_cond_signal(&state->cond); - pthread_mutex_unlock(&state->mutex); - if (comm->proxyThread) pthread_join(comm->proxyThread, NULL); - - // Free off any memory allocated for the proxy arg pools - pthread_mutex_lock(&state->mutex); - struct ncclProxyState* proxyState = &comm->proxyState; - while (proxyState->pools != NULL) { - struct ncclProxyPool *next = proxyState->pools->next; - free(proxyState->pools); - proxyState->pools = next; - } - pthread_mutex_unlock(&state->mutex); - - return ncclSuccess; -} diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc index 435c88d..a11f8be 100644 --- a/src/transport/coll_net.cc +++ b/src/transport/coll_net.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -15,17 +15,10 @@ struct collNetRecvConnectInfo { struct collNetSendConnectInfo { void* collNetComm; - void* mhandle; - void* llMhandle; + void* mhandles[NCCL_NUM_PROTOCOLS]; struct reqSlot* reqFifo; }; -struct ncclLLDataLine { - uint32_t data1; - uint32_t data2; -}; -static_assert(sizeof(struct ncclLLDataLine) == sizeof(union ncclLLFifoLine)>>1, "ncclLLDataLine is not half size of ncclLLFifoLine"); - struct reqSlot { volatile void* recvBuff; volatile int size; @@ -37,14 +30,11 @@ struct collNetSendResources { struct ncclRecvMem* hostRecvMem; struct ncclSendMem* devHostSendMem; struct ncclRecvMem* devHostRecvMem; - struct ncclLLDataLine* llData; + uint32_t* llData; int netDev; int useGdr; - int buffSize; - void* sendMhandle; - void* llSendMhandle; - void* recvMhandle; - void* llRecvMhandle; + void* sendMhandles[NCCL_NUM_PROTOCOLS]; + void* recvMhandles[NCCL_NUM_PROTOCOLS]; struct ncclRecvMem* devRecvMem; uint64_t step; uint64_t llLastCleaning; @@ -59,12 +49,10 @@ struct collNetRecvResources { struct ncclRecvMem* hostRecvMem; struct ncclSendMem* devHostSendMem; struct ncclRecvMem* devHostRecvMem; - struct ncclLLDataLine* llData; + uint32_t* llData; int netDev; int useGdr; - int buffSize; - void* mhandle; - void* llMhandle; + void* mhandles[NCCL_NUM_PROTOCOLS]; struct ncclRecvMem* devRecvMem; uint64_t step; uint64_t llLastCleaning; @@ -79,112 +67,120 @@ ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncc } /* Setup send connector, and return connect information for others in the coll communicator to connect to me */ -ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { - struct collNetSendResources* sendResources; - NCCLCHECK(ncclCalloc(&sendResources, 1)); - send->transportResources = sendResources; +ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) { + struct collNetSendResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + send->transportResources = resources; - NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &sendResources->netDev)); - NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, sendResources->netDev, 1, &sendResources->useGdr)); + NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev)); + NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr)); - int sendSize = sizeof(struct ncclSendMem); - NCCLCHECK(ncclCudaHostAlloc((void**)&sendResources->hostSendMem, (void**)&sendResources->devHostSendMem, sendSize)); + NCCLCHECK(ncclCudaHostCalloc(&resources->hostSendMem, 1)); + resources->devHostSendMem = resources->hostSendMem; - int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; - if (sendResources->useGdr) { - NCCLCHECK(ncclCudaCalloc((char**)(&sendResources->devRecvMem), recvSize)); - } - NCCLCHECK(ncclCudaHostAlloc((void**)&sendResources->hostRecvMem, (void**)&sendResources->devHostRecvMem, recvSize)); - NCCLCHECK(ncclIbMalloc((void**)&(sendResources->llData), NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine))); - sendResources->buffSize = buffSize; + int recvSize = offsetof(struct ncclRecvMem, buff); + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) recvSize += send->comm->buffSizes[p]; - INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), sendResources->netDev, - sendResources->useGdr ? "/GDRDMA" : ""); + if (resources->useGdr) { + NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize)); + } + NCCLCHECK(ncclCudaHostCalloc((char**)&resources->hostRecvMem, recvSize)); + resources->devHostRecvMem = resources->hostRecvMem; + NCCLCHECK(ncclIbMalloc((void**)&(resources->llData), send->comm->buffSizes[NCCL_PROTO_LL]/2)); + INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev, + resources->useGdr ? "/GDRDMA" : ""); return ncclSuccess; } /* Setup recv connector */ -ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { - struct collNetRecvResources* recvResources; - NCCLCHECK(ncclCalloc(&recvResources, 1)); - recv->transportResources = recvResources; +ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) { + struct collNetRecvResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + recv->transportResources = resources; - NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &recvResources->netDev)); - NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, recvResources->netDev, 0, &recvResources->useGdr)); + NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev)); + NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr)); - int sendSize = sizeof(struct ncclSendMem); - NCCLCHECK(ncclCudaHostAlloc((void**)&recvResources->hostSendMem, (void**)&recvResources->devHostSendMem, sendSize)); + NCCLCHECK(ncclCudaHostCalloc(&resources->hostSendMem, 1)); + resources->devHostSendMem = resources->hostSendMem; - int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; - if (recvResources->useGdr) { - NCCLCHECK(ncclCudaCalloc((char**)(&recvResources->devRecvMem), recvSize)); + int recvSize = offsetof(struct ncclRecvMem, buff); + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) recvSize += recv->comm->buffSizes[p]; + + if (resources->useGdr) { + NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize)); } - NCCLCHECK(ncclCudaHostAlloc((void**)&recvResources->hostRecvMem, (void**)&recvResources->devHostRecvMem, recvSize)); - NCCLCHECK(ncclIbMalloc((void**)&(recvResources->llData), NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine))); - recvResources->buffSize = buffSize; + NCCLCHECK(ncclCudaHostCalloc((char**)&resources->hostRecvMem, recvSize)); + resources->devHostRecvMem = resources->hostRecvMem; - INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), recvResources->netDev, - recvResources->useGdr ? "/GDRDMA" : ""); + NCCLCHECK(ncclIbMalloc((void**)&(resources->llData), recv->comm->buffSizes[NCCL_PROTO_LL]/2)); + INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev, + resources->useGdr ? "/GDRDMA" : ""); struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo; - NCCLCHECK(collNetListen(recvResources->netDev, &info->collNetHandle, &recvResources->netListenComm)); - + NCCLCHECK(collNetListen(resources->netDev, &info->collNetHandle, &resources->netListenComm)); return ncclSuccess; } ncclResult_t collNetSendConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) { // Setup device pointers - struct collNetSendResources* sendResources = (struct collNetSendResources*)send->transportResources; - sendResources->collNetRank = rank; - - // Get info from recv side - struct collNetSendConnectInfo* sInfo = (struct collNetSendConnectInfo*)(connectInfos+rank); - sendResources->reqFifo = sInfo->reqFifo; - sendResources->collNetSendComm = sInfo->collNetComm; - sendResources->recvMhandle = sInfo->mhandle; - sendResources->llRecvMhandle = sInfo->llMhandle; + struct collNetSendResources* resources = (struct collNetSendResources*)send->transportResources; + struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank); // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host - struct ncclRecvMem* sRecvMem = sendResources->useGdr ? sendResources->devRecvMem : sendResources->devHostRecvMem; - // Register buffers - NCCLCHECK(collNetRegMr(sendResources->collNetSendComm, sRecvMem->buff, sendResources->buffSize, - sendResources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &sendResources->sendMhandle)); - NCCLCHECK(collNetRegMr(sendResources->collNetSendComm, sendResources->llData, - NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine), NCCL_PTR_HOST, &sendResources->llSendMhandle)); - - send->conn.buff = sRecvMem->buff; - send->conn.llBuff = sendResources->devHostRecvMem->llBuff; - send->conn.direct |= sendResources->useGdr ? NCCL_DIRECT_NIC : 0; + struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem; + int offset = 0; + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + send->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->devHostRecvMem->buff : recvMem->buff) + offset; + offset += send->comm->buffSizes[p]; + } + send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0; // Head/Tail/Opcount/Fifos are always on host - send->conn.tail = &sendResources->devHostRecvMem->tail; - send->conn.opCountRem = &sendResources->devHostRecvMem->opCount; - send->conn.fifo = sendResources->devHostRecvMem->sizesFifo; - send->conn.head = &sendResources->devHostSendMem->head; - send->conn.opCountLoc = &sendResources->devHostSendMem->opCount; + send->conn.tail = &resources->devHostRecvMem->tail; + send->conn.opCountRem = &resources->devHostRecvMem->opCount; + send->conn.fifo = resources->devHostRecvMem->sizesFifo; + send->conn.head = &resources->devHostSendMem->head; + send->conn.opCountLoc = &resources->devHostSendMem->opCount; for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1; + // Get info from recv side + resources->collNetRank = rank; + resources->reqFifo = info->reqFifo; + resources->collNetSendComm = info->collNetComm; + + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) + resources->recvMhandles[p] = info->mhandles[p]; + + // Register buffers + NCCLCHECK(collNetRegMr(resources->collNetSendComm, send->conn.buffs[NCCL_PROTO_SIMPLE], send->comm->buffSizes[NCCL_PROTO_SIMPLE], + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->sendMhandles[NCCL_PROTO_SIMPLE])); + NCCLCHECK(collNetRegMr(resources->collNetSendComm, resources->llData, send->comm->buffSizes[NCCL_PROTO_LL]/2, + NCCL_PTR_HOST, &resources->sendMhandles[NCCL_PROTO_LL])); return ncclSuccess; } ncclResult_t collNetRecvConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) { // Setup device pointers - struct collNetRecvResources* recvResources = (struct collNetRecvResources*)recv->transportResources; - struct collNetSendConnectInfo* sInfo = (struct collNetSendConnectInfo*)(connectInfos+rank); - recvResources->collNetRank = rank; + struct collNetRecvResources* resources = (struct collNetRecvResources*)recv->transportResources; + struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank); + resources->collNetRank = rank; // Intermediate buffering on GPU for GPU Direct RDMA - struct ncclRecvMem* rRecvMem = recvResources->useGdr ? recvResources->devRecvMem : recvResources->devHostRecvMem; - recv->conn.buff = rRecvMem->buff; - recv->conn.llBuff = recvResources->devHostRecvMem->llBuff; // recv LL buff always on host - recv->conn.direct |= recvResources->useGdr ? NCCL_DIRECT_NIC : 0; + struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem; + int offset = 0; + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + recv->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->devHostRecvMem->buff : recvMem->buff) + offset; + offset += recv->comm->buffSizes[p]; + } + recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0; // Head/Tail/Opcount are always on host - recv->conn.tail = &recvResources->devHostRecvMem->tail; - recv->conn.opCountLoc = &recvResources->devHostRecvMem->opCount; - recv->conn.head = &recvResources->devHostSendMem->head; - recv->conn.opCountRem = &recvResources->devHostSendMem->opCount; + recv->conn.tail = &resources->devHostRecvMem->tail; + recv->conn.opCountLoc = &resources->devHostRecvMem->opCount; + recv->conn.head = &resources->devHostSendMem->head; + recv->conn.opCountRem = &resources->devHostSendMem->opCount; // Connect to coll comm collNetHandle_t** handlePtrs = NULL; @@ -194,64 +190,64 @@ ncclResult_t collNetRecvConnect(struct ncclConnect* connectInfos, int nranks, in handlePtrs[i] = &(info->collNetHandle); } ncclResult_t res; - NCCLCHECKGOTO(collNetConnect((void**)handlePtrs, nranks, rank, recvResources->netListenComm, &recvResources->collNetRecvComm), res, cleanup); + NCCLCHECKGOTO(collNetConnect((void**)handlePtrs, nranks, rank, resources->netListenComm, &resources->collNetRecvComm), res, cleanup); // Register buffers - NCCLCHECK(collNetRegMr(recvResources->collNetRecvComm, rRecvMem->buff, recvResources->buffSize, - recvResources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &recvResources->mhandle)); - NCCLCHECK(collNetRegMr(recvResources->collNetRecvComm, recvResources->llData, - NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine), NCCL_PTR_HOST, &recvResources->llMhandle)); + NCCLCHECK(collNetRegMr(resources->collNetRecvComm, recv->conn.buffs[NCCL_PROTO_SIMPLE], recv->comm->buffSizes[NCCL_PROTO_SIMPLE], + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[NCCL_PROTO_SIMPLE])); + NCCLCHECK(collNetRegMr(resources->collNetRecvComm, resources->llData, recv->comm->buffSizes[NCCL_PROTO_LL]/2, + NCCL_PTR_HOST, &resources->mhandles[NCCL_PROTO_LL])); // Create shared info between send and recv proxies - NCCLCHECK(ncclCalloc(&(recvResources->reqFifo), NCCL_STEPS)); + NCCLCHECK(ncclCalloc(&(resources->reqFifo), NCCL_STEPS)); // Pass info to send side - sInfo->reqFifo = recvResources->reqFifo; - sInfo->collNetComm = recvResources->collNetRecvComm; - sInfo->mhandle = recvResources->mhandle; - sInfo->llMhandle = recvResources->llMhandle; + info->reqFifo = resources->reqFifo; + info->collNetComm = resources->collNetRecvComm; + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) + info->mhandles[p] = resources->mhandles[p]; cleanup: if (handlePtrs != NULL) free(handlePtrs); // Close listen comm - NCCLCHECK(collNetCloseListen(recvResources->netListenComm)); + NCCLCHECK(collNetCloseListen(resources->netListenComm)); return res; } ncclResult_t collNetSendFree(void* sendTransportResources) { - struct collNetSendResources* sendResources = (struct collNetSendResources*)sendTransportResources; - NCCLCHECK(ncclCudaHostFree(sendResources->hostSendMem)); - NCCLCHECK(ncclCudaHostFree(sendResources->hostRecvMem)); - if (sendResources->collNetSendComm) { - NCCLCHECK(collNetDeregMr(sendResources->collNetSendComm, sendResources->sendMhandle)); - NCCLCHECK(collNetDeregMr(sendResources->collNetSendComm, sendResources->llSendMhandle)); + struct collNetSendResources* resources = (struct collNetSendResources*)sendTransportResources; + NCCLCHECK(ncclCudaHostFree(resources->hostSendMem)); + NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem)); + if (resources->collNetSendComm) { + NCCLCHECK(collNetDeregMr(resources->collNetSendComm, resources->sendMhandles[NCCL_PROTO_LL])); + NCCLCHECK(collNetDeregMr(resources->collNetSendComm, resources->sendMhandles[NCCL_PROTO_SIMPLE])); } - if (sendResources->useGdr) - CUDACHECK(cudaFree(sendResources->devRecvMem)); - free(sendResources->llData); - free(sendResources); + if (resources->useGdr) + CUDACHECK(cudaFree(resources->devRecvMem)); + free(resources->llData); + free(resources); return ncclSuccess; } ncclResult_t collNetRecvFree(void* recvTransportResources) { - struct collNetRecvResources* recvResources = (struct collNetRecvResources*)recvTransportResources; - NCCLCHECK(ncclCudaHostFree(recvResources->hostSendMem)); - if (recvResources->collNetRecvComm) { - NCCLCHECK(collNetDeregMr(recvResources->collNetRecvComm, recvResources->mhandle)); - NCCLCHECK(collNetDeregMr(recvResources->collNetRecvComm, recvResources->llMhandle)); + struct collNetRecvResources* resources = (struct collNetRecvResources*)recvTransportResources; + NCCLCHECK(ncclCudaHostFree(resources->hostSendMem)); + if (resources->collNetRecvComm) { + NCCLCHECK(collNetDeregMr(resources->collNetRecvComm, resources->mhandles[NCCL_PROTO_LL])); + NCCLCHECK(collNetDeregMr(resources->collNetRecvComm, resources->mhandles[NCCL_PROTO_SIMPLE])); } - NCCLCHECK(ncclCudaHostFree(recvResources->hostRecvMem)); - if (recvResources->useGdr) - CUDACHECK(cudaFree(recvResources->devRecvMem)); - free(recvResources->llData); - free(recvResources->reqFifo); + NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem)); + if (resources->useGdr) + CUDACHECK(cudaFree(resources->devRecvMem)); + free(resources->llData); + free(resources->reqFifo); // Make sure SendFree is called before RecvFree - if (recvResources->collNetRecvComm) { - NCCLCHECK(collNetCloseColl(recvResources->collNetRecvComm)); + if (resources->collNetRecvComm) { + NCCLCHECK(collNetCloseColl(resources->collNetRecvComm)); } - free(recvResources); + free(resources); return ncclSuccess; } @@ -273,6 +269,11 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) { args->state = ncclProxyOpProgress; } if (args->state == ncclProxyOpProgress) { + int p = args->protocol; + int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS; + char* localBuff = args->connector->conn.buffs[p]; + void* sendMhandle = resources->sendMhandles[p]; + void* recvMhandle = resources->recvMhandles[p]; args->idle = 1; struct reqSlot* reqFifo = resources->reqFifo; if (args->head < args->end) { @@ -286,7 +287,7 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) { if (size != -1) { uint32_t flag = NCCL_LL_FLAG(args->tail + 1); int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine)); - union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES; + union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize); int ready = 1; for (int i=0; i<nFifoLines; i++) { volatile uint32_t *f1 = &lines[i].flag1; @@ -294,16 +295,17 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) { if (f1[0] != flag || f2[0] != flag) { ready = 0; break; } } if (ready) { + int stepLines = stepSize / sizeof(union ncclLLFifoLine); //separate data from flag - struct ncclLLDataLine* sendBuff = resources->llData+buffSlot*NCCL_LL_SLICE_LINES; + uint32_t* sendBuff = resources->llData+buffSlot*2*stepLines; // each line has two data elements for (int i=0; i<nFifoLines; i++) { volatile uint32_t *d1 = &lines[i].data1; volatile uint32_t *d2 = &lines[i].data2; - sendBuff[i].data1 = d1[0]; - sendBuff[i].data2 = d2[0]; + sendBuff[2*i] = d1[0]; + sendBuff[2*i+1] = d2[0]; } - int count = nFifoLines*sizeof(struct ncclLLDataLine) / ncclTypeSize(args->dtype); - NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*)sendBuff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, resources->llSendMhandle, resources->llRecvMhandle, args->requests+buffSlot)); + int count = nFifoLines*2*sizeof(uint32_t) / ncclTypeSize(args->dtype); + NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*)sendBuff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot)); if (args->requests[buffSlot] != NULL) { TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce (LL) posted, req %p", args->head, buffSlot, args->requests[buffSlot]); sizesFifo[buffSlot] = -1; @@ -315,12 +317,10 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) { } } } else if (args->tail < *recvTail) { - int stepSize = args->channel->buffSize/NCCL_STEPS; - struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; // Send through network if (sizesFifo[buffSlot] != -1) { int count = sizesFifo[buffSlot]/ncclTypeSize(args->dtype); - NCCLCHECK(collNetIallreduce(resources->collNetSendComm, localMem->buff+buffSlot*stepSize, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, resources->sendMhandle, resources->recvMhandle, args->requests+buffSlot)); + NCCLCHECK(collNetIallreduce(resources->collNetSendComm, localBuff+buffSlot*stepSize, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot)); if (args->requests[buffSlot] != NULL) { TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce posted, req %p count %d", args->head, buffSlot, args->requests[buffSlot], count); sizesFifo[buffSlot] = -1; @@ -377,16 +377,18 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) { } if (args->state == ncclProxyOpProgress) { args->idle = 1; - int stepSize = ( args->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine) : args->channel->buffSize ) / NCCL_STEPS; + int p = args->protocol; + int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS; + char* localBuff = args->connector->conn.buffs[p]; + void* mhandle = resources->mhandles[p]; struct reqSlot* reqFifo = resources->reqFifo; if (args->head < args->end) { - struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; - char* localBuff = args->protocol == NCCL_PROTO_LL ? (char*)resources->llData : localMem->buff; - void* mhandle = args->protocol == NCCL_PROTO_LL ? resources->llMhandle : resources->mhandle; if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) { int buffSlot = args->tail%NCCL_STEPS; - reqFifo[buffSlot].recvBuff = localBuff+buffSlot*stepSize; - TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->tail, buffSlot, localBuff+buffSlot*stepSize); + char* recvBuff = p == NCCL_PROTO_LL ? (char*)resources->llData : localBuff; + int recvStepSize = p == NCCL_PROTO_LL ? stepSize/2 : stepSize; + reqFifo[buffSlot].recvBuff = recvBuff+buffSlot*recvStepSize; + TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->tail, buffSlot, reqFifo[buffSlot].recvBuff); args->tail += args->sliceSteps; args->idle = 0; } @@ -398,15 +400,16 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) { if (args->protocol == NCCL_PROTO_LL) { // ll // re-attach flag uint32_t flag = args->head; - union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(resources->hostRecvMem->llBuff)+buffSlot*NCCL_LL_SLICE_LINES; - struct ncclLLDataLine* recvData = resources->llData+buffSlot*NCCL_LL_SLICE_LINES; - int nFifoLines = DIVUP(reqFifo[buffSlot].size, sizeof(struct ncclLLDataLine)); + int stepLines = stepSize / sizeof(union ncclLLFifoLine); + union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize); + uint32_t* recvData = resources->llData+buffSlot*2*stepLines; + int nFifoLines = DIVUP(reqFifo[buffSlot].size, 2*sizeof(uint32_t)); for (int i=0; i<nFifoLines; i++) { - lines[i].v[0] = ((uint64_t)flag << 32) + recvData[i].data1; - lines[i].v[1] = ((uint64_t)flag << 32) + recvData[i].data2; + lines[i].v[0] = ((uint64_t)flag << 32) + recvData[2*i]; + lines[i].v[1] = ((uint64_t)flag << 32) + recvData[2*i+1]; } } else if (args->protocol == NCCL_PROTO_SIMPLE) { - if (resources->useGdr) collNetFlush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, reqFifo[buffSlot].size, mhandle); + if (resources->useGdr) NCCLCHECK(collNetFlush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, reqFifo[buffSlot].size, mhandle)); resources->hostRecvMem->tail = args->head; } args->idle = 0; diff --git a/src/transport/net.cc b/src/transport/net.cc index 288ad92..7cf4d09 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -12,19 +12,20 @@ struct netConnectInfo { ncclNetHandle_t netHandle; }; +#define LOC_HOSTMEM 0 +#define LOC_DEVMEM 1 +#define LOC_COUNT 2 + struct netSendResources { void* netSendComm; - struct ncclSendMem* hostSendMem; - struct ncclRecvMem* hostRecvMem; - struct ncclSendMem* devHostSendMem; - struct ncclRecvMem* devHostRecvMem; + struct ncclSendMem* sendMem; + struct ncclRecvMem* recvMem; int netDev; int useGdr; - int buffSize; - void* mhandle; - void* llMhandle; - void* ll128Mhandle; - struct ncclRecvMem* devRecvMem; + char* buffers[LOC_COUNT]; + int buffSizes[LOC_COUNT]; + void* mhandles[LOC_COUNT]; + void** mhandlesProto[NCCL_NUM_PROTOCOLS]; uint64_t step; uint64_t llLastCleaning; }; @@ -32,17 +33,14 @@ struct netSendResources { struct netRecvResources { void* netListenComm; void* netRecvComm; - struct ncclSendMem* hostSendMem; - struct ncclRecvMem* hostRecvMem; - struct ncclSendMem* devHostSendMem; - struct ncclRecvMem* devHostRecvMem; + struct ncclSendMem* sendMem; + struct ncclRecvMem* recvMem; int netDev; int useGdr; - int buffSize; - void* mhandle; - void* llMhandle; - void* ll128Mhandle; - struct ncclRecvMem* devRecvMem; + char* buffers[LOC_COUNT]; + int buffSizes[LOC_COUNT]; + void* mhandles[LOC_COUNT]; + void** mhandlesProto[NCCL_NUM_PROTOCOLS]; uint64_t step; uint64_t llLastCleaning; }; @@ -55,84 +53,123 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop /* Determine if we will use this transport for this peer and return connect * information for this peer */ -ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { +ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) { struct netSendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; - NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &resources->netDev)); + NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev)); NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr)); - int sendSize = sizeof(struct ncclSendMem); - NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize)); + NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1)); + NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1)); + + send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0; + send->conn.tail = &resources->recvMem->tail; + send->conn.opCountRem = &resources->recvMem->opCount; + send->conn.fifo = resources->recvMem->sizesFifo; + send->conn.head = &resources->sendMem->head; + send->conn.opCountLoc = &resources->sendMem->opCount; + for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1; + + int protoLoc[NCCL_NUM_PROTOCOLS]; + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + protoLoc[p] = p != NCCL_PROTO_LL && resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM; + } - int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; - if (resources->useGdr) { - NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize)); + int buffSizes[NCCL_NUM_PROTOCOLS]; + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + // Only allocate buffers for simple for p2p connections + buffSizes[p] = graph == NULL && p != NCCL_PROTO_SIMPLE ? 0 : send->comm->buffSizes[p]; + resources->buffSizes[protoLoc[p]] += buffSizes[p]; } - NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize)); - resources->buffSize = buffSize; - INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev, + if (resources->buffSizes[LOC_DEVMEM]) { + NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM])); + } + if (resources->buffSizes[LOC_HOSTMEM]) { + NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM])); + } + + int offsets[LOC_COUNT]; + offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0; + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + resources->mhandlesProto[p] = resources->mhandles+protoLoc[p]; + send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]]; + offsets[protoLoc[p]] += buffSizes[p]; + } + + INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev, resources->useGdr ? "/GDRDMA" : ""); return ncclSuccess; } -ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { +ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) { struct netRecvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; - NCCLCHECK(ncclTopoGetNetDev(topo, graph, myInfo->rank, channelId, &resources->netDev)); + NCCLCHECK(ncclTopoGetNetDev(topo, myInfo->rank, graph, channelId, &resources->netDev)); NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr)); - int sendSize = sizeof(struct ncclSendMem); - NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize)); + NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1)); + NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1)); + + recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0; + recv->conn.tail = &resources->recvMem->tail; + recv->conn.opCountLoc = &resources->recvMem->opCount; + recv->conn.head = &resources->sendMem->head; + recv->conn.opCountRem = &resources->sendMem->opCount; + + int protoLoc[NCCL_NUM_PROTOCOLS]; + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + protoLoc[p] = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM; + } + + int buffSizes[NCCL_NUM_PROTOCOLS]; + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + // Only allocate buffers for simple for p2p connections + buffSizes[p] = graph == NULL && p != NCCL_PROTO_SIMPLE ? 0 : recv->comm->buffSizes[p]; + resources->buffSizes[protoLoc[p]] += buffSizes[p]; + } + + if (resources->buffSizes[LOC_DEVMEM]) { + NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM])); + } + if (resources->buffSizes[LOC_HOSTMEM]) { + NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM])); + } - int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; - if (resources->useGdr) { - NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize)); + int offsets[LOC_COUNT]; + offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0; + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + resources->mhandlesProto[p] = resources->mhandles+protoLoc[p]; + recv->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]]; + offsets[protoLoc[p]] += buffSizes[p]; } - NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize)); - resources->buffSize = buffSize; - INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev, + INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev, resources->useGdr ? "/GDRDMA" : ""); struct netConnectInfo* info = (struct netConnectInfo*) connectInfo; NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm)); + return ncclSuccess; } ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { // Setup device pointers struct netSendResources* resources = (struct netSendResources*)send->transportResources; - - // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host - struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem; - send->conn.buff = recvMem->buff; - send->conn.llBuff = resources->devHostRecvMem->llBuff; - send->conn.ll128Buff = recvMem->ll128Buff; - send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0; - - // Head/Tail/Opcount/Fifos are always on host - send->conn.tail = &resources->devHostRecvMem->tail; - send->conn.opCountRem = &resources->devHostRecvMem->opCount; - send->conn.fifo = resources->devHostRecvMem->sizesFifo; - send->conn.head = &resources->devHostSendMem->head; - send->conn.opCountLoc = &resources->devHostSendMem->opCount; - for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1; + struct netConnectInfo* info = (struct netConnectInfo*)connectInfo; // Connect to remote peer - struct netConnectInfo* info = (struct netConnectInfo*)connectInfo; NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm)); - NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->buff, resources->buffSize, - resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle)); - NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->devHostRecvMem->llBuff, - NCCL_LL_BUFF_SIZE, NCCL_PTR_HOST, &resources->llMhandle)); - NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE, - resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle)); - + if (resources->buffSizes[LOC_DEVMEM]) { + NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM])); + } + if (resources->buffSizes[LOC_HOSTMEM]) { + NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM])); + } return ncclSuccess; } @@ -141,42 +178,29 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran // Setup device pointers struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources; - // Intermediate buffering on GPU for GPU Direct RDMA - struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem; - recv->conn.buff = recvMem->buff; - recv->conn.llBuff = recvMem->llBuff; - recv->conn.ll128Buff = recvMem->ll128Buff; - recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0; - - // Head/Tail/Opcount are always on host - recv->conn.tail = &resources->devHostRecvMem->tail; - recv->conn.opCountLoc = &resources->devHostRecvMem->opCount; - recv->conn.head = &resources->devHostSendMem->head; - recv->conn.opCountRem = &resources->devHostSendMem->opCount; - // Finish connection establishment from remote peer NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm)); NCCLCHECK(ncclNetCloseListen(resources->netListenComm)); - NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->buff, resources->buffSize, - resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle)); - NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->llBuff, NCCL_LL_BUFF_SIZE, - resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->llMhandle)); - NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE, - resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle)); - + if (resources->buffSizes[LOC_DEVMEM]) { + NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM])); + } + if (resources->buffSizes[LOC_HOSTMEM]) { + NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM])); + } return ncclSuccess; } ncclResult_t netSendFree(void* transportResources) { struct netSendResources* resources = (struct netSendResources*)transportResources; - NCCLCHECK(ncclCudaHostFree(resources->hostSendMem)); - NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandle)); - NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->llMhandle)); - NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->ll128Mhandle)); - NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem)); - if (resources->useGdr) - CUDACHECK(cudaFree(resources->devRecvMem)); + NCCLCHECK(ncclCudaHostFree(resources->sendMem)); + NCCLCHECK(ncclCudaHostFree(resources->recvMem)); + for (int l=0; l<LOC_COUNT; l++) { + if (resources->buffers[l]) + NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[l])); + } + NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM])); + CUDACHECK(cudaFree(resources->buffers[LOC_DEVMEM])); NCCLCHECK(ncclNetCloseSend(resources->netSendComm)); free(resources); return ncclSuccess; @@ -184,13 +208,14 @@ ncclResult_t netSendFree(void* transportResources) { ncclResult_t netRecvFree(void* transportResources) { struct netRecvResources* resources = (struct netRecvResources*)transportResources; - NCCLCHECK(ncclCudaHostFree(resources->hostSendMem)); - NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandle)); - NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->llMhandle)); - NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->ll128Mhandle)); - NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem)); - if (resources->useGdr) - CUDACHECK(cudaFree(resources->devRecvMem)); + NCCLCHECK(ncclCudaHostFree(resources->sendMem)); + NCCLCHECK(ncclCudaHostFree(resources->recvMem)); + for (int l=0; l<LOC_COUNT; l++) { + if (resources->buffers[l]) + NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[l])); + } + NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM])); + CUDACHECK(cudaFree(resources->buffers[LOC_DEVMEM])); NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm)); free(resources); return ncclSuccess; @@ -200,7 +225,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { struct netSendResources* resources = (struct netSendResources*) (args->connector->transportResources); if (args->state == ncclProxyOpReady) { // Update opCount - resources->hostRecvMem->opCount = args->opCount; + resources->recvMem->opCount = args->opCount; // Round to next multiple of sliceSteps resources->step = ROUNDUP(resources->step, args->chunkSteps); @@ -210,18 +235,19 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { args->state = ncclProxyOpProgress; } if (args->state == ncclProxyOpProgress) { + int p = args->protocol; + int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS; + char* localBuff = args->connector->conn.buffs[p]; + void* mhandle = *(resources->mhandlesProto[p]); args->idle = 1; if (args->head < args->end) { + int buffSlot = args->tail%NCCL_STEPS; if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) { - volatile int* sizesFifo = resources->hostRecvMem->sizesFifo; - volatile uint64_t* recvTail = &resources->hostRecvMem->tail; + volatile int* sizesFifo = resources->recvMem->sizesFifo; + volatile uint64_t* recvTail = &resources->recvMem->tail; if (args->protocol == NCCL_PROTO_LL128) { - int stepSize = NCCL_LL128_BUFF_SIZE/NCCL_STEPS; if (args->tail < *recvTail) { - int buffSlot = args->tail%NCCL_STEPS; if (sizesFifo[buffSlot] != -1) { - struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; - char* localBuff = (char*)localMem->ll128Buff; int ready = resources->useGdr; if (!ready) { // When data is in sysmem, we need to wait until all flags are correct since the GPU only @@ -236,7 +262,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { } if (ready) { // Send through network - NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, sizesFifo[buffSlot], resources->ll128Mhandle, args->requests+buffSlot)); + NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, sizesFifo[buffSlot], mhandle, args->requests+buffSlot)); if (args->requests[buffSlot] != NULL) { sizesFifo[buffSlot] = -1; // Make sure size is reset to zero before we update the head. @@ -248,13 +274,12 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { } } } else if (args->protocol == NCCL_PROTO_LL) { - int buffSlot = args->tail%NCCL_STEPS; int size = sizesFifo[buffSlot]; if (size != -1) { uint32_t flag = NCCL_LL_FLAG(args->tail + 1); int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine)); size = nFifoLines * sizeof(union ncclLLFifoLine); - union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES; + union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize); int ready = 1; for (int i=0; i<nFifoLines; i++) { volatile uint32_t *f1 = &lines[i].flag1; @@ -262,7 +287,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { if (f1[0] != flag || f2[0] != flag) { ready = 0; break; } } if (ready) { - NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, resources->llMhandle, args->requests+buffSlot)); + NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, mhandle, args->requests+buffSlot)); if (args->requests[buffSlot] != NULL) { sizesFifo[buffSlot] = -1; // Make sure size is reset to zero before we update the head. @@ -273,12 +298,9 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { } } } else if (args->tail < *recvTail) { - int stepSize = args->channel->buffSize/NCCL_STEPS; - struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; // Send through network - int buffSlot = args->tail%NCCL_STEPS; if (sizesFifo[buffSlot] != -1) { - NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot)); + NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, sizesFifo[buffSlot], mhandle, args->requests+buffSlot)); if (args->requests[buffSlot] != NULL) { sizesFifo[buffSlot] = -1; // Make sure size is reset to zero before we update the head. @@ -295,7 +317,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL)); if (done) { args->head += args->sliceSteps; - resources->hostSendMem->head = args->head; + resources->sendMem->head = args->head; args->idle = 0; } } @@ -313,7 +335,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { struct netRecvResources* resources = (struct netRecvResources*) (args->connector->transportResources); if (args->state == ncclProxyOpReady) { // Update opCount - resources->hostSendMem->opCount = args->opCount; + resources->sendMem->opCount = args->opCount; // Round to next multiple of sliceSteps resources->step = ROUNDUP(resources->step, args->chunkSteps); @@ -324,12 +346,12 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { } if (args->state == ncclProxyOpProgress) { args->idle = 1; - int stepSize = ( args->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : args->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS; + int p = args->protocol; + int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS; + char* localBuff = args->connector->conn.buffs[p]; + void* mhandle = *(resources->mhandlesProto[p]); if (args->head < args->end) { - struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; - char* localBuff = args->protocol == NCCL_PROTO_LL ? (char*)localMem->llBuff : args->protocol == NCCL_PROTO_LL128 ? (char*)localMem->ll128Buff : localMem->buff; - void* mhandle = args->protocol == NCCL_PROTO_LL ? resources->llMhandle : args->protocol == NCCL_PROTO_LL128 ? resources->ll128Mhandle : resources->mhandle; - volatile uint64_t* sendHead = &resources->hostSendMem->head; + volatile uint64_t* sendHead = &resources->sendMem->head; if ((args->tail < args->head + NCCL_STEPS) && (args->tail < *sendHead + NCCL_STEPS) && (args->tail < args->end)) { int buffSlot = args->tail%NCCL_STEPS; int sliceSize = stepSize * args->sliceSteps; @@ -347,7 +369,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { args->head += args->sliceSteps; if (args->protocol == NCCL_PROTO_SIMPLE) { if (resources->useGdr) NCCLCHECK(ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle)); - resources->hostRecvMem->tail = args->head; + resources->recvMem->tail = args->head; } args->idle = 0; } diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 1a832f2..97eca9f 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -112,6 +112,7 @@ static int ncclIbSpeed(int speed) { } ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { + static int shownIbHcaEnv = 0; if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; } if (ncclParamIbDisable()) return ncclInternalError; @@ -131,6 +132,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { // Check if user defined which IB device:port to use char* userIbEnv = getenv("NCCL_IB_HCA"); + if (userIbEnv != NULL && shownIbHcaEnv++ == 0) INFO(NCCL_NET|NCCL_ENV, "NCCL_IB_HCA set to %s", userIbEnv); struct netIf userIfs[MAX_IB_DEVS]; bool searchNot = userIbEnv && userIbEnv[0] == '^'; if (searchNot) userIbEnv++; diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index 6586ce7..15816ce 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -10,6 +10,7 @@ struct p2pConnectInfo { int direct; + int read; union { void* directPtr; cudaIpcMemHandle_t devIpc; @@ -54,7 +55,8 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop } // Check topology / p2p level. - NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret)); + int read; + NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, &read)); if (*ret == 0) return ncclSuccess; // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) @@ -95,23 +97,44 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \ } while (0) +// Setting this to non zero causes P2P to use Reads rather than Writes +NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2); + +static int p2pUseRead(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { + int readEnable = ncclParamP2pReadEnable(); + if (readEnable != -2) return readEnable; + + int p2p, read; + // Queries the topology to see if the GPUs are Ampere and + // connected via NVLink, if so we enable P2P Read by default + NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, &p2p, &read)); + + return read; +} + /* Send: Create and return connect structures for this peer to connect to me */ ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, - struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { + struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) { struct p2pSendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; + int useRead = p2pUseRead(topo, myInfo, peerInfo); int sendSize = sizeof(struct ncclSendMem); + // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure + if (useRead) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE]; ALIGN_SIZE(sendSize, CUDA_IPC_MIN); NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize)); struct p2pConnectInfo info; + info.read = useRead; + const char* useReadStr = info.read ? "/read" : ""; if (myInfo->pidHash == peerInfo->pidHash) { info.direct = 1; info.directPtr = resources->devMem; if (myInfo->cudaDev == peerInfo->cudaDev) { - INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/common device", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%d] -> %d[%d] via P2P/common device%s", + channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev, useReadStr); return ncclInternalError; } else { // Enable P2P access @@ -123,8 +146,8 @@ ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err)); return ncclInternalError; } - INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer", - channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); + INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s", + channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr); } } else { // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) @@ -137,8 +160,8 @@ ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra myInfo->rank, peerCudaDev, peerInfo->busId, err, cudaGetErrorString(err)); return ncclInternalError; } - INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/IPC", - channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); + INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s", + channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr); //TRACE_DUMP_IPC(&info.devIpc); } static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); @@ -148,16 +171,20 @@ ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra /* Create and return connect structures for this peer to connect to me */ ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, - struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) { + struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId) { struct p2pRecvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; - int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; + int useRead = p2pUseRead(topo, myInfo, peerInfo); + int recvSize = offsetof(struct ncclRecvMem, buff); + // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(useRead && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p]; ALIGN_SIZE(recvSize, CUDA_IPC_MIN); NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize)); struct p2pConnectInfo info; + info.read = useRead; if (myInfo->pidHash == peerInfo->pidHash) { info.direct = 1; info.directPtr = resources->devMem; @@ -173,7 +200,7 @@ ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err)); return ncclInternalError; } - TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/direct pointer", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); + TRACE(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] <- %d[%lx] via P2P/direct pointer", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); } } else { // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) @@ -186,7 +213,7 @@ ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra myInfo->rank, peerCudaDev, peerInfo->busId, err, cudaGetErrorString(err)); return ncclInternalError; } - TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/IPC", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); + TRACE(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] <- %d[%lx] via P2P/IPC", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); //TRACE_DUMP_IPC(&info.devIpc); } static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); @@ -201,7 +228,7 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks, struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; if (info->direct) { remDevMem = (struct ncclRecvMem*)(info->directPtr); - send->conn.direct |= NCCL_DIRECT_GPU; + if (info->read == 0) send->conn.direct |= NCCL_DIRECT_GPU; } else { //TRACE_DUMP_IPC(&info->devIpc); cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess); @@ -213,9 +240,16 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks, } } - send->conn.buff = remDevMem->buff; - send->conn.llBuff = remDevMem->llBuff; - send->conn.ll128Buff = remDevMem->ll128Buff; + int offset = 0; + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + if (info->read && p == NCCL_PROTO_SIMPLE) { + /* For P2P Read the SIMPLE buffer is local (ncclSendMem) */ + send->conn.buffs[p] = resources->devMem->buff; + } else { + send->conn.buffs[p] = remDevMem->buff + offset; + offset += send->comm->buffSizes[p]; + } + } send->conn.tail = &remDevMem->tail; send->conn.opCountRem = &remDevMem->opCount; send->conn.head = &resources->devMem->head; @@ -231,8 +265,10 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; if (info->direct) { remDevMem = (struct ncclSendMem*)(info->directPtr); - recv->conn.direct |= NCCL_DIRECT_GPU; - recv->conn.ptrExchange = &remDevMem->ptrExchange; + if (info->read == 0) { + recv->conn.direct |= NCCL_DIRECT_GPU; + recv->conn.ptrExchange = &remDevMem->ptrExchange; + } } else { //TRACE_DUMP_IPC(&info->devIpc); cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess); @@ -244,9 +280,16 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran } } - recv->conn.buff = resources->devMem->buff; - recv->conn.llBuff = resources->devMem->llBuff; - recv->conn.ll128Buff = resources->devMem->ll128Buff; + int offset = 0; + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + if (info->read && p == NCCL_PROTO_SIMPLE) { + /* For P2P Read the SIMPLE buffer is remote (ncclSendMem) */ + recv->conn.buffs[p] = remDevMem->buff; + } else { + recv->conn.buffs[p] = resources->devMem->buff + offset; + offset += recv->comm->buffSizes[p]; + } + } recv->conn.tail = &resources->devMem->tail; recv->conn.opCountLoc = &resources->devMem->opCount; recv->conn.head = &remDevMem->head; diff --git a/src/transport/shm.cc b/src/transport/shm.cc index 0b1d8ee..caac3f6 100644 --- a/src/transport/shm.cc +++ b/src/transport/shm.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -57,7 +57,7 @@ ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop #define MAX_SHM_NAME_LEN 1024 /* Create and return connect structures for this peer to connect to me */ -ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { +ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) { struct shmSendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); @@ -75,13 +75,13 @@ ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); - INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); + INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big"); memcpy(connectInfo, &info, sizeof(struct shmConnectInfo)); return ncclSuccess; } -ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { +ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) { struct shmRecvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; @@ -94,7 +94,9 @@ ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra char shmName[MAX_SHM_NAME_LEN]; sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank); - info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+buffSize; + int shmSize = offsetof(struct ncclRecvMem, buff); + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p]; + info.shmSize = resources->shmSize = shmSize; TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); @@ -118,9 +120,11 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, int nranks, int ran NCCLCHECK(shmUnlink(shmName)); send->transportResources = resources; - send->conn.buff = resources->devRemHostMem->buff; - send->conn.llBuff = resources->devRemHostMem->llBuff; - send->conn.ll128Buff = resources->devRemHostMem->ll128Buff; + int offset = 0; + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + send->conn.buffs[p] = resources->devRemHostMem->buff + offset; + offset += send->comm->buffSizes[p]; + } send->conn.tail = &resources->devRemHostMem->tail; send->conn.opCountRem = &resources->devRemHostMem->opCount; @@ -143,9 +147,11 @@ ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran recv->conn.head = &resources->devRemHostMem->head; recv->conn.opCountRem = &resources->devRemHostMem->opCount; - recv->conn.buff = resources->devHostMem->buff; - recv->conn.llBuff = resources->devHostMem->llBuff; - recv->conn.ll128Buff = resources->devHostMem->ll128Buff; + int offset = 0; + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + recv->conn.buffs[p] = resources->devHostMem->buff + offset; + offset += recv->comm->buffSizes[p]; + } recv->conn.tail = &resources->devHostMem->tail; recv->conn.opCountLoc = &resources->devHostMem->opCount; return ncclSuccess; |