diff options
Diffstat (limited to 'src/collectives')
22 files changed, 61 insertions, 77 deletions
diff --git a/src/collectives/all_gather.cu b/src/collectives/all_gather.cc index db21dee..348c176 100644 --- a/src/collectives/all_gather.cu +++ b/src/collectives/all_gather.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/all_reduce.cu b/src/collectives/all_reduce.cc index 1492c90..921f2de 100644 --- a/src/collectives/all_reduce.cu +++ b/src/collectives/all_reduce.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/broadcast.cu b/src/collectives/broadcast.cc index 6a3d0a8..042301b 100644 --- a/src/collectives/broadcast.cu +++ b/src/collectives/broadcast.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/collectives.h b/src/collectives/collectives.h index e6b19cb..73fe7d5 100644 --- a/src/collectives/collectives.h +++ b/src/collectives/collectives.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile index 8e92596..0ee587b 100644 --- a/src/collectives/device/Makefile +++ b/src/collectives/device/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu index 530bf14..109c341 100644 --- a/src/collectives/device/all_gather.cu +++ b/src/collectives/device/all_gather.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h index 36809c9..8e78730 100644 --- a/src/collectives/device/all_gather.h +++ b/src/collectives/device/all_gather.h @@ -1,10 +1,10 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" @@ -13,7 +13,7 @@ __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; @@ -74,7 +74,7 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu index aaa96b4..85d007e 100644 --- a/src/collectives/device/all_reduce.cu +++ b/src/collectives/device/all_reduce.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h index ea89a71..9b058cc 100644 --- a/src/collectives/device/all_reduce.h +++ b/src/collectives/device/all_reduce.h @@ -1,10 +1,10 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" @@ -13,7 +13,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; @@ -87,7 +87,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclTree* tree = &channel->tree; const ssize_t size = args->N; @@ -139,7 +139,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; @@ -214,7 +214,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = args->nThreads; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclTree* tree = &channel->tree; const ssize_t size = args->N; diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu index b83ee70..8c8dbb6 100644 --- a/src/collectives/device/broadcast.cu +++ b/src/collectives/device/broadcast.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h index fb18312..ae8667f 100644 --- a/src/collectives/device/broadcast.h +++ b/src/collectives/device/broadcast.h @@ -1,10 +1,10 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" @@ -13,7 +13,7 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; @@ -59,7 +59,7 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index e4aecbd..8c336bf 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,7 +8,7 @@ #define NCCL_DEVICE_COMMON_H_ #include "../collectives.h" -#include "core.h" +#include "devcomm.h" #include "nccl.h" // Exit If Abort Barrier across CTA: make sure all threads exit consistently @@ -57,7 +57,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ int bid = blockIdx.x; \ __shared__ struct ncclColl localColl; \ \ - struct ncclComm* comm = firstColl.args.comm; \ + struct ncclDevComm* comm = firstColl.args.comm; \ struct ncclChannel* channel = comm->channels+bid; \ struct ncclColl* c; \ if (bid == 0) { \ diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h index e1fb096..435a598 100644 --- a/src/collectives/device/common_kernel.h +++ b/src/collectives/device/common_kernel.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,7 +7,7 @@ #ifndef NCCL_COMMON_KERNEL_H_ #define NCCL_COMMON_KERNEL_H_ -#include "core.h" +#include "devcomm.h" #include <cstdio> #include <cstdint> diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu index ea06b68..010c454 100644 --- a/src/collectives/device/functions.cu +++ b/src/collectives/device/functions.cu @@ -1,10 +1,10 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "collectives.h" #include "common.h" diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh index 3942c8c..4413213 100755 --- a/src/collectives/device/gen_rules.sh +++ b/src/collectives/device/gen_rules.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h index c5aaf54..7beeaf4 100644 --- a/src/collectives/device/primitives.h +++ b/src/collectives/device/primitives.h @@ -50,7 +50,7 @@ class ncclPrimitives { T* sendDirectBuff[NSEND]; const T* recvBuff[NRECV]; T* sendBuff[NSEND]; - struct ncclComm* comm; + struct ncclDevComm* comm; inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; } inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; } @@ -239,7 +239,7 @@ class ncclPrimitives { public: __device__ __forceinline__ - ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount) + ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount) : comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) { // Make sure step is updated before we read it __syncthreads(); @@ -329,14 +329,14 @@ class ncclLLPrimitives { uint64_t sendConnHead; union ncclLLFifoLine* recvBuff[NRECV]; union ncclLLFifoLine* sendBuff[NSEND]; - struct ncclComm* comm; + struct ncclDevComm* comm; inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); } - inline __device__ uint32_t recvFlag(int i) { return recvStep[i]+1; } - inline __device__ uint32_t sendFlag(int i) { return sendStep[i]+1; } + inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); } + inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); } // Exit If Abort Barrier : make sure all threads exit consistently // Each thread sets a predicate to true if val == 1 @@ -393,7 +393,10 @@ class ncclLLPrimitives { sendConnHead = *waitPtr; if (checkAbort(sendConn[i]->opCountRem)) break; } - if (fifoPtr) fifoPtr[sendStep[i]%NCCL_STEPS] = nbytes; + if (fifoPtr) { + int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes; + fifoPtr[sendStep[i]%NCCL_STEPS] = size; + } } } @@ -402,7 +405,12 @@ class ncclLLPrimitives { if (tid == i) *postPtr = recvStep[i]; } - inline __device__ void postSend(int i) { + inline __device__ void postSend(int i, int offset) { + // LL Cleanup : write all flags in the slice to make sure we don't have + // data corruption when flag loops over. + if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) { + for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i)); + } sendStep[i]++; } @@ -443,9 +451,10 @@ class ncclLLPrimitives { uint32_t npack = DIVUP(nbytes, sizeof(uint64_t)); uint64_t* srcPack = (uint64_t*)srcPtr; uint64_t* dstPack = (uint64_t*)dstPtr; + int offset = tid; // Do multiples of 64 bits #pragma unroll 2 - for (int offset=tid; offset<npack; offset+=nthreads) { + for (; offset<npack; offset+=nthreads) { // Recv : local, then intra-node, then inter-node uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset); if (RECV) { @@ -471,7 +480,7 @@ class ncclLLPrimitives { } exitIfAbortLocalBarrier(); FOR_RECV(postRecv); - FOR_SEND(postSend); + FOR_SEND(postSend, offset); } __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) { @@ -514,32 +523,9 @@ class ncclLLPrimitives { } } - __device__ __forceinline__ void llSendCleaning(int i) { - if (sendStep[i] > sendConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - /* Reset all flags */ - static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); - static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); - for (int s=0; s<NCCL_STEPS; s++) { - waitSend(i, 0); - for (int o=tid; o<NCCL_LL_SLICE_LINES; o+=nthreads) { - const union ncclLLFifoLine resetLine = { 0, sendFlag(i), 0, sendFlag(i) }; - sendPtr(i)[o].i4 = resetLine.i4; - } - } - if (tid == 0) sendConn[i]->llLastCleaning = sendStep[i]; - } - } - - __device__ __forceinline__ void llRecvCleaning(int i) { - if (recvStep[i] > recvConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - recvStep[i] += NCCL_STEPS; - if (tid == 0) recvConn[i]->llLastCleaning = recvStep[i]; - } - } - public: __device__ __forceinline__ - ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount) + ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount) : comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) { // Make sure step is updated before we read it. barrier(); @@ -577,8 +563,6 @@ class ncclLLPrimitives { } __device__ __forceinline__ ~ncclLLPrimitives() { - for (int i=0; i<NSEND && i<nsend; i++) llSendCleaning(i); - for (int i=0; i<NRECV && i<nrecv; i++) llRecvCleaning(i); // Save steps for the next operation for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i); for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i); diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu index 1ef66d4..a2caac5 100644 --- a/src/collectives/device/reduce.cu +++ b/src/collectives/device/reduce.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h index 302d053..d2d5d3b 100644 --- a/src/collectives/device/reduce.h +++ b/src/collectives/device/reduce.h @@ -1,10 +1,10 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" @@ -13,7 +13,7 @@ __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; @@ -55,7 +55,7 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu index 10857ed..8b45299 100644 --- a/src/collectives/device/reduce_scatter.cu +++ b/src/collectives/device/reduce_scatter.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h index c70c845..09ba56e 100644 --- a/src/collectives/device/reduce_scatter.h +++ b/src/collectives/device/reduce_scatter.h @@ -1,10 +1,10 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" @@ -13,7 +13,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; @@ -69,7 +69,7 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; diff --git a/src/collectives/reduce.cu b/src/collectives/reduce.cc index 302d4bc..67f2fae 100644 --- a/src/collectives/reduce.cu +++ b/src/collectives/reduce.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/reduce_scatter.cu b/src/collectives/reduce_scatter.cc index 4ee77ef..5ad7f5f 100644 --- a/src/collectives/reduce_scatter.cu +++ b/src/collectives/reduce_scatter.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ |