diff options
Diffstat (limited to 'src/include/core.h')
-rw-r--r-- | src/include/core.h | 423 |
1 files changed, 10 insertions, 413 deletions
diff --git a/src/include/core.h b/src/include/core.h index d57d271..8a08b91 100644 --- a/src/include/core.h +++ b/src/include/core.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,385 +7,20 @@ #ifndef NCCL_CORE_H_ #define NCCL_CORE_H_ -#define NCCL_MAX_OPS 2048 -#define NCCL_STEPS 8 - +#include <pthread.h> +#include <algorithm> #include "nccl.h" -#include "transport.h" #include "debug.h" +#include "checks.h" +#include "alloc.h" +#include "transport.h" +#include "devcomm.h" +#include "comm.h" +#include "info.h" +#include "argcheck.h" #include <cstdio> -#include <algorithm> // std::min/std::max #include <unistd.h> #include <stdlib.h> -#include <cuda_runtime.h> - -#if CUDART_VERSION < 9000 -struct cudaLaunchParams { - void *func; - dim3 gridDim; - dim3 blockDim; - void **args; - size_t sharedMem; - cudaStream_t stream; -}; -#endif - -#define MAXCHANNELS 16 -#define MAXTHREADS 256 -#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */ - -// Channels / LL tuning -#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings -#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL -#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs -#define NCCL_LL_MAX_NTHREADS MAXTHREADS -#define NCCL_LL_MIN_NTHREADS 64 - -#define DIVUP(x, y) \ - (((x)+(y)-1)/(y)) -#define ROUNDUP(x, y) \ - (DIVUP((x), (y))*(y)) - -#define ALIGN_SIZE(size, align) \ - size = ((size + (align) - 1) / (align)) * (align); - -union ncclLLFifoLine { - /* Flags have to be *after* data, because otherwise, an incomplete receive - from the network may receive the flag but not the data. - Note this is assuming that either we receive contiguous chunks of data - (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */ - struct { - uint32_t data1; - uint32_t flag1; - uint32_t data2; - uint32_t flag2; - }; - uint64_t v[2]; - int4 i4; -}; - -typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t; - -typedef enum { - ncclPatternRing, - ncclPatternRingTwice, - ncclPatternPipelineFrom, - ncclPatternPipelineTo, - ncclPatternTreeUp, - ncclPatternTreeDown, - ncclPatternTreeUpDown -} ncclPattern_t; - -typedef enum { - ncclDevSuccess, - ncclDevAssertedMismatch, - ncclDevSuspectedMismatch -} ncclDevError_t; - -// Used to pass NCCL call information between functions -struct ncclInfo { - ncclColl_t coll; - const char* opName; - // NCCL Coll Args - const void* sendbuff; - void* recvbuff; - size_t count; - ncclDataType_t datatype; - ncclRedOp_t op; - int root; - ncclComm_t comm; - cudaStream_t stream; - // Algorithm details - int chunkSteps; - int sliceSteps; - // Computed later - ncclPattern_t pattern; - size_t nBytes; - int nstepsPerLoop; - int nchunksPerLoop; -}; - -struct ncclConnInfo { - // Regular comm mechanism - char *buff; // Local for recv, remote for send - uint64_t *tail; // Local for recv, remote for send - uint64_t *head; // Local for send, remote for recv - uint64_t *opCountLoc; // opCount of local rank - uint64_t *opCountRem; // opCount of remote rank - - int direct; // Direct communication - void **ptrExchange; // Pointer exchange for direct communication - - int *fifo; // Size fifo for proxy - - uint64_t step; // Keep where we are - - // Low latency mechanism - union ncclLLFifoLine *llBuff; // Local for recv, remote for send - uint64_t llLastCleaning; -}; - -struct ncclConnector { - int connected; - struct ncclProxyArgs *proxyAppend; - struct ncclTransportComm* transportComm; - void* transportResources; // Host-side resources - struct ncclConnInfo conn; - struct ncclComm *comm; -}; - -#define CACHE_LINE_SIZE 128 -#define MEM_ALIGN 4096 -#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */ - -#define NUM_LINES_PER_THREAD 8 -#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS) -#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS) -#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine)) -#define NCCL_LL_CLEAN_FREQ 0x10000000 - -struct ncclSendMem { - union { - struct { - uint64_t head; - char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; - void* ptrExchange; - char pad2[CACHE_LINE_SIZE-sizeof(void*)]; - uint64_t opCount; - }; - char pad3[MEM_ALIGN]; - }; -}; - -struct ncclRecvMem { - union { - struct { - uint64_t tail; - char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; - uint64_t opCount; - char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)]; - int sizesFifo[NCCL_STEPS]; - }; - char pad4[MEM_ALIGN]; - }; - ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES]; - char buff[1]; // Actually larger than that -}; - -struct ncclRing { - // Shortcuts for userRanks[1] and userRanks[n-1] - int prev; - int next; - - // Maps an internal nccl index to user-specified rank order. This is necessary - // since we need to know how the user expects data to be ordered across - // devices. Ordered from current device. - int* userRanks; - int* devUserRanks; -}; - -#define NCCL_MAX_TREE_ARITY 3 -struct ncclTree { - int depth; - int up; - int down[NCCL_MAX_TREE_ARITY]; -}; - -struct ncclPeer { - struct ncclConnector send; - struct ncclConnector recv; -}; - -struct ncclChannel { - union { - struct { - struct ncclRing ring; - struct ncclTree tree; - - int id; - int nthreads; - int buffSize; - - // Communication structures - struct ncclPeer* peers; - struct ncclPeer* devPeers; - - // Operation list for aggregation - struct ncclColl* collectives; - struct ncclColl* devCollectives; - int collStart; - int collCount; - int collFifoHead; // Only used by GPU - int collFifoTail; // Only used by CPU - }; - int data[0x80]; - }; -}; -static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size"); - -/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */ -/* to make sure reads to host from the CUDA kernel are aligned. */ -/* Make sure to adjust padding at the end of ncclColl. */ -struct CollectiveArgs { - struct ncclComm* comm; - uint64_t opCount; - - // local and remote input, output, and buffer - const void * ThisInput; - void * ThisOutput; - - // general parameters - size_t N; - uint32_t root; - uint8_t bid; - uint8_t nChannels; - uint16_t nThreads; - - int lastChunkSize; -}; -struct ncclColl { - union { - struct { - struct CollectiveArgs args; - uint16_t funcIndex; - uint16_t nextIndex; - uint8_t active; - }; - int data[0x10]; - }; -}; -static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size"); - -struct ncclComm { - struct ncclChannel channels[MAXCHANNELS]; - - struct ncclPeerInfo* peerInfo; - - void* bootstrap; - - int rank; // my rank in the communicator - int nRanks; // number of GPUs in communicator - int cudaDev; // my cuda device index - int nvmlDev; // my NVML device number - - enum { GROUP, PARALLEL } launchMode; - cudaStream_t userStream; - bool userStreamSet; - cudaEvent_t doneEvent; - bool checkPointers; - - // Counter to make sure collectives match (needed for bcast/reduce - // where syncs are not symmetric). - uint64_t opCount; - - // Channels for collectives - int nChannels; - int nThreads; - - // Low-latency algorithm threshold - ssize_t llThreshold; - ssize_t threadThreshold; - - // Tree algorithm threshold - ssize_t treeThreshold; - - // An internal CUDA stream for NCCL kernel CGMD launches - int groupCudaStream; - cudaStream_t groupStream; - - // Whether there has been a fatal error in this communicator. - ncclResult_t fatalError; - - // Error reported by GPU - volatile ncclDevError_t* fatalDevError; - - // On host: this pointer has been obtained from cudaHostAlloc(cudaHostAllocMapped) - // On device: this pointer has been obtained from cudaHostGetDevicePointer() - volatile uint32_t *abortFlag; - - // Device copy of the communicator - struct ncclComm *devComm; - - // Intra-process sync - int intraRank; - int intraRanks; - int* intraBarrier; - int intraPhase; - - // Storage for deferred intra-process launch - struct cudaLaunchParams * intraParams; - struct cudaLaunchParams *myParams; - int* intraCudaDevs; - int* intraCGMode; // Whether we can use CUDA9 CGMD or not - int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not - struct ncclColl args; - void* argsptr; - - // Global proxy thread - pthread_t proxyThread; - struct ncclProxyState proxyState; -}; - -// Check CUDA calls -#define CUDACHECK(cmd) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ - WARN("Cuda failure '%s'", cudaGetErrorString(e)); \ - return ncclUnhandledCudaError; \ - } \ -} while(false) - -#define CUDACHECKGOTO(cmd, res, label) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ - WARN("Cuda failure '%s'", cudaGetErrorString(e)); \ - res = ncclUnhandledCudaError; \ - goto label; \ - } \ -} while(false) - -#include <errno.h> -// Check system calls -#define SYSCHECK(call, name) do { \ - int retval; \ - SYSCHECKVAL(call, name, retval); \ -} while (false) - -#define SYSCHECKVAL(call, name, retval) do { \ - SYSCHECKSYNC(call, name, retval); \ - if (retval == -1) { \ - WARN("Call to " name " failed : %s", strerror(errno)); \ - return ncclSystemError; \ - } \ -} while (false) - -#define SYSCHECKSYNC(call, name, retval) do { \ - retval = call; \ - if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ - INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \ - } else { \ - break; \ - } \ -} while(true) - -// Propagate errors up -#define NCCLCHECK(call) do { \ - ncclResult_t res = call; \ - if (res != ncclSuccess) { \ - /* Print the back trace*/ \ - INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ - return res; \ - } \ -} while (0); - -#define NCCLCHECKGOTO(call, res, label) do { \ - res = call; \ - if (res != ncclSuccess) { \ - /* Print the back trace*/ \ - INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ - goto label; \ - } \ -} while (0); #ifdef PROFAPI #define NCCL_API(ret, func, args...) \ @@ -427,42 +62,4 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) { } } -#include <sys/mman.h> -static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) { - CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped)); - memset(*ptr, 0, size); - *devPtr = *ptr; - return ncclSuccess; -} - -static inline ncclResult_t ncclCudaHostFree(void* ptr) { - CUDACHECK(cudaFreeHost(ptr)); - return ncclSuccess; -} - -template <typename T> -static ncclResult_t ncclCalloc(T** ptr, size_t nelem) { - void* p = malloc(nelem*sizeof(T)); - if (p == NULL) { - WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); - return ncclSystemError; - } - memset(p, 0, nelem*sizeof(T)); - *ptr = (T*)p; - return ncclSuccess; -} - -template <typename T> -static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) { - CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T))); - CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T))); - return ncclSuccess; -} - -template <typename T> -static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { - CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault)); - return ncclSuccess; -} - #endif // end include guard |