Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/include/core.h')
-rw-r--r--src/include/core.h423
1 files changed, 10 insertions, 413 deletions
diff --git a/src/include/core.h b/src/include/core.h
index d57d271..8a08b91 100644
--- a/src/include/core.h
+++ b/src/include/core.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,385 +7,20 @@
#ifndef NCCL_CORE_H_
#define NCCL_CORE_H_
-#define NCCL_MAX_OPS 2048
-#define NCCL_STEPS 8
-
+#include <pthread.h>
+#include <algorithm>
#include "nccl.h"
-#include "transport.h"
#include "debug.h"
+#include "checks.h"
+#include "alloc.h"
+#include "transport.h"
+#include "devcomm.h"
+#include "comm.h"
+#include "info.h"
+#include "argcheck.h"
#include <cstdio>
-#include <algorithm> // std::min/std::max
#include <unistd.h>
#include <stdlib.h>
-#include <cuda_runtime.h>
-
-#if CUDART_VERSION < 9000
-struct cudaLaunchParams {
- void *func;
- dim3 gridDim;
- dim3 blockDim;
- void **args;
- size_t sharedMem;
- cudaStream_t stream;
-};
-#endif
-
-#define MAXCHANNELS 16
-#define MAXTHREADS 256
-#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
-
-// Channels / LL tuning
-#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
-#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL
-#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
-#define NCCL_LL_MAX_NTHREADS MAXTHREADS
-#define NCCL_LL_MIN_NTHREADS 64
-
-#define DIVUP(x, y) \
- (((x)+(y)-1)/(y))
-#define ROUNDUP(x, y) \
- (DIVUP((x), (y))*(y))
-
-#define ALIGN_SIZE(size, align) \
- size = ((size + (align) - 1) / (align)) * (align);
-
-union ncclLLFifoLine {
- /* Flags have to be *after* data, because otherwise, an incomplete receive
- from the network may receive the flag but not the data.
- Note this is assuming that either we receive contiguous chunks of data
- (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
- struct {
- uint32_t data1;
- uint32_t flag1;
- uint32_t data2;
- uint32_t flag2;
- };
- uint64_t v[2];
- int4 i4;
-};
-
-typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
-
-typedef enum {
- ncclPatternRing,
- ncclPatternRingTwice,
- ncclPatternPipelineFrom,
- ncclPatternPipelineTo,
- ncclPatternTreeUp,
- ncclPatternTreeDown,
- ncclPatternTreeUpDown
-} ncclPattern_t;
-
-typedef enum {
- ncclDevSuccess,
- ncclDevAssertedMismatch,
- ncclDevSuspectedMismatch
-} ncclDevError_t;
-
-// Used to pass NCCL call information between functions
-struct ncclInfo {
- ncclColl_t coll;
- const char* opName;
- // NCCL Coll Args
- const void* sendbuff;
- void* recvbuff;
- size_t count;
- ncclDataType_t datatype;
- ncclRedOp_t op;
- int root;
- ncclComm_t comm;
- cudaStream_t stream;
- // Algorithm details
- int chunkSteps;
- int sliceSteps;
- // Computed later
- ncclPattern_t pattern;
- size_t nBytes;
- int nstepsPerLoop;
- int nchunksPerLoop;
-};
-
-struct ncclConnInfo {
- // Regular comm mechanism
- char *buff; // Local for recv, remote for send
- uint64_t *tail; // Local for recv, remote for send
- uint64_t *head; // Local for send, remote for recv
- uint64_t *opCountLoc; // opCount of local rank
- uint64_t *opCountRem; // opCount of remote rank
-
- int direct; // Direct communication
- void **ptrExchange; // Pointer exchange for direct communication
-
- int *fifo; // Size fifo for proxy
-
- uint64_t step; // Keep where we are
-
- // Low latency mechanism
- union ncclLLFifoLine *llBuff; // Local for recv, remote for send
- uint64_t llLastCleaning;
-};
-
-struct ncclConnector {
- int connected;
- struct ncclProxyArgs *proxyAppend;
- struct ncclTransportComm* transportComm;
- void* transportResources; // Host-side resources
- struct ncclConnInfo conn;
- struct ncclComm *comm;
-};
-
-#define CACHE_LINE_SIZE 128
-#define MEM_ALIGN 4096
-#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
-
-#define NUM_LINES_PER_THREAD 8
-#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
-#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
-#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
-#define NCCL_LL_CLEAN_FREQ 0x10000000
-
-struct ncclSendMem {
- union {
- struct {
- uint64_t head;
- char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
- void* ptrExchange;
- char pad2[CACHE_LINE_SIZE-sizeof(void*)];
- uint64_t opCount;
- };
- char pad3[MEM_ALIGN];
- };
-};
-
-struct ncclRecvMem {
- union {
- struct {
- uint64_t tail;
- char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
- uint64_t opCount;
- char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
- int sizesFifo[NCCL_STEPS];
- };
- char pad4[MEM_ALIGN];
- };
- ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
- char buff[1]; // Actually larger than that
-};
-
-struct ncclRing {
- // Shortcuts for userRanks[1] and userRanks[n-1]
- int prev;
- int next;
-
- // Maps an internal nccl index to user-specified rank order. This is necessary
- // since we need to know how the user expects data to be ordered across
- // devices. Ordered from current device.
- int* userRanks;
- int* devUserRanks;
-};
-
-#define NCCL_MAX_TREE_ARITY 3
-struct ncclTree {
- int depth;
- int up;
- int down[NCCL_MAX_TREE_ARITY];
-};
-
-struct ncclPeer {
- struct ncclConnector send;
- struct ncclConnector recv;
-};
-
-struct ncclChannel {
- union {
- struct {
- struct ncclRing ring;
- struct ncclTree tree;
-
- int id;
- int nthreads;
- int buffSize;
-
- // Communication structures
- struct ncclPeer* peers;
- struct ncclPeer* devPeers;
-
- // Operation list for aggregation
- struct ncclColl* collectives;
- struct ncclColl* devCollectives;
- int collStart;
- int collCount;
- int collFifoHead; // Only used by GPU
- int collFifoTail; // Only used by CPU
- };
- int data[0x80];
- };
-};
-static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
-
-/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
-/* to make sure reads to host from the CUDA kernel are aligned. */
-/* Make sure to adjust padding at the end of ncclColl. */
-struct CollectiveArgs {
- struct ncclComm* comm;
- uint64_t opCount;
-
- // local and remote input, output, and buffer
- const void * ThisInput;
- void * ThisOutput;
-
- // general parameters
- size_t N;
- uint32_t root;
- uint8_t bid;
- uint8_t nChannels;
- uint16_t nThreads;
-
- int lastChunkSize;
-};
-struct ncclColl {
- union {
- struct {
- struct CollectiveArgs args;
- uint16_t funcIndex;
- uint16_t nextIndex;
- uint8_t active;
- };
- int data[0x10];
- };
-};
-static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
-
-struct ncclComm {
- struct ncclChannel channels[MAXCHANNELS];
-
- struct ncclPeerInfo* peerInfo;
-
- void* bootstrap;
-
- int rank; // my rank in the communicator
- int nRanks; // number of GPUs in communicator
- int cudaDev; // my cuda device index
- int nvmlDev; // my NVML device number
-
- enum { GROUP, PARALLEL } launchMode;
- cudaStream_t userStream;
- bool userStreamSet;
- cudaEvent_t doneEvent;
- bool checkPointers;
-
- // Counter to make sure collectives match (needed for bcast/reduce
- // where syncs are not symmetric).
- uint64_t opCount;
-
- // Channels for collectives
- int nChannels;
- int nThreads;
-
- // Low-latency algorithm threshold
- ssize_t llThreshold;
- ssize_t threadThreshold;
-
- // Tree algorithm threshold
- ssize_t treeThreshold;
-
- // An internal CUDA stream for NCCL kernel CGMD launches
- int groupCudaStream;
- cudaStream_t groupStream;
-
- // Whether there has been a fatal error in this communicator.
- ncclResult_t fatalError;
-
- // Error reported by GPU
- volatile ncclDevError_t* fatalDevError;
-
- // On host: this pointer has been obtained from cudaHostAlloc(cudaHostAllocMapped)
- // On device: this pointer has been obtained from cudaHostGetDevicePointer()
- volatile uint32_t *abortFlag;
-
- // Device copy of the communicator
- struct ncclComm *devComm;
-
- // Intra-process sync
- int intraRank;
- int intraRanks;
- int* intraBarrier;
- int intraPhase;
-
- // Storage for deferred intra-process launch
- struct cudaLaunchParams * intraParams;
- struct cudaLaunchParams *myParams;
- int* intraCudaDevs;
- int* intraCGMode; // Whether we can use CUDA9 CGMD or not
- int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
- struct ncclColl args;
- void* argsptr;
-
- // Global proxy thread
- pthread_t proxyThread;
- struct ncclProxyState proxyState;
-};
-
-// Check CUDA calls
-#define CUDACHECK(cmd) do { \
- cudaError_t e = cmd; \
- if( e != cudaSuccess ) { \
- WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
- return ncclUnhandledCudaError; \
- } \
-} while(false)
-
-#define CUDACHECKGOTO(cmd, res, label) do { \
- cudaError_t e = cmd; \
- if( e != cudaSuccess ) { \
- WARN("Cuda failure '%s'", cudaGetErrorString(e)); \
- res = ncclUnhandledCudaError; \
- goto label; \
- } \
-} while(false)
-
-#include <errno.h>
-// Check system calls
-#define SYSCHECK(call, name) do { \
- int retval; \
- SYSCHECKVAL(call, name, retval); \
-} while (false)
-
-#define SYSCHECKVAL(call, name, retval) do { \
- SYSCHECKSYNC(call, name, retval); \
- if (retval == -1) { \
- WARN("Call to " name " failed : %s", strerror(errno)); \
- return ncclSystemError; \
- } \
-} while (false)
-
-#define SYSCHECKSYNC(call, name, retval) do { \
- retval = call; \
- if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
- INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
- } else { \
- break; \
- } \
-} while(true)
-
-// Propagate errors up
-#define NCCLCHECK(call) do { \
- ncclResult_t res = call; \
- if (res != ncclSuccess) { \
- /* Print the back trace*/ \
- INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
- return res; \
- } \
-} while (0);
-
-#define NCCLCHECKGOTO(call, res, label) do { \
- res = call; \
- if (res != ncclSuccess) { \
- /* Print the back trace*/ \
- INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
- goto label; \
- } \
-} while (0);
#ifdef PROFAPI
#define NCCL_API(ret, func, args...) \
@@ -427,42 +62,4 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
}
}
-#include <sys/mman.h>
-static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
- CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped));
- memset(*ptr, 0, size);
- *devPtr = *ptr;
- return ncclSuccess;
-}
-
-static inline ncclResult_t ncclCudaHostFree(void* ptr) {
- CUDACHECK(cudaFreeHost(ptr));
- return ncclSuccess;
-}
-
-template <typename T>
-static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
- void* p = malloc(nelem*sizeof(T));
- if (p == NULL) {
- WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
- return ncclSystemError;
- }
- memset(p, 0, nelem*sizeof(T));
- *ptr = (T*)p;
- return ncclSuccess;
-}
-
-template <typename T>
-static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) {
- CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T)));
- CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T)));
- return ncclSuccess;
-}
-
-template <typename T>
-static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
- CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault));
- return ncclSuccess;
-}
-
#endif // end include guard