diff options
Diffstat (limited to 'src/include/comm.h')
-rw-r--r-- | src/include/comm.h | 30 |
1 files changed, 22 insertions, 8 deletions
diff --git a/src/include/comm.h b/src/include/comm.h index 3b2a85d..7164dc0 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -7,6 +7,8 @@ #ifndef NCCL_COMM_H_ #define NCCL_COMM_H_ +#include "transport.h" + #if CUDART_VERSION < 9000 struct cudaLaunchParams { void *func; @@ -18,13 +20,17 @@ struct cudaLaunchParams { }; #endif -#define MAXCHANNELS 16 #define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */ #define CACHE_LINE_SIZE 128 #define MEM_ALIGN 4096 #define CUDA_IPC_MIN 2097152UL +// Channels / LL tuning +#define NCCL_LL_THREAD_THRESHOLD 8 +#define NCCL_LL128_THREAD_THRESHOLD 8 +#define NCCL_SIMPLE_THREAD_THRESHOLD 64 + struct ncclSendMem { union { struct { @@ -50,6 +56,7 @@ struct ncclRecvMem { char pad4[MEM_ALIGN]; }; ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES]; + uint64_t ll128Buff[NCCL_LL128_BUFF_ELEMS]; char buff[1]; // Actually larger than that }; @@ -57,13 +64,18 @@ struct ncclComm { struct ncclChannel channels[MAXCHANNELS]; struct ncclPeerInfo* peerInfo; + struct ncclTopoSystem* topo; void* bootstrap; int rank; // my rank in the communicator int nRanks; // number of GPUs in communicator int cudaDev; // my cuda device index - int nvmlDev; // my NVML device number + int64_t busId; // my PCI bus ID in int format + + int node; + int nNodes; + int localRanks; enum { GROUP, PARALLEL } launchMode; cudaStream_t userStream; @@ -74,17 +86,19 @@ struct ncclComm { // Counter to make sure collectives match (needed for bcast/reduce // where syncs are not symmetric). uint64_t opCount; + uint64_t lastOpCount; // Channels for collectives int nChannels; - int nThreads; - // Low-latency algorithm threshold - ssize_t llThreshold; - ssize_t threadThreshold; + // Only nvlink is used for inter-GPU communication + int nvlink; - // Tree algorithm threshold - ssize_t treeThreshold; + // Algorithm/Protocols thresholds + ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + int maxThreads[NCCL_NUM_PROTOCOLS]; // An internal CUDA stream for NCCL kernel CGMD launches int groupCudaStream; |