diff options
Diffstat (limited to 'src/include/comm.h')
-rw-r--r-- | src/include/comm.h | 45 |
1 files changed, 30 insertions, 15 deletions
diff --git a/src/include/comm.h b/src/include/comm.h index 7164dc0..56116e0 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,6 +8,7 @@ #define NCCL_COMM_H_ #include "transport.h" +#include "p2p.h" #if CUDART_VERSION < 9000 struct cudaLaunchParams { @@ -20,8 +21,6 @@ struct cudaLaunchParams { }; #endif -#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */ - #define CACHE_LINE_SIZE 128 #define MEM_ALIGN 4096 #define CUDA_IPC_MIN 2097152UL @@ -38,10 +37,10 @@ struct ncclSendMem { char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; void* ptrExchange; char pad2[CACHE_LINE_SIZE-sizeof(void*)]; - uint64_t opCount; }; char pad3[MEM_ALIGN]; }; + char buff[1]; // Actually larger than that }; struct ncclRecvMem { @@ -49,14 +48,11 @@ struct ncclRecvMem { struct { uint64_t tail; char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; - uint64_t opCount; - char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)]; int sizesFifo[NCCL_STEPS]; + void* ptrsFifo[NCCL_STEPS]; }; char pad4[MEM_ALIGN]; }; - ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES]; - uint64_t ll128Buff[NCCL_LL128_BUFF_ELEMS]; char buff[1]; // Actually larger than that }; @@ -67,6 +63,10 @@ struct ncclComm { struct ncclTopoSystem* topo; void* bootstrap; + // Bitmasks for ncclTransportP2pSetup + int connect; + uint32_t* connectSend; + uint32_t* connectRecv; int rank; // my rank in the communicator int nRanks; // number of GPUs in communicator @@ -90,15 +90,19 @@ struct ncclComm { // Channels for collectives int nChannels; + // Channels (per peer) for p2p + int p2pnChannels; + int p2pnChannelsPerPeer; + int p2pChannels[MAXCHANNELS]; - // Only nvlink is used for inter-GPU communication - int nvlink; + // Buffer sizes + int buffSizes[NCCL_NUM_PROTOCOLS]; // Algorithm/Protocols thresholds ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - int maxThreads[NCCL_NUM_PROTOCOLS]; + int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; // An internal CUDA stream for NCCL kernel CGMD launches int groupCudaStream; @@ -107,9 +111,6 @@ struct ncclComm { // Whether there has been a fatal error in this communicator. ncclResult_t fatalError; - // Error reported by GPU - volatile ncclDevError_t* fatalDevError; - // Flag to ask NCCL kernels to abort volatile uint32_t *abortFlag; @@ -130,12 +131,26 @@ struct ncclComm { int* intraCudaDevs; int* intraCGMode; // Whether we can use CUDA9 CGMD or not int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not - struct ncclColl args; + struct ncclWorkElem args; void* argsptr; // Global proxy thread pthread_t proxyThread; struct ncclProxyState proxyState; + + // Whether this communicator uses collNet + int collNetSupport; + + // Store info of async operations + struct ncclInfo* asyncOps; + int asyncOpCount; + size_t asyncTotalSize; + + //list of async p2p operation queued in a group semantics + struct ncclP2Plist* p2pSends; + struct ncclP2Plist* p2pRecvs; + int p2pSendCount; + int p2pRecvCount; }; #endif |