Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/include/comm.h')
-rw-r--r--src/include/comm.h30
1 files changed, 22 insertions, 8 deletions
diff --git a/src/include/comm.h b/src/include/comm.h
index 3b2a85d..7164dc0 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -7,6 +7,8 @@
#ifndef NCCL_COMM_H_
#define NCCL_COMM_H_
+#include "transport.h"
+
#if CUDART_VERSION < 9000
struct cudaLaunchParams {
void *func;
@@ -18,13 +20,17 @@ struct cudaLaunchParams {
};
#endif
-#define MAXCHANNELS 16
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
#define CACHE_LINE_SIZE 128
#define MEM_ALIGN 4096
#define CUDA_IPC_MIN 2097152UL
+// Channels / LL tuning
+#define NCCL_LL_THREAD_THRESHOLD 8
+#define NCCL_LL128_THREAD_THRESHOLD 8
+#define NCCL_SIMPLE_THREAD_THRESHOLD 64
+
struct ncclSendMem {
union {
struct {
@@ -50,6 +56,7 @@ struct ncclRecvMem {
char pad4[MEM_ALIGN];
};
ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
+ uint64_t ll128Buff[NCCL_LL128_BUFF_ELEMS];
char buff[1]; // Actually larger than that
};
@@ -57,13 +64,18 @@ struct ncclComm {
struct ncclChannel channels[MAXCHANNELS];
struct ncclPeerInfo* peerInfo;
+ struct ncclTopoSystem* topo;
void* bootstrap;
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
int cudaDev; // my cuda device index
- int nvmlDev; // my NVML device number
+ int64_t busId; // my PCI bus ID in int format
+
+ int node;
+ int nNodes;
+ int localRanks;
enum { GROUP, PARALLEL } launchMode;
cudaStream_t userStream;
@@ -74,17 +86,19 @@ struct ncclComm {
// Counter to make sure collectives match (needed for bcast/reduce
// where syncs are not symmetric).
uint64_t opCount;
+ uint64_t lastOpCount;
// Channels for collectives
int nChannels;
- int nThreads;
- // Low-latency algorithm threshold
- ssize_t llThreshold;
- ssize_t threadThreshold;
+ // Only nvlink is used for inter-GPU communication
+ int nvlink;
- // Tree algorithm threshold
- ssize_t treeThreshold;
+ // Algorithm/Protocols thresholds
+ ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+ float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+ float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+ int maxThreads[NCCL_NUM_PROTOCOLS];
// An internal CUDA stream for NCCL kernel CGMD launches
int groupCudaStream;