1 files changed, 22 insertions, 8 deletions
diff --git a/src/include/comm.h b/src/include/comm.h
index 3b2a85d..7164dc0 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -7,6 +7,8 @@
 #ifndef NCCL_COMM_H_
 #define NCCL_COMM_H_
 
+#include "transport.h"
+
 #if CUDART_VERSION < 9000
 struct cudaLaunchParams {
   void *func;
@@ -18,13 +20,17 @@ struct cudaLaunchParams {
 };
 #endif
 
-#define MAXCHANNELS 16
 #define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
 
 #define CACHE_LINE_SIZE 128
 #define MEM_ALIGN 4096
 #define CUDA_IPC_MIN 2097152UL
 
+// Channels / LL tuning
+#define NCCL_LL_THREAD_THRESHOLD 8
+#define NCCL_LL128_THREAD_THRESHOLD 8
+#define NCCL_SIMPLE_THREAD_THRESHOLD 64
+
 struct ncclSendMem {
   union {
     struct {
@@ -50,6 +56,7 @@ struct ncclRecvMem {
     char pad4[MEM_ALIGN];
   };
   ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
+  uint64_t ll128Buff[NCCL_LL128_BUFF_ELEMS];
   char buff[1]; // Actually larger than that
 };
 
@@ -57,13 +64,18 @@ struct ncclComm {
   struct ncclChannel channels[MAXCHANNELS];
 
   struct ncclPeerInfo* peerInfo;
+  struct ncclTopoSystem* topo;
 
   void* bootstrap;
 
   int rank;    // my rank in the communicator
   int nRanks;  // number of GPUs in communicator
   int cudaDev; // my cuda device index
-  int nvmlDev; // my NVML device number
+  int64_t busId;   // my PCI bus ID in int format
+
+  int node;
+  int nNodes;
+  int localRanks;
 
   enum { GROUP, PARALLEL } launchMode;
   cudaStream_t userStream;
@@ -74,17 +86,19 @@ struct ncclComm {
   // Counter to make sure collectives match (needed for bcast/reduce
   // where syncs are not symmetric).
   uint64_t opCount;
+  uint64_t lastOpCount;
 
   // Channels for collectives
   int nChannels;
-  int nThreads;
 
-  // Low-latency algorithm threshold
-  ssize_t llThreshold;
-  ssize_t threadThreshold;
+  // Only nvlink is used for inter-GPU communication
+  int nvlink;
 
-  // Tree algorithm threshold
-  ssize_t treeThreshold;
+  // Algorithm/Protocols thresholds
+  ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  int maxThreads[NCCL_NUM_PROTOCOLS];
 
   // An internal CUDA stream for NCCL kernel CGMD launches
   int groupCudaStream;