1 files changed, 30 insertions, 15 deletions
diff --git a/src/include/comm.h b/src/include/comm.h
index 7164dc0..56116e0 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -8,6 +8,7 @@
 #define NCCL_COMM_H_
 
 #include "transport.h"
+#include "p2p.h"
 
 #if CUDART_VERSION < 9000
 struct cudaLaunchParams {
@@ -20,8 +21,6 @@ struct cudaLaunchParams {
 };
 #endif
 
-#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
-
 #define CACHE_LINE_SIZE 128
 #define MEM_ALIGN 4096
 #define CUDA_IPC_MIN 2097152UL
@@ -38,10 +37,10 @@ struct ncclSendMem {
       char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
       void* ptrExchange;
       char pad2[CACHE_LINE_SIZE-sizeof(void*)];
-      uint64_t opCount;
     };
     char pad3[MEM_ALIGN];
   };
+  char buff[1]; // Actually larger than that
 };
 
 struct ncclRecvMem {
@@ -49,14 +48,11 @@ struct ncclRecvMem {
     struct {
       uint64_t tail;
       char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      uint64_t opCount;
-      char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
       int sizesFifo[NCCL_STEPS];
+      void* ptrsFifo[NCCL_STEPS];
     };
     char pad4[MEM_ALIGN];
   };
-  ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
-  uint64_t ll128Buff[NCCL_LL128_BUFF_ELEMS];
   char buff[1]; // Actually larger than that
 };
 
@@ -67,6 +63,10 @@ struct ncclComm {
   struct ncclTopoSystem* topo;
 
   void* bootstrap;
+  // Bitmasks for ncclTransportP2pSetup
+  int connect;
+  uint32_t* connectSend;
+  uint32_t* connectRecv;
 
   int rank;    // my rank in the communicator
   int nRanks;  // number of GPUs in communicator
@@ -90,15 +90,19 @@ struct ncclComm {
 
   // Channels for collectives
   int nChannels;
+  // Channels (per peer) for p2p
+  int p2pnChannels;
+  int p2pnChannelsPerPeer;
+  int p2pChannels[MAXCHANNELS];
 
-  // Only nvlink is used for inter-GPU communication
-  int nvlink;
+  // Buffer sizes
+  int buffSizes[NCCL_NUM_PROTOCOLS];
 
   // Algorithm/Protocols thresholds
   ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
-  int maxThreads[NCCL_NUM_PROTOCOLS];
+  int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
 
   // An internal CUDA stream for NCCL kernel CGMD launches
   int groupCudaStream;
@@ -107,9 +111,6 @@ struct ncclComm {
   // Whether there has been a fatal error in this communicator.
   ncclResult_t fatalError;
 
-  // Error reported by GPU
-  volatile ncclDevError_t* fatalDevError;
-
   // Flag to ask NCCL kernels to abort
   volatile uint32_t *abortFlag;
 
@@ -130,12 +131,26 @@ struct ncclComm {
   int* intraCudaDevs;
   int* intraCGMode; // Whether we can use CUDA9 CGMD or not
   int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
-  struct ncclColl args;
+  struct ncclWorkElem args;
   void* argsptr;
 
   // Global proxy thread
   pthread_t proxyThread;
   struct ncclProxyState proxyState;
+
+  // Whether this communicator uses collNet
+  int collNetSupport;
+
+  // Store info of async operations
+  struct ncclInfo* asyncOps;
+  int asyncOpCount;
+  size_t asyncTotalSize;
+
+  //list of async p2p operation queued in a group semantics
+  struct ncclP2Plist* p2pSends;
+  struct ncclP2Plist* p2pRecvs;
+  int p2pSendCount;
+  int p2pRecvCount;
 };
 
 #endif