Add profiling API

author: Sylvain Jeaugey <sjeaugey@nvidia.com> 2016-09-22 21:56:51 +0300
committer: Sylvain Jeaugey <sjeaugey@nvidia.com> 2016-09-22 21:56:51 +0300
commit: e3dbc6110ebefdf5792de0c60fda1d81822d1454 (patch)
tree: 1c036d23a18c14364755c20f30333e35669ec1b2 /src
parent: 1d6715fe2053148908a9c5f59518b8e58c623e2e (diff)
1 files changed, 44 insertions, 19 deletions
diff --git a/src/nccl.h b/src/nccl.h
index 7e24031..7bb5aa5 100644
--- a/src/nccl.h
+++ b/src/nccl.h
@@ -1,7 +1,7 @@
 /*************************************************************************
  * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
  *
- * See LICENCE.txt for license information
+ * See LICENSE.txt for license information
  ************************************************************************/
 
 #ifndef NCCL_H_
@@ -47,7 +47,8 @@ typedef enum { ncclSuccess                 =  0,
 /* Generates a unique Id with each call. Used to generate commId for
  * ncclCommInitAll. uniqueId will be created in such a way that it is
  * guaranteed to be unique accross the host. */
-ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
+ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
+ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
 
 /* Creates a new communicator (multi process version).
  * rank must be between 0 and ndev-1 and unique within a communicator clique.
@@ -55,7 +56,8 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
  * The communicator is created on the current CUDA device.
  * ncclCommInitRank implicitly syncronizes with other ranks, so INIT OF EACH RANK MUST
  * BE CALLED IN A SEPARATE HOST THREADS to avoid deadlock. */
-ncclResult_t ncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank);
+ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank);
+ncclResult_t pncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank);
 
 /* Creates a clique of communicators.
  * This is a convenience function to create a single-process communicator clique.
@@ -63,22 +65,28 @@ ncclResult_t ncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, i
  * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
  * If devlist is NULL, the first ndev CUDA devices are used.
  * Order of devlist defines user-order of processors within the communicator. */
-ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
 
 /* Frees resources associated with communicator object. */
-void ncclCommDestroy(ncclComm_t comm);
+void  ncclCommDestroy(ncclComm_t comm);
+void pncclCommDestroy(ncclComm_t comm);
 
 /* Returns nice error message. */
-const char* ncclGetErrorString(ncclResult_t result);
+const char*  ncclGetErrorString(ncclResult_t result);
+const char* pncclGetErrorString(ncclResult_t result);
 
 /* Sets count to number of devices in the communicator clique. */
-ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
+ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
+ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
 
 /* Returns cuda device number associated with communicator. */
 ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
+ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
 
 /* Returns user-ordered "rank" assocaiated with communicator. */
-ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
+ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
+ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
 
 /* Reduction opperation selector */
 typedef enum { ncclSum        = 0,
@@ -105,14 +113,18 @@ typedef enum { ncclChar       = 0,
  * the same device.
  * Must be called separately for each communicator in communicator clique.
 */
-ncclResult_t ncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype,
-                        ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
 
 /* Reduces data arrays of length count in sendbuff using op operation, and leaves
  * identical copies of result on each GPUs recvbuff.
  * Sendbuff and recvbuff are assumed to reside on the same device.
  * Must be called separately for each communicator in communicator clique. */
-ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
+ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, int count,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
 
 /* Reduces data in sendbuff using op operation and leaves reduced result scattered
@@ -121,7 +133,10 @@ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, int count,
  * sendbuff has size at least ndev*recvcount elements, where ndev is number of
  * communicators in communicator clique
  * Must be called separately for each communicator in communicator clique.*/
-ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
+ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream);
+ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
     int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
     cudaStream_t stream);
 
@@ -129,7 +144,9 @@ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
  * Root specifies the source device in user-order
  * (see ncclCommInit).
  * Must be called separately for each communicator in communicator clique. */
-ncclResult_t ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
+ncclResult_t  ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream);
 
 
@@ -139,7 +156,9 @@ ncclResult_t ncclBcast(void* buff, int count, ncclDataType_t datatype, int root,
  * in communicator clique.
  * Sendbuff and recvbuff are assumed to reside on same device.
  * Must be called separately for each communicator in communicator clique. */
-ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
+ncclResult_t  ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
+    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype,
     void* recvbuff, ncclComm_t comm, cudaStream_t stream);
 
 
@@ -150,7 +169,9 @@ ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datat
 // * On the root device, sendbuff and recvbuff are assumed to reside on the same device.
 // * Must be called separately for each communicator in communicator clique. */
 // * All GPUs, including root, perform copies into recvbuff.
-//ncclResult_t ncclGather(const void* sendbuff, int count, ncclDataType_t datatype,
+//ncclResult_t  ncclGather(const void* sendbuff, int count, ncclDataType_t datatype,
+//    void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream);
+//ncclResult_t pncclGather(const void* sendbuff, int count, ncclDataType_t datatype,
 //                        void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream);
 
 ///* Root device scatters count values to each devices.
@@ -159,16 +180,20 @@ ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datat
 // * recvbuff allocated on each gpu, including root, size=count.
 // * Result is ordered by comm's logical device order.
 // * Called separately for each device in the ncclComm. */
-//ncclResult_t ncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff,
-//                         int count, int root, ncclComm_t comm, cudaStream_t stream);
+//ncclResult_t  ncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff,
+//    int count, int root, ncclComm_t comm, cudaStream_t stream);
+//ncclResult_t pncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff,
+//    int count, int root, ncclComm_t comm, cudaStream_t stream);
 //
 ///* All GPUs scatter blocks of count elements to other devices.
 // * Must be called separately for each device in the ncclComm.
 // * sendbuff and recvbuff assumed to reside on same device and
 // * have size at least nGPUs*count.
 // * Called separately for each device in the ncclComm. */
-//ncclResult_t ncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype,
-//                          void* recvbuff, ncclComm_t comm, cudaStream_t stream);
+//ncclResult_t  ncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype,
+//    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
+//ncclResult_t pncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype,
+//    void* recvbuff, ncclComm_t comm, cudaStream_t stream);
 
 #ifdef __cplusplus
 } // end extern "C"
author	Sylvain Jeaugey <sjeaugey@nvidia.com>	2016-09-22 21:56:51 +0300
committer	Sylvain Jeaugey <sjeaugey@nvidia.com>	2016-09-22 21:56:51 +0300
commit	e3dbc6110ebefdf5792de0c60fda1d81822d1454 (patch)
tree	1c036d23a18c14364755c20f30333e35669ec1b2 /src
parent	1d6715fe2053148908a9c5f59518b8e58c623e2e (diff)