diff options
author | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2016-09-22 21:56:51 +0300 |
---|---|---|
committer | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2016-09-22 21:56:51 +0300 |
commit | e3dbc6110ebefdf5792de0c60fda1d81822d1454 (patch) | |
tree | 1c036d23a18c14364755c20f30333e35669ec1b2 /src | |
parent | 1d6715fe2053148908a9c5f59518b8e58c623e2e (diff) |
Add profiling API
Diffstat (limited to 'src')
-rw-r--r-- | src/nccl.h | 63 |
1 files changed, 44 insertions, 19 deletions
@@ -1,7 +1,7 @@ /************************************************************************* * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved. * - * See LICENCE.txt for license information + * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_H_ @@ -47,7 +47,8 @@ typedef enum { ncclSuccess = 0, /* Generates a unique Id with each call. Used to generate commId for * ncclCommInitAll. uniqueId will be created in such a way that it is * guaranteed to be unique accross the host. */ -ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); +ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); +ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); /* Creates a new communicator (multi process version). * rank must be between 0 and ndev-1 and unique within a communicator clique. @@ -55,7 +56,8 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); * The communicator is created on the current CUDA device. * ncclCommInitRank implicitly syncronizes with other ranks, so INIT OF EACH RANK MUST * BE CALLED IN A SEPARATE HOST THREADS to avoid deadlock. */ -ncclResult_t ncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank); +ncclResult_t ncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank); +ncclResult_t pncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, int rank); /* Creates a clique of communicators. * This is a convenience function to create a single-process communicator clique. @@ -63,22 +65,28 @@ ncclResult_t ncclCommInitRank(ncclComm_t* comm, int ndev, ncclUniqueId commId, i * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t). * If devlist is NULL, the first ndev CUDA devices are used. * Order of devlist defines user-order of processors within the communicator. */ -ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); +ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); +ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); /* Frees resources associated with communicator object. */ -void ncclCommDestroy(ncclComm_t comm); +void ncclCommDestroy(ncclComm_t comm); +void pncclCommDestroy(ncclComm_t comm); /* Returns nice error message. */ -const char* ncclGetErrorString(ncclResult_t result); +const char* ncclGetErrorString(ncclResult_t result); +const char* pncclGetErrorString(ncclResult_t result); /* Sets count to number of devices in the communicator clique. */ -ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); +ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); +ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); /* Returns cuda device number associated with communicator. */ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); +ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); /* Returns user-ordered "rank" assocaiated with communicator. */ -ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); +ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); +ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); /* Reduction opperation selector */ typedef enum { ncclSum = 0, @@ -105,14 +113,18 @@ typedef enum { ncclChar = 0, * the same device. * Must be called separately for each communicator in communicator clique. */ -ncclResult_t ncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclReduce(const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); /* Reduces data arrays of length count in sendbuff using op operation, and leaves * identical copies of result on each GPUs recvbuff. * Sendbuff and recvbuff are assumed to reside on the same device. * Must be called separately for each communicator in communicator clique. */ -ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, int count, +ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, int count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, int count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); /* Reduces data in sendbuff using op operation and leaves reduced result scattered @@ -121,7 +133,10 @@ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, int count, * sendbuff has size at least ndev*recvcount elements, where ndev is number of * communicators in communicator clique * Must be called separately for each communicator in communicator clique.*/ -ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, +ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, + int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream); +ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); @@ -129,7 +144,9 @@ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, * Root specifies the source device in user-order * (see ncclCommInit). * Must be called separately for each communicator in communicator clique. */ -ncclResult_t ncclBcast(void* buff, int count, ncclDataType_t datatype, int root, +ncclResult_t ncclBcast(void* buff, int count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclBcast(void* buff, int count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); @@ -139,7 +156,9 @@ ncclResult_t ncclBcast(void* buff, int count, ncclDataType_t datatype, int root, * in communicator clique. * Sendbuff and recvbuff are assumed to reside on same device. * Must be called separately for each communicator in communicator clique. */ -ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype, +ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype, + void* recvbuff, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclAllGather(const void* sendbuff, int count, ncclDataType_t datatype, void* recvbuff, ncclComm_t comm, cudaStream_t stream); @@ -150,7 +169,9 @@ ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datat // * On the root device, sendbuff and recvbuff are assumed to reside on the same device. // * Must be called separately for each communicator in communicator clique. */ // * All GPUs, including root, perform copies into recvbuff. -//ncclResult_t ncclGather(const void* sendbuff, int count, ncclDataType_t datatype, +//ncclResult_t ncclGather(const void* sendbuff, int count, ncclDataType_t datatype, +// void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream); +//ncclResult_t pncclGather(const void* sendbuff, int count, ncclDataType_t datatype, // void* recvbuff, int root, ncclComm_t comm, cudaStream_t stream); ///* Root device scatters count values to each devices. @@ -159,16 +180,20 @@ ncclResult_t ncclAllGather(const void* sendbuff, int count, ncclDataType_t datat // * recvbuff allocated on each gpu, including root, size=count. // * Result is ordered by comm's logical device order. // * Called separately for each device in the ncclComm. */ -//ncclResult_t ncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff, -// int count, int root, ncclComm_t comm, cudaStream_t stream); +//ncclResult_t ncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff, +// int count, int root, ncclComm_t comm, cudaStream_t stream); +//ncclResult_t pncclScatter(void* sendbuff, ncclDataType_t datatype, void* recvbuff, +// int count, int root, ncclComm_t comm, cudaStream_t stream); // ///* All GPUs scatter blocks of count elements to other devices. // * Must be called separately for each device in the ncclComm. // * sendbuff and recvbuff assumed to reside on same device and // * have size at least nGPUs*count. // * Called separately for each device in the ncclComm. */ -//ncclResult_t ncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype, -// void* recvbuff, ncclComm_t comm, cudaStream_t stream); +//ncclResult_t ncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype, +// void* recvbuff, ncclComm_t comm, cudaStream_t stream); +//ncclResult_t pncclAllToAll(void* sendbuff, int count, ncclDataType_t datatype, +// void* recvbuff, ncclComm_t comm, cudaStream_t stream); #ifdef __cplusplus } // end extern "C" |