diff options
Diffstat (limited to 'src/nccl.h.in')
-rw-r--r-- | src/nccl.h.in | 251 |
1 files changed, 251 insertions, 0 deletions
diff --git a/src/nccl.h.in b/src/nccl.h.in new file mode 100644 index 0000000..7227625 --- /dev/null +++ b/src/nccl.h.in @@ -0,0 +1,251 @@ +/************************************************************************* + * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_H_ +#define NCCL_H_ + +#include <cuda_runtime.h> +#include <cuda_fp16.h> + +#define NCCL_MAJOR ${nccl:Major} +#define NCCL_MINOR ${nccl:Minor} +#define NCCL_PATCH ${nccl:Patch} +#define NCCL_SUFFIX "${nccl:Suffix}" + +#define NCCL_VERSION_CODE ${nccl:Version} +#define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z)) + +#ifdef __cplusplus +extern "C" { +#endif + +/* Opaque handle to communicator */ +typedef struct ncclComm* ncclComm_t; + +#define NCCL_UNIQUE_ID_BYTES 128 +typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; + +/* Error type */ +typedef enum { ncclSuccess = 0, + ncclUnhandledCudaError = 1, + ncclSystemError = 2, + ncclInternalError = 3, + ncclInvalidArgument = 4, + ncclInvalidUsage = 5, + ncclNumResults = 6 } ncclResult_t; + +/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. + * This integer is coded with the MAJOR, MINOR and PATCH level of the + * NCCL library + */ +ncclResult_t ncclGetVersion(int *version); +ncclResult_t pncclGetVersion(int *version); + +/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be + * called once and the Id should be distributed to all ranks in the + * communicator before calling ncclCommInitRank. */ +ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); +ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); + +/* Creates a new communicator (multi thread/process version). + * rank must be between 0 and nranks-1 and unique within a communicator clique. + * Each rank is associated to a CUDA device, which has to be set before calling + * ncclCommInitRank. + * ncclCommInitRank implicitly syncronizes with other ranks, so it must be + * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */ +ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); +ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); + +/* Creates a clique of communicators (single process version). + * This is a convenience function to create a single-process communicator clique. + * Returns an array of ndev newly initialized communicators in comm. + * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t). + * If devlist is NULL, the first ndev CUDA devices are used. + * Order of devlist defines user-order of processors within the communicator. */ +ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); +ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); + +/* Frees resources associated with communicator object. */ +ncclResult_t ncclCommDestroy(ncclComm_t comm); +ncclResult_t pncclCommDestroy(ncclComm_t comm); + +/* Returns a human-readable error message. */ +const char* ncclGetErrorString(ncclResult_t result); +const char* pncclGetErrorString(ncclResult_t result); + +/* Gets the number of ranks in the communicator clique. */ +ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); +ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); + +/* Returns the cuda device number associated with the communicator. */ +ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); +ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); + +/* Returns the user-ordered "rank" associated with the communicator. */ +ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); +ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); + +/* Reduction operation selector */ +typedef enum { ncclSum = 0, + ncclProd = 1, + ncclMax = 2, + ncclMin = 3, + ncclNumOps = 4 } ncclRedOp_t; + +/* Data types */ +typedef enum { ncclInt8 = 0, ncclChar = 0, + ncclUint8 = 1, + ncclInt32 = 2, ncclInt = 2, + ncclUint32 = 3, + ncclInt64 = 4, + ncclUint64 = 5, + ncclFloat16 = 6, ncclHalf = 6, + ncclFloat32 = 7, ncclFloat = 7, + ncclFloat64 = 8, ncclDouble = 8, + ncclNumTypes = 9 } ncclDataType_t; + +/* + * Collective communication operations + * + * Collective communication operations must be called separately for each + * communicator in a communicator clique. + * + * They return when operations have been enqueued on the CUDA stream. + * + * Since they may perform inter-CPU synchronization, each call has to be done + * from a different thread or process, or need to use Group Semantics (see + * below). + */ + +/* + * Reduce + * + * Reduces data arrays of length count in sendbuff into recvbuff using op + * operation. + * recvbuff may be NULL on all calls except for root device. + * root is the rank (not the CUDA device) where data will reside after the + * operation is complete. + * + * In-place operation will happen if sendbuff == recvbuff. + */ +ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); + +/* + * (deprecated) Broadcast (in-place) + * + * Copies count values from root to all other devices. + * root is the rank (not the CUDA device) where data resides before the + * operation is started. + * + * This operation is implicitely in place. + */ +ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); + +/* + * Broadcast + * + * Copies count values from root to all other devices. + * root is the rank (not the CUDA device) where data resides before the + * operation is started. + * + * In-place operation will happen if sendbuff == recvbuff. + */ +ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); + +/* + * All-Reduce + * + * Reduces data arrays of length count in sendbuff using op operation, and + * leaves identical copies of result on each recvbuff. + * + * In-place operation will happen if sendbuff == recvbuff. + */ +ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); + +/* + * Reduce-Scatter + * + * Reduces data in sendbuff using op operation and leaves reduced result + * scattered over the devices so that recvbuff on rank i will contain the i-th + * block of the result. + * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff + * should have a size of at least nranks*recvcount elements. + * + * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. + */ +ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream); +ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, + size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + cudaStream_t stream); + +/* + * All-Gather + * + * Each device gathers sendcount values from other GPUs into recvbuff, + * receiving data from rank i at offset i*sendcount. + * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff + * should have a size of at least nranks*sendcount elements. + * + * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. + */ +ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); +ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, + ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); + +/* + * Group semantics + * + * When managing multiple GPUs from a single thread, and since NCCL collective + * calls may perform inter-CPU synchronization, we need to "group" calls for + * different ranks/devices into a single call. + * + * Grouping NCCL calls as being part of the same collective operation is done + * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all + * collective calls until the ncclGroupEnd call, which will wait for all calls + * to be complete. Note that for collective communication, ncclGroupEnd only + * guarantees that the operations are enqueued on the streams, not that + * the operation is effectively done. + * + * Both collective communication and ncclCommInitRank can be used in conjunction + * of ncclGroupStart/ncclGroupEnd. + */ + +/* + * Group Start + * + * Start a group call. All subsequent calls to NCCL may not block due to + * inter-CPU synchronization. + */ +ncclResult_t ncclGroupStart(); + +/* + * Group End + * + * End a group call. Wait for all calls since ncclGroupStart to complete + * before returning. + */ +ncclResult_t ncclGroupEnd(); + +#ifdef __cplusplus +} // end extern "C" +#endif + +#endif // end include guard |