/************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_H_ #define NCCL_H_ #include #include #define NCCL_MAJOR ${nccl:Major} #define NCCL_MINOR ${nccl:Minor} #define NCCL_PATCH ${nccl:Patch} #define NCCL_SUFFIX "${nccl:Suffix}" #define NCCL_VERSION_CODE ${nccl:Version} #define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z)) #ifdef __cplusplus extern "C" { #endif /* Opaque handle to communicator */ typedef struct ncclComm* ncclComm_t; #define NCCL_UNIQUE_ID_BYTES 128 typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; /* Error type */ typedef enum { ncclSuccess = 0, ncclUnhandledCudaError = 1, ncclSystemError = 2, ncclInternalError = 3, ncclInvalidArgument = 4, ncclInvalidUsage = 5, ncclNumResults = 6 } ncclResult_t; /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. * This integer is coded with the MAJOR, MINOR and PATCH level of the * NCCL library */ ncclResult_t ncclGetVersion(int *version); ncclResult_t pncclGetVersion(int *version); /* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be * called once and the Id should be distributed to all ranks in the * communicator before calling ncclCommInitRank. */ ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); /* Creates a new communicator (multi thread/process version). * rank must be between 0 and nranks-1 and unique within a communicator clique. * Each rank is associated to a CUDA device, which has to be set before calling * ncclCommInitRank. * ncclCommInitRank implicitly syncronizes with other ranks, so it must be * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */ ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); /* Creates a clique of communicators (single process version). * This is a convenience function to create a single-process communicator clique. * Returns an array of ndev newly initialized communicators in comm. * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t). * If devlist is NULL, the first ndev CUDA devices are used. * Order of devlist defines user-order of processors within the communicator. */ ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); /* Frees resources associated with communicator object, but waits for any operations * that might still be running on the device. */ ncclResult_t ncclCommDestroy(ncclComm_t comm); ncclResult_t pncclCommDestroy(ncclComm_t comm); /* Frees resources associated with communicator object and aborts any operations * that might still be running on the device. */ ncclResult_t ncclCommAbort(ncclComm_t comm); ncclResult_t pncclCommAbort(ncclComm_t comm); /* Returns a human-readable error message. */ const char* ncclGetErrorString(ncclResult_t result); const char* pncclGetErrorString(ncclResult_t result); /* Checks whether the comm has encountered any asynchronous errors */ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); /* Gets the number of ranks in the communicator clique. */ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); /* Returns the cuda device number associated with the communicator. */ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); /* Returns the user-ordered "rank" associated with the communicator. */ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); /* Reduction operation selector */ typedef enum { ncclSum = 0, ncclProd = 1, ncclMax = 2, ncclMin = 3, ncclNumOps = 4 } ncclRedOp_t; /* Data types */ typedef enum { ncclInt8 = 0, ncclChar = 0, ncclUint8 = 1, ncclInt32 = 2, ncclInt = 2, ncclUint32 = 3, ncclInt64 = 4, ncclUint64 = 5, ncclFloat16 = 6, ncclHalf = 6, ncclFloat32 = 7, ncclFloat = 7, ncclFloat64 = 8, ncclDouble = 8, ncclNumTypes = 9 } ncclDataType_t; /* * Collective communication operations * * Collective communication operations must be called separately for each * communicator in a communicator clique. * * They return when operations have been enqueued on the CUDA stream. * * Since they may perform inter-CPU synchronization, each call has to be done * from a different thread or process, or need to use Group Semantics (see * below). */ /* * Reduce * * Reduces data arrays of length count in sendbuff into recvbuff using op * operation. * recvbuff may be NULL on all calls except for root device. * root is the rank (not the CUDA device) where data will reside after the * operation is complete. * * In-place operation will happen if sendbuff == recvbuff. */ ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); /* * (deprecated) Broadcast (in-place) * * Copies count values from root to all other devices. * root is the rank (not the CUDA device) where data resides before the * operation is started. * * This operation is implicitely in place. */ ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); /* * Broadcast * * Copies count values from root to all other devices. * root is the rank (not the CUDA device) where data resides before the * operation is started. * * In-place operation will happen if sendbuff == recvbuff. */ ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); /* * All-Reduce * * Reduces data arrays of length count in sendbuff using op operation, and * leaves identical copies of result on each recvbuff. * * In-place operation will happen if sendbuff == recvbuff. */ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); /* * Reduce-Scatter * * Reduces data in sendbuff using op operation and leaves reduced result * scattered over the devices so that recvbuff on rank i will contain the i-th * block of the result. * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff * should have a size of at least nranks*recvcount elements. * * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. */ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); /* * All-Gather * * Each device gathers sendcount values from other GPUs into recvbuff, * receiving data from rank i at offset i*sendcount. * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff * should have a size of at least nranks*sendcount elements. * * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. */ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); /* * Send * * Send data from sendbuff to rank peer. * * Rank peer needs to call ncclRecv with the same datatype and the same count from this * rank. * * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ * ncclGroupEnd section. */ ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); /* * Receive * * Receive data from rank peer into recvbuff. * * Rank peer needs to call ncclSend with the same datatype and the same count to this * rank. * * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ * ncclGroupEnd section. */ ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream); /* * Group semantics * * When managing multiple GPUs from a single thread, and since NCCL collective * calls may perform inter-CPU synchronization, we need to "group" calls for * different ranks/devices into a single call. * * Grouping NCCL calls as being part of the same collective operation is done * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all * collective calls until the ncclGroupEnd call, which will wait for all calls * to be complete. Note that for collective communication, ncclGroupEnd only * guarantees that the operations are enqueued on the streams, not that * the operation is effectively done. * * Both collective communication and ncclCommInitRank can be used in conjunction * of ncclGroupStart/ncclGroupEnd, but not together. * * Group semantics also allow to fuse multiple operations on the same device * to improve performance (for aggregated collective calls), or to permit * concurrent progress of multiple send/receive operations. */ /* * Group Start * * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into * a single NCCL operation. Nothing will be started on the CUDA stream until * ncclGroupEnd. */ ncclResult_t ncclGroupStart(); ncclResult_t pncclGroupStart(); /* * Group End * * End a group call. Start a fused NCCL operation consisting of all calls since * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations * need to be called after ncclGroupEnd. */ ncclResult_t ncclGroupEnd(); ncclResult_t pncclGroupEnd(); #ifdef __cplusplus } // end extern "C" #endif #endif // end include guard