diff options
author | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2019-11-20 01:57:39 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-11-20 01:57:39 +0300 |
commit | 299c554dccf923230321ad7495946543f3e9b457 (patch) | |
tree | 6a70b52080f0570fc87285b3b2300dbd2f2918ad | |
parent | ccb1298148327bacb9b83452ed6ae0b29417e7e2 (diff) |
2.5.6-1 (#255)
Add LL128 Protocol.
Rewrite the topology detection and tree/ring creation (#179). Improve
tree performance by sending/receiving from different GPUs. Add
model-based tuning to switch between the different algorithms and
protocols.
Rework P2P/SHM detection in containers (#155, #248).
Detect duplicated devices and return an error (#231).
Add tuning for GCP
65 files changed, 4770 insertions, 2819 deletions
diff --git a/makefiles/common.mk b/makefiles/common.mk index 2ad5c73..37e81be 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -25,8 +25,7 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2) # Better define NVCC_GENCODE in your environment to the minimal set # of archs to reduce compile time. -CUDA8_GENCODE = -gencode=arch=compute_30,code=sm_30 \ - -gencode=arch=compute_35,code=sm_35 \ +CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \ -gencode=arch=compute_50,code=sm_50 \ -gencode=arch=compute_60,code=sm_60 \ -gencode=arch=compute_61,code=sm_61 @@ -46,7 +45,10 @@ endif CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden CXXFLAGS += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla CXXFLAGS += -I $(CUDA_INC) -NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all +# Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors) +# 512 : 120, 640 : 96, 768 : 80, 1024 : 60 +# We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions. +NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all # Use addprefix so that we can specify more than one path NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt diff --git a/makefiles/version.mk b/makefiles/version.mk index bab58ec..efbdee7 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 4 -NCCL_PATCH := 8 +NCCL_MINOR := 5 +NCCL_PATCH := 6 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/pkg/debian/Makefile b/pkg/debian/Makefile index 7884cf2..2ce4390 100644 --- a/pkg/debian/Makefile +++ b/pkg/debian/Makefile @@ -17,7 +17,7 @@ DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES)) PKG_TIMESTAMP := $(shell date -R) ARCH := $(shell uname -m) -PKG_ARCH ?= $(shell uname -m | sed -e "s/x86_64/amd64/g" | sed -e "s/ppc64le/ppc64el/g") +PKG_ARCH ?= $(shell uname -m | sed -e "s/x86_64/amd64/g" | sed -e "s/ppc64le/ppc64el/g"| sed -e "s/aarch64/arm64/g") PKG_MULTIARCH ?= $(shell $(CXX) -print-multiarch) ifeq ($(PKG_MULTIARCH),) # Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it diff --git a/src/Makefile b/src/Makefile index 452adf5..b11de5e 100644 --- a/src/Makefile +++ b/src/Makefile @@ -9,10 +9,11 @@ include ../makefiles/version.mk ##### src files INCEXPORTS := nccl.h nccl_net.h -LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc \ - misc/group.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/rings.cc misc/utils.cc misc/argcheck.cc misc/trees.cc misc/topo.cc \ +LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc \ + misc/nvmlwrap.cc misc/ibvwrap.cc misc/utils.cc misc/argcheck.cc \ transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \ - collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc + collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \ + graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc ##### lib files LIBNAME := libnccl.so @@ -94,17 +95,17 @@ $(PKGDIR)/nccl.pc : nccl.pc.in $(INCDIR)/%.h : %.h @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(INCDIR) - cp -f $< $@ + install -m 644 $< $@ $(INCDIR)/nccl_%.h : include/nccl_%.h @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(INCDIR) - cp -f $< $@ + install -m 644 $< $@ $(PKGDIR)/%.pc : %.pc @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(PKGDIR) - cp -f $< $@ + install -m 644 $< $@ $(OBJDIR)/%.o : %.cc @printf "Compiling %-35s > %s\n" $< $@ @@ -117,8 +118,8 @@ $(OBJDIR)/%.o : %.cc @rm -f $(@:%.o=%.d.tmp) clean : - rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR} $(MAKE) -C collectives/device clean + rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR} install : lib mkdir -p $(PREFIX)/lib diff --git a/src/bootstrap.cc b/src/bootstrap.cc index d7c2ac6..11ffc35 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -13,11 +13,6 @@ #include <unistd.h> #include <sys/types.h> -// Always use sockets for bootstrap -struct bootstrapNetHandle { - union socketAddress connectAddr; -}; - struct bootstrapNetComm { int fd; }; @@ -68,36 +63,36 @@ static ncclResult_t bootstrapNetGetSocketAddr(int dev, union socketAddress* addr /* Socket Interface Selection type */ enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 }; -static ncclResult_t bootstrapNetListen(int dev, void* opaqueHandle, void** listenComm) { - struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle; - static_assert(sizeof(struct bootstrapNetHandle) < NCCL_NET_HANDLE_MAXSIZE, "bootstrapNetHandle size too large"); +static ncclResult_t bootstrapNetListen(int dev, ncclNetHandle_t* netHandle, void** listenComm) { + union socketAddress* connectAddr = (union socketAddress*) netHandle; + static_assert(sizeof(union socketAddress) < NCCL_NET_HANDLE_MAXSIZE, "union socketAddress size is too large"); // if dev >= 0, listen based on dev if (dev >= 0) { - NCCLCHECK(bootstrapNetGetSocketAddr(dev, &(handle->connectAddr))); + NCCLCHECK(bootstrapNetGetSocketAddr(dev, connectAddr)); } else if (dev == findSubnetIf) { // handle stores a remote address // need to find a local addr that is in the same network as the remote addr union socketAddress localAddr; char ifName[MAX_IF_NAME_SIZE]; - if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) { + if (findInterfaceMatchSubnet(ifName, &localAddr, connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) { WARN("NET/Socket : No usable listening interface found"); return ncclSystemError; } // pass the local address back - memcpy(&handle->connectAddr, &localAddr, sizeof(handle->connectAddr)); + memcpy(connectAddr, &localAddr, sizeof(localAddr)); } // Otherwise, handle stores a local address struct bootstrapNetComm* comm; NCCLCHECK(bootstrapNetNewComm(&comm)); - NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr)); + NCCLCHECK(createListenSocket(&comm->fd, connectAddr)); *listenComm = comm; return ncclSuccess; } -static ncclResult_t bootstrapNetConnect(int dev, void* opaqueHandle, void** sendComm) { +static ncclResult_t bootstrapNetConnect(int dev, ncclNetHandle_t* netHandle, void** sendComm) { + union socketAddress* connectAddr = (union socketAddress*) netHandle; struct bootstrapNetComm* comm; NCCLCHECK(bootstrapNetNewComm(&comm)); - struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle; - NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr)); + NCCLCHECK(connectAddress(&comm->fd, connectAddr)); *sendComm = comm; return ncclSuccess; } @@ -145,21 +140,12 @@ static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) { return ncclSuccess; } -ncclResult_t bootstrapNetCreateHandle(void* opaqueHandle, const char* str) { - struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle; - NCCLCHECK(GetSocketAddrFromString(&handle->connectAddr, str)); +ncclResult_t bootstrapNetCreateHandle(ncclNetHandle_t* netHandle, const char* str) { + union socketAddress* connectAddr = (union socketAddress*) netHandle; + NCCLCHECK(GetSocketAddrFromString(connectAddr, str)); return ncclSuccess; } -struct extId { - ncclNetHandle_t extHandleRoot; - void* extListenComm; - uint64_t hostHash; - pid_t pid; - int fd; - pthread_t boostrapThread; -}; - struct extInfo { int rank; int nranks; @@ -177,9 +163,8 @@ static ncclResult_t setFilesLimit() { return ncclSuccess; } -static void *bootstrapRoot(void* commId) { +static void *bootstrapRoot(void* listenComm) { struct extInfo info; - struct extId* id = (struct extId*)commId; ncclNetHandle_t *rankHandles = NULL; ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange ncclNetHandle_t zero = { 0 }; // for sanity checking @@ -191,7 +176,7 @@ static void *bootstrapRoot(void* commId) { /* Receive addresses from all ranks */ int nranks = 0, c = 0; do { - NCCLCHECKGOTO(bootstrapNetAccept(id->extListenComm, &tmpComm), res, out); + NCCLCHECKGOTO(bootstrapNetAccept(listenComm, &tmpComm), res, out); NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out); NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out); @@ -216,22 +201,22 @@ static void *bootstrapRoot(void* commId) { memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t)); ++c; + TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks); } while (c < nranks); - TRACE(NCCL_INIT, "COLLECTED HANDLES"); + TRACE(NCCL_INIT, "COLLECTED ALL %d HANDLES", nranks); // Send the connect handle for the next rank in the AllGather ring for (int r=0; r<nranks; ++r) { int next = (r+1) % nranks; void *tmpSendComm; - NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot[r], &tmpSendComm), res, out); + NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot+r, &tmpSendComm), res, out); NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out); NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out); } - TRACE(NCCL_INIT, "SENT OUT HANDLES"); + TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks); out: - bootstrapNetCloseListen(id->extListenComm); - free(commId); + bootstrapNetCloseListen(listenComm); if (rankHandles) free(rankHandles); if (rankHandlesRoot) free(rankHandlesRoot); @@ -239,31 +224,28 @@ out: return NULL; } -ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) { - struct extId* id = (struct extId*)commId; - id->hostHash = getHostHash(); - NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm)); - ncclUniqueId* threadIdCopy; - NCCLCHECK(ncclCalloc(&threadIdCopy, 1)); - memcpy(threadIdCopy, id, sizeof(ncclUniqueId)); - pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy); +ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) { + ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id; + void* listenComm; + NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, netHandle, &listenComm)); + pthread_t thread; + pthread_create(&thread, NULL, bootstrapRoot, listenComm); return ncclSuccess; } -ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) { - static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId"); - extId* id = (extId*)out; +ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) { + static_assert(sizeof(ncclNetHandle_t) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId"); + memset(id, 0, sizeof(ncclUniqueId)); + ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id; char* env = getenv("NCCL_COMM_ID"); if (env) { - if (bootstrapNetCreateHandle(&id->extHandleRoot, env) != 0) { + if (bootstrapNetCreateHandle(netHandle, env) != 0) { WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>"); return ncclInvalidArgument; } - id->pid = -1; } else { - id->pid = getpid(); - NCCLCHECK(bootstrapCreateRoot(out, false)); + NCCLCHECK(bootstrapCreateRoot(id, false)); } return ncclSuccess; @@ -286,9 +268,9 @@ struct extState { int dev; }; -ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) { - struct extId* id = (struct extId*)commId; - bool idFromEnv = id->pid < 0; +ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState) { + ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id; + bool idFromEnv = getenv("NCCL_COMM_ID") != NULL; struct extState* state; NCCLCHECK(ncclCalloc(&state, 1)); state->rank = rank; @@ -303,8 +285,8 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co void *tmpSendComm, *tmpRecvComm; // Pass the remote address to listen via info if (idFromEnv) { - memcpy(&info.extHandleListen, &id->extHandleRoot, sizeof(ncclNetHandle_t)); - memcpy(&info.extHandleListenRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t)); + memcpy(&info.extHandleListen, netHandle, sizeof(ncclNetHandle_t)); + memcpy(&info.extHandleListenRoot, netHandle, sizeof(ncclNetHandle_t)); } // listen will return the local address via info (specify interface type 'findSubnetIf') state->dev = idFromEnv ? findSubnetIf : 0; @@ -323,7 +305,7 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co } // send info on my listening socket to root - NCCLCHECK(bootstrapNetConnect(state->dev, id->extHandleRoot, &tmpSendComm)); + NCCLCHECK(bootstrapNetConnect(state->dev, netHandle, &tmpSendComm)); NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info))); NCCLCHECK(bootstrapNetCloseSend(tmpSendComm)); @@ -334,7 +316,7 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm)); NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot)); - NCCLCHECK(bootstrapNetConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm)); + NCCLCHECK(bootstrapNetConnect(state->dev, &extHandleNext, &state->extBstrapRingSendComm)); // Accept the connect request from the previous rank in the AllGather ring NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm)); @@ -377,7 +359,7 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) { ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) { struct extState* state = (struct extState*)commState; void* tmpSendComm; - NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles[peer], &tmpSendComm)); + NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles+peer, &tmpSendComm)); NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int))); NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size)); NCCLCHECK(bootstrapNetCloseSend(tmpSendComm)); @@ -465,3 +447,13 @@ ncclResult_t bootstrapClose(void* commState) { return ncclSuccess; } + +ncclResult_t bootstrapAbort(void* commState) { + struct extState* state = (struct extState*)commState; + bootstrapNetCloseListen(state->extBstrapListenComm); + bootstrapNetCloseSend(state->extBstrapRingSendComm); + bootstrapNetCloseRecv(state->extBstrapRingRecvComm); + free(state->peerBstrapHandles); + free(state); + return ncclSuccess; +} diff --git a/src/collectives/all_reduce.cc b/src/collectives/all_reduce.cc index 921f2de..7796d5b 100644 --- a/src/collectives/all_reduce.cc +++ b/src/collectives/all_reduce.cc @@ -5,7 +5,6 @@ ************************************************************************/ #include "enqueue.h" -#include "collectives.h" NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile index 0ee587b..001059c 100644 --- a/src/collectives/device/Makefile +++ b/src/collectives/device/Makefile @@ -68,4 +68,4 @@ $(DEVOBJ) : $(LIBOBJ) $(NVCC) $(NVCUFLAGS) -dlink $^ -o $@ clean: - rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(STATICLIB) test + rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(RULESFILE) $(STATICLIB) diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h index 8e78730..0ad5ba9 100644 --- a/src/collectives/device/all_gather.h +++ b/src/collectives/device/all_gather.h @@ -11,7 +11,7 @@ template<int UNROLL, class FUNC, typename T> __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = blockDim.x - 1; + const int nthreads = args->nThreads-WARP_SIZE; const int bid = args->bid; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; @@ -19,15 +19,15 @@ __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) { const ssize_t size = args->N; const int nranks = comm->nRanks; const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); - const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS; + const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS; const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC> - prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount); + ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, FUNC> + prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); @@ -129,3 +129,67 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) { template<int UNUSED, class FUNC, typename T> __device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { } + +#include "prims_ll128.h" +template<int UNUSED, class FUNC, typename T> +__device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int bid = args->bid; + const int nthreads = args->nThreads; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; + + ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); + + const ssize_t size = args->N; + //const int rank = comm->rank; + const int nranks = comm->nRanks; + ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. + const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2; + + const ssize_t loopSize = args->nChannels*chunkSize; + + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize); + + ssize_t chunkOffset = gridOffset + bid*chunkSize; + + /////////////// begin AllGather steps /////////////// + ssize_t offset; + int nelem = min(chunkSize, size-chunkOffset); + int rankDest; + + // step 0: push data to next GPU + rankDest = ring->devUserRanks[0]; + offset = chunkOffset + rankDest * size; + + if (thisInput + chunkOffset == thisOutput + offset) { // In place + LLprims.send(thisInput+chunkOffset, nelem); + } else { + LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem); + } + + // k-2 steps: copy to next GPU + for (int j=1; j<nranks-1; ++j) { + rankDest = ring->devUserRanks[nranks-j]; + offset = chunkOffset + rankDest * size; + + LLprims.recvCopySend(thisOutput+offset, nelem); + } + + // step k-1: final store + rankDest = ring->devUserRanks[1]; + offset = chunkOffset + rankDest * size; + + LLprims.recv(thisOutput+offset, nelem); + } +} + +template<int UNUSED, class FUNC, typename T> +__device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { } diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h index 9b058cc..2449c2b 100644 --- a/src/collectives/device/all_reduce.h +++ b/src/collectives/device/all_reduce.h @@ -11,7 +11,7 @@ template<int UNROLL, class FUNC, typename T> __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = blockDim.x - 1; + const int nthreads = args->nThreads-WARP_SIZE; const int bid = args->bid; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; @@ -27,7 +27,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) { T * __restrict__ thisOutput = (T*)args->ThisOutput; ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC> - prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount); + prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) { int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels)); @@ -85,23 +85,28 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) { template<int UNROLL, class FUNC, typename T> __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = blockDim.x - 1; + const int nthreads = args->nThreads-WARP_SIZE; const int bid = args->bid; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; - struct ncclTree* tree = &channel->tree; const ssize_t size = args->N; const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); - const int chunkSize = args->lastChunkSize; + int chunkSize = args->lastChunkSize; + const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T); const ssize_t loopSize = args->nChannels*chunkSize; + if (loopSize > size) { + chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize; + } + // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; do { + struct ncclTree* tree = &channel->treeUp; // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) - ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount); + ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { // Up ssize_t offset = gridOffset + bid*chunkSize; @@ -117,8 +122,9 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) { } while(0); do { + struct ncclTree* tree = &channel->treeDn; // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) - ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount); + ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { // Down ssize_t offset = gridOffset + bid*chunkSize; @@ -149,6 +155,8 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { //const int rank = comm->rank; const int nranks = comm->nRanks; ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); + const ssize_t minChunkSize = nthreads * (sizeof(uint64_t)) / sizeof(T); + const ssize_t loopSize = args->nChannels*nranks*chunkSize; // Compute pointers @@ -156,10 +164,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { T * __restrict__ thisOutput = (T*)args->ThisOutput; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - if (size-gridOffset < loopSize) { - chunkSize = args->lastChunkSize; - } - ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize; + chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize); /////////////// begin AllReduce steps /////////////// ssize_t offset; @@ -168,7 +173,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { // step 0: push data to next GPU slice = ring->devUserRanks[nranks-1]; - offset = chunkOffset + slice * chunkSize; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.send(thisInput+offset, nelem); @@ -176,7 +181,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { // k-2 steps: reduce and copy to next GPU for (int j=2; j<nranks; ++j) { slice = ring->devUserRanks[nranks-j]; - offset = chunkOffset + slice * chunkSize; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.recvReduceSend(thisInput+offset, nelem); @@ -185,7 +190,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { // step k-1: reduce this buffer and data, which will produce the final // result that we store in this data and push to the next GPU slice = ring->devUserRanks[0]; - offset = chunkOffset + slice * chunkSize; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem); @@ -193,7 +198,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { // k-2 steps: copy to next GPU for (int j=1; j<nranks-1; ++j) { slice = ring->devUserRanks[nranks-j]; - offset = chunkOffset + slice * chunkSize; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.recvCopySend(thisOutput+offset, nelem); @@ -201,7 +206,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { // Make final copy from buffer to dest. slice = ring->devUserRanks[1]; - offset = chunkOffset + slice * chunkSize; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); // Here we need to copy from buffer to this output. @@ -216,16 +221,21 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) { const int bid = args->bid; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; - struct ncclTree* tree = &channel->tree; const ssize_t size = args->N; ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); + const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T); const ssize_t loopSize = args->nChannels*chunkSize; + if (loopSize > size) { + chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize; + } + // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; do { + struct ncclTree* tree = &channel->treeUp; // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { @@ -243,6 +253,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) { } while(0); do { + struct ncclTree* tree = &channel->treeDn; // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { @@ -259,3 +270,141 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) { } } while(0); } + +#include "prims_ll128.h" +template<int UNUSED, class FUNC, typename T> +__device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int bid = args->bid; + const int nthreads = args->nThreads; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; + + ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); + + const ssize_t size = args->N; + //const int rank = comm->rank; + const int nranks = comm->nRanks; + ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. + const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2; + + const ssize_t loopSize = args->nChannels*nranks*chunkSize; + + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize); + + /////////////// begin AllReduce steps /////////////// + ssize_t offset; + int nelem; + int slice; + + // step 0: push data to next GPU + slice = ring->devUserRanks[nranks-1]; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + nelem = min(chunkSize, size-offset); + + LLprims.send(thisInput+offset, nelem); + + // k-2 steps: reduce and copy to next GPU + for (int j=2; j<nranks; ++j) { + slice = ring->devUserRanks[nranks-j]; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + nelem = min(chunkSize, size-offset); + + LLprims.recvReduceSend(thisInput+offset, nelem); + } + + // step k-1: reduce this buffer and data, which will produce the final + // result that we store in this data and push to the next GPU + slice = ring->devUserRanks[0]; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + nelem = min(chunkSize, size-offset); + + LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem); + + // k-2 steps: copy to next GPU + for (int j=1; j<nranks-1; ++j) { + slice = ring->devUserRanks[nranks-j]; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + nelem = min(chunkSize, size-offset); + + LLprims.recvCopySend(thisOutput+offset, nelem); + } + + // Make final copy from buffer to dest. + slice = ring->devUserRanks[1]; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + nelem = min(chunkSize, size-offset); + + // Here we need to copy from buffer to this output. + LLprims.recv(thisOutput+offset, nelem); + } +} + +template<int UNUSED, class FUNC, typename T> +__device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int nthreads = args->nThreads; + const int bid = args->bid; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclTree* treeUp = &channel->treeUp; + struct ncclTree* treeDn = &channel->treeDn; + const ssize_t size = args->N; + ssize_t chunkSize = args->lastChunkSize; + const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/8; + const ssize_t loopSize = args->nChannels*chunkSize; + int nthreadsSplit = NCCL_LL128_SPLIT(nthreads); + + if (loopSize > size) { + chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize; + } + + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + if (treeUp->up == -1) { + // ReduceAndBroadcast : max number of recv is 3, max number of send is 3 + ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, treeUp->down, treeDn->down, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem); + } + } else { + if (tid < nthreadsSplit) { + // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) + ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreadsSplit, treeUp->down, &treeUp->up, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Up + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (treeUp->down[0] == -1) { + LLprims.send(thisInput+offset, nelem); + } else { + LLprims.recvReduceSend(thisInput+offset, nelem); + } + } + } else { + // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) + ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &treeDn->up, treeDn->down, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Down + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (treeDn->down[0] == -1) { + LLprims.recv(thisOutput+offset, nelem); + } else { + LLprims.recvCopySend(thisOutput+offset, nelem); + } + } + } + } +} diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h index ae8667f..de8b989 100644 --- a/src/collectives/device/broadcast.h +++ b/src/collectives/device/broadcast.h @@ -11,7 +11,7 @@ template<int UNROLL, class FUNC, typename T> __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = blockDim.x - 1; + const int nthreads = args->nThreads-WARP_SIZE; const int bid = args->bid; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; @@ -29,7 +29,7 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) { T * __restrict__ thisOutput = (T*)args->ThisOutput; ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, FUNC> - prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); + prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); @@ -100,3 +100,51 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) { template<int UNUSED, class FUNC, typename T> __device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { } + +#include "prims_ll128.h" +template<int UNUSED, class FUNC, typename T> +__device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int bid = args->bid; + const int nthreads = args->nThreads; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; + + ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); + + const ssize_t size = args->N; + const int rank = ring->devUserRanks[0]; + const int nextRank = ring->devUserRanks[1]; + const int root = args->root; + + ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + + const ssize_t loopSize = args->nChannels*chunkSize; + + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize); + ssize_t offset = gridOffset + bid*chunkSize; + + int nelem = min(chunkSize, size-offset); + if (rank == root) { + if (thisInput == thisOutput) { + LLprims.send(thisInput+offset, nelem); + } else { + LLprims.copySend(thisInput + offset, thisOutput + offset, nelem); + } + } else if (nextRank == root) { + LLprims.recv(thisOutput + offset, nelem); + } else { + LLprims.recvCopySend(thisOutput + offset, nelem); + } + } +} + +template<int UNUSED, class FUNC, typename T> +__device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { } diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index 8c336bf..46eb9f5 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -7,9 +7,8 @@ #ifndef NCCL_DEVICE_COMMON_H_ #define NCCL_DEVICE_COMMON_H_ -#include "../collectives.h" +#include "collectives.h" #include "devcomm.h" -#include "nccl.h" // Exit If Abort Barrier across CTA: make sure all threads exit consistently // Each thread sets a predicate to true if abort == 1 @@ -31,17 +30,19 @@ extern __device__ ncclKern_t ncclFuncs[]; static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) { int* d = (int*)dst; int* s = (int*)src; - // When aggregation is effective, if some threads have aborted inside the LL kernel, - // make sure the rest of the threads abort as well - exitIfAbortBarrier(0); for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o]; - __syncthreads(); } -static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid) { +static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, struct ncclDevComm* comm) { + // Check whether the last operation was aborted and make sure all threads exit + int abort = tid == 0 ? *(comm->abortFlag) : 0; + exitIfAbortBarrier(abort); load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid); + __syncthreads(); if (tid == 0) hostColl->active = 0; } +extern __device__ volatile uint64_t* ncclShmem; + /* Functions for aggregation case */ #define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \ __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \ @@ -51,10 +52,11 @@ __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \ #if NCCL_OP == 0 /* Kernels with the first operation inlined */ #define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \ -__launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ int tid = threadIdx.x; \ int bid = blockIdx.x; \ + __shared__ volatile uint64_t shmem[NCCL_LL128_SHMEM_SIZE]; \ + ncclShmem = shmem; \ __shared__ struct ncclColl localColl; \ \ struct ncclDevComm* comm = firstColl.args.comm; \ @@ -65,7 +67,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ c = &firstColl; \ } else { \ c = &localColl; \ - load_coll(c, channel->devCollectives+channel->collFifoHead, tid); \ + load_coll(c, channel->devCollectives+channel->collFifoHead, tid, comm); \ } \ while (1) { \ if (tid < c->args.nThreads) { \ @@ -84,7 +86,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ \ /* Load next collective operation*/ \ c = &localColl; /* for bid 0 */ \ - load_coll(c, channel->devCollectives+nextIndex, tid); \ + load_coll(c, channel->devCollectives+nextIndex, tid, comm); \ } \ } #else @@ -93,13 +95,14 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ // Only generate inline kernels for LL #define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \ - IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \ IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \ - IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1, al)) \ + IMPL_COLL_FUNC(coll##LL128, op, ncclFunc, dtype, ctype) \ + IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \ + IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, al, NCCL_PROTO_LL)) \ #define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \ - IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0) \ - IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 1) + IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \ + IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING) #if NCCL_TYPE == 0 #define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h index 435a598..aa1e936 100644 --- a/src/collectives/device/common_kernel.h +++ b/src/collectives/device/common_kernel.h @@ -263,8 +263,6 @@ __device__ __forceinline__ void ReduceCopyMulti(const int tid, const int nthread } } -#define WARP_SIZE 32 - template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS> __device__ __forceinline__ void ReduceCopy128bMulti( const int w, const int nw, const int t, int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS], diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu index 010c454..034fe96 100644 --- a/src/collectives/device/functions.cu +++ b/src/collectives/device/functions.cu @@ -8,13 +8,16 @@ #include "collectives.h" #include "common.h" +__device__ volatile uint64_t* ncclShmem; + #define NCCL_FUNC5(coll, op, dtype) \ - NCCL_COLL_NAME(coll, op, dtype), \ - NCCL_COLL_NAME(coll##LL, op, dtype) + NCCL_COLL_NAME(coll##LL, op, dtype), \ + NCCL_COLL_NAME(coll##LL128, op, dtype), \ + NCCL_COLL_NAME(coll, op, dtype) #define NCCL_FUNC4(coll, op, dtype) \ - NCCL_FUNC5(coll##Ring, op, dtype), \ - NCCL_FUNC5(coll##Tree, op, dtype) + NCCL_FUNC5(coll##Tree, op, dtype), \ + NCCL_FUNC5(coll##Ring, op, dtype) // Must be consistent with ncclDataType_t #define NCCL_FUNCS3A(coll, op) \ @@ -50,7 +53,7 @@ NCCL_FUNCS3B(coll, copy), \ NCCL_FUNCS3B(coll, copy) -// Must be consistent with ncclColl_t +// Must be consistent with ncclFunc_t #define NCCL_FUNCS() { \ NCCL_FUNCS2B(ncclBroadcast), \ NCCL_FUNCS2A(ncclReduce), \ @@ -59,7 +62,7 @@ NCCL_FUNCS2A(ncclAllReduce) } // Must be consistent with the ncclFuncSet enum -__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = { +__device__ ncclKern_t ncclFuncs[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = { // Don't try to initialize the host shadow copy of this device-side global // variable. There is no host pointer to a device-side function, which // confuses clang. This will be fixed in the next clang release. diff --git a/src/collectives/device/op128.h b/src/collectives/device/op128.h new file mode 100644 index 0000000..9405dc2 --- /dev/null +++ b/src/collectives/device/op128.h @@ -0,0 +1,36 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef OP128_H_ +#define OP128_H_ + +inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) { + asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" + : "=l"(v0), "=l"(v1) : "l"(ptr)); +} + +inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) { + asm volatile("st.volatile.global.v2.u64 [%2], {%0,%1};" + :: "l"(v0), "l"(v1), "l"(ptr)); +} + +inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) { + uint64_t* shmemAsmPtr; + asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr)); + return shmemAsmPtr; +} + +inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) { + asm volatile("ld.volatile.shared.v2.u64 {%0,%1}, [%2];" + : "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr)); +} + +inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) { + asm volatile("st.volatile.shared.v2.u64 [%2], {%0,%1};" + :: "l"(v0), "l"(v1), "l"(shmemAsmPtr)); +} + +#endif diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h index 7beeaf4..aa3d20d 100644 --- a/src/collectives/device/primitives.h +++ b/src/collectives/device/primitives.h @@ -37,15 +37,27 @@ class ncclPrimitives { private: const int tid; const int nthreads; + const int wid; + const int stepSize; int nrecv = 0; int nsend = 0; - const int stepSize; - struct ncclConnInfo* recvConn[NRECV]; - struct ncclConnInfo* sendConn[NSEND]; - volatile uint64_t* waitPtr; + struct ncclConnInfo* recvConn = NULL; + volatile uint64_t* recvConnHeadPtr = NULL; + uint64_t recvConnHead; + volatile uint64_t* recvConnTailPtr = NULL; + uint64_t recvConnTail; + uint64_t recvConnTailCache; // Cache last seen value + + struct ncclConnInfo* sendConn = NULL; + volatile int* sendConnFifoPtr = NULL; + volatile uint64_t* sendConnTailPtr = NULL; + uint64_t sendConnTail; + volatile uint64_t* sendConnHeadPtr = NULL; + uint64_t sendConnHead; + uint64_t sendConnHeadCache; // Cache last seen value + uint64_t recvStep[NRECV]; uint64_t sendStep[NSEND]; - uint64_t sendConnHead[NSEND]; const T* recvDirectBuff[NRECV]; T* sendDirectBuff[NSEND]; const T* recvBuff[NRECV]; @@ -60,15 +72,18 @@ class ncclPrimitives { inline __device__ void barrier() { asm volatile ("bar.sync 1, %0;" :: "r"(nthreads)); } + inline __device__ void subBarrier() { + asm volatile ("bar.sync 2, %0;" :: "r"(nthreads-WARP_SIZE)); + } uint32_t mismatch = 0; const uint64_t opCount; - inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) { + inline __device__ void checkMismatch(struct ncclConnInfo* conn) { if (mismatch) { // In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch *(comm->fatalDevError) = ncclDevAssertedMismatch; - } else if (remoteOpCount && *remoteOpCount > opCount) { + } else if (conn && *conn->opCountRem > opCount) { mismatch += 1; } } @@ -76,49 +91,55 @@ class ncclPrimitives { uint32_t spins = 0; uint32_t abort = 0; - inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) { + inline __device__ int checkAbort(int i, int send) { spins++; - if (spins == SPINS_BEFORE_CHECK_ABORT) { + if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) { abort = *(comm->abortFlag); - checkMismatch(remoteOpCount); + if (wid == i) checkMismatch(send ? sendConn : recvConn); spins = 0; } return abort; } - inline __device__ void waitRecv(int i) { + inline __device__ void waitSend(int nbytes) { spins = 0; mismatch = 0; - recvStep[i] += SLICESTEPS; - if (tid == i) { - while (*(waitPtr) < recvStep[i]) { - if (checkAbort(recvConn[i]->opCountRem)) break; + if (sendConnHeadPtr) { + while (sendConnHeadCache + NCCL_STEPS < sendConnHead + SLICESTEPS) { + sendConnHeadCache = *sendConnHeadPtr; + if (checkAbort(wid, 1)) break; + } + if (sendConnFifoPtr) { + sendConnFifoPtr[sendConnHead%NCCL_STEPS] = nbytes; } + sendConnHead += SLICESTEPS; } } - inline __device__ void waitSend(int i) { + inline __device__ void waitRecv() { spins = 0; mismatch = 0; - sendStep[i] += SLICESTEPS; - if (tid == WARP_SIZE+i) { - while (sendConnHead[i] + NCCL_STEPS < sendStep[i]) { - sendConnHead[i] = *waitPtr; - if (checkAbort(sendConn[i]->opCountRem)) break; + if (recvConnTailPtr) { + while (recvConnTailCache < recvConnTail + SLICESTEPS) { + recvConnTailCache = *recvConnTailPtr; + if (checkAbort(wid, 0)) break; } + recvConnTail += SLICESTEPS; } } - inline __device__ void postRecv(int i) { - *(recvConn[i]->head) = recvStep[i] += SLICESTEPS; + inline __device__ void incRecv(int i) { + recvStep[i] += SLICESTEPS; } - - inline __device__ void postSend(int i) { - *(sendConn[i]->tail) = sendStep[i] += SLICESTEPS; + inline __device__ void postRecv() { + if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += SLICESTEPS; } - inline __device__ void postSendSize(int i, int size) { - if (sendConn[i]->fifo) sendConn[i]->fifo[sendStep[i]%NCCL_STEPS] = size; + inline __device__ void incSend(int i) { + sendStep[i] += SLICESTEPS; + } + inline __device__ void postSend() { + if (sendConnTailPtr) *sendConnTailPtr = sendConnTail += SLICESTEPS; } template <int DIRECTRECV> @@ -131,11 +152,22 @@ class ncclPrimitives { return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i); } + template <int DIRECTRECV> + inline __device__ int directRecvInc(int i, int directInc, int sliceInc) { + return DIRECTRECV && recvDirectBuff[i] ? directInc : sliceInc; + } + + template <int DIRECTSEND> + inline __device__ int directSendInc(int i, int directInc, int sliceInc) { + return DIRECTSEND && sendDirectBuff[i] ? directInc : sliceInc; + } + template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST> inline __device__ void GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) { int offset = 0; - int sliceSize = stepSize * SLICESTEPS; + int sliceSize = stepSize*SLICESTEPS; + int dataSize = max(DIVUP(nelem, 16*SLICESPERCHUNK)*16, sliceSize/32); const T* srcs[RECV*NRECV+SRC]; srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset); @@ -151,101 +183,126 @@ class ncclPrimitives { for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset); } - #pragma unroll 1 + bool syncThread = tid >= nthreads-WARP_SIZE; + + #pragma unroll for (int slice=0; slice<SLICESPERCHUNK; ++slice) { - int realSize = max(0, min(sliceSize, nelem-offset)); - if (tid < nthreads) { - FOR_SEND(waitSend); - FOR_RECV(waitRecv); + int realSize = max(0, min(dataSize, nelem-offset)); + if (!syncThread) { + if (SEND) waitSend(realSize*sizeof(T)); + if (RECV) waitRecv(); if (realSize > 0) { - barrier(); + subBarrier(); if (DIRECTRECV && recvDirectBuff[0]) { // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy if (SEND) { - ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize); + ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads-WARP_SIZE, 1, srcs, nsend, dsts+1, realSize); } } else { - ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize); + ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads-WARP_SIZE, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize); } } - exitIfAbortBarrier(abort); - } else { - exitIfAbortBarrier(abort); - FOR_SEND(postSendSize, realSize*sizeof(T)); - if (SEND) __threadfence_system(); - FOR_SEND(postSend); - FOR_RECV(postRecv); } - for (int i=0; i<RECV*NRECV+SRC; i++) srcs[i] += sliceSize; - for (int i=0; i<SEND*NSEND+DST; i++) dsts[i] += sliceSize; - offset += sliceSize; + barrier(); + FOR_SEND(incSend); + FOR_RECV(incRecv); + if (syncThread) { + if (SEND) { + if (realSize > 0 && wid == 0) __threadfence_system(); + __syncwarp(); + postSend(); + } + if (RECV) postRecv(); + } + srcs[0] += SRC ? realSize : directRecvInc<DIRECTRECV>(0, realSize, sliceSize); + for (int i=1-SRC; i<RECV*NRECV; i++) srcs[SRC+i] += sliceSize; + dsts[0] += DST ? realSize : directSendInc<DIRECTSEND>(0, realSize, sliceSize); + for (int i=1-DST; i<SEND*NSEND; i++) dsts[DST+i] += directSendInc<DIRECTSEND>(i, realSize, sliceSize); + offset += realSize; } } __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) { - recvConn[i] = conn; - recvBuff[i] = (const T*)recvConn[i]->buff; - recvStep[i] = recvConn[i]->step; + recvBuff[i] = (const T*)conn->buff; + recvStep[i] = conn->step; recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS); - // Return credits in case we rounded up. - if (tid == nthreads) *recvConn[i]->head = recvStep[i]; - if (tid == i) { - waitPtr = recvConn[i]->tail; - *(recvConn[i]->opCountLoc) = opCount; - } recvDirectBuff[i] = NULL; - if (directBuff && recvConn[i]->direct) { + if (directBuff && conn->direct) { recvDirectBuff[i] = directBuff; - if (tid == 0) *recvConn[i]->ptrExchange = directBuff; + if (tid == 0) *conn->ptrExchange = directBuff; } + if (wid == i) recvConn = conn; + if (wid == i) recvConnTail = recvConnHead = recvStep[i]; // Make sure we set this after rounding up nrecv++; } + __device__ __forceinline__ void loadRecvSync() { + if (tid >= WARP_SIZE && tid < 2*WARP_SIZE && wid<nrecv) { + recvConnTailPtr = recvConn->tail; + recvConnTailCache = *recvConnTailPtr; + } + if (tid >= nthreads-WARP_SIZE && wid < nrecv) { + recvConnHeadPtr = recvConn->head; + // Return credits in case we rounded up. + *recvConnHeadPtr = recvConnHead; + // Update opCount in case we skipped some operations + *(recvConn->opCountLoc) = opCount; + } + } __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) { - sendConn[i] = conn; - sendBuff[i] = (T*)sendConn[i]->buff; - sendStep[i] = sendConn[i]->step; + sendBuff[i] = (T*)conn->buff; + sendStep[i] = conn->step; sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS); - if (tid == WARP_SIZE+i) { - waitPtr = sendConn[i]->head; - sendConnHead[i] = *waitPtr; - *(sendConn[i]->opCountLoc) = opCount; - } sendDirectBuff[i] = NULL; - if (directBuff && sendConn[i]->direct) { - void* volatile* ptr = sendConn[i]->ptrExchange; + if (directBuff && conn->direct) { + void* volatile* ptr = conn->ptrExchange; while ((sendDirectBuff[i] = (T*)(*ptr)) == NULL); - __syncthreads(); + barrier(); if (tid == 0) *ptr = NULL; } + if (wid == i) sendConn = conn; + if (wid == i) sendConnTail = sendConnHead = sendStep[i]; // Make sure we set this after rounding up nsend++; } + __device__ __forceinline__ void loadSendSync() { + if (tid < nsend) { + sendConnHeadPtr = sendConn->head; + sendConnHeadCache = *sendConnHeadPtr; + sendConnFifoPtr = sendConn->fifo; + *(sendConn->opCountLoc) = opCount; + } + if (tid >= nthreads-WARP_SIZE && wid<nsend) { + sendConnTailPtr = sendConn->tail; + } + } - __device__ __forceinline__ void saveRecvConn(int i) { - if (tid == i) { - recvConn[i]->step = recvStep[i]; + __device__ __forceinline__ void saveRecvSync() { + if (tid >= nthreads-WARP_SIZE && wid < nrecv) { + recvConn->step = recvConnHead; + *(recvConn->opCountLoc) = opCount+1; __threadfence_system(); - *(recvConn[i]->opCountLoc) += 1; } } - __device__ __forceinline__ void saveSendConn(int i) { - if (tid == WARP_SIZE+i) { - sendConn[i]->step = sendStep[i]; + __device__ __forceinline__ void saveSendSync() { + if (tid < nsend) { + sendConn->step = sendConnHead; + *(sendConn->opCountLoc) = opCount+1; __threadfence_system(); - *(sendConn[i]->opCountLoc) += 1; } } public: __device__ __forceinline__ ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount) - : comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) { - // Make sure step is updated before we read it - __syncthreads(); + : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepSize(stepSize), opCount(opCount) { + // Make sure step is updated before we read it. + barrier(); for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, directBuff); for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, directBuff); + loadRecvSync(); + loadSendSync(); } __device__ __forceinline__ void @@ -305,267 +362,13 @@ class ncclPrimitives { } __device__ __forceinline__ ~ncclPrimitives() { - // Save steps for next collective. Have thread 0 do it to be compatible - // with the way LL works. - for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i); - for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i); + // Save steps for the next operation + saveRecvSync(); + saveSendSync(); } }; -template <typename T, class FUNC, int NRECV, int NSEND> -class ncclLLPrimitives { - private: - const int tid; - const int nthreads; - int nrecv = 0; - int nsend = 0; - struct ncclConnInfo* recvConn[NRECV]; - struct ncclConnInfo* sendConn[NSEND]; - volatile uint64_t* waitPtr; - volatile uint64_t* postPtr; - volatile int* fifoPtr; - uint64_t recvStep[NRECV]; - uint64_t sendStep[NSEND]; - uint64_t sendConnHead; - union ncclLLFifoLine* recvBuff[NRECV]; - union ncclLLFifoLine* sendBuff[NSEND]; - struct ncclDevComm* comm; - - inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } - inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } - inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } - inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); } - inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); } - inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); } - - // Exit If Abort Barrier : make sure all threads exit consistently - // Each thread sets a predicate to true if val == 1 - // all CTA's threads enter the barrier and do a popc on their predicates being True - // If any of the thread's predicate was True, all the threads call exit() - inline __device__ void exitIfAbortLocalBarrier() { - uint32_t popc; - asm ("{"); - asm volatile (" .reg .pred barr_pred;"); - asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort)); - asm volatile (" bar.red.popc.u32 %0, 14, %1, barr_pred;" : "=r"(popc) : "r"(nthreads)); - asm ("}"); - if (popc) { - // Make sure threads not participating in the operation get the abort and all threads exit - exitIfAbortBarrier(1); - } - } - - inline __device__ void barrier() { - asm volatile ("bar.sync 1, %0;" :: "r"(nthreads)); - } - - uint32_t mismatch = 0; - const uint64_t opCount; +#include "prims_ll.h" +//#include "prims_ll128.h" - inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) { - if (mismatch > 20) { - // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch - // Note that we are not using _threadfence_system in LL so the error cannot be asserted - *(comm->fatalDevError) = ncclDevSuspectedMismatch; - } else if (remoteOpCount && *remoteOpCount > opCount) { - mismatch += 1; - } - } - - uint32_t spins = 0; - uint32_t abort = 0; - - inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) { - spins++; - if (spins == SPINS_BEFORE_CHECK_ABORT) { - abort = *(comm->abortFlag); - checkMismatch(remoteOpCount); - spins = 0; - } - return abort; - } - - inline __device__ void waitSend(int i, int nbytes) { - spins = 0; - mismatch = 0; - if (tid == WARP_SIZE+i) { - while (sendConnHead + NCCL_STEPS < sendStep[i] + 1) { - sendConnHead = *waitPtr; - if (checkAbort(sendConn[i]->opCountRem)) break; - } - if (fifoPtr) { - int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes; - fifoPtr[sendStep[i]%NCCL_STEPS] = size; - } - } - } - - inline __device__ void postRecv(int i) { - recvStep[i]++; - if (tid == i) *postPtr = recvStep[i]; - } - - inline __device__ void postSend(int i, int offset) { - // LL Cleanup : write all flags in the slice to make sure we don't have - // data corruption when flag loops over. - if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) { - for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i)); - } - sendStep[i]++; - } - - __device__ uint64_t readLL(int i, int offset) { - union ncclLLFifoLine* src = recvPtr(i) + offset; - uint32_t flag = recvFlag(i); - uint32_t data1, flag1, data2, flag2; - spins = 0; - mismatch = 0; - do { - asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4)); - if (checkAbort(recvConn[i]->opCountRem)) break; - } while ((flag1 != flag) || (flag2 != flag)); - uint64_t val64 = data1 + (((uint64_t)data2) << 32); - return val64; - } - - __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) { - asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag)); - } - - // Using memcpy handles misaligned pointers. - __device__ uint64_t readAL(uint64_t* src) { - uint64_t val; - memcpy((char*)&val, (char*)src, sizeof(uint64_t)); - return val; - } - - __device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) { - memcpy((char*)dst, (char*)&val, nbytes); - } - - template <int RECV, int SEND, int SRC, int DST> - __device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) { - uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T); - FOR_SEND(waitSend, nbytes*2); - barrier(); - uint32_t npack = DIVUP(nbytes, sizeof(uint64_t)); - uint64_t* srcPack = (uint64_t*)srcPtr; - uint64_t* dstPack = (uint64_t*)dstPtr; - int offset = tid; - // Do multiples of 64 bits - #pragma unroll 2 - for (; offset<npack; offset+=nthreads) { - // Recv : local, then intra-node, then inter-node - uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset); - if (RECV) { - if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val); - for (int i=1; i<NRECV && i<nrecv; i++) { - val = MULTI<FUNC, T>()(readLL(i, offset), val); - } - } - - // Send : inter-node, then intra-node, then local - if (SEND) { - for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i)); - storeLL(sendPtr(0)+offset, val, sendFlag(0)); - } - if (DST) { - if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) { - // Last incomplete word - storeAL(dstPack+offset, val, nbytes & 0x7); - } else { - storeAL(dstPack+offset, val, sizeof(uint64_t)); - } - } - } - exitIfAbortLocalBarrier(); - FOR_RECV(postRecv); - FOR_SEND(postSend, offset); - } - - __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) { - recvConn[i] = conn; - recvBuff[i] = recvConn[i]->llBuff; - recvStep[i] = recvConn[i]->step; - if (tid == i) { - postPtr = recvConn[i]->head; - *(recvConn[i]->opCountLoc) = opCount; - } - nrecv++; - } - - __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) { - sendConn[i] = conn; - sendBuff[i] = sendConn[i]->llBuff; - sendStep[i] = sendConn[i]->step; - if (tid == WARP_SIZE+i) { - waitPtr = sendConn[i]->head; - fifoPtr = sendConn[i]->fifo; - sendConnHead = *waitPtr; - *(sendConn[i]->opCountLoc) = opCount; - } - nsend++; - } - - __device__ __forceinline__ void saveRecvConn(int i) { - if (tid == i) { - recvConn[i]->step = recvStep[i]; - *(recvConn[i]->opCountLoc) += 1; - __threadfence_block(); - } - } - - __device__ __forceinline__ void saveSendConn(int i) { - if (tid == WARP_SIZE+i) { - sendConn[i]->step = sendStep[i]; - *(sendConn[i]->opCountLoc) += 1; - __threadfence_block(); - } - } - - public: - __device__ __forceinline__ - ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount) - : comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) { - // Make sure step is updated before we read it. - barrier(); - - for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i); - for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i); - } - - __device__ void send(const T* src, int nelem) { - return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem); - } - - __device__ void recv(T* dst, int nelem) { - return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem); - } - - __device__ void recvReduceSend(const T* src, int nelem) { - return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem); - } - - __device__ void recvReduceCopy(const T* src, T* dst, int nelem) { - return LLGenericOp<1, 0, 1, 1>(src, dst, nelem); - } - - __device__ void copySend(const T* src, T* dst, int nelem) { - return LLGenericOp<0, 1, 1, 1>(src, dst, nelem); - } - - __device__ void recvCopySend(T* dst, int nelem) { - return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem); - } - - __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) { - return LLGenericOp<1, 1, 1, 1>(src, dst, nelem); - } - - __device__ __forceinline__ ~ncclLLPrimitives() { - // Save steps for the next operation - for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i); - for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i); - } -}; #endif diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h new file mode 100644 index 0000000..f919493 --- /dev/null +++ b/src/collectives/device/prims_ll.h @@ -0,0 +1,259 @@ +template <typename T, class FUNC, int NRECV, int NSEND> +class ncclLLPrimitives { + private: + const int tid; + const int nthreads; + const int wid; + int nrecv = 0; + int nsend = 0; + struct ncclConnInfo* recvConn = NULL; + volatile uint64_t* recvConnHeadPtr = NULL; + uint64_t recvConnHead; + + struct ncclConnInfo* sendConn = NULL; + volatile int* sendConnFifoPtr = NULL; + volatile uint64_t* sendConnHeadPtr = NULL; + uint64_t sendConnHead; + uint64_t sendConnHeadCache; // Cache last seen value + + uint64_t recvStep[NRECV]; + uint64_t sendStep[NSEND]; + union ncclLLFifoLine* recvBuff[NRECV]; + union ncclLLFifoLine* sendBuff[NSEND]; + struct ncclDevComm* comm; + + inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } + inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } + inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } + inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); } + inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); } + inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); } + + inline __device__ void barrier() { + asm volatile ("bar.sync 1, %0;" :: "r"(nthreads)); + } + + uint32_t mismatch = 0; + const uint64_t opCount; + + inline __device__ void checkMismatch(struct ncclConnInfo* conn) { + if (mismatch > 20) { + // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch + // Note that we are not using _threadfence_system in LL so the error cannot be asserted + *(comm->fatalDevError) = ncclDevSuspectedMismatch; + } else if (conn && *conn->opCountRem > opCount) { + mismatch += 1; + } + } + + uint32_t spins = 0; + uint32_t abort = 0; + + inline __device__ int checkAbort(int i, int send) { + spins++; + if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) { + abort = *(comm->abortFlag); + if (wid == i) checkMismatch(send ? sendConn : recvConn); + spins = 0; + } + return abort; + } + + inline __device__ void waitSend(int nbytes) { + spins = 0; + mismatch = 0; + if (sendConnHeadPtr) { + while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) { + sendConnHeadCache = *sendConnHeadPtr; + if (checkAbort(wid, 1)) break; + } + if (sendConnFifoPtr) { + int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes; + sendConnFifoPtr[sendConnHead%NCCL_STEPS] = size; + } + sendConnHead += 1; + } + barrier(); + } + + inline __device__ void incRecv(int i) { + recvStep[i] += 1; + } + inline __device__ void postRecv() { + barrier(); + if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1; + } + + inline __device__ void incSend(int i, int offset) { + // LL Cleanup : write all flags in the slice to make sure we don't have + // data corruption when flag loops over. + if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) { + for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i)); + } + sendStep[i]++; + } + + __device__ uint64_t readLL(int i, int offset) { + union ncclLLFifoLine* src = recvPtr(i) + offset; + uint32_t flag = recvFlag(i); + uint32_t data1, flag1, data2, flag2; + spins = 0; + mismatch = 0; + do { + asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4)); + if (checkAbort(i, 0)) break; + } while ((flag1 != flag) || (flag2 != flag)); + uint64_t val64 = data1 + (((uint64_t)data2) << 32); + return val64; + } + + __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) { + asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag)); + } + + // Using memcpy handles misaligned pointers. + __device__ uint64_t readAL(uint64_t* src) { + uint64_t val; + memcpy((char*)&val, (char*)src, sizeof(uint64_t)); + return val; + } + + __device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) { + memcpy((char*)dst, (char*)&val, nbytes); + } + + template <int RECV, int SEND, int SRC, int DST> + __device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) { + uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T); + uint32_t npack = DIVUP(nbytes, sizeof(uint64_t)); + uint64_t* srcPack = (uint64_t*)srcPtr; + uint64_t* dstPack = (uint64_t*)dstPtr; + int offset = tid; + + // Always waitSend in case of cleanup + if (SEND) waitSend(npack*sizeof(union ncclLLFifoLine)); + + // Do multiples of 64 bits + #pragma unroll 2 + for (; offset<npack; offset+=nthreads) { + // Recv : local, then intra-node, then inter-node + uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset); + if (RECV) { + if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val); + for (int i=1; i<NRECV && i<nrecv; i++) { + val = MULTI<FUNC, T>()(readLL(i, offset), val); + } + } + + // Send : inter-node, then intra-node, then local + if (SEND) { + for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i)); + storeLL(sendPtr(0)+offset, val, sendFlag(0)); + } + if (DST) { + if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) { + // Last incomplete word + storeAL(dstPack+offset, val, nbytes & 0x7); + } else { + storeAL(dstPack+offset, val, sizeof(uint64_t)); + } + } + } + FOR_RECV(incRecv); if (RECV) postRecv(); + FOR_SEND(incSend, offset); + } + + __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) { + recvBuff[i] = conn->llBuff; + recvStep[i] = conn->step; + if (wid == i) recvConn = conn; + nrecv++; + } + __device__ __forceinline__ void loadRecvSync() { + if (tid >= nthreads-WARP_SIZE && wid < nrecv) { + recvConnHeadPtr = recvConn->head; + recvConnHead = recvConn->step; + // Update opCount in case we skipped some operations + *(recvConn->opCountLoc) = opCount; + } + } + + __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) { + sendBuff[i] = conn->llBuff; + sendStep[i] = conn->step; + if (wid == i) sendConn = conn; + nsend++; + } + __device__ __forceinline__ void loadSendSync() { + if (tid < nsend) { + sendConnHeadPtr = sendConn->head; + sendConnHeadCache = *sendConnHeadPtr; + sendConnHead = sendConn->step; + sendConnFifoPtr = sendConn->fifo; + *(sendConn->opCountLoc) = opCount; + } + } + + __device__ __forceinline__ void saveRecvSync() { + if (tid >= nthreads-WARP_SIZE && wid < nrecv) { + recvConn->step = recvConnHead; + *(recvConn->opCountLoc) = opCount+1; + __threadfence_block(); + } + } + + __device__ __forceinline__ void saveSendSync() { + if (tid < nsend) { + sendConn->step = sendConnHead; + *(sendConn->opCountLoc) = opCount+1; + __threadfence_block(); + } + } + + public: + __device__ __forceinline__ + ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount) + : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), opCount(opCount) { + // Make sure step is updated before we read it. + barrier(); + + for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i); + for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i); + loadRecvSync(); + loadSendSync(); + } + + __device__ void send(const T* src, int nelem) { + return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem); + } + + __device__ void recv(T* dst, int nelem) { + return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem); + } + + __device__ void recvReduceSend(const T* src, int nelem) { + return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem); + } + + __device__ void recvReduceCopy(const T* src, T* dst, int nelem) { + return LLGenericOp<1, 0, 1, 1>(src, dst, nelem); + } + + __device__ void copySend(const T* src, T* dst, int nelem) { + return LLGenericOp<0, 1, 1, 1>(src, dst, nelem); + } + + __device__ void recvCopySend(T* dst, int nelem) { + return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem); + } + + __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) { + return LLGenericOp<1, 1, 1, 1>(src, dst, nelem); + } + + __device__ __forceinline__ ~ncclLLPrimitives() { + // Save steps for the next operation + saveRecvSync(); + saveSendSync(); + } +}; diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h new file mode 100644 index 0000000..40a8cff --- /dev/null +++ b/src/collectives/device/prims_ll128.h @@ -0,0 +1,410 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "op128.h" + +#define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1) + +template <typename T, class FUNC, int NRECV, int NSEND> +class ncclLL128Primitives { + private: + const int tid; + const int nthreads; + const int wid; + const int warp; + const bool flagThread; + int nrecv = 0; + int nsend = 0; + struct ncclConnInfo* recvConn = NULL; + volatile uint64_t* recvConnHeadPtr = NULL; + uint64_t recvConnHead; + + struct ncclConnInfo* sendConn = NULL; + volatile int* sendConnFifoPtr = NULL; + volatile uint64_t* sendConnTailPtr = NULL; + uint64_t sendConnTail; + volatile uint64_t* sendConnHeadPtr = NULL; + uint64_t sendConnHead; + uint64_t sendConnHeadCache; // Cache last seen value + + uint64_t recvStep[NRECV]; + uint64_t sendStep[NSEND]; + uint64_t* recvBuff[NRECV]; + uint64_t* sendBuff[NSEND]; + struct ncclDevComm* comm; + + volatile uint64_t* shmem; + + inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; } + inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; } + inline __device__ uint64_t* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } + inline __device__ uint64_t* sendPtr(int i) { return sendBuff[i]+sendOffset(i); } + inline __device__ uint64_t recvFlag(int i) { return recvStep[i]+1; } + inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; } + + inline __device__ void barrier() { + if (NSEND>NRECV) { + asm volatile ("bar.sync 2, %0;" :: "r"(nthreads)); + } else { + asm volatile ("bar.sync 3, %0;" :: "r"(nthreads)); + } + } + + uint32_t mismatch = 0; + const uint64_t opCount; + + inline __device__ void checkMismatch(struct ncclConnInfo* conn) { + if (mismatch > 20) { + // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch + // Note that we are not using _threadfence_system in LL so the error cannot be asserted + *(comm->fatalDevError) = ncclDevSuspectedMismatch; + } else if (conn && *conn->opCountRem > opCount) { + mismatch += 1; + } + } + + uint32_t spins = 0; + uint32_t abort = 0; + + inline __device__ int checkAbort(int i, int send) { + spins++; + if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) { + abort = *(comm->abortFlag); + if (wid == i) checkMismatch(send ? sendConn : recvConn); + spins = 0; + } + return abort; + } + + inline __device__ void waitSend(int nbytes) { + spins = 0; + mismatch = 0; + if (sendConnHeadPtr) { + while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) { + sendConnHeadCache = *sendConnHeadPtr; + if (checkAbort(wid, 1)) break; + } + if (sendConnFifoPtr) { + sendConnFifoPtr[sendStep[wid]%NCCL_STEPS] = nbytes; + } + sendConnHead += 1; + } + } + + inline __device__ void incRecv(int i) { + recvStep[i] += 1; + } + inline __device__ void postRecv() { + if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1; + } + + inline __device__ void incSend(int i) { + sendStep[i] += 1; + } + inline __device__ void postSend() { + if (sendConnTailPtr) { __threadfence(); *sendConnTailPtr = sendConnTail += 1; } + } + + template <int ELEMS_PER_THREAD> + inline __device__ void loadSrcToShmem128(int maxOffset, const uint64_t* src64Ptr) { +#if 0 + uint64_t v[ELEMS_PER_THREAD]; + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + if (u*WARP_SIZE < maxOffset) load128(src64Ptr+u*WARP_SIZE, v[u], v[u+1]); + } + uint64_t* shmemAsmPtr = shmemCvtPtr(shmem); + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + storeShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]); + } +#else + uint64_t* shmemAsmPtr = shmemCvtPtr(shmem); + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + if (u*WARP_SIZE < maxOffset) { + uint64_t v0, v1; + load128(src64Ptr+u*WARP_SIZE, v0, v1); + storeShmem128(shmemAsmPtr+u*WARP_SIZE, v0, v1); + } + } +#endif + } + + inline __device__ void loadSrcToShmem(int start, int end, const T* srcPtr) { + T* shmemPtr = (T*)(shmem-2*wid); + for (int offset = start+wid; offset < end; offset += WARP_SIZE) { + shmemPtr[offset] = srcPtr[offset]; + } + } + + template <int ELEMS_PER_THREAD> + inline __device__ void storeShmemToDst128(int maxOffset, uint64_t* dst64Ptr) { + uint64_t v[ELEMS_PER_THREAD]; + uint64_t* shmemAsmPtr = shmemCvtPtr(shmem); + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + loadShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]); + } + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + if (u*WARP_SIZE < maxOffset) store128(dst64Ptr+u*WARP_SIZE, v[u], v[u+1]); + } + } + + inline __device__ void storeShmemToDst(int start, int end, T* dstPtr) { + T* shmemPtr = (T*)(shmem-2*wid); + for (int offset = start+wid; offset < end; offset += WARP_SIZE) { + dstPtr[offset] = shmemPtr[offset]; + } + } + + #define WARP_MASK 0xffffffff + + template <int ELEMS_PER_THREAD, int RECV, int SEND, int SRC, int DST> + __device__ __forceinline__ void recvReduceSendCopy(int ll128Offset) { + uint64_t v[ELEMS_PER_THREAD]; + + /************* Data Loading : SHMEM -> REG **************/ + if (SRC) { + volatile uint64_t* shmem64Ptr = shmem - (2*wid)/NCCL_LL128_LINEELEMS; + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + v[u] = shmem64Ptr[u*(WARP_SIZE-2)]; + if (!flagThread) v[u+1] = shmem64Ptr[u*(WARP_SIZE-2)+1]; + } + } + /*********** End Data Loading : SHMEM -> REG ************/ + + /************************ Recv **************************/ + if (RECV) { + uint64_t flag = recvFlag(0); + uint64_t* ptr = recvPtr(0)+ll128Offset; + bool needReload; + uint64_t v0, v1; + do { + needReload = false; + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + load128(ptr+u*WARP_SIZE, v0, v1); + needReload |= flagThread && (v1 != flag); + } + } while (__any_sync(WARP_MASK, needReload) && checkAbort(0, 0) == 0); + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + load128(ptr+u*WARP_SIZE, v0, v1); + v[u] = SRC ? MULTI<FUNC, T>()(v0, v[u]) : v0; + v[u+1] = SRC ? MULTI<FUNC, T>()(v1, v[u+1]) : v1; + } + + for (int i=1; i<NRECV && i<nrecv; i++) { + uint64_t flag = recvFlag(i); + uint64_t* ptr = recvPtr(i)+ll128Offset; + uint64_t v0, v1; + do { + needReload = false; + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + load128(ptr+u*WARP_SIZE, v0, v1); + needReload |= flagThread && (v1 != flag); + } + } while (__any_sync(WARP_MASK, needReload) && checkAbort(i, 0) == 0); + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + load128(ptr+u*WARP_SIZE, v0, v1); + v[u] = MULTI<FUNC, T>()(v0, v[u]); + v[u+1] = MULTI<FUNC, T>()(v1, v[u+1]); + } + } + } + /********************** End Recv ************************/ + + /************************ Send **************************/ + if (SEND) { + for (int i=1; i<NSEND && i<nsend; i++) { + int flag = sendFlag(i); + uint64_t* ptr = sendPtr(i)+ll128Offset; + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]); + } + } + int flag = sendFlag(0); + uint64_t* ptr = sendPtr(0)+ll128Offset; + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]); + } + } + /********************** End Send ************************/ + + /************* Data Storing : REG -> SHMEM **************/ + if (DST) { + volatile uint64_t* shmem64Ptr = shmem - (2*wid)/NCCL_LL128_LINEELEMS; + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + shmem64Ptr[u*(WARP_SIZE-2)] = v[u]; + if (!flagThread) shmem64Ptr[u*(WARP_SIZE-2)+1] = v[u+1]; + } + } + /*********** End data Storing : REG -> SHMEM ************/ + } + + #define LL128INC (WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD) + #define ELEMINC (LL128INC-(LL128INC/NCCL_LL128_LINEELEMS)) + + template <int RECV, int SEND, int SRC, int DST> + __device__ void GenericOp(const T* srcPtr, T* dstPtr, int nelem) { + if (nelem <= 0) { + // Don't move any data but still increase steps and sync with prev/next + if (SEND) waitSend(0); + FOR_SEND(incSend); if (SEND) postSend(); + FOR_RECV(incRecv); if (RECV) postRecv(); + return; + } + const int nelem64 = ((nelem*sizeof(T))/(2*sizeof(uint64_t)))*2; + const uint64_t* src64Ptr = ((uint64_t*)srcPtr); + uint64_t* dst64Ptr = ((uint64_t*)dstPtr); + + int ll128Offset = LL128INC*warp+2*wid; + int elemOffset = ELEMINC*warp; + const int nwarps = nthreads/WARP_SIZE; + + if (SEND) waitSend(DIVUP(nelem*sizeof(T), ELEMINC*sizeof(uint64_t))*LL128INC*sizeof(uint64_t)); + barrier(); + + while (elemOffset*(sizeof(uint64_t)/sizeof(T)) < nelem) { + const int maxOffset128 = min(nelem64-elemOffset, (int)ELEMINC); + const int maxOffset = min(nelem-(elemOffset*((int)(sizeof(uint64_t)/sizeof(T)))), (int)(ELEMINC*(sizeof(uint64_t)/sizeof(T)))); + if (SRC) { + int done = 0; + if ((((uint64_t)srcPtr)&0xf) == 0) { + loadSrcToShmem128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, src64Ptr+elemOffset+2*wid); + done = maxOffset128*(sizeof(uint64_t)/sizeof(T)); + } + loadSrcToShmem(done, maxOffset, (T*)(src64Ptr+elemOffset)); + } + __syncwarp(); + recvReduceSendCopy<NCCL_LL128_SHMEM_ELEMS_PER_THREAD, RECV, SEND, SRC, DST>(ll128Offset); + __syncwarp(); + if (DST) { + int done = 0; + if ((((uint64_t)dstPtr)&0xf) == 0) { + storeShmemToDst128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, dst64Ptr+elemOffset+2*wid); + done = maxOffset128*(sizeof(uint64_t)/sizeof(T)); + } + storeShmemToDst(done, maxOffset, (T*)(dst64Ptr+elemOffset)); + } + __syncwarp(); + ll128Offset += LL128INC*nwarps; + elemOffset += ELEMINC*nwarps; + } + + barrier(); + FOR_SEND(incSend); if (SEND) postSend(); + FOR_RECV(incRecv); if (RECV) postRecv(); + } + + __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) { + recvBuff[i] = conn->ll128Buff; + recvStep[i] = conn->step; + if (wid == i) recvConn = conn; + nrecv++; + } + __device__ __forceinline__ void loadRecvSync() { + if (tid >= nthreads-WARP_SIZE && wid < nrecv) { + recvConnHeadPtr = recvConn->head; + recvConnHead = recvConn->step; + // Update opCount in case we skipped some operations + *(recvConn->opCountLoc) = opCount; + } + } + + __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) { + sendBuff[i] = conn->ll128Buff; + sendStep[i] = conn->step; + if (wid == i) sendConn = conn; + nsend++; + } + __device__ __forceinline__ void loadSendSync() { + if (tid < nsend) { + sendConnHeadPtr = sendConn->head; + sendConnHeadCache = *sendConnHeadPtr; + sendConnHead = sendConn->step; + sendConnFifoPtr = sendConn->fifo; + *(sendConn->opCountLoc) = opCount; + } + if (tid >= nthreads-WARP_SIZE && wid<nsend) { + if (sendConn->fifo) { + sendConnTailPtr = sendConn->tail; + sendConnTail = sendConn->step; + } + } + } + + __device__ __forceinline__ void saveRecvSync() { + if (tid >= nthreads-WARP_SIZE && wid < nrecv) { + recvConn->step = recvConnHead; + *(recvConn->opCountLoc) = opCount+1; + __threadfence_block(); + } + } + + __device__ __forceinline__ void saveSendSync() { + if (tid < nsend) { + sendConn->step = sendConnHead; + *(sendConn->opCountLoc) = opCount+1; + __threadfence_block(); + } + } + + public: + __device__ __forceinline__ + ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount) + : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), opCount(opCount), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) { + // Make sure step is updated before we read it. + barrier(); + + for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i); + for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i); + loadRecvSync(); + loadSendSync(); + } + + __device__ void send(const T* src, int nelem) { + return GenericOp<0, 1, 1, 0>(src, NULL, nelem); + } + + __device__ void recv(T* dst, int nelem) { + return GenericOp<1, 0, 0, 1>(NULL, dst, nelem); + } + + __device__ void recvReduceSend(const T* src, int nelem) { + return GenericOp<1, 1, 1, 0>(src, NULL, nelem); + } + + __device__ void recvReduceCopy(const T* src, T* dst, int nelem) { + return GenericOp<1, 0, 1, 1>(src, dst, nelem); + } + + __device__ void copySend(const T* src, T* dst, int nelem) { + return GenericOp<0, 1, 1, 1>(src, dst, nelem); + } + + __device__ void recvCopySend(T* dst, int nelem) { + return GenericOp<1, 1, 0, 1>(NULL, dst, nelem); + } + + __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) { + return GenericOp<1, 1, 1, 1>(src, dst, nelem); + } + + __device__ __forceinline__ ~ncclLL128Primitives() { + // Save steps for the next operation + saveRecvSync(); + saveSendSync(); + } +}; diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h index d2d5d3b..0680abe 100644 --- a/src/collectives/device/reduce.h +++ b/src/collectives/device/reduce.h @@ -11,7 +11,7 @@ template<int UNROLL, class FUNC, typename T> __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = blockDim.x - 1; + const int nthreads = args->nThreads-WARP_SIZE; const int bid = args->bid; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; @@ -30,7 +30,7 @@ __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) { T * __restrict__ thisOutput = (T*)args->ThisOutput; ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, FUNC> - prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); + prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); @@ -93,3 +93,48 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) { template<int UNUSED, class FUNC, typename T> __device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { } + +#include "prims_ll128.h" +template<int UNUSED, class FUNC, typename T> +__device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int bid = args->bid; + const int nthreads = args->nThreads; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; + + ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); + + const ssize_t size = args->N; + const int rank = comm->rank; + const int nranks = comm->nRanks; + const int prevRank = ring->devUserRanks[nranks-1]; + const int root = args->root; + + ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + + const ssize_t loopSize = args->nChannels*chunkSize; + + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize); + ssize_t offset = gridOffset + bid*chunkSize; + + int nelem = min(chunkSize, size-offset); + if (prevRank == root) { + LLprims.send(thisInput+offset, nelem); + } else if (rank == root) { + LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem); + } else { + LLprims.recvReduceSend(thisInput+offset, nelem); + } + } +} + +template<int UNUSED, class FUNC, typename T> +__device__ void ncclReduceTreeLL128Kernel(struct CollectiveArgs* args) { } diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h index 09ba56e..1985148 100644 --- a/src/collectives/device/reduce_scatter.h +++ b/src/collectives/device/reduce_scatter.h @@ -11,7 +11,7 @@ template<int UNROLL, class FUNC, typename T> __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = blockDim.x - 1; + const int nthreads = args->nThreads-WARP_SIZE; const int bid = args->bid; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; @@ -19,7 +19,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) { const ssize_t size = args->N; const int nranks = comm->nRanks; const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); - const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS; + const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS; const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; // Compute pointers @@ -27,7 +27,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) { T * __restrict__ thisOutput = (T*)args->ThisOutput; ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, FUNC> - prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); + prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); @@ -121,3 +121,64 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) { template<int UNUSED, class FUNC, typename T> __device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { } + +#include "prims_ll128.h" +template<int UNUSED, class FUNC, typename T> +__device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int bid = args->bid; + const int nthreads = args->nThreads; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; + + ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); + + const ssize_t size = args->N; + //const int rank = comm->rank; + const int nranks = comm->nRanks; + ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. + const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2; + + const ssize_t loopSize = args->nChannels*chunkSize; + + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize); + + ssize_t chunkOffset = gridOffset + bid*chunkSize; + + /////////////// begin ReduceScatter steps /////////////// + ssize_t offset; + int nelem = min(chunkSize, size-chunkOffset); + int rankDest; + + // step 0: push data to next GPU + rankDest = ring->devUserRanks[nranks-1]; + offset = chunkOffset + rankDest * size; + + LLprims.send(thisInput+offset, nelem); + + // k-2 steps: reduce and copy to next GPU + for (int j=2; j<nranks; ++j) { + rankDest = ring->devUserRanks[nranks-j]; + offset = chunkOffset + rankDest * size; + + LLprims.recvReduceSend(thisInput+offset, nelem); + } + + // step k-1: reduce this buffer and data, which will produce the final + // result that we store in this data + rankDest = ring->devUserRanks[0]; + offset = chunkOffset + rankDest * size; + + LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem); + } +} + +template<int UNUSED, class FUNC, typename T> +__device__ void ncclReduceScatterTreeLL128Kernel(struct CollectiveArgs* args) { } diff --git a/src/debug.cc b/src/debug.cc new file mode 100644 index 0000000..03a77ae --- /dev/null +++ b/src/debug.cc @@ -0,0 +1,169 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" +#include "nccl_net.h" +#include <stdlib.h> +#include <stdarg.h> + +int ncclDebugLevel = -1; +thread_local int ncclDebugNoWarn = 0; +uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT +FILE *ncclDebugFile = stdout; +pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER; + +void ncclDebugInit() { + pthread_mutex_lock(&ncclDebugLock); + if (ncclDebugLevel != -1) return; + const char* nccl_debug = getenv("NCCL_DEBUG"); + if (nccl_debug == NULL) { + ncclDebugLevel = NCCL_LOG_NONE; + } else if (strcasecmp(nccl_debug, "VERSION") == 0) { + ncclDebugLevel = NCCL_LOG_VERSION; + } else if (strcasecmp(nccl_debug, "WARN") == 0) { + ncclDebugLevel = NCCL_LOG_WARN; + } else if (strcasecmp(nccl_debug, "INFO") == 0) { + ncclDebugLevel = NCCL_LOG_INFO; + } else if (strcasecmp(nccl_debug, "ABORT") == 0) { + ncclDebugLevel = NCCL_LOG_ABORT; + } else if (strcasecmp(nccl_debug, "TRACE") == 0) { + ncclDebugLevel = NCCL_LOG_TRACE; + } + + /* Parse the NCCL_DEBUG_SUBSYS env var + * This can be a comma separated list such as INIT,COLL + * or ^INIT,COLL etc + */ + char* ncclDebugSubsysEnv = getenv("NCCL_DEBUG_SUBSYS"); + if (ncclDebugSubsysEnv != NULL) { + int invert = 0; + if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; } + ncclDebugMask = invert ? ~0ULL : 0ULL; + char *ncclDebugSubsys = strdup(ncclDebugSubsysEnv); + char *subsys = strtok(ncclDebugSubsys, ","); + while (subsys != NULL) { + uint64_t mask = 0; + if (strcasecmp(subsys, "INIT") == 0) { + mask = NCCL_INIT; + } else if (strcasecmp(subsys, "COLL") == 0) { + mask = NCCL_COLL; + } else if (strcasecmp(subsys, "P2P") == 0) { + mask = NCCL_P2P; + } else if (strcasecmp(subsys, "SHM") == 0) { + mask = NCCL_SHM; + } else if (strcasecmp(subsys, "NET") == 0) { + mask = NCCL_NET; + } else if (strcasecmp(subsys, "GRAPH") == 0) { + mask = NCCL_GRAPH; + } else if (strcasecmp(subsys, "TUNING") == 0) { + mask = NCCL_TUNING; + } else if (strcasecmp(subsys, "ALL") == 0) { + mask = NCCL_ALL; + } + if (mask) { + if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask; + } + subsys = strtok(NULL, ","); + } + free(ncclDebugSubsys); + } + + /* Parse and expand the NCCL_DEBUG_FILE path and + * then create the debug file. But don't bother unless the + * NCCL_DEBUG level is > VERSION + */ + const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE"); + if (ncclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) { + int c = 0; + char debugFn[PATH_MAX+1] = ""; + char *dfn = debugFn; + while (ncclDebugFileEnv[c] != '\0' && c < PATH_MAX) { + if (ncclDebugFileEnv[c++] != '%') { + *dfn++ = ncclDebugFileEnv[c-1]; + continue; + } + switch (ncclDebugFileEnv[c++]) { + case '%': // Double % + *dfn++ = '%'; + break; + case 'h': // %h = hostname + char hostname[1024]; + getHostName(hostname, 1024, '.'); + dfn += snprintf(dfn, PATH_MAX, "%s", hostname); + break; + case 'p': // %p = pid + dfn += snprintf(dfn, PATH_MAX, "%d", getpid()); + break; + default: // Echo everything we don't understand + *dfn++ = '%'; + *dfn++ = ncclDebugFileEnv[c-1]; + break; + } + } + *dfn = '\0'; + if (debugFn[0] != '\0') { + FILE *file = fopen(debugFn, "w"); + if (file != NULL) { + INFO(NCCL_ALL,"DEBUG file is '%s'", debugFn); + ncclDebugFile = file; + } + } + } + +#ifdef ENABLE_TRACE + ncclEpoch = std::chrono::high_resolution_clock::now(); +#endif + pthread_mutex_unlock(&ncclDebugLock); +} + +/* Common logging function used by the INFO, WARN and TRACE macros + * Also exported to the dynamically loadable Net transport modules so + * they can share the debugging mechanisms and output files + */ +void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) { + if (ncclDebugLevel == -1) ncclDebugInit(); + if (ncclDebugNoWarn == 1 && level == NCCL_LOG_WARN) level = NCCL_LOG_INFO; + + char hostname[1024]; + getHostName(hostname, 1024, '.'); + int cudaDev; + cudaGetDevice(&cudaDev); + + char buffer[1024]; + size_t len = 0; + pthread_mutex_lock(&ncclDebugLock); + if (ncclDebugNoWarn && ncclDebugLevel == NCCL_LOG_WARN) printf("WARN -> INFO\n"); + if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN) + len = snprintf(buffer, sizeof(buffer), + "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line); + else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask)) + len = snprintf(buffer, sizeof(buffer), + "%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev); +#ifdef ENABLE_TRACE + else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) { + auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch; + double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000; + len = snprintf(buffer, sizeof(buffer), + "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line); + } +#endif + if (len) { + va_list vargs; + va_start(vargs, fmt); + (void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs); + va_end(vargs); + fprintf(ncclDebugFile,"%s\n", buffer); + fflush(ncclDebugFile); + } + pthread_mutex_unlock(&ncclDebugLock); + + // If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort() + if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) { + fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n", + hostname, getpid(), gettid(), cudaDev, filefunc, line); + abort(); + } +} diff --git a/src/enqueue.cc b/src/enqueue.cc index b485634..2239865 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -5,19 +5,17 @@ ************************************************************************/ #include "enqueue.h" -#include "checks.h" -#include "param.h" - -#include "collectives/collectives.h" +#include "argcheck.h" // Only generate inline kernels for LL #define NCCL_FUNC5(coll, op, dtype) \ (void*)NCCL_KERN_NAME(coll##LL, op, dtype), \ + (void*)NCCL_KERN_NAME(coll##LL, op, dtype), \ (void*)NCCL_KERN_NAME(coll##LL, op, dtype) #define NCCL_FUNC4(coll, op, dtype) \ - (void*)NCCL_FUNC5(coll##Ring, op, dtype), \ - (void*)NCCL_FUNC5(coll##Tree, op, dtype) + (void*)NCCL_FUNC5(coll##Tree, op, dtype), \ + (void*)NCCL_FUNC5(coll##Ring, op, dtype) // Must be consistent with ncclDataType_t #define NCCL_FUNCS3A(coll, op) \ @@ -54,7 +52,7 @@ NCCL_FUNCS3B(coll, copy) // Must be consistent with the ncclFuncSet enum -static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = { +static void* const ncclKerns[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = { NCCL_FUNCS2B(ncclBroadcast), NCCL_FUNCS2A(ncclReduce), NCCL_FUNCS2B(ncclAllGather), @@ -207,6 +205,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) { channel->collCount = 0; } params->gridDim.x = params->blockDim.x = 0; + comm->lastOpCount = comm->opCount; NCCLCHECK(transportStartProxy(comm)); return ncclSuccess; } @@ -228,20 +227,70 @@ ncclResult_t ncclEnqueueEvents(ncclComm_t comm) { /* Enqueueing system : computation of kernel and proxy operations parameters */ /*****************************************************************************/ -static ncclResult_t getPatternInfo(struct ncclInfo* info) { - if (info->coll == ncclCollBroadcast) info->pattern = ncclPatternPipelineFrom; - else if (info->coll == ncclCollReduce) info->pattern = ncclPatternPipelineTo; - else if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->pattern = ncclPatternRing; - else if (info->coll == ncclCollAllReduce) { - if (info->nBytes <= info->comm->treeThreshold) - info->pattern = ncclPatternTreeUpDown; - else - info->pattern = ncclPatternRingTwice; +// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction +// factor is not ideal but works quite well. Powers of two, 64 B to 1 GB. +static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = { + { 1.0, 1.0, 1.0, 1.0, .9, .8, .7, .7, .7, .7, .6, .5, .5, .5, .6, .7, .8, .9, .9, 1.0, 1.0, 1.0 }, + { 1.0, 1.0, 1.0, 1.0, 1.0, .9, .8, .8, .8, .8, .7, .7, .7, .6, .6, .7, .7, .8, .8, .9, .9, 1.0 }, + { .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .5, .5, .6, .6, .7, .8, .9 } +}; + +static ncclResult_t getAlgoInfo(struct ncclInfo* info) { + struct ncclComm* comm = info->comm; + float minTime = 3600000.0; // Hopefully no operation will take an hour to complete. + // Find algorithm / protocol. + info->algorithm = -1; + info->protocol = -1; + for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) { + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + float bw = comm->bandwidths[info->coll][a][p]; + if (bw == 0) continue; + int logSize = log2i(info->nBytes>>6); + if (a == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[p][logSize]; + float time = comm->latencies[info->coll][a][p] + (info->nBytes) / (1000 * bw); + if (time < minTime) { + info->algorithm = a; + info->protocol = p; + minTime = time; + } + } } - else { - WARN("Unknown collective %d", info->coll); + if (info->algorithm == -1 || info->protocol == -1) { + WARN("Error : no algorithm/protocol available"); return ncclInternalError; } + //if (comm->rank == 0) INFO(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %d", info->nBytes, info->algorithm, info->protocol, minTime); + TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime); + + int nc = comm->nChannels; + int nt = comm->maxThreads[info->protocol]; + int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol]; + while (info->nBytes < nc*nt*threadThreshold) { + if (nc >= 2) nc--; + else if ((nt % 128) == 0) nt/=2; + else break; + } + if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync + info->nChannels = nc; + info->nThreads = nt; + return ncclSuccess; +} + +static ncclResult_t getPatternInfo(struct ncclInfo* info) { + switch (info->coll) { + case ncclCollBroadcast: + info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; break; + case ncclCollReduce: + info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break; + case ncclCollReduceScatter: + case ncclCollAllGather: + info->pattern = ncclPatternRing; break; + case ncclCollAllReduce: + info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break; + default: + WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm); + return ncclInternalError; + } return ncclSuccess; } @@ -264,40 +313,9 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) { return ncclSuccess; } -static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) { - // Compute thresholds and limits that users can override - ssize_t perThreadLLThreshold = std::min<ssize_t>(info->comm->threadThreshold, NCCL_LL_CHANNEL_THRESHOLD); - int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads); - - // First compute nThreads - int nt = NCCL_LL_MIN_NTHREADS; - while (DIVUP(info->nBytes, nt*info->nchunksPerLoop) > perThreadLLThreshold && nt*2 <= maxLLNthreads) nt *= 2; - - // Then compute nChannels - int nc = DIVUP(info->nBytes, nt*info->nchunksPerLoop*perThreadLLThreshold); - if (nc == 0) nc = 1; - if (nc > info->comm->nChannels) nc = info->comm->nChannels; - - // Check if we have a fixed LL threshold, otherwise compute it. - int perThreadThreshold = info->comm->threadThreshold; - if (info->pattern >= ncclPatternTreeUp) perThreadThreshold *= 4; - ssize_t llThreshold = info->comm->llThreshold >= 0 ? - info->comm->llThreshold : - nc*nt*info->nchunksPerLoop*perThreadThreshold; - - if (info->nBytes <= llThreshold) { - *llMode = 1; - *nChannels = nc; - *nThreads = nt; - } else { - *llMode = 0; - *nChannels = info->comm->nChannels; - *nThreads = info->comm->nThreads+1; - } -} - static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) { // Set nstepsPerLoop and nchunksPerLoop + NCCLCHECK(getAlgoInfo(info)); NCCLCHECK(getPatternInfo(info)); NCCLCHECK(getLoopInfo(info)); @@ -307,48 +325,52 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo coll->args.ThisOutput = info->recvbuff; coll->args.comm = info->comm->devComm; coll->args.opCount = info->comm->opCount; + coll->args.nChannels = info->nChannels; + coll->args.nThreads = info->nThreads; - // Compute llMode, nChannels, nThreads - int llMode; - getKernelInfo(info, &coll->args.nChannels, &coll->args.nThreads, &llMode); - - int treeMode = info->pattern >= ncclPatternTreeUp ? 1 : 0; - coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, llMode, treeMode); + coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol); - int stepSize = ( llMode ? NCCL_LL_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS; - int chunkSteps = (llMode|treeMode) ? 1 : info->chunkSteps; - int sliceSteps = (llMode|treeMode) ? 1 : info->sliceSteps; + int stepSize = (info->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : info->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS; + int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1; + int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1; int chunkSize = stepSize*chunkSteps; // Compute lastChunkSize - if (treeMode == 1 && llMode == 0) { + if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_SIMPLE) { if (info->pattern == ncclPatternTreeUpDown) { // Optimize chunkSize / nSteps - while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2; - while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2; - while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2; + while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*8 && chunkSize > 131072) chunkSize /= 2; + while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*4 && chunkSize > 65536) chunkSize /= 2; + while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth && chunkSize > 32768) chunkSize /= 2; } // Use lastChunkSize as chunkSize coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype); - } else if (llMode == 1) { + } else if (info->protocol == NCCL_PROTO_LL) { int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t); - const ssize_t loopSize = coll->args.nChannels*info->nchunksPerLoop*(ssize_t)sliceSize; - coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), coll->args.nChannels*info->nchunksPerLoop); - ALIGN_SIZE(coll->args.lastChunkSize, coll->args.nThreads*sizeof(uint64_t)); + const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize; + coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop); + ALIGN_SIZE(coll->args.lastChunkSize, info->nThreads*sizeof(uint64_t)); coll->args.lastChunkSize /= ncclTypeSize(info->datatype); + } else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) { + int nstepsInter = 1+log2i(info->comm->nNodes); + while (info->nBytes / (info->nChannels*chunkSize) < nstepsInter*4 && chunkSize > 32768) chunkSize /= 2; + // Use lastChunkSize as chunkSize + coll->args.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype)); } // Compute nSteps for proxies - size_t nBytes = llMode ? info->nBytes*2 : info->nBytes; - - int nLoops = (int)(DIVUP(nBytes, (((size_t)(coll->args.nChannels))*info->nchunksPerLoop*chunkSize))); + int chunkEffectiveSize = chunkSize; + if (info->protocol == NCCL_PROTO_LL) chunkEffectiveSize /= 2; + if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS; + //if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol); + int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize))); proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps; proxyArgs->sliceSteps = sliceSteps; proxyArgs->chunkSteps = chunkSteps; - proxyArgs->llMode = llMode; + proxyArgs->protocol = info->protocol; proxyArgs->opCount = info->comm->opCount; - TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> llmode %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p", - coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, nBytes, llMode, coll->args.nChannels, coll->args.nThreads, + TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p", + coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads, nLoops, proxyArgs->nsteps, info->comm); return ncclSuccess; } @@ -401,7 +423,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) { channel->collFifoTail = opIndex; channel->collCount++; } - /*if (llMode == 0)*/ info->comm->opCount++; + info->comm->opCount++; return ncclSuccess; } diff --git a/src/graph/connect.cc b/src/graph/connect.cc new file mode 100644 index 0000000..af481d2 --- /dev/null +++ b/src/graph/connect.cc @@ -0,0 +1,268 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "comm.h" +#include "graph.h" +#include "trees.h" +#include "rings.h" + +/******************************************************************/ +/********************* Internode connection ***********************/ +/******************************************************************/ + +ncclResult_t ncclTopoPreset(struct ncclComm* comm, + struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, + struct ncclTopoRanks* topoRanks) { + int rank = comm->rank; + int localRanks = comm->localRanks; + int nChannels = comm->nChannels; + + for (int c=0; c<nChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + channel->ring.prev = channel->ring.next = -1; + channel->treeUp.up = -1; + for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeUp.down[i] = -1; + channel->treeDn.up = -1; + for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeDn.down[i] = -1; + + int* ringIntra = ringGraph->intra+c*localRanks; + int* treeIntra = treeGraph->intra+c*localRanks; + + for (int i=0; i<localRanks; i++) { + if (ringIntra[i] == rank) { + topoRanks->ringRecv[c] = ringIntra[0]; + topoRanks->ringSend[c] = ringIntra[localRanks-1]; + channel->ring.prev = (i == 0) ? -1 : ringIntra[i-1]; + channel->ring.next = (i == localRanks-1) ? -1 : ringIntra[i+1]; + } + if (treeIntra[i] == rank) { + int recvIndex = 0, sendIndex = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; + int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks; + + // Tree loop always flows in the same direction. Other trees are symmetric, i.e. + // up/down go in reverse directions + int sym = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP ? 0 : 1; + + // Down tree is common + topoRanks->treeDnRecv[c] = treeIntra[recvIndex]; + topoRanks->treeDnSend[c] = treeIntra[sendIndex]; + channel->treeDn.up = treeIntra[prev]; + channel->treeDn.down[0] = treeIntra[next]; + // Up tree depends on the pattern + topoRanks->treeUpRecv[c] = sym ? topoRanks->treeDnSend[c] : topoRanks->treeDnRecv[c]; + topoRanks->treeUpSend[c] = sym ? topoRanks->treeDnRecv[c] : topoRanks->treeDnSend[c]; + channel->treeUp.down[0] = sym ? channel->treeDn.down[0] : channel->treeDn.up ; + channel->treeUp.up = sym ? channel->treeDn.up : channel->treeDn.down[0]; + } + } + topoRanks->ringPrev[c] = channel->ring.prev; + topoRanks->ringNext[c] = channel->ring.next; + } + // Duplicate channels rings/trees + struct ncclChannel* channel0 = comm->channels; + struct ncclChannel* channel1 = channel0+nChannels; + memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel)); + return ncclSuccess; +} + +static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext, int* firstRanks) { + int nChannels = comm->nChannels; + int nNodes = comm->nNodes; + for (int c=0; c<nChannels; c++) { + int* recv = ringRecv+c*comm->nRanks; + int* send = ringSend+c*comm->nRanks; + int* prev = ringPrev+c*comm->nRanks; + int* next = ringNext+c*comm->nRanks; + struct ncclChannel* channel0 = comm->channels+c; + struct ncclChannel* channel1 = channel0+nChannels; + for (int n=0; n<nNodes; n++) { + int recvRank = recv[firstRanks[n]]; + int prevSendRank = send[firstRanks[(n-1+nNodes)%nNodes]]; + prev[recvRank] = prevSendRank; + if (comm->rank == recvRank) { + channel0->ring.prev = prevSendRank; + channel1->ring.prev = prevSendRank; + } + int sendRank = send[firstRanks[n]]; + int nextRecvRank = recv[firstRanks[(n+1)%nNodes]]; + next[sendRank] = nextRecvRank; + if (comm->rank == sendRank) { + channel0->ring.next = nextRecvRank; + channel1->ring.next = nextRecvRank; + } + } + TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, channel0->ring.prev, comm->rank, channel0->ring.next); + TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c+nChannels, channel1->ring.prev, comm->rank, channel1->ring.next); + } + return ncclSuccess; +} + +static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes, int* firstRanks) { + for (int n=0; n<nNodes; n++) indexes[n] = ranks[firstRanks[n]]; + return ncclSuccess; +} + +static ncclResult_t setTreeUp(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int u0, int u1) { + if (u0 != -1) tree0->up = indexes[u0]; + if (u1 != -1) tree1->up = indexes[u1]; + return ncclSuccess; +} + +static ncclResult_t addRanksDown(int* down, int* indexes, int r0, int r1) { + int x = 0; + if (down[x] >= 0) x++; + if (down[x] >= 0) { + WARN("Internal error : tree already has more than one child (%d %d %d)\n", down[0], down[1], down[2]); + return ncclInternalError; + } + if (r0 != -1) down[x++] = indexes[r0]; + if (r1 != -1) down[x++] = indexes[r1]; + return ncclSuccess; +} + +static ncclResult_t setTreeDown(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int d0_0, int d0_1, int d1_0, int d1_1) { + NCCLCHECK(addRanksDown(tree0->down, indexes, d0_0, d0_1)); + NCCLCHECK(addRanksDown(tree1->down, indexes, d1_0, d1_1)); + return ncclSuccess; +} + +static ncclResult_t openRing(struct ncclTree* tree, int rank, int upRank) { + if (tree->down[0] == upRank) tree->down[0] = -1; + if (rank == upRank) tree->up = -1; + return ncclSuccess; +} + +static ncclResult_t connectTrees(struct ncclComm* comm, int* treeUpRecv, int* treeUpSend, int* treeDnRecv, int* treeDnSend, int* firstRanks) { + const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node; + int* indexesSend, *indexesRecv; + NCCLCHECK(ncclCalloc(&indexesSend, nNodes)); + NCCLCHECK(ncclCalloc(&indexesRecv, nNodes)); + + // Compute tree depth. Not an exact value but a good approximation in most + // cases + int depth = comm->nRanks/nNodes - 1 + log2i(nNodes); + + int u0, d0_0, d0_1, u1, d1_0, d1_1; + NCCLCHECK(ncclGetDtree(nNodes, node, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1)); + for (int c=0; c<nChannels; c++) { + struct ncclChannel* channel0 = comm->channels+c; + struct ncclChannel* channel1 = channel0+nChannels; + NCCLCHECK(getIndexes(treeUpSend+c*comm->nRanks, indexesSend, nNodes, firstRanks)); + NCCLCHECK(getIndexes(treeUpRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks)); + NCCLCHECK(openRing(&channel0->treeUp, comm->rank, indexesSend[node])); + NCCLCHECK(openRing(&channel1->treeUp, comm->rank, indexesSend[node])); + int root = indexesSend[node]; + if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeUp, &channel1->treeUp, indexesRecv, u0, u1)); + if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeUp, &channel1->treeUp, indexesSend, d0_0, d0_1, d1_0, d1_1)); + NCCLCHECK(getIndexes(treeDnSend+c*comm->nRanks, indexesSend, nNodes, firstRanks)); + NCCLCHECK(getIndexes(treeDnRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks)); + NCCLCHECK(openRing(&channel0->treeDn, comm->rank, u0 == -1 ? root : indexesRecv[node])); + NCCLCHECK(openRing(&channel1->treeDn, comm->rank, u1 == -1 ? root : indexesRecv[node])); + if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeDn, &channel1->treeDn, indexesRecv, d0_0, d0_1, d1_0, d1_1)); + if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeDn, &channel1->treeDn, indexesSend, u0, u1)); + TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c, channel0->treeUp.up, channel0->treeUp.down[0], channel0->treeUp.down[1], channel0->treeUp.down[2]); + TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c+nChannels, channel1->treeUp.up, channel1->treeUp.down[0], channel1->treeUp.down[1], channel1->treeUp.down[2]); + TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c, channel0->treeDn.up, channel0->treeDn.down[0], channel0->treeDn.down[1], channel0->treeDn.down[2]); + TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c+nChannels, channel1->treeDn.up, channel1->treeDn.down[0], channel1->treeDn.down[1], channel1->treeDn.down[2]); + channel0->treeUp.depth = channel1->treeUp.depth = depth; + } + free(indexesSend); + free(indexesRecv); + return ncclSuccess; +} + +// Legacy naming +NCCL_PARAM(MinNrings, "MIN_NRINGS", -2); +NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2); +// New naming +NCCL_PARAM(MinNchannels, "MIN_NCHANNELS", -2); +NCCL_PARAM(MaxNchannels, "MAX_NCHANNELS", -2); + +int ncclMinNchannels() { + int minNchannels = 0; + if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings(); + if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels(); + if (minNchannels > MAXCHANNELS) { + WARN("User asked for a minimum of %d channels, limiting to %d\n", minNchannels, MAXCHANNELS); + minNchannels = MAXCHANNELS; + } + if (minNchannels < 0) minNchannels = 0; + return minNchannels; +} +int ncclMaxNchannels() { + int maxNchannels = MAXCHANNELS; + if (ncclParamMaxNrings() != -2) maxNchannels = ncclParamMaxNrings(); + if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels(); + if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS; + if (maxNchannels < 1) { + WARN("User asked for a maximum of %d channels, setting it to 1\n", maxNchannels); + maxNchannels = 1; + } + return maxNchannels; +} + +ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings) { + // Gather data from all ranks + int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend; + int nranks = comm->nRanks; + int nChannels = comm->nChannels; + NCCLCHECK(ncclCalloc(&ringRecv, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&ringSend, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeUpRecv, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeUpSend, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeDnRecv, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeDnSend, nranks*MAXCHANNELS)); + for (int i=0; i<nranks; i++) { + for (int c=0; c<nChannels;c++) { + ringRecv[c*nranks+i] = allTopoRanks[i]->ringRecv[c]; + ringSend[c*nranks+i] = allTopoRanks[i]->ringSend[c]; + ringPrev[c*nranks+i] = allTopoRanks[i]->ringPrev[c]; + ringNext[c*nranks+i] = allTopoRanks[i]->ringNext[c]; + treeUpRecv[c*nranks+i] = allTopoRanks[i]->treeUpRecv[c]; + treeUpSend[c*nranks+i] = allTopoRanks[i]->treeUpSend[c]; + treeDnRecv[c*nranks+i] = allTopoRanks[i]->treeDnRecv[c]; + treeDnSend[c*nranks+i] = allTopoRanks[i]->treeDnSend[c]; + } + } + + // Connect rings and trees. This should also duplicate the channels. + NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext, firstRanks)); + NCCLCHECK(connectTrees(comm, treeUpRecv, treeUpSend, treeDnRecv, treeDnSend, firstRanks)); + + // Duplicate ringPrev/ringNext for ncclBuildRing + memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int)); + memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int)); + + // Duplication should be complete now + nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2); + + // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS. + // We permit combining max, then min, to only use the first channels, then duplicate them. + nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels); + int c; + for (c=nChannels; c<ncclMinNchannels(); c++) { + memcpy(ringPrev+c*nranks, ringPrev+(c-nChannels)*nranks, nranks*sizeof(int)); + memcpy(ringNext+c*nranks, ringNext+(c-nChannels)*nranks, nranks*sizeof(int)); + memcpy(comm->channels+c, comm->channels+c-nChannels, sizeof(struct ncclChannel)); + } + nChannels = comm->nChannels = c; + + // Create rings array and check all is fine + NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext)); + + free(ringRecv); + free(ringSend); + free(ringPrev); + free(ringNext); + free(treeUpRecv); + free(treeUpSend); + free(treeDnRecv); + free(treeDnSend); + + return ncclSuccess; +} diff --git a/src/graph/paths.cc b/src/graph/paths.cc new file mode 100644 index 0000000..ce1772c --- /dev/null +++ b/src/graph/paths.cc @@ -0,0 +1,363 @@ +/************************************************************************* + * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" +#include "graph.h" +#include "topo.h" +#include "comm.h" +#include "net.h" + +// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths + +struct ncclTopoNodeList { + struct ncclTopoNode* list[NCCL_TOPO_MAX_NODES]; + int count; +}; + +static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode* node, int t, int64_t id, struct ncclTopoLinkList** path) { + for (int i=0; i<system->nodes[t].count; i++) { + if (system->nodes[t].nodes[i].id == id) { + *path = node->paths[t]+i; + return ncclSuccess; + } + } + WARN("Could not find node of type %d id %lx\n", t, id); + return ncclInternalError; +} + +static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) { + if (baseNode->paths[baseNode->type] == NULL) { + NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count)); + } + + // breadth-first search to set all paths to that node in the system + struct ncclTopoNodeList nodeList; + struct ncclTopoNodeList nextNodeList; + nodeList.count = 1; nodeList.list[0] = baseNode; + nextNodeList.count = 0; + struct ncclTopoLinkList* basePath; + NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath)); + basePath->count = 0; + basePath->width = LOC_WIDTH; + basePath->type = LINK_LOC; + + while (nodeList.count) { + nextNodeList.count = 0; + for (int n=0; n<nodeList.count; n++) { + struct ncclTopoNode* node = nodeList.list[n]; + struct ncclTopoLinkList* path; + NCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path)); + for (int l=0; l<node->nlinks; l++) { + struct ncclTopoLink* link = node->links+l; + struct ncclTopoNode* remNode = link->remNode; + if (remNode->paths[baseNode->type] == NULL) { + NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count)); + } + struct ncclTopoLinkList* remPath; + NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath)); + int width = std::min(path->width, link->width); + if (remPath->width < width) { + // Find reverse link + for (int l=0; l<remNode->nlinks; l++) { + if (remNode->links[l].remNode == node) { + remPath->list[0] = remNode->links+l; + break; + } + } + if (remPath->list[0] == NULL) { + WARN("Failed to find reverse path from remNode id %d type %d nlinks %d to node id %d type %d", + remNode->id, remNode->type, remNode->nlinks, node->id, node->type); + return ncclInternalError; + } + // Copy the rest of the path + for (int i=0; i<path->count; i++) remPath->list[i+1] = path->list[i]; + remPath->count = path->count + 1; + remPath->width = width; + + // Consider the path is QPI when going through the CPU + // Also don't consider LINK_NET as we only care about the NIC->GPU path. + int type = remNode->type == CPU ? LINK_QPI : link->type == LINK_NET ? 0 : link->type; + remPath->type = std::max(path->type, type); + + // Add to the list for the next iteration if not already in the list + // Disallow GPUs as intermediate steps for now + if (remNode->type != GPU) { + int i; + for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break; + if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode; + } + } + } + } + memcpy(&nodeList, &nextNodeList, sizeof(nodeList)); + } + return ncclSuccess; +} + +static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* node) { + char line[1024]; +#ifdef ENABLE_TRACE + INFO(NCCL_GRAPH, "Paths from %s/%lX :", topoNodeTypeStr[node->type], node->id); +#else + sprintf(line, "%s/%lX :", topoNodeTypeStr[node->type], node->id); + int offset = strlen(line); +#endif + for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) { + if (node->paths[t] == NULL) continue; + for (int n = 0; n<system->nodes[t].count; n++) { +#ifdef ENABLE_TRACE + line[0] = 0; + int offset = 0; + for (int i=0; i<node->paths[t][n].count; i++) { + struct ncclTopoLink* link = node->paths[t][n].list[i]; + struct ncclTopoNode* remNode = link->remNode; + sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id); + offset = strlen(line); + } + INFO(NCCL_GRAPH, "%s (%d)", line, node->paths[t][n].width); +#else + sprintf(line+offset, "%s/%lX (%d/%d/%d) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, node->paths[t][n].type); + offset = strlen(line); +#endif + } + } +#ifndef ENABLE_TRACE + INFO(NCCL_GRAPH, "%s", line); +#endif +} + +ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system) { + for (int i=0; i<system->nodes[GPU].count; i++) { + printNodePaths(system, system->nodes[GPU].nodes+i); + } + for (int i=0; i<system->nodes[NET].count; i++) { + printNodePaths(system, system->nodes[NET].nodes+i); + } + return ncclSuccess; +} + +static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) { + // Find the closest CPU to a GPU + int minHops = 0; + int localCpu = -1; + struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU]; + for (int c=0; c<system->nodes[CPU].count; c++) { + int hops = paths[c].count; + if (minHops == 0 || hops < minHops) { + localCpu = c; + minHops = hops; + } + } + if (localCpu == -1) { + WARN("Error : could not find CPU close to GPU %d", gpu); + return ncclInternalError; + } + *retCpu = localCpu; + return ncclSuccess; +} + +static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int i1, int t2, int i2) { + struct ncclTopoNode* cpuNode = system->nodes[CPU].nodes+c; + struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1; + + int l=0; + // Node 1 -> CPU + for (int i=0; i<srcNode->paths[CPU][c].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[CPU][c].list[i]; + // CPU -> Node 2 + for (int i=0; i<cpuNode->paths[t2][i2].count; i++) srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i]; + + // Update path characteristics + srcNode->paths[t2][i2].count = l; + srcNode->paths[t2][i2].type = LINK_QPI; + srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width); + return ncclSuccess; +} + +// Remove/free paths for a given type +static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) { + for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) { + for (int n=0; n<system->nodes[t].count; n++) { + struct ncclTopoNode* node = system->nodes[t].nodes+n; + free(node->paths[nodeType]); + node->paths[nodeType] = NULL; + } + } +} + +ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) { + // Precompute paths between GPUs/NICs. + + // Remove everything in case we're re-computing + for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t); + + // Set direct paths from/to CPUs. We need them in many cases. + for (int c=0; c<system->nodes[CPU].count; c++) { + NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system)); + } + + // Set direct paths from/to GPUs. + for (int g=0; g<system->nodes[GPU].count; g++) { + // Compute paths to GPU g + NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system)); + + if (peerInfos == NULL) continue; + // Update paths from GPUs p to GPU g when we can't or don't want to use P2P or even SHM + struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].rank; + for (int p=0; p<system->nodes[GPU].count; p++) { + if (p == g) continue; + struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].rank; + int p2p; + NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo)); + if (p2p == 0) { + int shm; + NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo)); + if (shm == 1) { + // We cannot use GPU Direct, so we need all traffic to go through a CPU + int cpu; + NCCLCHECK(getLocalCpu(system, g, &cpu)); + NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g)); + } else { + // We cannot communicate with that peer. + system->nodes[GPU].nodes[p].paths[GPU][g].count = 0; + } + } + } + } + + // Set direct paths from/to NICs. + for (int n=0; n<system->nodes[NET].count; n++) { + struct ncclTopoNode* netNode = system->nodes[NET].nodes+n; + NCCLCHECK(ncclTopoSetPaths(netNode, system)); + + if (peerInfos == NULL) continue; + for (int g=0; g<system->nodes[GPU].count; g++) { + if ((peerInfos[system->nodes[GPU].nodes[g].rank].gdrSupport & (1 << n)) == 0) { + // We cannot use GPU Direct RDMA, so we need all NIC<->GPU paths + // to go through a CPU + int localCpu; + NCCLCHECK(getLocalCpu(system, g, &localCpu)); + NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g)); + NCCLCHECK(addCpuStep(system, localCpu, GPU, g, NET, n)); + } + } + } + + return ncclSuccess; +} + +ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm) { + int *domains; + int64_t *ids; + NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count)); + NCCLCHECK(ncclCalloc(&ids, system->nodes[GPU].count)); + int myDomain = 0; + for (int g=0; g<system->nodes[GPU].count; g++) { + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + domains[g] = g; + ids[g] = gpu->id; + for (int p=0; p<g; p++) { + if (gpu->paths[GPU][p].count > 0) { + domains[g] = std::min(domains[g], domains[p]); + } + } + if (gpu->rank == comm->rank) myDomain = domains[g]; + } + + int ngpus = system->nodes[GPU].count; + for (int i=0; i<ngpus; i++) { + if (domains[i] == myDomain) continue; + struct ncclTopoNode* gpu = NULL; + int g; + for (g=0; g<system->nodes[GPU].count /* This one varies over the loops */; g++) { + gpu = system->nodes[GPU].nodes+g; + if (gpu->id == ids[i]) break; else gpu=NULL; + } + if (gpu == NULL) { + WARN("Could not find id %lx", ids[i]); + free(domains); + free(ids); + return ncclInternalError; + } + + // Remove GPUs I can't access (even indirectly) from my view of the node + for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) { + for (int n=0; n<system->nodes[t].count; n++) { + struct ncclTopoNode* node = system->nodes[t].nodes+n; + if (node == gpu) continue; + for (int l=0; l<node->nlinks; l++) { + while (l<node->nlinks && node->links[l].remNode == gpu) { + if (l<node->nlinks-1) + memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink)); + node->nlinks--; + } + if (l<node->nlinks && node->links[l].remNode->type == GPU && node->links[l].remNode >= gpu) { + node->links[l].remNode--; + } + } + } + } + if (g != system->nodes[GPU].count-1) + memmove(gpu, gpu+1, (system->nodes[GPU].count-g-1)*sizeof(struct ncclTopoNode)); + system->nodes[GPU].count--; + } + + comm->localRanks = system->nodes[GPU].count; + if (system->nodes[GPU].count == comm->nRanks) { + // Trim network + ncclTopoRemovePathType(system, NET); + system->nodes[NET].count = 0; + } + free(domains); + free(ids); + return ncclSuccess; +} + +static ncclResult_t getGpuSpeed(struct ncclTopoNode* node, int* speed) { + int nvlSpeed = 0; + int nvlPeers = 0; + int pciSpeed = 0; + for (int l=0; l<node->nlinks; l++) { + if (node->links[l].type == LINK_NVL) nvlSpeed += node->links[l].width; + if (node->links[l].remNode->type == GPU) nvlPeers++; else nvlPeers = 2; + if (node->links[l].type == LINK_PCI) pciSpeed = node->links[l].width; + } + *speed = std::min(*speed, std::max(nvlSpeed, pciSpeed)); + return ncclSuccess; +} + +ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system) { + // Compute max speed to try to accelerate the search. + system->maxSpeed = LOC_WIDTH; + + for (int g=0; g<system->nodes[GPU].count; g++) { + NCCLCHECK(getGpuSpeed(system->nodes[GPU].nodes+g, &system->maxSpeed)); + } + if (system->nodes[NET].count) { + // Try to assign one NIC per GPU + int netMaxSpeed = 0; + int netMaxSpeedCount = 0; + for (int n=0; n<system->nodes[NET].count; n++) { + int maxSpeed = 0; + struct ncclTopoNode* net = system->nodes[NET].nodes+n; + for (int g=0; g<system->nodes[GPU].count; g++) { + maxSpeed = std::max(maxSpeed, net->paths[GPU][g].width); + } + if (maxSpeed > netMaxSpeed) { + netMaxSpeed = maxSpeed; + netMaxSpeedCount = 1; + } else if (maxSpeed == netMaxSpeed) { + netMaxSpeedCount++; + } + } + system->maxSpeed = std::min(system->maxSpeed, netMaxSpeedCount*NET_WIDTH); + } + return ncclSuccess; +} + +void ncclTopoFree(struct ncclTopoSystem* system) { + for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t); + free(system); +} diff --git a/src/graph/rings.cc b/src/graph/rings.cc new file mode 100644 index 0000000..5aacbb5 --- /dev/null +++ b/src/graph/rings.cc @@ -0,0 +1,57 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" + +#define MAXWIDTH 20 +#define PREFIXLEN 15 +#define STRLENGTH (PREFIXLEN+5*MAXWIDTH) +void dumpLine(int* values, int nranks, const char* prefix) { + int prefixlen = strlen(prefix); + char line[STRLENGTH+1]; + line[STRLENGTH] = '\0'; + memset(line, ' ', STRLENGTH); + strncpy(line, prefix, PREFIXLEN); + for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]); + INFO(NCCL_INIT,"%s", line); +} + +ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) { + for (int r=0; r<nrings; r++) { + char prefix[30]; + /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r); + dumpLine(prev+r*nranks, nranks, prefix); + sprintf(prefix, "[%d] Channel %d Next : ", rank, r); + dumpLine(next+r*nranks, nranks, prefix);*/ + + int current = rank; + for (int i=0; i<nranks; i++) { + rings[r*nranks+i] = current; + current = next[r*nranks+current]; + } + sprintf(prefix, "Channel %02d/%02d : ", r, nrings); + if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix); + if (current != rank) { + WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank); + return ncclInternalError; + } + // Check that all ranks are there + for (int i=0; i<nranks; i++) { + int found = 0; + for (int j=0; j<nranks; j++) { + if (rings[r*nranks+j] == i) { + found = 1; + break; + } + } + if (found == 0) { + WARN("Error : ring %d does not contain rank %d", r, i); + return ncclInternalError; + } + } + } + return ncclSuccess; +} diff --git a/src/graph/rings.h b/src/graph/rings.h new file mode 100644 index 0000000..c52b1ca --- /dev/null +++ b/src/graph/rings.h @@ -0,0 +1,7 @@ +/************************************************************************* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next); diff --git a/src/graph/search.cc b/src/graph/search.cc new file mode 100644 index 0000000..3a8b4e7 --- /dev/null +++ b/src/graph/search.cc @@ -0,0 +1,594 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" +#include "graph.h" +#include "topo.h" + +static ncclResult_t ncclTopoFollowPath(struct ncclTopoGraph* graph, struct ncclTopoLinkList* path, struct ncclTopoNode** node, int width, int typeSave) { + if (path->count == 0) return ncclSuccess; + + *node = NULL; + if (width > 0) { + if (path->type > graph->type) return ncclSuccess; + graph->type = std::max(graph->type, path->type); + graph->nHops += path->count; + } else { + graph->type = typeSave; + graph->nHops -= path->count; + } + + for (int i=0; i<path->count; i++) { + if (path->list[i]->width < width) { + // Can't follow this path, rewind and exit + for (int j=0; j<i; j++) path->list[j]->width += width; + return ncclSuccess; + } + path->list[i]->width -= width; + } + *node = path->list[path->count-1]->remNode; + return ncclSuccess; +} + +static int gpuPciWidth(struct ncclTopoNode* gpu) { + for (int l=0; l<gpu->nlinks; l++) { + struct ncclTopoLink* gpuLink = gpu->links+l; + if (gpuLink->type != LINK_PCI) continue; + struct ncclTopoNode* pci = gpuLink->remNode; + for (int l=0; l<pci->nlinks; l++) { + struct ncclTopoLink* pciLink = pci->links+l; + if (pciLink->remNode != gpu) continue; + return std::min(gpuLink->width, pciLink->width); + } + } + return -1; +} + +/* Choose the order in which we try next GPUs. This is critical for the search + to quickly converge to the best solution even if it eventually times out. */ +struct ncclGpuScore { + int g; // Retain the index + int startIndex; // Least important + int intraNhops; + int intraWidth; + int interNhops; + int interPciWidth; + int interWidth; // Most important +}; + +static int cmpScore(const void * g1, const void * g2) { + struct ncclGpuScore *s1 = (struct ncclGpuScore*)g1; + struct ncclGpuScore *s2 = (struct ncclGpuScore*)g2; + int d; + if ((d = (s2->interWidth - s1->interWidth))) return d; + if ((d = (s2->interPciWidth - s1->interPciWidth))) return d; + if ((d = (s1->interNhops - s2->interNhops))) return d; + if ((d = (s2->intraWidth - s1->intraWidth))) return d; + if ((d = (s1->intraNhops - s2->intraNhops))) return d; + return s1->startIndex - s2->startIndex; +} + +static int cmpIntraScores(struct ncclGpuScore* scores, int count) { + int intraWidth = scores[0].intraWidth; + int intraNhops = scores[0].intraNhops; + for (int i=1; i<count; i++) { + if (scores[i].intraWidth != intraWidth || scores[i].intraNhops != intraNhops) return 1; + } + return 0; +} + +static ncclResult_t getNetPaths(struct ncclTopoSystem* system, const uint64_t flag, struct ncclTopoLinkList** netPaths) { + for (int n=0; n<system->nodes[NET].count; n++) { + if (system->nodes[NET].nodes[n].used & flag) { + *netPaths=system->nodes[NET].nodes[n].paths[GPU]; + return ncclSuccess; + } + } + return ncclInternalError; +} + +ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoNode* gpu, int* next, int* countPtr, int sortNet) { + const uint64_t flag = 1ULL<<(graph->nChannels); + int ngpus = system->nodes[GPU].count; + struct ncclTopoLinkList* paths = gpu->paths[GPU]; + struct ncclTopoLinkList* netPaths = NULL; + if (sortNet) NCCLCHECK(getNetPaths(system, flag, &netPaths)); + + struct ncclGpuScore scores[NCCL_TOPO_MAX_NODES]; + memset(scores, 0, ngpus*sizeof(struct ncclGpuScore)); + int start = gpu-system->nodes[GPU].nodes; + int count = 0; + for (int i=1; i<ngpus; i++) { + int g = (start+i)%ngpus; + if (paths[g].count == 0) continue; // There is no path to that GPU + if (system->nodes[GPU].nodes[g].used & flag) continue; + scores[count].g = g; + scores[count].startIndex = i; + scores[count].intraNhops = paths[g].count; + scores[count].intraWidth = paths[g].width; + if (netPaths) { + scores[count].interNhops = netPaths[g].count; + scores[count].interPciWidth = gpuPciWidth(system->nodes[GPU].nodes+g); + scores[count].interWidth = netPaths[g].width; + } + count++; + } + + // Sort GPUs + qsort(scores, count, sizeof(struct ncclGpuScore), cmpScore); + + // Check if all have the same intra-node score in which case we go reverse for sortNet = -1 + if (sortNet == -1 && cmpIntraScores(scores, count) == 0) { + for (int i=0; i<count; i++) next[i] = scores[count-1-i].g; + } else { + for (int i=0; i<count; i++) next[i] = scores[i].g; + } + *countPtr = count; + return ncclSuccess; +} + +ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time); + +#define NCCL_SEARCH_TIMEOUT (1ULL<<20) // This should get contain all search within a second or so. + +#define FORCED_ORDER_PCI 1 +#define FORCED_ORDER_REPLAY 2 + +ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int step, int* g) { + *g = -1; + if (graph->nChannels == 0) return ncclInternalError; + int ngpus = system->nodes[GPU].count; + int nextRank = graph->intra[(graph->nChannels-1)*ngpus+step+1]; + for (int i=0; i<ngpus; i++) if (system->nodes[GPU].nodes[i].rank == nextRank) { + *g = i; + return ncclSuccess; + } + if (*g == -1) return ncclInternalError; + return ncclSuccess; +} + +ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time); + +ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoLinkList* paths, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time, int g, int speed) { + int typeSave = graph->type; + const uint64_t flag = 1ULL<<(graph->nChannels); + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, speed, typeSave)); + if (gpu) { + gpu->used ^= flag; + NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, maxSpeed, time)); + gpu->used ^= flag; + if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, -speed, typeSave)); + } + return ncclSuccess; +} + +ncclResult_t ncclTopoCompareGraphs(struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) { + // 0. When we are trying to increase speedIntra, do not copy if the solution has less channels + // since it would likely impact the rings algorithms too. + if (graph->speedIntra > graph->speedInter && graph->nChannels < refGraph->nChannels) return ncclSuccess; + + // 1. Try to get better bandwidth + if (graph->nChannels*graph->speedIntra < refGraph->nChannels*refGraph->speedIntra) return ncclSuccess; + if (graph->nChannels*graph->speedIntra > refGraph->nChannels*refGraph->speedIntra) { + *copy = 1; + return ncclSuccess; + } + // 2. Give an advantage when all channels are the same + if (graph->nChannels > 1 && graph->sameChannels && refGraph->sameChannels == 0) { + *copy = 1; + return ncclSuccess; + } + // 3. Less hops + if (graph->nHops < refGraph->nHops) *copy = 1; + return ncclSuccess; +} + +ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time) { + if ((*time) <= 0) return ncclSuccess; + (*time)--; + + int ngpus = system->nodes[GPU].count; + if (step == ngpus) { + // Determine whether we found a better solution or not + int copy = 0; + int sameChannels = graph->sameChannels; + if (graph->nChannels > 0) { + int* intra = graph->intra+graph->nChannels*ngpus; + for (int g=0; g<ngpus; g++) if (intra[g] != intra[g-ngpus]) graph->sameChannels = 0; + } + graph->nChannels++; + NCCLCHECK(ncclTopoCompareGraphs(graph, saveGraph, ©)); + if (copy) { + memcpy(saveGraph, graph, sizeof(struct ncclTopoGraph)); + if (graph->nChannels*graph->speedIntra == maxSpeed) *time = -1; + } + if (graph->nChannels < MAXCHANNELS/2) { + NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, maxSpeed, time)); + } + graph->nChannels--; + graph->sameChannels = sameChannels; + return ncclSuccess; + } + graph->intra[graph->nChannels*ngpus+step] = gpu->rank; + if (step == backToNet) { + // first get back to NIC + if (system->nodes[NET].count) { + int maxWidth = 0; + struct ncclTopoLinkList* paths = gpu->paths[NET]; + for (int n=0; n<system->nodes[NET].count; n++) { + if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue; + maxWidth = std::max(paths[n].width, maxWidth); + } + for (int n=0; n<system->nodes[NET].count; n++) { + if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue; + if (paths[n].width == maxWidth) { + struct ncclTopoNode* net = system->nodes[NET].nodes+n; + int typeSave = graph->type; + NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, graph->speedInter, typeSave)); + if (net) { + graph->inter[graph->nChannels*2+1] = net->id; + NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, maxSpeed, time)); + NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, -graph->speedInter, typeSave)); + } + } + } + } + } else if (step < system->nodes[GPU].count-1) { + // Go to next GPU + struct ncclTopoLinkList* paths = gpu->paths[GPU]; + int next[NCCL_TOPO_MAX_NODES]; + int count; + if (forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order + next[0] = step+1; + count = 1; + } else if (forcedOrder == FORCED_ORDER_REPLAY) { // Try last channel order + NCCLCHECK(ncclTopoReplayGetGpu(system, graph, step, next)); + count = 1; + } else { // Normal search + NCCLCHECK(ncclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step+1 ? 1 : -1 )); + } + for (int i=0; i<count; i++) { + int g = next[i]; + int nvlink = graph->nvlink; + graph->nvlink &= paths[g].type <= LINK_NVL ? 1 : 0; + int speed = graph->speedIntra; + if (paths[g].type == LINK_QPI) speed = INTEL_P2P_OVERHEAD(speed); + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, step+1, backToNet, backToFirstRank, forcedOrder, maxSpeed, time, g, speed)); + graph->nvlink = nvlink; + } + } else if (step == backToFirstRank) { + // Find first GPU and loop back to it + int g; + int rank = graph->intra[graph->nChannels*ngpus]; + for (g=0; g<ngpus; g++) { + if (system->nodes[GPU].nodes[g].rank == rank) break; + } + if (g == ngpus) { + WARN("Could not find GPU with rank %d\n", rank); + return ncclInternalError; + } + struct ncclTopoLinkList* paths = gpu->paths[GPU]; + struct ncclTopoNode* firstGpu = system->nodes[GPU].nodes+g; + int typeSave = graph->type; + NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, graph->speedIntra, typeSave)); + if (firstGpu) { + NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step+1, backToNet, -1, forcedOrder, maxSpeed, time)); + NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, -graph->speedIntra, typeSave)); + } + } else { + // Next path + NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, maxSpeed, time)); + } + return ncclSuccess; +} + +ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int maxSpeed, int* time) { + const uint64_t flag = 1ULL<<(graph->nChannels); + const int speed = graph->speedInter; + for (int n=0; n<system->nodes[NET].count; n++) { + struct ncclTopoNode* net = system->nodes[NET].nodes+n; + struct ncclTopoNode* gpu; + if (net->used == 0) { + graph->inter[graph->nChannels*2] = net->id; + for (int i=0; i<system->nodes[NET].count; i++) { + if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag; + } + struct ncclTopoLinkList* paths = net->paths[GPU]; + + // First try the PCI order to set a reference + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, speed)); + // Then try to replay the last channel + if (graph->nChannels > 0) { + int g; + NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g)); + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, g, speed)); + } + + // Then try the most local GPUs + int maxWidth = 0, minHops = 0xfffffff; + for (int g=0; g<system->nodes[GPU].count; g++) { + if (paths[g].width > maxWidth) { + maxWidth = paths[g].width; + minHops = paths[g].count; + } else if (paths[g].width == maxWidth && paths[g].count < minHops) { + minHops = paths[g].count; + } + } + if (maxWidth >= speed) { + // In the first loop, avoid using GPUs in both directions between channels (one channel + // sending from that GPU and one channel receiving to that GPU), since that usually leads + // to lower BW. + for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) { + for (int g=0; g<system->nodes[GPU].count; g++) { + if (paths[g].width == maxWidth && paths[g].count == minHops) { + gpu = system->nodes[GPU].nodes+g; + int gpuUsed = gpuPciWidth(gpu) > 0 ? 0 : 1; + if (tryGpuBidir == gpuUsed) { + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, 0, maxSpeed, time, g, speed)); + } + } + } + } + } + for (int i=0; i<system->nodes[NET].count; i++) { + if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag; + } + } + } + return ncclSuccess; +} + +/* Search Patterns + * + * Intra-node + * Ring : GPU a -> GPU b -> .. -> GPU x -> GPU a + * (=Split Tree Loop) + * Tree : GPU a -> GPU b -> .. -> GPU x + * (=Split Tree) + * + * Inter-node + * Ring : NET n -> GPU a -> GPU b -> .. -> GPU x -> NET n (or m if crossNic) + * Tree : NET n -> GPU a -> GPU b -> .. -> GPU x + * `--> NET n (or m if crossNic) + * Split Tree : NET n -> GPU a -> GPU b -> .. -> GPU x + * `--> NET n (or m if crossNic) + * Split Tree Loop : NET n -> GPU a -> GPU b -> .. -> GPU x -> GPU a + * `--> NET n (or m if crossNic) + */ +ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) { + if (system->nodes[NET].count) { + if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1; + else if (pattern == NCCL_TOPO_PATTERN_TREE) *backToNet = 0; + else *backToNet = 1; + if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1; + else *backToFirstRank = -1; + } else { + *backToNet = -1; + if (pattern == NCCL_TOPO_PATTERN_RING || pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1; + else *backToFirstRank = -1; + } + return ncclSuccess; +} + +ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time) { + int backToNet, backToFirstRank; + NCCLCHECK(ncclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank)); + if (system->nodes[NET].count) { + // Start from NET + ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, maxSpeed, time); + } else { + // Start from GPU 0 + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, graph->speedIntra)); + if (graph->nChannels > 0) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, 0, graph->speedIntra)); + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, 0, maxSpeed, time, 0, graph->speedIntra)); + } + return ncclSuccess; +} + +/* Parse user defined rings. Format is like : + * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0" + * Rings with a non-matching number of ranks are ignored so we can provide + * rings for multiple cases. + */ +#define MAX_ENV_RANKS 512 +static ncclResult_t parseGraph(const char* str, int* nChannelsRet, int ngpus, int* channels) { + int ranks[MAX_ENV_RANKS]; + int nChannels = 0; + int rank = 0; + int offset = 0; + int status = 0; // 0 : between numbers, 1 : inside number + do { + int digit = str[offset] - '0'; + if (digit >= 0 && digit <= 9) { + if (status == 0) { + ranks[rank] = digit; + status = 1; + } else { + ranks[rank] = ranks[rank]*10+digit; + } + } else { + if (status == 1) { + rank++; + if (rank == MAX_ENV_RANKS) goto end; + } + status = 0; + if (str[offset] == '|' || str[offset] == '\0') { + // Ignore if ngpus doesn't match + if (rank != ngpus) goto newchannel; + + for (int r=0; r<ngpus; r++) { + int rank = ranks[r]; + // Ignore if ranks are out of bounds + if (rank < 0 || rank >= ngpus) goto newchannel; + // Ignore if ranks are duplicate + for (int i=0; i<r; i++) + if (ranks[i] == rank) goto newchannel; + + channels[nChannels*ngpus+r] = rank; + } + nChannels++; +newchannel: + rank = 0; + } + } + } while (str[offset++] != 0); +end: + *nChannelsRet = nChannels; + return ncclSuccess; +} + +ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) { + int ngpus = system->nodes[GPU].count; + int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0; + graph->speedIntra = graph->speedInter = 0; + if (graph->crossNic == 2) graph->crossNic = 0; + graph->nvlink = 0; + graph->type = LINK_LOC; + graph->nChannels = 0; + graph->sameChannels = 1; + + char* str = getenv("NCCL_GRAPH"); + if (str) { + NCCLCHECK(parseGraph(str, &graph->nChannels, ngpus, graph->intra)); + for (int i=0; i<graph->nChannels*ngpus; i++) { + // Translate gpu numbers into ranks + graph->intra[i] = system->nodes[GPU].nodes[graph->intra[i]].rank; + } + // TODO : let user specify NICs + graph->inter[0] = graph->inter[1] = 0; + graph->speedIntra = graph->speedInter = PCI_WIDTH+2; + graph->nvlink = 0; + if (graph->pattern == NCCL_TOPO_PATTERN_RING) { + // Reverse the loop + for (int c=0; c<graph->nChannels; c++) { + for (int i=0; i<=ngpus/2; i++) { + int tmp = graph->intra[ngpus*c+i]; + graph->intra[ngpus*c+i] = graph->intra[ngpus*c+(ngpus-i)%ngpus]; + graph->intra[ngpus*c+ngpus-i] = tmp; + } + } + } + if (graph->nChannels) return ncclSuccess; + } + + if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE; + + struct ncclTopoGraph tmpGraph; + memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph)); + int bestSpeed = 0; + + // First try crossnic, then decrease speed and finally increase speedIntra. + tmpGraph.speedIntra = tmpGraph.speedInter = system->maxWidth; + int maxSpeed = system->maxSpeed; + tmpGraph.pattern = graph->pattern; + +search: + int time = NCCL_SEARCH_TIMEOUT; + tmpGraph.nvlink = 1; + tmpGraph.nChannels = 0; + tmpGraph.sameChannels = 1; + NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, maxSpeed, &time)); +#if 0 + printf("Pattern %d, crossNic %d, Speed %d/%d, type %d -> nChannels %dx%d/%d %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.type, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : ""); + for (int c=0; c<graph->nChannels; c++) { + printf("%2d : ", c); + for (int g=0; g<ngpus; g++) { + printf("%d ", graph->intra[c*ngpus+g]); + } + printf("\n"); + } +#endif + if (time == -1) goto done; + // We already have a solution and we timed out so lower speed will just timeout as well + if (time == 0 && graph->nChannels > 0) goto done; + if ((graph->nChannels > 0) && (bestSpeed == 0)) bestSpeed = graph->speedIntra; + + if (tmpGraph.speedIntra == tmpGraph.speedInter) { + // First pass, we don't have a solution yet ; try to go slower. + + // Try a simpler tree + if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) { + tmpGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE; + goto search; + } + if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) { + tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE; + goto search; + } + tmpGraph.pattern = graph->pattern; + + if (tmpGraph.type < LINK_QPI) { + tmpGraph.type += 1; + goto search; + } + tmpGraph.type = graph->type; + + if (crossNic && tmpGraph.crossNic == 0) { + // Try again with crossNic if permitted + tmpGraph.crossNic = crossNic; + goto search; + } + tmpGraph.crossNic = graph->crossNic; + + // Try to reduce speed per channel + tmpGraph.speedIntra = tmpGraph.speedInter -= 3; + if (tmpGraph.speedIntra >= bestSpeed/2 && tmpGraph.speedIntra >= 3) goto search; + } + +done: + // We have a solution now. See if we can increase speedIntra + if (tmpGraph.speedIntra == tmpGraph.speedInter) { + time = -1; + memcpy(&tmpGraph, graph, sizeof(tmpGraph)); + } + if (time != 0 && tmpGraph.pattern != NCCL_TOPO_PATTERN_RING && tmpGraph.speedIntra == graph->speedIntra) { + // Try to increase the intra speed only but keeping nChannels the same + tmpGraph.speedIntra += 3; + maxSpeed = tmpGraph.speedIntra * graph->nChannels; + if (tmpGraph.speedIntra <= tmpGraph.speedInter*2) goto search; + } + + if (graph->nChannels == 0) { + WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern); + for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].rank; + graph->inter[0] = graph->inter[1] = 0; + graph->speedIntra = graph->speedInter = 3; + graph->nvlink = 0; + graph->nChannels = 1; + } + return ncclSuccess; +} + +ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { + INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, speed %d/%d, nvlink %d, type %d, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->speedIntra, graph->speedInter, graph->nvlink, graph->type, graph->sameChannels); + int ngpus = system->nodes[GPU].count; + + char line[1024]; + for (int c=0; c<graph->nChannels; c++) { + sprintf(line, "%2d :", c); + int offset = strlen(line); + if (system->nodes[NET].count > 0) { + sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c]); + offset = strlen(line); + } + for (int i=0; i<ngpus; i++) { + sprintf(line+offset, " %s/%d", topoNodeTypeStr[GPU], graph->intra[ngpus*c+i]); + offset = strlen(line); + } + if (system->nodes[NET].count > 0) { + sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c+1]); + offset = strlen(line); + } + INFO(NCCL_GRAPH, "%s", line); + } + return ncclSuccess; +} + +ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* dev) { + *dev = graph->inter[(channelId%graph->nChannels)*2+dir]; + return ncclSuccess; +} diff --git a/src/graph/topo.cc b/src/graph/topo.cc new file mode 100644 index 0000000..a1b3209 --- /dev/null +++ b/src/graph/topo.cc @@ -0,0 +1,641 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" +#include "graph.h" +#include "topo.h" +#include "comm.h" +#include "nvmlwrap.h" +#include "net.h" +#include <sys/stat.h> +#include <fcntl.h> + +#define BUSID_SIZE (sizeof("0000:00:00.0")) +#define BUSID_REDUCED_SIZE (sizeof("0000:00")) + +const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" }; + +const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" }; +const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "QPI", "NET" }; + +/******************************************************************/ +/******************* Graph Creation Functions *********************/ +/******************************************************************/ +static int getNumaId(char *path) { + char npath[PATH_MAX]; + snprintf(npath, PATH_MAX, "%s/numa_node", path); + npath[PATH_MAX-1] = '\0'; + + int numaId = -1; + FILE *file = fopen(npath, "r"); + if (file == NULL) return -1; + if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; } + fclose(file); + + return numaId; +} + +static ncclResult_t getPciPath(char* busId, char** path) { + for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]); + char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0"; + memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1); + memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1); + *path = realpath(busPath, NULL); + if (*path == NULL) { + WARN("Could not find real path of %s", busPath); + return ncclSystemError; + } + return ncclSuccess; +} + +// Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000. +ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) { + char* str = path+offset; + // Remove trailing "/" + if (*str == '/') str--; + // Find next / + while (*str != '/') str--; + str++; + NCCLCHECK(busIdToInt64(str, id)); + return ncclSuccess; +} + +static ncclResult_t idToIndex(struct ncclTopoSystem* system, int64_t id, int* index) { + *index = -1; + for (int i=0; i<system->nodes[GPU].count; i++) { + if (system->nodes[GPU].nodes[i].id == id) { + *index = i; + } + } + return ncclSuccess; +} + + +static ncclResult_t getPath(int64_t id, char** path) { + char busId[] = "0000:00:00.0"; + NCCLCHECK(int64ToBusId(id, busId)); + NCCLCHECK(getPciPath(busId, path)); + return ncclSuccess; +} + +ncclResult_t ncclTopoCudaPath(int cudaDev, char** path) { + char busId[BUSID_SIZE]; + CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev)); + NCCLCHECK(getPciPath(busId, path)); + return ncclSuccess; +} + + +int interCpuWidth = 0; +int cpuPciWidth = 0; + +static ncclResult_t getCpuWidths() { + // Check if already detected + if (interCpuWidth + cpuPciWidth) return ncclSuccess; + + // Defaults + char cpu[256]; + sprintf(cpu, "Generic"); + cpuPciWidth = interCpuWidth = PCI_WIDTH; + +#ifdef __PPC__ + sprintf(cpu, "ppc64"); + interCpuWidth = P9_WIDTH; +#endif +#ifdef __x86_64__ + sprintf(cpu, "x86_64"); + union { + struct { + // CPUID 0 String register order + uint32_t ebx; + uint32_t edx; + uint32_t ecx; + }; + char vendor[12]; + } cpuid0; + + asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0)); + if (strncmp(cpuid0.vendor, "GenuineIntel", 12) == 0) sprintf(cpu, "Intel"); + + if (strcmp(cpu, "Intel") == 0) { + union { + struct { + int steppingId:4; + int model:4; + int familyId:4; + int processorType:2; + int resv0:2; + int extModelId:4; + int modelId:8; + int resv1:4; + }; + uint32_t val; + } cpuid1; + asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1)); + if (cpuid1.familyId == 6 && cpuid1.modelId >= 0x55) { // Skylake + sprintf(cpu, "Intel/Skylake (or later)"); + interCpuWidth = SKL_QPI_WIDTH; + } else { + interCpuWidth = QPI_WIDTH; + } + } +#endif + INFO(NCCL_GRAPH, "%s CPU (PCI %d, InterCpu %d)", cpu, cpuPciWidth, interCpuWidth); + return ncclSuccess; +} + +static ncclResult_t ncclTopoGetInterCpuWidth(int* width) { + NCCLCHECK(getCpuWidths()); + *width = interCpuWidth; + return ncclSuccess; +} +static ncclResult_t ncclTopoGetCpuPciP2pWidth(int* width) { + NCCLCHECK(getCpuWidths()); + *width = cpuPciWidth; + return ncclSuccess; +} +static ncclResult_t ncclTopoGetPciWidth(int* width) { + *width = PCI_WIDTH; + return ncclSuccess; +} +static ncclResult_t ncclTopoGetNetWidth(int* width) { + *width = NET_WIDTH; + return ncclSuccess; +} + +enum ncclNvLinkDeviceType { + ncclNvLinkDeviceUnknown, + ncclNvLinkDeviceGpu, + ncclNvLinkDeviceSwitch, + ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea) +}; + +static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) { + char classPath[] = "/sys/bus/pci/devices/0000:00:00.0/class"; + memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1); + char* rPath = realpath(classPath, NULL); + int fd; + if ((fd = open(rPath, O_RDONLY)) == -1) { + // Could not find device. It might be because we're in a VM and + // we don't see the whole machine. This is handled silently so + // we don't want to print an INFO error. + TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno)); + return ncclSystemError; + } + free(rPath); + char pciClass[9]; + strncpy(pciClass, "0x000000", 9); + int len; + SYSCHECKVAL(read(fd, pciClass, 8), "read", len); + SYSCHECK(close(fd), "close"); + if (strcmp(pciClass, "0x068000") == 0) { + // PCI device is of type "Bridge / Other Bridge Device" (NVswitch) + *type = ncclNvLinkDeviceSwitch; + } else if (strcmp(pciClass, "0x068001") == 0) { + // PCI device is of type "Bridge: IBM Device 04ea" + *type = ncclNvLinkDeviceBridge; + } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla) + || strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce) + *type = ncclNvLinkDeviceGpu; + } else { + *type = ncclNvLinkDeviceUnknown; + } + return ncclSuccess; +} + +ncclResult_t ncclTopoConnectCpu(struct ncclTopoSystem* system, int numaId, struct ncclTopoNode* node, int linkType, int linkWidth) { + struct ncclTopoNode* cpuNode = NULL; + for (int c=0; c<system->nodes[CPU].count; c++) { + if (system->nodes[CPU].nodes[c].id == numaId) cpuNode = system->nodes[CPU].nodes+c; + } + if (cpuNode == NULL) { // Create CPU + NCCLCHECK(ncclTopoCreateNode(system, &cpuNode, CPU, numaId)); + } + NCCLCHECK(ncclTopoConnectNodes(node, cpuNode, linkType, linkWidth)); + NCCLCHECK(ncclTopoConnectNodes(cpuNode, node, linkType, linkWidth)); + return ncclSuccess; +} + +ncclResult_t ncclTopoConnectNVLink(nvmlDevice_t* nvmlDevs, struct ncclTopoSystem* system) { + struct ncclTopoNode* nvsNode = NULL; + + int minNvlinks = 6, minWidth = VOLTA_NVLINK_WIDTH; + for (int g=0; g<system->nodes[GPU].count; g++) { + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + int cudaMajor, cudaMinor; + NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDevs[g], &cudaMajor, &cudaMinor)); + int maxNvLinks, width; + if (cudaMajor < 6) { + maxNvLinks = 0; + width = 0; + } else if (cudaMajor == 6) { + maxNvLinks = 4; + width = PASCAL_NVLINK_WIDTH; + } else { + maxNvLinks = 6; + width = VOLTA_NVLINK_WIDTH; + } + + int nvlinks = 0; + for (int l=0; l<maxNvLinks; ++l) { + // Check whether we can use this NVLink for P2P + unsigned canP2P; + if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDevs[g], l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue; + + // Make sure the Nvlink is up. The previous call should have trained the link. + nvmlEnableState_t isActive; + if ((wrapNvmlDeviceGetNvLinkState(nvmlDevs[g], l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue; + + // Try to figure out what's on the other side of the NVLink + nvmlPciInfo_t remoteProc; + if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevs[g], l, &remoteProc) != ncclSuccess) continue; + + // Make a lower case copy of the bus ID for calling ncclDeviceType + // PCI system path is in lower case + char* p = remoteProc.busId; + char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) { + lowerId[c] = tolower(p[c]); + if (p[c] == 0) break; + } + + enum ncclNvLinkDeviceType type; + NCCLCHECK(ncclDeviceType(lowerId, &type)); + if (type == ncclNvLinkDeviceGpu) { + int64_t remoteId; + NCCLCHECK(busIdToInt64(lowerId, &remoteId)); + int peer; + NCCLCHECK(idToIndex(system, remoteId, &peer)); + if (peer != -1) { + NCCLCHECK(ncclTopoConnectNodes(gpu, system->nodes[GPU].nodes+peer, LINK_NVL, width)); + nvlinks++; + } + } else if (type == ncclNvLinkDeviceBridge) { + // Nvlink between GPU and CPU (PPC) + // Since the remote bridge does not have a valid numa_node, assume we + // are connected to the closest CPU. + char* path; + NCCLCHECK(getPath(gpu->id, &path)); + int numaId = getNumaId(path); + free(path); + NCCLCHECK(ncclTopoConnectCpu(system, numaId, gpu, LINK_NVL, width)); + nvlinks++; + } else { // Nvswitch + if (type == ncclNvLinkDeviceUnknown) { + // The NVLink is up but we couldn't find the PCI device on the other + // side. Assume it's an NVswitch outside a VM. + if (l == 0) INFO(NCCL_INIT, "%d/%d -> %s : Assuming NVLink is connected to NVswitch", g, l, lowerId); + } + if (nvsNode == NULL) { // Create nvswitch + NCCLCHECK(ncclTopoCreateNode(system, &nvsNode, NVS, 0)); + } + NCCLCHECK(ncclTopoConnectNodes(gpu, nvsNode, LINK_NVL, VOLTA_NVLINK_WIDTH)); + NCCLCHECK(ncclTopoConnectNodes(nvsNode, gpu, LINK_NVL, VOLTA_NVLINK_WIDTH)); + nvlinks++; + } + } + minNvlinks = std::min(minNvlinks, nvlinks); + minWidth = std::min(minWidth, width); + } + int pciWidth; + NCCLCHECK(ncclTopoGetPciWidth(&pciWidth)); + system->maxSpeed = minNvlinks ? minNvlinks*minWidth : pciWidth; + system->maxWidth = minNvlinks ? minWidth : pciWidth; + return ncclSuccess; +} + +ncclResult_t ncclTopoCreatePciPath(struct ncclTopoSystem* system, struct ncclTopoNode* endNode, char* path) { + struct ncclTopoNode* lastNode = endNode; + int pciWidth; + NCCLCHECK(ncclTopoGetPciWidth(&pciWidth)); + // Find intermediate PCI switches + int slashCount = 0; + int offsetRC = 0; + while (offsetRC < strlen(path)) { + if (path[offsetRC] == '/') slashCount++; + if (slashCount == 4) break; + offsetRC++; + } + int offset = strlen(path); + slashCount = 0; + while (--offset > offsetRC) { + if (path[offset] == '/') { + slashCount++; + // Find if already existing + if ((slashCount%2) == 0) { + int64_t pciId; + NCCLCHECK(pciPathToInt64(path, offset, offsetRC, &pciId)); + for (int p=0; p<system->nodes[PCI].count; p++) { + if (system->nodes[PCI].nodes[p].id == pciId) { + // Found our PCI switch. Attach and stop since the rest should already + // be connected + NCCLCHECK(ncclTopoConnectNodes(system->nodes[PCI].nodes+p, lastNode, LINK_PCI, pciWidth)); + NCCLCHECK(ncclTopoConnectNodes(lastNode, system->nodes[PCI].nodes+p, LINK_PCI, pciWidth)); + return ncclSuccess; + } + } + struct ncclTopoNode* pciNode; + NCCLCHECK(ncclTopoCreateNode(system, &pciNode, PCI, pciId)); + NCCLCHECK(ncclTopoConnectNodes(pciNode, lastNode, LINK_PCI, pciWidth)); + NCCLCHECK(ncclTopoConnectNodes(lastNode, pciNode, LINK_PCI, pciWidth)); + lastNode = pciNode; + } + } + } + // Then attach to a CPU node + int numaId = getNumaId(path); + int width; + NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width)); + NCCLCHECK(ncclTopoConnectCpu(system, numaId, lastNode, LINK_PCI, width)); + return ncclSuccess; +} + +// Try to detect if IB cards are in fact the same physical NIC, hence sharing ports. +#include <glob.h> +#define IB_GUID_PATH "%s/infiniband/mlx5_*/sys_image_guid" +uint64_t getIbGuid(char* path) { + uint64_t guid = 0ULL; + char guidPath[PATH_MAX]; + snprintf(guidPath, PATH_MAX, IB_GUID_PATH, path); + // PATH has a wildcard in it so use glob() + glob_t globbuf; + glob(guidPath, 0, NULL, &globbuf); + if (globbuf.gl_pathc > 0) + strncpy(guidPath, globbuf.gl_pathv[0], PATH_MAX); + globfree(&globbuf); + guidPath[PATH_MAX-1] = '\0'; + FILE *file = fopen(guidPath, "r"); + if (file != NULL) { + uint64_t a, b, c, d; + if (fscanf(file, "%04lx:%04lx:%04lx:%04lx", &a, &b, &c, &d) != EOF) { + guid = (a << 48) + (b << 32) + (c<<16) + d; + TRACE(NCCL_GRAPH, "Opened %s guid %lx", guidPath, guid); + } + fclose(file); + } + return guid; +} + +struct netInfo { + char* path; + int64_t nic; + uint64_t asic; + int port; + int net; +}; + +ncclResult_t ncclTopoComputeNetInfo(struct netInfo* netInfos, int ndev) { + for (int n=0; n<ndev; n++) { + struct netInfo* info = netInfos+n; + uint64_t ibGuid; + info->nic = n; + info->asic = n; + info->port = 0; + info->net = n; + if (info->path && (ibGuid = getIbGuid(info->path)) != 0) { + info->asic = ibGuid; + + // Ignore PCI subdevice when computing the ID to merge multi-port cards + // and make them use the same PCI link. + char* path = strdup(info->path); + path[strlen(path)-1]='0'; + NCCLCHECK(pciPathToInt64(path, strlen(path), 0, &info->nic)); + free(path); + + // Same PCI path -> different ports of the same NIC + for (int i=0; i<n; i++) if (netInfos[i].nic == info->nic) info->port++; + + // Same GUID -> same network links as the other NIC + for (int i=0; i<n; i++) if (netInfos[i].asic == info->asic && netInfos[i].port == info->port) info->net = netInfos[i].net; + } + INFO(NCCL_GRAPH, "%s -> %x/%lx/%d/%d", info->path, info->nic, info->asic, info->port, info->net); + } + return ncclSuccess; +} + +ncclResult_t ncclTopoConnectPCI(struct ncclTopoSystem* system) { + for (int g=0; g<system->nodes[GPU].count; g++) { + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + char* path; + NCCLCHECK(getPath(gpu->id, &path)); + NCCLCHECK(ncclTopoCreatePciPath(system, gpu, path)); + free(path); + } + + // Connect the NICs + int netDevCount; + NCCLCHECK(ncclNetDevices(&netDevCount)); + int netWidth; + NCCLCHECK(ncclTopoGetNetWidth(&netWidth)); + + struct netInfo* netInfos; + NCCLCHECK(ncclCalloc(&netInfos, netDevCount)); + + for (int n=0; n<netDevCount; n++) { + ncclResult_t res = ncclNetPciPath(n, &netInfos[n].path); + if (res != ncclSuccess) netInfos[n].path = NULL; + } + + NCCLCHECK(ncclTopoComputeNetInfo(netInfos, netDevCount)); + + for (int n=0; n<netDevCount; n++) { + struct netInfo* info = netInfos+n; + // Create NIC and attach it to the PCI tree + struct ncclTopoNode* nicNode = NULL; + for (int i=0; i<system->nodes[NIC].count; i++) { + if (system->nodes[NIC].nodes[i].id == info->nic) { + nicNode = system->nodes[NIC].nodes+i; + break; + } + } + if (!nicNode) { + NCCLCHECK(ncclTopoCreateNode(system, &nicNode, NIC, info->nic)); + if (info->path) { + // Create the PCI path + NCCLCHECK(ncclTopoCreatePciPath(system, nicNode, info->path)); + } else { + // This is probably a virtual NIC. Just attach it directly to CPU 0 + int width; + NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width)); + NCCLCHECK(ncclTopoConnectCpu(system, 0, nicNode, LINK_PCI, width)); + } + } + free(info->path); + + // Create the network side + struct ncclTopoNode* netNode; + NCCLCHECK(ncclTopoCreateNode(system, &netNode, NET, n)); + + // Use rank to store the net information + netNode->rank = info->net; + + NCCLCHECK(ncclTopoConnectNodes(nicNode, netNode, LINK_NET, netWidth)); + NCCLCHECK(ncclTopoConnectNodes(netNode, nicNode, LINK_NET, netWidth)); + } + free(netInfos); + + // And connect all CPU nodes together + for (int n=0; n<system->nodes[CPU].count; n++) { + for (int p=0; p<system->nodes[CPU].count; p++) { + if (n == p) continue; + int width; + NCCLCHECK(ncclTopoGetInterCpuWidth(&width)); + NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_QPI, width)); + } + } + return ncclSuccess; +} + +static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) { + if (node->type == GPU) { + sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->rank); + } else { + sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id); + } + INFO(NCCL_GRAPH, "%s", line); + for (int i=0; i<offset; i++) line[i] = ' '; + + for (int l=0; l<node->nlinks; l++) { + struct ncclTopoLink* link = node->links+l; + if (link->type == LINK_LOC) continue; + if (link->remNode != prevNode) { + sprintf(line+offset, "+ %s[%2d] - ", topoLinkTypeStr[link->type], link->width); + int nextOffset = strlen(line); + if (link->type == LINK_PCI) { + NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset)); + } else { + if (link->remNode->type == NET) { + sprintf(line+nextOffset, "%s/%lX (%d)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->rank); + } else { + sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id); + } + INFO(NCCL_GRAPH, "%s", line); + } + } + } + return ncclSuccess; +} + +ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) { + INFO(NCCL_GRAPH, "=== System : maxWidth %2d maxSpeed %2d ===", s->maxWidth, s->maxSpeed); + char line[1024]; + for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0)); + INFO(NCCL_GRAPH, "=========================================="); + NCCLCHECK(ncclTopoPrintPaths(s)); + return ncclSuccess; +} + +static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* upNode) { + // Shift all links to have upLink as last link + if (upNode) { + int l=0; + while (node->links[l].remNode != upNode) l++; + struct ncclTopoLink upLink; + memcpy(&upLink, node->links+l, sizeof(struct ncclTopoLink)); + while (node->links[l+1].remNode) { + memcpy(node->links+l, node->links+l+1, sizeof(struct ncclTopoLink)); + l++; + } + memcpy(node->links+l, &upLink, sizeof(struct ncclTopoLink)); + } + + // Recursively sort the PCI tree + for (int l=0; l<node->nlinks; l++) { + struct ncclTopoLink* link = node->links+l; + if (link->type == LINK_PCI && link->remNode != upNode) NCCLCHECK(ncclTopoSort(link->remNode, node)); + } + return ncclSuccess; +} + +// We want the graph to be organized to ease/accelerate traversal : +// 1. NVLinks (already the case) +// 2. PCI down +// 3. PCI up +// 4. QPI (already the case) +ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) { + for (int n=0; n<system->nodes[CPU].count; n++) NCCLCHECK(ncclTopoSort(system->nodes[CPU].nodes+n, NULL)); + return ncclSuccess; +} + +ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) { + struct ncclTopoSystem* s; + NCCLCHECK(ncclCalloc(&s, 1)); + nvmlDevice_t* nvmlDevs; + int g = 0; + NCCLCHECK(ncclCalloc(&nvmlDevs, comm->nRanks)); + for (int r=0; r<comm->nRanks; r++) { + if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) { + // Consider the GPU as outside of our node if we can't see it through NVML. + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId)); + if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevs+g) != ncclSuccess) continue; + g++; + struct ncclTopoNode* gpuNode; + NCCLCHECK(ncclTopoCreateNode(s, &gpuNode, GPU, comm->peerInfo[r].busId)); + gpuNode->rank = r; + } + } + + NCCLCHECK(ncclTopoConnectNVLink(nvmlDevs, s)); + NCCLCHECK(ncclTopoConnectPCI(s)); + + free(nvmlDevs); + NCCLCHECK(ncclTopoSortSystem(s)); + *system = s; + return ncclSuccess; +} + +ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink) { + int g1, g2; + NCCLCHECK(idToIndex(system, busId1, &g1)); + NCCLCHECK(idToIndex(system, busId2, &g2)); + *nvlink = g1 != -1 && g2 != -1 && system->nodes[GPU].nodes[g1].paths[GPU][g2].type == LINK_NVL; + return ncclSuccess; +} + +ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink) { + int g; + NCCLCHECK(idToIndex(system, busId, &g)); + for (int i=0; i<system->nodes[GPU].count; i++) { + if (i == g) continue; + if (system->nodes[GPU].nodes[g].paths[GPU][i].type == LINK_NVL) { + *nvlink = 1; + return ncclSuccess; + } + } + *nvlink = 0; + return ncclSuccess; +} + +static int pathDistance(struct ncclTopoLinkList* links) { + int distance = PATH_PIX; + if (links->count > 2) distance = PATH_PXB; + for (int l=0; l<links->count; l++) { + // PHB if we go through 1 CPU, SYS if we go through 2 CPUs + if (links->list[l]->remNode->type == CPU) distance = (distance == PATH_PHB) ? PATH_SYS : PATH_PHB; + } + return distance; +} + +ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance) { + int g1, g2; + NCCLCHECK(idToIndex(system, busId1, &g1)); + NCCLCHECK(idToIndex(system, busId2, &g2)); + *distance = pathDistance(system->nodes[GPU].nodes[g1].paths[GPU]+g2); + return ncclSuccess; +} + +ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance) { + int g; + NCCLCHECK(idToIndex(system, busId, &g)); + *distance = pathDistance(system->nodes[GPU].nodes[g].paths[NET]+netDev); + return ncclSuccess; +} + +ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count) { + *count = system->nodes[CPU].count; + return ncclSuccess; +} diff --git a/src/graph/topo.h b/src/graph/topo.h new file mode 100644 index 0000000..6b8a2f9 --- /dev/null +++ b/src/graph/topo.h @@ -0,0 +1,138 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_TOPO_H_ +#define NCCL_TOPO_H_ + +#include "graph.h" +#include "core.h" + +#define LOC_WIDTH 5000 +#define PASCAL_NVLINK_WIDTH 18 +#define VOLTA_NVLINK_WIDTH 21 +#define PCI_WIDTH 12 // PCI Gen3 x16 +#define QPI_WIDTH 8 +#define SKL_QPI_WIDTH 12 +#define P9_WIDTH 32 +#define NET_WIDTH 12 // 100Gbit + +// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, to GPU +// to GPU traffic consumed more PCI bandwidth. +#define INTEL_P2P(speed) (speed*9/12) +#define INTEL_P2P_OVERHEAD(speed) (speed*12/9) + +#define NCCL_TOPO_NODE_TYPES 6 +#define GPU 0 +#define PCI 1 +#define NVS 2 +#define CPU 3 // Actually NUMA domains +#define NIC 4 +#define NET 5 +extern const char* topoNodeTypeStr[]; + +#define LINK_LOC 0 +#define LINK_NVL 1 +#define LINK_PCI 2 +#define LINK_QPI 3 +#define LINK_NET 4 +extern const char* topoLinkTypeStr[]; + +struct ncclTopoNode; +struct ncclTopoLink { + int type; + int width; + struct ncclTopoNode* remNode; +}; +#define NCCL_TOPO_MAX_LINKS 32 +#define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES) +#define SELECT_PATH 1 +#define SELECT_LAST 2 + +#define NET_GDR_MASK 0x70000000 + +struct ncclTopoLinkList { + struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS]; + int count; + int width; + int type; +}; + +struct ncclTopoNode { + int type; + int64_t id; + int rank; + int nlinks; + struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS]; + // Pre-computed paths to GPUs and NICs + struct ncclTopoLinkList* paths[NCCL_TOPO_NODE_TYPES]; + // Used during search + uint64_t used; +}; + +struct ncclTopoNodeSet { + int count; + struct ncclTopoNode nodes[NCCL_TOPO_MAX_NODES]; +}; + +struct ncclTopoSystem { + struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES]; + int maxSpeed; + int maxWidth; + int searchInitDone; +}; + +static ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) { + for (int i=0; i<system->nodes[type].count; i++) { + if (system->nodes[type].nodes[i].id == id) { + *node = system->nodes[type].nodes+i; + return ncclSuccess; + } + } + if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) { + WARN("Error : tried to create too many nodes of type %d\n", type); + return ncclInternalError; + } + struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count; + system->nodes[type].count++; + n->type = type; + n->id = id; + if (type == GPU) { + // Create link to itself (used in some corner cases) + n->nlinks=1; + n->links[0].type = LINK_LOC; + n->links[0].remNode = n; + n->links[0].width = LOC_WIDTH; + } + *node = n; + return ncclSuccess; +} + +static ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, int width) { + // Aggregate links into higher width for NVLink + struct ncclTopoLink* link; + for (link = node->links; link->remNode; link++) { + if (link->remNode == remNode && link->type == type) break; + } + if (link->remNode == NULL) node->nlinks++; + link->type = type; + link->remNode = remNode; + link->width += width; + + // Sort links in BW descending order + struct ncclTopoLink linkSave; + memcpy(&linkSave, link, sizeof(struct ncclTopoLink)); + while (link != node->links) { + if ((link-1)->width >= linkSave.width) break; + memcpy(link, link-1, sizeof(struct ncclTopoLink)); + link--; + } + memcpy(link, &linkSave, sizeof(struct ncclTopoLink)); + return ncclSuccess; +} + +ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system); + +#endif diff --git a/src/misc/trees.cc b/src/graph/trees.cc index f672abe..722e61b 100644 --- a/src/misc/trees.cc +++ b/src/graph/trees.cc @@ -4,9 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" -#include "net.h" -#include "param.h" +#include "nccl.h" #define RANK_TO_INDEX(r) (rank > root ? rank-1 : rank) diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc new file mode 100644 index 0000000..89a97a3 --- /dev/null +++ b/src/graph/tuning.cc @@ -0,0 +1,212 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" +#include "devcomm.h" +#include "comm.h" +#include "topo.h" + +NCCL_PARAM(Nthreads, "NTHREADS", -2); +NCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2); + +static int getNthreads(const char* name, int env, int min, int max, int def) { + int nt = env; + if (nt > 0) { + if (nt % WARP_SIZE != 0) { + WARN("Invalid %s %d (must be a multiple of %d)", name, nt, WARP_SIZE); + nt = max; + } else if (nt > max) { + WARN("Invalid %s %d (maximum %d).", name, nt, max); + nt = max; + } else if (nt < min) { + WARN("Invalid %s %d (minimum %d).", name, nt, min); + nt = min; + } + } else { + nt = def; + } + return nt; +} + +ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* list) { + int def, set; + if (str[0] == '^') { + def = 1; set = 0; str++; + } else { + def = 0; set = 1; + } + for (int i=0; i<nelems; i++) list[i] = def; + char* tokStr = strdup(str); + char* tmpStr; + char* token = strtok_r(tokStr, ",", &tmpStr); + while (token) { + for (int i=0; i<nelems; i++) + if (strcasecmp(token, elems[i]) == 0) list[i] = set; + token = strtok_r(NULL, ",", &tmpStr); + } + free(tokStr); + return ncclSuccess; +} + +static const char* ncclFuncStr[] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" }; +static const char* ncclAlgoStr[] = { "Tree", "Ring" }; +static const char* ncclProtoStr[] = { "LL", "LL128", "Simple" }; + +// Latencies in us, Bandwidths in GB/s +// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple } +static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4, 0 }, { 3.6, 3.6, 8.4 } }; + +// NVLink, PCI, Network +#define NCCL_HW_NVLINK 0 +#define NCCL_HW_PCI 1 +#define NCCL_HW_NET 2 +// Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network). +static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = +{ /* NVLINK */ + { /* Tree (LL/LL128/Simple)*/ { .5, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { .4, 2.5, 5.7 } }, + /* PCI */ + { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 } }, + /* NET */ + { /* Tree (LL/LL128/Simple)*/ { 5.0, 7.5, 50 }, /* Ring (LL/LL128/Simple)*/ { .9, 2.5, 6.6 } } +}; + +// LL128 max BW for the different collectives +static const double ll128MaxBw[NCCL_NUM_FUNCTIONS] = { 113.0, 72.0, 110.0, 91.0, 100.0 }; + +ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph) { + int simpleDefaultThreads = (treeGraph->speedIntra*treeGraph->nChannels <= 12) ? 256 : NCCL_MAX_NTHREADS; + comm->maxThreads[NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads); + comm->maxThreads[NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS); + comm->maxThreads[NCCL_PROTO_LL128] = getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS); + + INFO(NCCL_INIT, "Threads per block : %d/%d/%d", comm->maxThreads[NCCL_PROTO_LL], comm->maxThreads[NCCL_PROTO_LL128], comm->maxThreads[NCCL_PROTO_SIMPLE]); + + if (comm->nRanks <= 1) return ncclSuccess; + + struct ncclTopoGraph* graphs[2] = { treeGraph, ringGraph }; + int intraHw[2], hw[2]; + for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->nvlink ? NCCL_HW_NVLINK : NCCL_HW_PCI; + for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = comm->nNodes == 1 ? intraHw[a] : NCCL_HW_NET; + + for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) { + int nsteps = coll == ncclCollAllReduce ? 2*(comm->nRanks-1) : + coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nRanks-1 : + comm->nRanks; + + for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) { + if (coll != ncclCollAllReduce && a == NCCL_ALGO_TREE) continue; + + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + int speed = comm->nNodes <= 2 ? graphs[a]->speedIntra : graphs[a]->speedInter; + float busBw = graphs[a]->nChannels * speed * 1.0; + + // Various model refinements + if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= 1.0/4.0; + if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw*120.0/128.0, ll128MaxBw[coll]); + if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.9, comm->nNodes > 1 ? 70.0 : 90.0); + if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/3.0; + if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw *= 7.0/9.0; + + // Convert bus BW to algorithm BW + float ratio = a == NCCL_ALGO_TREE ? .5 : (1.0 * comm->nRanks) / nsteps; + comm->bandwidths[coll][a][p] = busBw * ratio; + + comm->latencies[coll][a][p] = baseLat[a][p]; + if (a == NCCL_ALGO_RING) { + float lat = hwLat[hw[a]][a][p]; + if ((coll == ncclCollReduce || coll == ncclCollBroadcast)) { + if (ringGraph->sameChannels) { + comm->latencies[coll][a][p] += lat; + } else { + if (p == NCCL_PROTO_SIMPLE) lat = hwLat[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling + comm->latencies[coll][a][p] += nsteps*lat; + } + } else { + comm->latencies[coll][a][p] += nsteps*lat; + } + } else { + float intraLat = hwLat[intraHw[a]][a][p]; + float interLat = hwLat[NCCL_HW_NET][a][p]; + comm->latencies[coll][a][p] += + 2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat); + } + } + } + } + + // Protocols/Algorithms enable/disable, and user overrides. + // All are enabled except ll128 which is enabled by default only in certain cases. + int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 }; + int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1 }; + + const char *protoStr = getenv("NCCL_PROTO"); + if (protoStr) NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable)); + const char *algoStr = getenv("NCCL_ALGO"); + if (algoStr) NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable)); + + for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + int pEnable = protoEnable[p]; + if (pEnable == 2 && p == NCCL_PROTO_LL128) { + // Enable LL128 by default only on Volta+NVLink. Other cases are not tested and may cause silent data corruption. + pEnable = (graphs[a]->type <= LINK_PCI) && graphs[a]->nvlink && minCompCap == 70 && maxCompCap == 70 ? 1 : 0; + } + if (pEnable == 0 || algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0; + } + + if (comm->rank == 0) { + char line[1024]; + int offset = 0; + sprintf(line, "Latency/AlgBw |"); + offset = strlen(line); + for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) { + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + sprintf(line+offset, " %4s/%6s |", ncclAlgoStr[a], ncclProtoStr[p]); + offset = strlen(line); + } + } + INFO(NCCL_TUNING, "%s", line); + for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) { + sprintf(line, "%13s |", ncclFuncStr[c]); + offset = strlen(line); + for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) { + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + sprintf(line+offset, "%7.1f/%5.1f|", comm->latencies[c][a][p], comm->bandwidths[c][a][p]); + offset = strlen(line); + } + } + INFO(NCCL_TUNING, "%s", line); + } + } + + // Set per-thread amount of work before we increase nThreads and nChannels + for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) { + comm->threadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD; + comm->threadThresholds[a][NCCL_PROTO_LL128] = NCCL_LL128_THREAD_THRESHOLD; + comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD; + } + comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= comm->nRanks; + + // Override defaults with user env + char* str = getenv("NCCL_THREAD_THRESHOLDS"); + if (str) { + ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { -2 }; + sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2); + for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) { + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + if (t[a][p] >= 0) comm->threadThresholds[a][p] = t[a][p]; + } + } + } + + INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld", + comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL], + comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL128], + comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], + comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL], + comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL128], + comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]); + return ncclSuccess; +} diff --git a/src/misc/group.cc b/src/group.cc index 7bc64cd..9bf8ac9 100644 --- a/src/misc/group.cc +++ b/src/group.cc @@ -51,11 +51,6 @@ struct ncclAsyncArgs { thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS]; -ncclResult_t ncclSetDevice(int cudaDev) { - CUDACHECK(cudaSetDevice(cudaDev)); - return ncclSuccess; -} - #define CHECK(a) do { \ if ((args->ret = (a)) != ncclSuccess) { \ INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ @@ -65,15 +60,14 @@ ncclResult_t ncclSetDevice(int cudaDev) { void* ncclAsyncThreadMain(void* args_) { struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_; - CHECK(ncclSetDevice(args->init.cudaDev)); - CHECK(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank)); + CHECK(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev)); return args; } -ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) { +ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev) { if (ncclGroupIndex >= MAX_ASYNC_OPS) { WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS); - return ncclAsyncErrCheck(ncclInternalError); + return ncclAsyncErrCheck(ncclInvalidUsage); } int index = ncclGroupIndex++; struct ncclAsyncArgs* args = ncclGroupArgs+index; @@ -84,8 +78,6 @@ ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm args->init.ndev = ndev; memcpy(&args->init.commId, &commId, sizeof(commId)); args->init.myrank = myrank; - // We need to use threads for Init - pthread_create(ncclGroupThreads+index, NULL, ncclAsyncThreadMain, args); return ncclSuccess; } @@ -97,7 +89,7 @@ ncclResult_t ncclAsyncColl(ncclComm_t comm) { } if (ncclGroupIndex >= MAX_ASYNC_OPS) { WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS); - return ncclAsyncErrCheck(ncclInternalError); + return ncclAsyncErrCheck(ncclInvalidUsage); } ncclGroupIndex++; args->funcType = ASYNC_FUNC_COLL; @@ -124,6 +116,14 @@ ncclResult_t ncclGroupEnd() { ncclResult_t ret = ncclGroupError; if (ret != ncclSuccess) goto group_cleanup; + /* Launch async ncclCommInitRank */ + for (int i=0; i<ncclGroupIndex; i++) { + struct ncclAsyncArgs* args = ncclGroupArgs+i; + if (args->funcType == ASYNC_FUNC_INIT) { + pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadMain, args); + } + } + /* Collectives are done in three steps : * 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative] * 2. Barrier Wait. No CUDA call is permitted @@ -166,8 +166,8 @@ ncclResult_t ncclGroupEnd() { if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) { int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL); if (err == EBUSY) continue; - if (err != 0) { ret = ncclSystemError; goto end; } - if (args->ret != ncclSuccess) { ret = args->ret; goto end; } + if (err != 0) ret = ncclSystemError; + if (args->ret != ncclSuccess) ret = args->ret; doneArray[i] = 1; done--; } @@ -175,20 +175,47 @@ ncclResult_t ncclGroupEnd() { } goto end; group_cleanup: - // At least one call in the group failed. Since we want to make that group - // an atomic operation, we need to cancel all operations. - for (int i=0; i<ncclGroupIndex; i++) { - struct ncclComm* comm = ncclGroupArgs[i].coll.comm; - for (int c=0; c<comm->nChannels; c++) { - struct ncclChannel* channel = comm->channels+c; - for (int i=0; i<channel->collCount; i++) { - channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0; + if (ret != ncclSuccess) { + // At least one call in the group failed. Since we want to make that group + // an atomic operation, we need to cancel all operations. + for (int i=0; i<ncclGroupIndex; i++) { + struct ncclAsyncArgs* args = ncclGroupArgs+i; + if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) { + if (args->init.newcomm) NCCLCHECK(ncclCommDestroy(*args->init.newcomm)); + *args->init.newcomm = NULL; + } else { + struct ncclComm* comm = args->coll.comm; + for (int c=0; c<comm->nChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + for (int i=0; i<channel->collCount; i++) { + channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0; + } + channel->collFifoTail = channel->collStart; + channel->collCount = 0; + } + /* Cancel all proxy ops : mark them as ncclProxyOpNone and they should be freed later on */ + struct ncclProxyState* state = &comm->proxyState; + struct ncclProxyArgs *op, *start; + pthread_mutex_lock(&state->mutex); + op = start = state->ops; + while (op) { + if (op->opCount >= comm->lastOpCount) op->state = ncclProxyOpNone; + struct ncclProxyArgs* peerOp = op->nextPeer; + while (peerOp) { + if (peerOp->opCount >= comm->lastOpCount) peerOp->state = ncclProxyOpNone; + peerOp = peerOp->nextPeer; + } + op = op->next; + if (op == start) break; + } + comm->opCount = comm->lastOpCount; + pthread_cond_signal(&state->cond); + pthread_mutex_unlock(&state->mutex); + + comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0; + comm->userStreamSet = false; } - channel->collFifoTail = channel->collStart; - channel->collCount = 0; } - comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0; - comm->userStreamSet = false; } end: ncclGroupError = ncclSuccess; diff --git a/src/include/argcheck.h b/src/include/argcheck.h index 0d6cca7..8d8b74e 100644 --- a/src/include/argcheck.h +++ b/src/include/argcheck.h @@ -8,6 +8,7 @@ #define NCCL_ARGCHECK_H_ #include "core.h" +#include "info.h" ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname); ncclResult_t ArgsCheck(struct ncclInfo* info); diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index dacbc7c..a7d6be9 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -17,4 +17,5 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size); ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size); ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size); ncclResult_t bootstrapClose(void* commState); +ncclResult_t bootstrapAbort(void* commState); #endif diff --git a/src/include/channel.h b/src/include/channel.h index c01d942..e2da325 100644 --- a/src/include/channel.h +++ b/src/include/channel.h @@ -6,7 +6,7 @@ #ifndef NCCL_CHANNEL_H_ #define NCCL_CHANNEL_H_ -#include "core.h" +#include "comm.h" ncclResult_t initChannel(struct ncclComm* comm, int channelid); ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks); diff --git a/src/collectives/collectives.h b/src/include/collectives.h index 73fe7d5..69c8e74 100644 --- a/src/collectives/collectives.h +++ b/src/include/collectives.h @@ -7,7 +7,10 @@ #ifndef NCCL_COLLECTIVES_H_ #define NCCL_COLLECTIVES_H_ -#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*2+(al))*2+(ll)) +#include "core.h" +#include "info.h" + +#define FUNC_INDEX(coll, redop, dtype, al, pr) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)) #define NCCL_COLL_NAME(coll, op, dtype) \ coll##_##op##_##dtype @@ -22,7 +25,8 @@ #define DECL_COLL4(coll, op, dtype) \ DECL_COLL5(coll, op, dtype) \ - DECL_COLL5(coll##LL, op, dtype) + DECL_COLL5(coll##LL, op, dtype) \ + DECL_COLL5(coll##LL128, op, dtype) #define DECL_COLL3(coll, op, dtype) \ DECL_COLL4(coll##Ring, op, dtype) \ diff --git a/src/include/comm.h b/src/include/comm.h index 3b2a85d..7164dc0 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -7,6 +7,8 @@ #ifndef NCCL_COMM_H_ #define NCCL_COMM_H_ +#include "transport.h" + #if CUDART_VERSION < 9000 struct cudaLaunchParams { void *func; @@ -18,13 +20,17 @@ struct cudaLaunchParams { }; #endif -#define MAXCHANNELS 16 #define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */ #define CACHE_LINE_SIZE 128 #define MEM_ALIGN 4096 #define CUDA_IPC_MIN 2097152UL +// Channels / LL tuning +#define NCCL_LL_THREAD_THRESHOLD 8 +#define NCCL_LL128_THREAD_THRESHOLD 8 +#define NCCL_SIMPLE_THREAD_THRESHOLD 64 + struct ncclSendMem { union { struct { @@ -50,6 +56,7 @@ struct ncclRecvMem { char pad4[MEM_ALIGN]; }; ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES]; + uint64_t ll128Buff[NCCL_LL128_BUFF_ELEMS]; char buff[1]; // Actually larger than that }; @@ -57,13 +64,18 @@ struct ncclComm { struct ncclChannel channels[MAXCHANNELS]; struct ncclPeerInfo* peerInfo; + struct ncclTopoSystem* topo; void* bootstrap; int rank; // my rank in the communicator int nRanks; // number of GPUs in communicator int cudaDev; // my cuda device index - int nvmlDev; // my NVML device number + int64_t busId; // my PCI bus ID in int format + + int node; + int nNodes; + int localRanks; enum { GROUP, PARALLEL } launchMode; cudaStream_t userStream; @@ -74,17 +86,19 @@ struct ncclComm { // Counter to make sure collectives match (needed for bcast/reduce // where syncs are not symmetric). uint64_t opCount; + uint64_t lastOpCount; // Channels for collectives int nChannels; - int nThreads; - // Low-latency algorithm threshold - ssize_t llThreshold; - ssize_t threadThreshold; + // Only nvlink is used for inter-GPU communication + int nvlink; - // Tree algorithm threshold - ssize_t treeThreshold; + // Algorithm/Protocols thresholds + ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + int maxThreads[NCCL_NUM_PROTOCOLS]; // An internal CUDA stream for NCCL kernel CGMD launches int groupCudaStream; diff --git a/src/include/core.h b/src/include/core.h index 8a08b91..250f43b 100644 --- a/src/include/core.h +++ b/src/include/core.h @@ -8,19 +8,11 @@ #define NCCL_CORE_H_ #include <pthread.h> -#include <algorithm> -#include "nccl.h" -#include "debug.h" -#include "checks.h" -#include "alloc.h" -#include "transport.h" -#include "devcomm.h" -#include "comm.h" -#include "info.h" -#include "argcheck.h" -#include <cstdio> #include <unistd.h> #include <stdlib.h> +#include <stdint.h> +#include <algorithm> // For std::min/std::max +#include "nccl.h" #ifdef PROFAPI #define NCCL_API(ret, func, args...) \ @@ -38,10 +30,6 @@ ret func(args) #endif // end PROFAPI -int ncclCudaCompCap(); -ncclResult_t ncclNvlinkGpu(int* nvlink); -int64_t ncclTreeThreshold(); - static __inline__ int ncclTypeSize(ncclDataType_t type) { switch (type) { case ncclInt8: @@ -62,4 +50,22 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) { } } +#define NCCL_NUM_FUNCTIONS 5 +typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce } ncclFunc_t; + +#define NCCL_NUM_ALGORITHMS 2 // Tree/Ring +#define NCCL_ALGO_TREE 0 +#define NCCL_ALGO_RING 1 + +#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 +#define NCCL_PROTO_LL 0 +#define NCCL_PROTO_LL128 1 +#define NCCL_PROTO_SIMPLE 2 + +#include "debug.h" +#include "checks.h" +#include "alloc.h" +#include "utils.h" +#include "param.h" + #endif // end include guard diff --git a/src/include/debug.h b/src/include/debug.h index c3e8fa0..89b6e42 100644 --- a/src/include/debug.h +++ b/src/include/debug.h @@ -7,15 +7,14 @@ #ifndef NCCL_DEBUG_H_ #define NCCL_DEBUG_H_ -#include <pthread.h> +#include "core.h" + #include <stdio.h> #include <chrono> -#include <unistd.h> #include <sys/syscall.h> #include <limits.h> #include <string.h> -#include "nccl.h" #include "nccl_net.h" #define gettid() (pid_t) syscall(SYS_gettid) @@ -25,9 +24,16 @@ extern uint64_t ncclDebugMask; extern pthread_mutex_t ncclDebugOutputLock; extern FILE *ncclDebugFile; extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim); -extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev); -extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...); +void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...); + +// Let code temporarily downgrade WARN into INFO +extern thread_local int ncclDebugNoWarn; +#define NOWARN(a, ret) do { \ + ncclDebugNoWarn = 1; \ + ret = a; \ + ncclDebugNoWarn = 0; \ +} while (0) #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) @@ -39,101 +45,4 @@ extern std::chrono::high_resolution_clock::time_point ncclEpoch; #define TRACE(...) #endif -#include <stdlib.h> - -static inline void initDebug() { - const char* nccl_debug = getenv("NCCL_DEBUG"); - if (nccl_debug == NULL) { - ncclDebugLevel = NCCL_LOG_NONE; - } else if (strcasecmp(nccl_debug, "VERSION") == 0) { - ncclDebugLevel = NCCL_LOG_VERSION; - } else if (strcasecmp(nccl_debug, "WARN") == 0) { - ncclDebugLevel = NCCL_LOG_WARN; - } else if (strcasecmp(nccl_debug, "INFO") == 0) { - ncclDebugLevel = NCCL_LOG_INFO; - } else if (strcasecmp(nccl_debug, "ABORT") == 0) { - ncclDebugLevel = NCCL_LOG_ABORT; - } else if (strcasecmp(nccl_debug, "TRACE") == 0) { - ncclDebugLevel = NCCL_LOG_TRACE; - } - - /* Parse the NCCL_DEBUG_SUBSYS env var - * This can be a comma separated list such as INIT,COLL - * or ^INIT,COLL etc - */ - char* nccl_debug_subsys = getenv("NCCL_DEBUG_SUBSYS"); - if (nccl_debug_subsys != NULL) { - char *subsys = strtok(nccl_debug_subsys, ","); - while (subsys != NULL) { - int invert = 0; - uint64_t mask = 0; - if (subsys[0] == '^') { invert = 1; subsys++; } - if (strcasecmp(subsys, "INIT") == 0) { - mask = NCCL_INIT; - } else if (strcasecmp(subsys, "COLL") == 0) { - mask = NCCL_COLL; - } else if (strcasecmp(subsys, "P2P") == 0) { - mask = NCCL_P2P; - } else if (strcasecmp(subsys, "SHM") == 0) { - mask = NCCL_SHM; - } else if (strcasecmp(subsys, "NET") == 0) { - mask = NCCL_NET; - } else if (strcasecmp(subsys, "ALL") == 0) { - mask = NCCL_ALL; - } - if (mask) { - if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask; - } - subsys = strtok(NULL, ","); - } - } - - /* Parse and expand the NCCL_DEBUG_FILE path and - * then create the debug file. But don't bother unless the - * NCCL_DEBUG level is > VERSION - */ - const char* nccl_debug_file = getenv("NCCL_DEBUG_FILE"); - if (ncclDebugLevel > NCCL_LOG_VERSION && nccl_debug_file != NULL) { - int c = 0; - char debug_fn[PATH_MAX+1] = ""; - char *dfn = debug_fn; - while (nccl_debug_file[c] != '\0' && c < PATH_MAX) { - if (nccl_debug_file[c++] != '%') { - *dfn++ = nccl_debug_file[c-1]; - continue; - } - switch (nccl_debug_file[c++]) { - case '%': // Double % - *dfn++ = '%'; - break; - case 'h': // %h = hostname - char hostname[1024]; - getHostName(hostname, 1024, '.'); - dfn += snprintf(dfn, PATH_MAX, "%s", hostname); - break; - case 'p': // %p = pid - dfn += snprintf(dfn, PATH_MAX, "%d", getpid()); - break; - default: // Echo everything we don't understand - *dfn++ = '%'; - *dfn++ = nccl_debug_file[c-1]; - break; - } - } - *dfn = '\0'; - if (debug_fn[0] != '\0') { - FILE *file = fopen(debug_fn, "w"); - if (file != NULL) { - INFO(NCCL_ALL,"DEBUG file is '%s'", debug_fn); - ncclDebugFile = file; - } - } - } - pthread_mutex_init(&ncclDebugOutputLock, NULL); - -#ifdef ENABLE_TRACE - ncclEpoch = std::chrono::high_resolution_clock::now(); -#endif -} - #endif diff --git a/src/include/devcomm.h b/src/include/devcomm.h index 0a2ef96..46d236b 100644 --- a/src/include/devcomm.h +++ b/src/include/devcomm.h @@ -13,8 +13,6 @@ #define NCCL_MAX_OPS 2048 #define NCCL_STEPS 8 -typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t; - #define DIVUP(x, y) \ (((x)+(y)-1)/(y)) #define ROUNDUP(x, y) \ @@ -38,16 +36,18 @@ union ncclLLFifoLine { int4 i4; }; -#define MAXTHREADS 256 -#define NCCL_LL_MAX_NTHREADS MAXTHREADS -#define NUM_LINES_PER_THREAD 8 -#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS) +#define WARP_SIZE 32 +#define MAXCHANNELS 32 +#define NCCL_MAX_NTHREADS 512 +#define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS +#define NCCL_LL_LINES_PER_THREAD 8 +#define NCCL_LL_SLICE_LINES (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS) #define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS) #define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine)) -#ifdef DEBUG_LL -#define NCCL_LL_CLEAN_MASK 0x00000ff8 -#define NCCL_LL_FLAG_MAX 0x00001000 -#define NCCL_LL_FLAG(a) ((uint32_t)(a % NCCL_LL_FLAG_MAX)) +#ifdef TEST_LL_CLEANUP +#define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup +#define NCCL_LL_FLAG_MAX 0x100 +#define NCCL_LL_FLAG(a) ((uint32_t)((a) % NCCL_LL_FLAG_MAX)) #else #define NCCL_LL_CLEAN_MASK 0x7ffffff8 #define NCCL_LL_FLAG(a) ((uint32_t)(a)) @@ -55,6 +55,24 @@ union ncclLLFifoLine { // Make sure the clean mask will last for at least NCCL_NSTEPS static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value"); +#define NCCL_LL128_LINESIZE 128 +#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t)) +#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1) + +#define NCCL_LL128_MAX_NTHREADS 640 +#define NCCL_LL128_ELEMS_PER_THREAD 120 + +// Receiving from up to 3 sources is more compute intensive than sending +// to 3 dests. Use 70% for reduce and 30% for bcast. +#define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32) + +#define NCCL_LL128_SLICE_ELEMS (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS) +#define NCCL_LL128_BUFF_ELEMS (NCCL_LL128_SLICE_ELEMS*NCCL_STEPS) +#define NCCL_LL128_BUFF_SIZE (NCCL_LL128_BUFF_ELEMS*sizeof(uint64_t)) + +#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8 +#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS) + struct ncclConnInfo { // Regular comm mechanism char *buff; // Local for recv, remote for send @@ -73,6 +91,9 @@ struct ncclConnInfo { // Low latency mechanism union ncclLLFifoLine *llBuff; // Local for recv, remote for send uint64_t llLastCleaning; + + // High bandwidth, low latency protocol + uint64_t* ll128Buff; // Local for recv, remote for send }; struct ncclConnector { @@ -148,7 +169,8 @@ struct ncclChannel { union { struct { struct ncclRing ring; - struct ncclTree tree; + struct ncclTree treeUp; + struct ncclTree treeDn; int id; int nthreads; @@ -171,8 +193,6 @@ struct ncclChannel { }; static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size"); -#define MAXCHANNELS 16 - typedef enum { ncclDevSuccess, ncclDevAssertedMismatch, diff --git a/src/include/enqueue.h b/src/include/enqueue.h index 3b7a18c..cea486e 100644 --- a/src/include/enqueue.h +++ b/src/include/enqueue.h @@ -7,14 +7,9 @@ #ifndef NCCL_ENQUEUE_H_ #define NCCL_ENQUEUE_H_ -#include "core.h" +#include "comm.h" #include "group.h" - -// Channels / LL tuning -#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings -#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL -#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs -#define NCCL_LL_MIN_NTHREADS 64 +#include "collectives.h" ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast); diff --git a/src/include/graph.h b/src/include/graph.h new file mode 100644 index 0000000..3c8ba19 --- /dev/null +++ b/src/include/graph.h @@ -0,0 +1,94 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_GRAPH_H_ +#define NCCL_GRAPH_H_ + +#include "nccl.h" +#include "devcomm.h" +#include <limits.h> +#include <stdlib.h> +#include <ctype.h> +#include <stdio.h> + +enum ncclPathDist { + PATH_PIX = 0, + PATH_PXB = 1, + PATH_PHB = 2, + PATH_NODE = 3, + PATH_SYS = 4, + PATH_ARRAY_SIZE = 5 +}; + +extern const char* pathDists[PATH_ARRAY_SIZE]; + +ncclResult_t ncclTopoCudaPath(int cudaDev, char** path); + +struct ncclTopoSystem; +// Build the topology +ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system); +ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system); +ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system); + +ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info); +void ncclTopoFree(struct ncclTopoSystem* system); +ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm); +ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system); + +// Query topology +ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink); +ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink); +ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance); +ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* net); +ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance); +ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count); + +#define NCCL_TOPO_MAX_NODES 256 + +#define NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP 1 // Split tree (send/recv from different ranks) always flowing in the same direction +#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Split tree (send/recv from different ranks) flowing in both directions +#define NCCL_TOPO_PATTERN_TREE 3 // Simple tree (send/recv from same rank) flowing in both directions +#define NCCL_TOPO_PATTERN_RING 4 // Ring +struct ncclTopoGraph { + // Input / output + int pattern; + int crossNic; + // Output + int nChannels; + int speedIntra; + int speedInter; + int type; + int nvlink; + int sameChannels; + int nHops; + int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES]; + int inter[MAXCHANNELS*2]; +}; +ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); + +ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); + +struct ncclTopoRanks { + int ringRecv[MAXCHANNELS]; + int ringSend[MAXCHANNELS]; + int ringPrev[MAXCHANNELS]; + int ringNext[MAXCHANNELS]; + int treeUpRecv[MAXCHANNELS]; + int treeUpSend[MAXCHANNELS]; + int treeDnRecv[MAXCHANNELS]; + int treeDnSend[MAXCHANNELS]; +}; + +ncclResult_t ncclTopoPreset(struct ncclComm* comm, + struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, + struct ncclTopoRanks* topoRanks); + +ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, + struct ncclTopoRanks** allTopoRanks, int* rings); + +ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph); + +#endif diff --git a/src/include/group.h b/src/include/group.h index 76da30f..239b05f 100644 --- a/src/include/group.h +++ b/src/include/group.h @@ -8,14 +8,14 @@ #define NCCL_GROUP_H_ #include "nccl.h" -#include "core.h" +#include "comm.h" bool ncclAsyncMode(); ncclResult_t ncclAsyncErrCheck(ncclResult_t ret); -typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank); +typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev); -ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank); +ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev); typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); diff --git a/src/include/info.h b/src/include/info.h index 401298a..9461759 100644 --- a/src/include/info.h +++ b/src/include/info.h @@ -8,6 +8,7 @@ #define NCCL_INFO_H_ #include "nccl.h" +#include "core.h" typedef enum { ncclPatternRing, @@ -21,7 +22,7 @@ typedef enum { // Used to pass NCCL call information between functions struct ncclInfo { - ncclColl_t coll; + ncclFunc_t coll; const char* opName; // NCCL Coll Args const void* sendbuff; @@ -36,7 +37,11 @@ struct ncclInfo { int chunkSteps; int sliceSteps; // Computed later + int algorithm; + int protocol; ncclPattern_t pattern; + int nChannels; + int nThreads; size_t nBytes; int nstepsPerLoop; int nchunksPerLoop; diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h index 797c759..d6ae9f8 100644 --- a/src/include/nccl_net.h +++ b/src/include/nccl_net.h @@ -15,7 +15,7 @@ #define NCCL_PTR_CUDA 0x2 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; -typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_ALL=~0} ncclDebugLogSubSys; +typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); diff --git a/src/include/net.h b/src/include/net.h index 950b5e5..3d37c8c 100644 --- a/src/include/net.h +++ b/src/include/net.h @@ -17,7 +17,6 @@ typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; static const char* ncclNetName() { return ncclNet->name; } static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; } static ncclResult_t ncclNetPciPath(int dev, char** path) { NCCLCHECK(ncclNet->pciPath(dev, path)); return ncclSuccess; } -static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(ncclNet->ptrSupport(dev, supportedTypes)); return ncclSuccess; } static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; } static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; } static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; } @@ -31,6 +30,37 @@ static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeS static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; } static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; } +#define GPU_BUF_SIZE (2*1024*1024) +static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { + int support; + NCCLCHECK(ncclNet->ptrSupport(dev, &support)); + *supportedTypes = support & ~NCCL_PTR_CUDA; + // The network supports GPU Direct RDMA ; verify the GPU supports it as well. + if (support & NCCL_PTR_CUDA) { + void *lComm = NULL, *sComm = NULL, *rComm = NULL; + ncclNetHandle_t handle; + void* gpuPtr = NULL; + void* mHandle = NULL; + ncclResult_t res; + NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), res, cleanup); + NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), res, cleanup); + NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), res, cleanup); + CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), res, cleanup); + NOWARN(ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res); + if (res != ncclSuccess) goto cleanup; + NCCLCHECKGOTO(ncclNetDeregMr(sComm, mHandle), res, cleanup); + NCCLCHECKGOTO(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res, cleanup); + NCCLCHECKGOTO(ncclNetDeregMr(rComm, mHandle), res, cleanup); + *supportedTypes |= NCCL_PTR_CUDA; +cleanup: + if (gpuPtr) cudaFree(gpuPtr); + if (rComm) ncclNetCloseRecv(rComm); + if (sComm) ncclNetCloseSend(sComm); + if (lComm) ncclNetCloseListen(lComm); + } + return ncclSuccess; +} + extern ncclNet_t ncclNetIb; extern ncclNet_t ncclNetSocket; diff --git a/src/include/nvlink.h b/src/include/nvlink.h deleted file mode 100644 index 8a0f99e..0000000 --- a/src/include/nvlink.h +++ /dev/null @@ -1,133 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_NVLINK_H_ -#define NCCL_NVLINK_H_ - -#include <sys/stat.h> -#include <fcntl.h> -#include "nvmlwrap.h" -#include "topo.h" - -#define CONNECT_NVLINK 0x10 -#define CONNECT_NVSWITCH 0x100 - -enum ncclNvLinkDeviceType { - ncclNvLinkDeviceGpu, - ncclNvLinkDeviceSwitch, - ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea) -}; - -static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) { - char classPath[] = "/sys/bus/pci/devices/0000:00:00.0/class"; - memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1); - char* rPath = realpath(classPath, NULL); - int fd; - if ((fd = open(rPath, O_RDONLY)) == -1) { - // Could not find device. It might be because we're in a VM and - // we don't see the whole machine. This is handled silently so - // we don't want to print an INFO error. - TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno)); - return ncclSystemError; - } - free(rPath); - char pciClass[9]; - strncpy(pciClass, "0x000000", 9); - int len; - SYSCHECKVAL(read(fd, pciClass, 8), "read", len); - SYSCHECK(close(fd), "close"); - if (strcmp(pciClass, "0x068000") == 0) { - // PCI device is of type "Bridge / Other Bridge Device" (NVswitch) - *type = ncclNvLinkDeviceSwitch; - } else if (strcmp(pciClass, "0x068001") == 0) { - // PCI device is of type "Bridge: IBM Device 04ea" - *type = ncclNvLinkDeviceBridge; - } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla) - || strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce) - *type = ncclNvLinkDeviceGpu; - } else { - // Ignore if we don't know what's on the other side. - return ncclSystemError; - } - return ncclSuccess; -} - -/* Get the maximum number of NVLinks based on the GPU generation */ -static ncclResult_t getMaxNvlinks(int* maxLinks) { - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - int ccMajor; - CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev)); - // 6 for Volta, 4 for Pascal - *maxLinks = (ccMajor > 6) ? 6 : 4; - // INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks); - return ncclSuccess; -} - -static int getNvlinkGpu(const char* busId1, const char* busId2) { - // Determine if that connection is through NVLink - int links = 0; - int nvswitch_links = 0; - int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4; - nvmlDevice_t nvmlDev; - ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId1, &nvmlDev); - if (res != ncclSuccess) return 0; - - for(int l=0; l<maxNvLinks; ++l) { - // Check whether we can use this NVLink for P2P - unsigned canP2P; - if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue; - - // Make sure the Nvlink is up. The previous call should have trained the link. - nvmlEnableState_t isActive; - if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue; - - // Try to figure out what's on the other side of the NVLink - nvmlPciInfo_t remoteProc; - if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue; - - // Old versions of NVML return a lowercase PCI ID - char* p = remoteProc.busId; - for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) { - if (p[c] == 0) break; - p[c] = toupper(p[c]); - } - - if (busId2 != NULL && strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) { - links++; - } else { - // Make a lower case copy of the bus ID for calling ncclDeviceType - // PCI system path is in lower case - char* p = remoteProc.busId; - char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; - for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) { - if (p[c] == 0) break; - lowerId[c] = tolower(p[c]); - } - - // Determine if the remote side is NVswitch or a GPU - enum ncclNvLinkDeviceType type; - ncclResult_t ret = ncclDeviceType(lowerId, &type); - if (ret == ncclSuccess) { - if (type == ncclNvLinkDeviceSwitch) { - //TODO: we are making an assumption that all GPUs are connected to this switch - //This assumption may change for future architectures - nvswitch_links++; - } else if (type == ncclNvLinkDeviceGpu && busId2 == NULL) { - links++; - } - } else { - // The NVLink is up but we couldn't find the PCI device on the other - // side. Assume it's an NVswitch outside a VM. - if (l==0) INFO(NCCL_INIT, "Assuming NVLink is connected to NVswitch"); - nvswitch_links++; - } - } - } - return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*links; -} - -#endif diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h index f658279..01bbb7f 100644 --- a/src/include/nvmlwrap.h +++ b/src/include/nvmlwrap.h @@ -9,18 +9,31 @@ #include "nccl.h" -//#define NVML_DIRECT 1 -#ifdef NVML_DIRECT -#include "nvml.h" +// The NVML library doesn't appear to be thread safe +#include <pthread.h> +extern pthread_mutex_t nvmlLock; +#define NVMLLOCK() pthread_mutex_lock(&nvmlLock) +#define NVMLUNLOCK() pthread_mutex_unlock(&nvmlLock) + +#define NVMLLOCKCALL(cmd, ret) do { \ + NVMLLOCK(); \ + ret = cmd; \ + NVMLUNLOCK(); \ +} while(false) #define NVMLCHECK(cmd) do { \ - nvmlReturn_t e = cmd; \ + nvmlReturn_t e; \ + NVMLLOCKCALL(cmd, e); \ if( e != NVML_SUCCESS ) { \ WARN("NVML failure '%s'", nvmlErrorString(e)); \ return ncclSystemError; \ } \ } while(false) +//#define NVML_DIRECT 1 +#ifdef NVML_DIRECT +#include "nvml.h" + static ncclResult_t wrapNvmlSymbols(void) { return ncclSuccess; } static ncclResult_t wrapNvmlInit(void) { NVMLCHECK(nvmlInit()); return ncclSuccess; } static ncclResult_t wrapNvmlShutdown(void) { NVMLCHECK(nvmlShutdown()); return ncclSuccess; } @@ -57,6 +70,10 @@ static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned i NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber)); return ncclSuccess; } +static ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) { + NVMLCHECK(nvmlDeviceGetCudaComputeCapability(device, major, minor)); + return ncclSuccess; +} #else // Dynamically handle dependencies on NVML @@ -139,6 +156,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult); ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber); +ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor); #endif // NVML_DIRECT diff --git a/src/include/rings.h b/src/include/rings.h deleted file mode 100644 index 9701f84..0000000 --- a/src/include/rings.h +++ /dev/null @@ -1,17 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_RINGS_H_ -#define NCCL_RINGS_H_ - -static int getDefaultThreads() { - // On Kepler, rings are doubled later. - return ncclCudaCompCap() == 3 ? 128 : 256; -} - -ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut); - -#endif diff --git a/src/include/socket.h b/src/include/socket.h index b4f09b9..96bf5db 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -66,7 +66,9 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre #endif struct netIf userIfs[MAX_IFS]; bool searchNot = prefixList && prefixList[0] == '^'; + if (searchNot) prefixList++; bool searchExact = prefixList && prefixList[0] == '='; + if (searchExact) prefixList++; int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS); int found = 0; @@ -118,17 +120,17 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre return found; } -static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) { +static bool matchSubnet(struct ifaddrs local_if, union socketAddress* remote) { /* Check family first */ int family = local_if.ifa_addr->sa_family; - if (family != remote.sa.sa_family) { + if (family != remote->sa.sa_family) { return false; } if (family == AF_INET) { struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr); struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask); - struct sockaddr_in& remote_addr = remote.sin; + struct sockaddr_in& remote_addr = remote->sin; struct in_addr local_subnet, remote_subnet; local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr; remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr; @@ -136,7 +138,7 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) { } else if (family == AF_INET6) { struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr); struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask); - struct sockaddr_in6& remote_addr = remote.sin6; + struct sockaddr_in6& remote_addr = remote->sin6; struct in6_addr& local_in6 = local_addr->sin6_addr; struct in6_addr& mask_in6 = mask->sin6_addr; struct in6_addr& remote_in6 = remote_addr.sin6_addr; @@ -161,7 +163,7 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) { } } -static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress remoteAddr, int ifNameMaxSize, int maxIfs) { +static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) { #ifdef ENABLE_TRACE char line[1024]; #endif @@ -189,13 +191,13 @@ static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAdd // Store the interface name strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize); - TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a)); + TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr->sa), line_a)); found++; if (found == maxIfs) break; } if (found == 0) { - WARN("Net : No interface found in the same subnet as remote address %s", socketToString(&(remoteAddr.sa), line_a)); + WARN("Net : No interface found in the same subnet as remote address %s", socketToString(&(remoteAddr->sa), line_a)); } freeifaddrs(interfaces); return found; @@ -300,7 +302,7 @@ static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNam // Try to find interface that is in the same subnet as the IP in comm id union socketAddress idAddr; GetSocketAddrFromString(&idAddr, commId); - nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, idAddr, ifNameMaxSize, maxIfs); + nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs); } } // Then look for anything else (but not docker or lo) @@ -387,7 +389,7 @@ retry: if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) { if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) || (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) { - INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); + if (refused_retries % 1000 == 0) INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); usleep(SLEEP_INT); goto retry; } diff --git a/src/include/topo.h b/src/include/topo.h deleted file mode 100644 index 69cd100..0000000 --- a/src/include/topo.h +++ /dev/null @@ -1,45 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_TOPO_H_ -#define NCCL_TOPO_H_ - -#include "nccl.h" -#include <limits.h> -#include <stdlib.h> -#include <ctype.h> -#include <stdio.h> - -ncclResult_t getCudaPath(int cudaDev, char** path); - -static int getNumaId(char *path) { - char npath[PATH_MAX]; - snprintf(npath, PATH_MAX, "%s/numa_node", path); - npath[PATH_MAX-1] = '\0'; - - int numaId = -1; - FILE *file = fopen(npath, "r"); - if (file == NULL) return -1; - if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; } - fclose(file); - - return numaId; -} - -enum ncclPathDist { - PATH_PIX = 0, - PATH_PXB = 1, - PATH_PHB = 2, - PATH_NODE = 3, - PATH_SYS = 4, - PATH_ARRAY_SIZE = 5 -}; - -extern const char* pathDists[PATH_ARRAY_SIZE]; - -int pciDistance(char* path1, char* path2); - -#endif diff --git a/src/include/transport.h b/src/include/transport.h index 91628f6..8f9bf0e 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -7,12 +7,15 @@ #ifndef NCCL_TRANSPORT_H_ #define NCCL_TRANSPORT_H_ -#include "nccl.h" #include "devcomm.h" -#include <stdint.h> +#include "graph.h" #include "nvmlwrap.h" +#include "core.h" #define NTRANSPORTS 3 +#define TRANSPORT_P2P 0 +#define TRANSPORT_SHM 1 +#define TRANSPORT_NET 2 extern struct ncclTransport ncclTransports[]; @@ -24,15 +27,13 @@ struct ncclComm; struct ncclPeerInfo { int rank; int cudaDev; - int nvmlDev; + int gdrSupport; uint64_t hostHash; uint64_t pidHash; - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + dev_t shmDev; + int64_t busId; }; -// Used to hold the transport connection values -typedef int64_t ncclTvalue_t; - #define CONNECT_SIZE 128 struct ncclConnect { char data[CONNECT_SIZE]; @@ -51,7 +52,7 @@ struct ncclProxyArgs { int chunkSteps; int nsteps; uint64_t opCount; - int llMode; + int protocol; int state; // add component before this line -- it is left out during initialization // Internal state @@ -78,7 +79,7 @@ struct ncclProxyState { }; struct ncclTransportComm { - ncclResult_t (*setup)(struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId); + ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId); ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*); ncclResult_t (*free)(void*); ncclResult_t (*proxy)(struct ncclProxyArgs*); @@ -86,8 +87,7 @@ struct ncclTransportComm { struct ncclTransport { const char name[4]; - ncclResult_t (*canConnect)(ncclTvalue_t*, struct ncclPeerInfo*, struct ncclPeerInfo*); - ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*); + ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*); struct ncclTransportComm send; struct ncclTransportComm recv; }; diff --git a/src/include/utils.h b/src/include/utils.h index 5acccc2..266abca 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -10,6 +10,14 @@ #include "nccl.h" #include <stdint.h> +int ncclCudaCompCap(); + +// PCI Bus ID <-> int64 conversion functions +ncclResult_t int64ToBusId(int64_t id, char* busId); +ncclResult_t busIdToInt64(char* busId, int64_t* id); + +ncclResult_t getBusId(int cudaDev, int64_t *busId); + ncclResult_t getHostName(char* hostname, int maxlen, const char delim); uint64_t getHash(const char* string, int n); uint64_t getHostHash(); @@ -23,4 +31,10 @@ struct netIf { int parseStringList(const char* string, struct netIf* ifList, int maxList); bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact); +static long log2i(long n) { + long l = 0; + while (n>>=1) l++; + return l; +} + #endif diff --git a/src/init.cc b/src/init.cc index 706d3a6..1ee2a73 100644 --- a/src/init.cc +++ b/src/init.cc @@ -5,44 +5,29 @@ ************************************************************************/ #include "nccl.h" -#include "core.h" #include "channel.h" -#include "param.h" #include "nvmlwrap.h" -#include "rings.h" -#include "trees.h" #include "bootstrap.h" #include "transport.h" #include "group.h" -#include "utils.h" #include "net.h" -#include "checks.h" #include "enqueue.h" -#include "topo.h" -#include "nvlink.h" +#include "graph.h" +#include "argcheck.h" #include "cpuset.h" -#include <stdio.h> -#include <stdlib.h> -#include <sys/mman.h> -#include <sys/stat.h> -#include <sys/types.h> #include <sched.h> #include <fcntl.h> -#include <unistd.h> -#include <cuda_runtime.h> #include <string.h> #include <errno.h> #include <assert.h> #include <dlfcn.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> #define STR2(v) #v #define STR(v) STR2(v) -int ncclDebugLevel; -uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT -pthread_mutex_t ncclDebugOutputLock; -FILE *ncclDebugFile = stdout; - #ifdef ENABLE_TRACE std::chrono::high_resolution_clock::time_point ncclEpoch; #endif @@ -59,34 +44,6 @@ NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); ncclNet_t* ncclNet = NULL; -// We define this as weak to let tests redefine their own -#pragma weak ncclNvlinkGpu -ncclResult_t ncclNvlinkGpu(int* nvlink) { - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; - CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev)); - *nvlink = getNvlinkGpu(busId, NULL); - return ncclSuccess; -} -// We define this as weak to let tests redefine their own -#pragma weak ncclCudaCompCap -int ncclCudaCompCap() { - int cudaDev; - if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0; - int ccMajor; - if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0; - return ccMajor; -} -int ncclCudaFullCompCap() { - int cudaDev; - if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0; - int ccMajor, ccMinor; - if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0; - if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0; - return ccMajor*10+ccMinor; -} - // Returns ncclInternalError if anything fails, causing that network to be ignored. ncclResult_t initNet(ncclNet_t* net) { int ndev; @@ -103,7 +60,7 @@ ncclResult_t initNetPlugin(ncclNet_t** net) { // string, so checking errno doesn't hurt to try to provide a better // error message if (errno == ENOENT) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (libnccl-net.so)."); + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (libnccl-net.so), using internal implementation"); } else { INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror()); } @@ -138,22 +95,6 @@ ncclResult_t initNet() { return ncclSuccess; } -NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2); -NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2); -NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", -2); - -int ncclThreadThreshold(int minCompCap, int multiNode) { - int threshold = ncclParamThreadThreshold(); - if (threshold == -2) { // user has not set this env variable - threshold = (minCompCap <= 6) ? NCCL_THREAD_THRESHOLD_PREVOLTA : NCCL_THREAD_THRESHOLD; - // multiply by 2 if running on multiple nodes - if (multiNode) { - threshold *= 2; - } - } - return threshold; -} - pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER; static bool initialized = false; static ncclResult_t ncclInit() { @@ -161,7 +102,6 @@ static ncclResult_t ncclInit() { pthread_mutex_lock(&initLock); if (!initialized) { initEnv(); - initDebug(); initNet(); initialized = true; } @@ -185,7 +125,7 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* out) { // Prevent compiler from optimizing out these operations void __attribute__((optimize("O0"))) commPoison(ncclComm_t comm) { - comm->rank = comm->cudaDev = comm->nvmlDev = comm->nRanks = -1; + comm->rank = comm->cudaDev = comm->busId = comm->nRanks = -1; } static ncclResult_t commFree(ncclComm_t comm) { @@ -193,6 +133,7 @@ static ncclResult_t commFree(ncclComm_t comm) { return ncclSuccess; free(comm->peerInfo); + ncclTopoFree(comm->topo); if (comm->bootstrap) NCCLCHECK(bootstrapClose(comm->bootstrap)); @@ -251,12 +192,10 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { comm->rank = comm->hostDevComm.rank =rank; comm->nRanks = comm->hostDevComm.nRanks = ndev; cudaGetDevice(&comm->cudaDev); - getNvmlDevice(comm->cudaDev, &comm->nvmlDev); - TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev); + NCCLCHECK(getBusId(comm->cudaDev, &comm->busId)); + TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x", comm, rank, ndev, comm->cudaDev, comm->busId); comm->doneEvent = doneEvent; - comm->llThreshold = ncclParamLlThreshold(); - comm->treeThreshold = ncclParamTreeThreshold(); comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false; #if CUDART_VERSION >= 9020 comm->groupCudaStream = ncclParamGroupCudaStream(); @@ -308,36 +247,41 @@ static void showVersion() { } } -static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank, uint64_t commHash) { - info->rank = rank; +static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) { + info->rank = comm->rank; CUDACHECK(cudaGetDevice(&info->cudaDev)); - NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev)) info->hostHash=getHostHash()+commHash; info->pidHash=getPidHash()+commHash; - // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the - // cudaDev is a CUDA runtime dev number which could be different from the - // NVML device number. Then we get the busID from NVML to be sure it is - // consistent with NVML remote PCI bus Ids. - CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev)); - nvmlDevice_t nvmlDevice; - NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice)); - nvmlPciInfo_t pciInfo; - NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo)); - strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE); + // Get the device MAJOR:MINOR of /dev/shm so we can use that + // information to decide whether we can use SHM for inter-process + // communication in a container environment + struct stat statbuf; + SYSCHECK(stat("/dev/shm", &statbuf), "stat"); + info->shmDev = statbuf.st_dev; + + info->busId = comm->busId; + int netDevs; + + NCCLCHECK(ncclNetDevices(&netDevs)); + for (int n=0; n<netDevs; n++) { + int ptrSupport; + NCCLCHECK(ncclNetPtrSupport(n, &ptrSupport)); + if (ptrSupport & NCCL_PTR_CUDA) info->gdrSupport |= (1 << n); + } return ncclSuccess; } template <int type> -static ncclResult_t selectTransport(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) { +static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) { for (int t=0; t<NTRANSPORTS; t++) { struct ncclTransport *transport = ncclTransports+t; struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv; - ncclTvalue_t ret = 0; - NCCLCHECK(transport->canConnect(&ret, myInfo, peerInfo)); - if (ret > 0) { + int ret = 0; + NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo)); + if (ret) { connector->transportComm = transportComm; - NCCLCHECK(transportComm->setup(myInfo, peerInfo, connect, connector, buffSize, channelId)); + NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, buffSize, channelId)); return ncclSuccess; } } @@ -345,44 +289,11 @@ static ncclResult_t selectTransport(struct ncclPeerInfo* myInfo, struct ncclPeer return ncclInternalError; } -static int log2(int n) { - int l = 0; - while (n>>=1) l++; - return l; -} - -static ncclResult_t ncclTreeThreshold(int nnodes, int nranks, int nChannels, ssize_t *treeThreshold) { - int nvlink; - NCCLCHECK(ncclNvlinkGpu(&nvlink)); - float ringbw = nvlink ? 5000*nChannels : 5000; // approx, in MB/s or B/us - float ringlatinter = 6; - float treelatintra = 4; - float treelatinter = 15; - float treebw; - if (!nvlink) { - treebw = ringbw * 2 / 3; - } else { - treebw = ringbw * 3 / 4; - if (nnodes == 2) treebw *= 2; - } - float ringlat = ringlatinter*(nranks-1); - float treelat = treelatinter*log2(nnodes)+treelatintra*(nranks/nnodes-1); - if (nnodes < 2 || ringlat <= treelat) - *treeThreshold = 0; - else if (treebw > ringbw) - *treeThreshold = 0x7fffffffffffffff; - else - *treeThreshold = (ssize_t)(((ringbw*treebw/(ringbw-treebw)))*(ringlat-treelat)); - return ncclSuccess; -} - -static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks, int* treeMasters) { +static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks) { TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); NCCLCHECK(initChannel(comm, channelId)); - struct ncclChannel* channel = comm->channels+channelId; - struct ncclRing* ring = &channel->ring; - + struct ncclRing* ring = &comm->channels[channelId].ring; // Reorganize ranks to start with rank. int shift; for (shift = 0; shift<nranks; shift++) { @@ -393,172 +304,6 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, for (int i=0; i<nranks; i++) { ring->userRanks[i] = ringRanks[(i+shift)%nranks]; } - int prev = ring->prev = ring->userRanks[nranks-1]; - int next = ring->next = ring->userRanks[1]; - - struct ncclTree* tree = &channel->tree; - tree->up = -1; - tree->down[0] = tree->down[1] = tree->down[2] = -1; - - // - // Find per-node masters and connect them via a binary tree - // - - int nMasters = 0; - for (int r=0; r<nranks; r++) nMasters += treeMasters[r]; - if (nMasters == 0) { - nMasters = 1; - treeMasters[0] = 1; - } - - if (comm->treeThreshold == -2) - NCCLCHECK(ncclTreeThreshold(nMasters, comm->nRanks, comm->nChannels, &comm->treeThreshold)); - - if (comm->treeThreshold > 0) { - // Compute tree depth. Not an exact value but a good approximation in most - // cases and consistent across nodes - tree->depth = nranks/nMasters + log2(nMasters); - - // Find my master : go backwards in the ring to find my root - int master = 0; - for (int i = 0; i<nranks; i++) { - int r = ring->userRanks[(nranks-i)%nranks]; - if (treeMasters[r]) { - master = r; - break; - } - } - - int* ranks; - NCCLCHECK(ncclCalloc(&ranks, nMasters)); - int i = 0, masterIndex = -1; - // Build binary tree - for (int r=0; r<nranks; r++) { - // Create index table - if (r == master) masterIndex = i; - if (treeMasters[r]) ranks[i++] = r; - } - int btreeUp, btreeDown0, btreeDown1; - int u0, d0_0, d0_1, u1, d1_0, d1_1; - NCCLCHECK(ncclGetDtree(nMasters, masterIndex, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1)); - if (channelId < DIVUP(comm->nChannels, 2)) { - btreeUp = u0; btreeDown0 = d0_0; btreeDown1 = d0_1; - } else { - btreeUp = u1; btreeDown0 = d1_0; btreeDown1 = d1_1; - } - - // - // Now build the full tree, combining the intra-node ring and the - // inter-node binary tree. - // - - if (rank == master) { - int nDown = 0; - if (btreeUp != -1) tree->up = ranks[btreeUp]; - if (treeMasters[next] == 0) tree->down[nDown++] = next; - if (btreeDown0 != -1) tree->down[nDown++] = ranks[btreeDown0]; - if (btreeDown1 != -1) tree->down[nDown++] = ranks[btreeDown1]; - } else { - tree->up = prev; - if (treeMasters[next] == 0) tree->down[0] = next; - } - free(ranks); - } - - TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); - return ncclSuccess; -} - -static ncclResult_t fillConnect(struct ncclPeerInfo* peerInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) { - for (int r=0; r<nranks; r++) { - connectTransport[r] = -1; - for (int t=0; t<NTRANSPORTS; t++) { - NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, peerInfo+rank, peerInfo+r)); - if (connectValue[r] > 0) { - connectTransport[r] = t; - break; - } - } - } - return ncclSuccess; -} - -#define MAXWIDTH 20 -#define PREFIXLEN 15 -#define STRLENGTH (PREFIXLEN+5*MAXWIDTH) -void dumpMatrix(int* connectMatrix, int nranks) { - char line[STRLENGTH+1]; - line[STRLENGTH] = '\0'; - memset(line, ' ', STRLENGTH); - for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j); - INFO(NCCL_INIT,"%s", line); - for (int i=0; i<nranks; i++) { - memset(line, ' ', STRLENGTH); - sprintf(line, "%3d ", i); - for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]); - INFO(NCCL_INIT,"%s", line); - } -} - -void dumpMatrixTvalue(ncclTvalue_t* connectMatrix, int nranks) { - char line[STRLENGTH+1]; - line[STRLENGTH] = '\0'; - memset(line, ' ', STRLENGTH); - for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4d", j); - INFO(NCCL_INIT,"%s", line); - for (int i=0; i<nranks; i++) { - memset(line, ' ', STRLENGTH); - sprintf(line, "%3d ", i); - for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4o", (int)connectMatrix[i*nranks+j]); - INFO(NCCL_INIT,"%s", line); - } -} - - -void dumpLine(int* values, int nranks, const char* prefix) { - int prefixlen = strlen(prefix); - char line[STRLENGTH+1]; - line[STRLENGTH] = '\0'; - memset(line, ' ', STRLENGTH); - strncpy(line, prefix, PREFIXLEN); - for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]); - INFO(NCCL_INIT,"%s", line); -} - -static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) { - for (int r=0; r<nrings; r++) { - char prefix[30]; - /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r); - dumpLine(prev+r*nranks, nranks, prefix); - sprintf(prefix, "[%d] Channel %d Next : ", rank, r); - dumpLine(next+r*nranks, nranks, prefix);*/ - - int current = rank; - for (int i=0; i<nranks; i++) { - rings[r*nranks+i] = current; - current = next[r*nranks+current]; - } - sprintf(prefix, "Channel %02d : ", r); - if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix); - if (current != rank) { - WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank); - return ncclInternalError; - } - // Check that all ranks are there - for (int i=0; i<nranks; i++) { - int found = 0; - for (int j=0; j<nranks; j++) { - if (rings[r*nranks+j] == i) { - found = 1; - break; - } - } - if (found == 0) { - WARN("Error : ring %d does not contain rank %d", r, i); - return ncclInternalError; - } - } - } return ncclSuccess; } @@ -599,7 +344,7 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct comm->intraCGMode = CGMode; int* CC; NCCLCHECK(ncclCalloc(&CC, 1)); - *CC = ncclCudaFullCompCap(); + *CC = ncclCudaCompCap(); comm->intraCC = CC; } else { comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier); @@ -622,7 +367,7 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct if (comm->launchMode == ncclComm::GROUP) { CUDACHECK(cudaStreamCreateWithFlags(&comm->groupStream, cudaStreamNonBlocking)); #if CUDART_VERSION >= 9000 - if (*comm->intraCC && (ncclCudaFullCompCap() == *comm->intraCC)) { + if (*comm->intraCC && (ncclCudaCompCap() == *comm->intraCC)) { // Check whether the GPU supports Cooperative Group Multi Device Launch (void) cudaDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev); } @@ -636,7 +381,7 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct return ncclSuccess; } -static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) { +static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) { TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv); uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */ struct ncclConnect connect; @@ -647,7 +392,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, conn = &channel->peers[peer].recv; if (conn->connected) { ++nSkippedRecv; continue; } memset(&connect, 0, sizeof(connect)); - NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); + NCCLCHECK(selectTransport<0>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); } for (int i=0; i<nsend; i++) { @@ -656,7 +401,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, conn = &channel->peers[peer].send; if (conn->connected) { ++nSkippedSend; continue; } memset(&connect, 0, sizeof(connect)); - NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); + NCCLCHECK(selectTransport<1>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); } for (int i=0; i<nsend; i++) { @@ -683,6 +428,8 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, return ncclSuccess; } +NCCL_PARAM(CrossNic, "CROSS_NIC", 2); + static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) { // We use 3 AllGathers // 1. { peerInfo, comm } @@ -703,145 +450,177 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECK(ncclCalloc(&allGather1Data, nranks)); allGather1Data[rank].comm = comm; - NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank, commHash)); + struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo; + NCCLCHECK(fillInfo(comm, myInfo, commHash)); NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data))); NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks)); for (int i = 0; i < nranks; i++) { memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo)); + if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) { + WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId); + return ncclInvalidUsage; + } } // AllGather1 data is used again below // AllGather1 - end - // AllGather2 - begin - size_t allGather2DataRowSize = sizeof(int)*nranks + sizeof(ncclTvalue_t)*nranks; - void *allGather2Data; - NCCLCHECK(ncclCalloc((char **)&allGather2Data, allGather2DataRowSize*nranks)); - int *myTransportRow = (int *)((char *)allGather2Data + allGather2DataRowSize*rank); - ncclTvalue_t *myValueRow = (ncclTvalue_t *)(myTransportRow + nranks); - - NCCLCHECK(fillConnect(comm->peerInfo, nranks, rank, myTransportRow, myValueRow)); - NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather2Data, allGather2DataRowSize)); - - int* connectTransport; - ncclTvalue_t* connectValue; - NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks)); - NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks)); - for (int i = 0; i < nranks; i++) { - memcpy(connectTransport + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize, sizeof(int)*nranks); - memcpy(connectValue + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize + nranks*sizeof(int), sizeof(ncclTvalue_t)*nranks); - } - free(allGather2Data); - // AllGather2 - end - - //if (rank == 0) dumpMatrix(connectTransport, nranks); - //if (rank == 0) dumpMatrixTvalue(connectValue, nranks); - - // Get my rings - int nrings; - int* prev, *next, *treeIn, *treeOut; - NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS)); - comm->nThreads = getDefaultThreads(); - NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut)); - TRACE(NCCL_INIT, "rank %d nranks %d - BUILD %d RINGS", rank, nranks, nrings); - assert(nrings <= MAXCHANNELS); - free(connectTransport); - free(connectValue); + // Topo detection / System graph creation + NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo)); + // Compute paths between GPUs and NICs + NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo)); + // Remove inaccessible GPUs and unused NICs + NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm)); + // Recompute paths after trimming + NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo)); + // Compute max speed to accelerate search + NCCLCHECK(ncclTopoGetMaxSpeed(comm->topo)); + // Print final topology + NCCLCHECK(ncclTopoPrint(comm->topo)); + + // Get rings and trees + struct ncclTopoGraph treeGraph; + treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE; + treeGraph.crossNic = ncclParamCrossNic(); + // We communicate only half the data between node with trees on 2 nodes. + NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph)); + NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph)); + struct ncclTopoGraph ringGraph; + ringGraph.pattern = NCCL_TOPO_PATTERN_RING; + ringGraph.crossNic = ncclParamCrossNic(); + NCCLCHECK(ncclTopoCompute(comm->topo, &ringGraph)); + NCCLCHECK(ncclTopoPrintGraph(comm->topo, &ringGraph)); // AllGather3 - begin + struct { - int nThreads; - int nrings; int cudaCompCap; - int prev[MAXCHANNELS]; - int next[MAXCHANNELS]; + int fullCudaCompCap; + int nvlink; + int nChannels; + struct { + int sameChannels; + int speedIntra; + int speedInter; + int nvlink; + } tree; + struct { + int sameChannels; + int speedIntra; + int speedInter; + int nvlink; + } ring; + struct ncclTopoRanks topoRanks; } *allGather3Data; NCCLCHECK(ncclCalloc(&allGather3Data, nranks)); - allGather3Data[rank].nThreads = comm->nThreads; - allGather3Data[rank].nrings = nrings; allGather3Data[rank].cudaCompCap = ncclCudaCompCap(); - for (int r=0; r<nrings; r++) { - allGather3Data[rank].prev[r] = *(prev+r*nranks+rank); - allGather3Data[rank].next[r] = *(next+r*nranks+rank); - } + allGather3Data[rank].nvlink = treeGraph.nvlink; + allGather3Data[rank].nChannels = comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels); + allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels; + allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra; + allGather3Data[rank].tree.speedInter = treeGraph.speedInter; + allGather3Data[rank].tree.nvlink = treeGraph.nvlink; + allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels; + allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra; + allGather3Data[rank].ring.speedInter = ringGraph.speedInter; + allGather3Data[rank].ring.nvlink = ringGraph.nvlink; + + NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &allGather3Data[rank].topoRanks)); + NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data))); - // Find max nThreads - for (int i=0; i<nranks; i++) - comm->nThreads = std::max(allGather3Data[i].nThreads, comm->nThreads); + // Determine nNodes, firstRanks, ... + int* nodesFirstRank; + NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks)); + for (int i=0; i<nranks; i++) { + int node = -1; + int firstRank = allGather3Data[i].topoRanks.ringRecv[0]; + for (int n=0; n<comm->nNodes; n++) { + if (nodesFirstRank[n] == firstRank) node = n; + } + if (node == -1) { + node = comm->nNodes++; + nodesFirstRank[node] = firstRank; + } + if (i == comm->rank) comm->node = node; + } // Determine the minimum CUDA Compute capability of all GPUs int myCompCap = allGather3Data[rank].cudaCompCap; - int minCompCap = myCompCap; - for (int i = 0; i < nranks; i++) + int minCompCap = myCompCap, maxCompCap = myCompCap; + for (int i = 0; i < nranks; i++) { minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap); + maxCompCap = std::max(allGather3Data[i].cudaCompCap, maxCompCap); + } - // Determine thread threshold across all GPUs - int nnodes = 0; - for (int r=0; r<nranks; r++) nnodes += treeIn[r]; - comm->threadThreshold = ncclThreadThreshold(minCompCap, nnodes); + comm->nvlink = 1; + for (int i = 0; i < nranks; i++) comm->nvlink &= allGather3Data[i].nvlink; - // Find min nrings across ranks - for (int i=0; i<nranks; i++) - nrings = std::min(allGather3Data[i].nrings, nrings); - comm->nChannels = nrings; + int nChannelsOrig = comm->nChannels; + struct ncclTopoRanks** allTopoRanks; + NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks)); + for (int i=0; i<nranks; i++) { + allTopoRanks[i] = &allGather3Data[i].topoRanks; + // Make sure we align all ranks so that the tuning is consistent across ranks + treeGraph.nChannels = ringGraph.nChannels = comm->nChannels = std::min(allGather3Data[i].nChannels, comm->nChannels); + treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels); + treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra); + treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter); + treeGraph.nvlink = std::min(allGather3Data[i].tree.nvlink, treeGraph.nvlink); + ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels); + ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra); + ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter); + ringGraph.nvlink = std::min(allGather3Data[i].ring.nvlink, ringGraph.nvlink); + } - // Unpack the per ring prev/next arrays - for (int i = 0; i < nranks; i++) { - for (int r = 0; r < nrings; r++) { - prev[r*nranks+i] = allGather3Data[i].prev[r]; - next[r*nranks+i] = allGather3Data[i].next[r]; - } + if (comm->nChannels < nChannelsOrig) { + // We started duplicating channels during Preset(), so we need to move the + // duplicated channels since we have removed some. + for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel)); } - free(allGather3Data); - // AllGather3 - end int *rings; NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS)); - NCCLCHECK(buildRings(nrings, rings, rank, nranks, prev, next)); - free(prev); - free(next); - TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d RINGS", rank, nranks, nrings); + + NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings)); + + free(allTopoRanks); + free(nodesFirstRank); + free(allGather3Data); + + // AllGather3 - end + + TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels); + + NCCLCHECK(ncclSetThresholds(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph)); + + char line[1024]; + line[0]='\0'; + for (int c=0; c<comm->nChannels; c++) { + struct ncclTree* treeUp = &comm->channels[c].treeUp; + struct ncclTree* treeDn = &comm->channels[c].treeDn; + snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d|%d->%d->%d/%d/%d", + c, treeUp->down[0], treeUp->down[1], treeUp->down[2], rank, treeUp->up, + treeDn->up, rank, treeDn->down[0], treeDn->down[1], treeDn->down[2]); + } + line[1023] = '\0'; + INFO(NCCL_INIT, "Trees%s", line); // Connect with prev/next for each ring struct ncclConnect *connect; NCCLCHECK(ncclCalloc(&connect, 2)); - for (int r=0; r<nrings; r++) { - struct ncclChannel* channel = comm->channels+r; - NCCLCHECK(setupChannel(comm, r, rank, nranks, rings+r*nranks, treeIn+r*nranks)); - NCCLCHECK(p2pSetup(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next)); - NCCLCHECK(p2pSetup(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up)); - NCCLCHECK(p2pSetup(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down)); - } - if (comm->treeThreshold > 0) { - char line[1024]; - line[0]='\0'; - for (int c=0; c<nrings; c++) { - struct ncclTree* tree = &comm->channels[c].tree; - snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d->%d->%d/%d/%d", - c, tree->up, rank, tree->down[0], tree->down[1], tree->down[2]); - } - line[1023] = '\0'; - INFO(NCCL_INIT, "Trees%s", line); - } - if (rank == 0) { - char treeline[64]; - snprintf(treeline, 64, "enabled up to size %ld", comm->treeThreshold); - INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees %s", comm->nThreads, minCompCap, - comm->treeThreshold == 0 ? "disabled" : - comm->treeThreshold == 0x7fffffffffffffff ? "enabled for all sizes" : - treeline); - } - - TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, nrings); + for (int c=0; c<comm->nChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + NCCLCHECK(setupChannel(comm, c, rank, nranks, rings+c*nranks)); + if (comm->nRanks == 1) continue; + NCCLCHECK(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next)); + NCCLCHECK(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up)); + NCCLCHECK(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down)); + } + TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); free(connect); free(rings); - free(treeIn); - free(treeOut); // Compute intra ranks (using AllGather1 data) int intraRank0 = -1, intraRank = -1, intraRanks = 0; @@ -865,7 +644,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm // Done with AllGather1 data free(allGather1Data); - if (nnodes) NCCLCHECK(transportCreateProxy(comm)); + if (comm->nNodes) NCCLCHECK(transportCreateProxy(comm)); TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); return ncclSuccess; @@ -874,7 +653,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) { CPU_ZERO_S(sizeof(cpu_set_t), mask); char* cudaPath; - NCCLCHECK(getCudaPath(cudaDev, &cudaPath)); + NCCLCHECK(ncclTopoCudaPath(cudaDev, &cudaPath)); char path[PATH_MAX]; strncpy(path, cudaPath, PATH_MAX-1); snprintf(path+strlen(path), PATH_MAX-1-strlen(path), "/local_cpus"); @@ -937,7 +716,7 @@ static ncclResult_t setCpuAffinity(int cudaDev) { return ncclSuccess; } -ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { +ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) { cpu_set_t affinitySave; sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); @@ -945,8 +724,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId NCCLCHECK(wrapNvmlInit()); // Make sure all host memory allocation are close to the GPU - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); + CUDACHECK(cudaSetDevice(cudaDev)); NCCLCHECK(setCpuAffinity(cudaDev)); ncclResult_t res; @@ -957,218 +735,73 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup); - INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->nvmlDev); + INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId); return ncclSuccess; cleanup: + if ((*newcomm) && (*newcomm)->bootstrap) bootstrapAbort((*newcomm)->bootstrap); *newcomm = NULL; sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); return res; } -NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank); -ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { +static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) { + ncclResult_t res; char* env = getenv("NCCL_COMM_ID"); if (env && myrank == 0) { - NCCLCHECK(bootstrapCreateRoot(&commId, true)); + NCCLCHECKGOTO(bootstrapCreateRoot(&commId, true), res, end); } - NCCLCHECK(ncclInit()); + NCCLCHECKGOTO(ncclInit(), res, end); if (myrank == 0) showVersion(); // Make sure the CUDA runtime is initialized. - CUDACHECK(cudaFree(NULL)); + CUDACHECKGOTO(cudaFree(NULL), res, end); - NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm")); + NCCLCHECKGOTO(PtrCheck(newcomm, "CommInitRank", "newcomm"), res, end); if (nranks < 1 || myrank < 0 || myrank >= nranks) { WARN("Invalid rank requested : %d/%d", myrank, nranks); - return ncclInvalidArgument; + res = ncclInvalidArgument; + goto end; } if (ncclAsyncMode()) { - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - return ncclAsyncInit(ncclCommInitRankSync, cudaDev, newcomm, nranks, commId, myrank); + NCCLCHECKGOTO(ncclAsyncInit(ncclCommInitRankSync, newcomm, nranks, commId, myrank, cudaDev), res, end); } else { - return ncclCommInitRankSync(newcomm, nranks, commId, myrank); + NCCLCHECKGOTO(ncclCommInitRankSync(newcomm, nranks, commId, myrank, cudaDev), res, end); } +end: + if (ncclAsyncMode()) return ncclAsyncErrCheck(res); + else return res; } -static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) { - struct ncclPeerInfo* allInfo; - NCCLCHECK(ncclCalloc(&allInfo, nranks)); - for (int rank=0; rank<nranks; rank++) { - CUDACHECK(cudaSetDevice(devs[rank])); - NCCLCHECK(fillInfo(allInfo+rank, rank, 0)); - } - - int* connectTransport; - ncclTvalue_t* connectValue; - NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks)); - NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks)); - for (int rank=0; rank<nranks; rank++) - NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank)); - - int* prev, *prevFinal, *next, *nextFinal, *treeIn, *treeOut; - NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS)); - int nrings = MAXCHANNELS; - int nthreads=0; - int myCompCap = ncclCudaCompCap(); - int minCompCap = myCompCap; - for (int rank=0; rank<nranks; rank++) { - CUDACHECK(cudaSetDevice(devs[rank])); - int nringsRank; - int nthreadsRank = getDefaultThreads(); - myCompCap = ncclCudaCompCap(); - NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut)); - nrings = std::min(nrings, nringsRank); - nthreads = std::max(nthreads, nthreadsRank); - minCompCap = std::min(minCompCap, myCompCap); - for (int ring=0; ring<nrings; ring++) { - int index = ring*nranks+rank; - prevFinal[index] = prev[index]; - nextFinal[index] = next[index]; - } - } - free(connectTransport); - free(connectValue); - free(prev); - free(next); - - INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees disabled", nthreads, minCompCap); - - int* rings; - NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS)); - NCCLCHECK(buildRings(nrings, rings, 0, nranks, prevFinal, nextFinal)); - free(prevFinal); - free(nextFinal); - - // Determine thread threshold across all GPUs - int threadThreshold = ncclThreadThreshold(minCompCap, 0); - - for (int rank=0; rank<nranks; rank++) { - comms[rank]->nChannels = nrings; - comms[rank]->nThreads = nthreads; - comms[rank]->threadThreshold = threadThreshold; - } - - struct ncclConnect* connect; - NCCLCHECK(ncclCalloc(&connect, 2*nranks)); - for (int r=0; r<nrings; r++) { - int* ringRanks = rings+r*nranks; - for (int rank=0; rank<nranks; rank++) { - CUDACHECK(cudaSetDevice(devs[rank])); - struct ncclChannel* channel = comms[rank]->channels+r; - struct ncclRing *ring = &channel->ring; - NCCLCHECK(setupChannel(comms[rank], r, rank, nranks, ringRanks, treeIn)); - // Make sure we don't use trees, we cannot use them with initAll - comms[rank]->treeThreshold = 0; - int prev = channel->ring.prev = ring->userRanks[nranks-1]; - int next = channel->ring.next = ring->userRanks[1]; - struct ncclConnector* recv = &channel->peers[prev].recv; - struct ncclConnector* send = &channel->peers[next].send; - NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+rank*2+0, recv, channel->buffSize, channel->id)); - NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+rank*2+1, send, channel->buffSize, channel->id)); - } - for (int rank=0; rank<nranks; rank++) { - CUDACHECK(cudaSetDevice(devs[rank])); - struct ncclChannel* channel = comms[rank]->channels+r; - struct ncclRing *ring = &channel->ring; - struct ncclConnector* recv = &channel->peers[ring->prev].recv; - struct ncclConnector* send = &channel->peers[ring->next].send; - NCCLCHECK(recv->transportComm->connect(connect+ring->prev*2+1, recv)); - NCCLCHECK(send->transportComm->connect(connect+ring->next*2+0, send)); - } - } - free(connect); - free(allInfo); - free(rings); - free(treeIn); - free(treeOut); +NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank); +ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev)); return ncclSuccess; } - NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist); ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { - NCCLCHECK(ncclInit()); - NCCLCHECK(wrapNvmlSymbols()); - NCCLCHECK(wrapNvmlInit()); - showVersion(); - - INFO(NCCL_INIT,"nranks %d", ndev); - NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms")); - if (ndev < 1) { + if (ndev < 0) { WARN("Invalid device count requested : %d", ndev); return ncclInvalidArgument; } - ncclResult_t res; - int savedDevice; - int rank, cudaDev; - ncclComm_t comm = NULL; - int* ncclDevList = NULL; - NCCLCHECK(ncclCalloc(&ncclDevList, ndev)); + ncclUniqueId uniqueId; + NCCLCHECK(ncclGetUniqueId(&uniqueId)); + NCCLCHECK(ncclGroupStart()); for (int i=0; i<ndev; i++) { - ncclDevList[i] = devlist ? devlist[i] : i; - } - - CUDACHECKGOTO(cudaGetDevice(&savedDevice), res, cleanup); - - for(rank=0; rank<ndev; ++rank) - comms[rank] = NULL; - - cpu_set_t affinitySave; - sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); - - for (rank=0; rank<ndev; ++rank) { - cudaDev = ncclDevList[rank]; - CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup); - - NCCLCHECK(setCpuAffinity(cudaDev)); - - NCCLCHECKGOTO(commAlloc(&comm, ndev, rank), res, cleanup); - comms[rank] = comm; - - NCCLCHECKGOTO(ncclCommSetIntra(comm, rank, ndev, comms[0]), res, cleanup); - } - - sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); - - NCCLCHECKGOTO(initTransportsAll(comms, ncclDevList, ndev), res, cleanup); - - for(rank=0; rank<ndev; ++rank) { - cudaDev = ncclDevList[rank]; - CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup); - NCCLCHECKGOTO(devCommSetup(comms[rank]), res, cleanup); + // Ignore return codes .. we need to call ncclGroupEnd to clean up anyway + ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i); } - - res = ncclSuccess; - goto final; - -cleanup: - for(rank=0; rank<ndev; ++rank) { - if(comms[rank] != NULL) { - commFree(comms[rank]); - } - } - -final: - free(ncclDevList); - if(wrapNvmlShutdown() != ncclSuccess) - INFO(NCCL_INIT,"NCCL did not shutdown nvml properly"); - cudaSetDevice(savedDevice); - sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); - return res; + NCCLCHECK(ncclGroupEnd()); + return ncclSuccess; } - static ncclResult_t commDestroy(ncclComm_t comm) { int savedDevice; #ifdef ENABLE_TRACE @@ -1200,10 +833,10 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; - TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d nvmlDev %d", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev); + TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %x", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId); // Try and prevent a double free of the comm struct (user error) - if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->nvmlDev == -1) { + if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) { WARN("comm %p has already been destroyed", comm); return ncclInvalidArgument; } diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc index 364f041..67931f8 100644 --- a/src/misc/argcheck.cc +++ b/src/misc/argcheck.cc @@ -5,6 +5,7 @@ ************************************************************************/ #include "argcheck.h" +#include "comm.h" static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) { cudaPointerAttributes attr; diff --git a/src/misc/nvmlwrap.cc b/src/misc/nvmlwrap.cc index fbe481f..34ed0aa 100644 --- a/src/misc/nvmlwrap.cc +++ b/src/misc/nvmlwrap.cc @@ -16,6 +16,7 @@ static nvmlReturn_t (*nvmlInternalInit)(void); static nvmlReturn_t (*nvmlInternalShutdown)(void); static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device); static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index); +static nvmlReturn_t (*nvmlInternalDeviceGetHandleByIndex)(unsigned int index, nvmlDevice_t* device); static const char* (*nvmlInternalErrorString)(nvmlReturn_t r); static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci); @@ -23,7 +24,10 @@ static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t dev static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult); static nvmlReturn_t (*nvmlInternalDeviceGetMinorNumber)(nvmlDevice_t device, unsigned int* minorNumber); +static nvmlReturn_t (*nvmlInternalDeviceGetCudaComputeCapability)(nvmlDevice_t device, int* major, int* minor); +// Used to make the NVML library calls thread safe +pthread_mutex_t nvmlLock = PTHREAD_MUTEX_INITIALIZER; ncclResult_t wrapNvmlSymbols(void) { if (nvmlState == nvmlInitialized) @@ -70,12 +74,14 @@ ncclResult_t wrapNvmlSymbols(void) { LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown); LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId); LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex); + LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByIndex", nvmlInternalDeviceGetHandleByIndex); LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString); LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo); LOAD_SYM(nvmlhandle, "nvmlDeviceGetMinorNumber", nvmlInternalDeviceGetMinorNumber); LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState); LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo); LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability); + LOAD_SYM(nvmlhandle, "nvmlDeviceGetCudaComputeCapability", nvmlInternalDeviceGetCudaComputeCapability); nvmlState = nvmlInitialized; return ncclSuccess; @@ -85,6 +91,7 @@ teardown: nvmlInternalShutdown = NULL; nvmlInternalDeviceGetHandleByPciBusId = NULL; nvmlInternalDeviceGetIndex = NULL; + nvmlInternalDeviceGetHandleByIndex = NULL; nvmlInternalDeviceGetPciInfo = NULL; nvmlInternalDeviceGetMinorNumber = NULL; nvmlInternalDeviceGetNvLinkState = NULL; @@ -130,7 +137,8 @@ ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_ WARN("lib wrapper not initialized."); return ncclInternalError; } - nvmlReturn_t ret = nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device); + nvmlReturn_t ret; + NVMLLOCKCALL(nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device), ret); if (ret != NVML_SUCCESS) { WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ", nvmlInternalErrorString(ret)); @@ -144,7 +152,8 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { WARN("lib wrapper not initialized."); return ncclInternalError; } - nvmlReturn_t ret = nvmlInternalDeviceGetIndex(device, index); + nvmlReturn_t ret; + NVMLLOCKCALL(nvmlInternalDeviceGetIndex(device, index), ret); if (ret != NVML_SUCCESS) { WARN("nvmlDeviceGetIndex() failed: %s ", nvmlInternalErrorString(ret)); @@ -153,12 +162,28 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { return ncclSuccess; } +ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) { + if (nvmlInternalDeviceGetHandleByIndex == NULL) { + WARN("lib wrapper not initialized."); + return ncclInternalError; + } + nvmlReturn_t ret; + NVMLLOCKCALL(nvmlInternalDeviceGetHandleByIndex(index, device), ret); + if (ret != NVML_SUCCESS) { + WARN("nvmlDeviceGetHandleByIndex() failed: %s ", + nvmlInternalErrorString(ret)); + return ncclSystemError; + } + return ncclSuccess; +} + ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) { if (nvmlInternalDeviceGetPciInfo == NULL) { WARN("lib wrapper not initialized."); return ncclInternalError; } - nvmlReturn_t ret = nvmlInternalDeviceGetPciInfo(device, pci); + nvmlReturn_t ret; + NVMLLOCKCALL(nvmlInternalDeviceGetPciInfo(device, pci), ret); if (ret != NVML_SUCCESS) { WARN("nvmlDeviceGetPciInfo() failed: %s ", nvmlInternalErrorString(ret)); @@ -172,7 +197,8 @@ ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* min WARN("lib wrapper not initialized."); return ncclInternalError; } - nvmlReturn_t ret = nvmlInternalDeviceGetMinorNumber(device, minorNumber); + nvmlReturn_t ret; + NVMLLOCKCALL(nvmlInternalDeviceGetMinorNumber(device, minorNumber), ret); if (ret != NVML_SUCCESS) { WARN("nvmlDeviceGetMinorNumber() failed: %s ", nvmlInternalErrorString(ret)); @@ -186,7 +212,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link /* Do not warn, this symbol is optional. */ return ncclInternalError; } - nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive); + nvmlReturn_t ret; + NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkState(device, link, isActive), ret); if (ret != NVML_SUCCESS) { if (ret != NVML_ERROR_NOT_SUPPORTED) INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ", @@ -201,7 +228,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned /* Do not warn, this symbol is optional. */ return ncclInternalError; } - nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci); + nvmlReturn_t ret; + NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci), ret); if (ret != NVML_SUCCESS) { if (ret != NVML_ERROR_NOT_SUPPORTED) INFO(NCCL_INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ", @@ -217,7 +245,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int /* Do not warn, this symbol is optional. */ return ncclInternalError; } - nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult); + nvmlReturn_t ret; + NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult), ret); if (ret != NVML_SUCCESS) { if (ret != NVML_ERROR_NOT_SUPPORTED) INFO(NCCL_INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ", @@ -226,4 +255,19 @@ ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int } return ncclSuccess; } + +ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) { + if (nvmlInternalDeviceGetNvLinkCapability == NULL) { + WARN("lib wrapper not initialized."); + return ncclInternalError; + } + nvmlReturn_t ret; + NVMLLOCKCALL(nvmlInternalDeviceGetCudaComputeCapability(device, major, minor), ret); + if (ret != NVML_SUCCESS) { + WARN("nvmlDeviceGetCudaComputeCapability() failed: %s ", + nvmlInternalErrorString(ret)); + return ncclSystemError; + } + return ncclSuccess; +} #endif diff --git a/src/misc/rings.cc b/src/misc/rings.cc deleted file mode 100644 index 7e1fc1b..0000000 --- a/src/misc/rings.cc +++ /dev/null @@ -1,391 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "core.h" -#include "param.h" - -#define NCCL_MAX_SCORE 7 - -/* Parse user defined rings. Format is like : - * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0" - * Rings with a non-matching number of ranks are ignored so we can provide - * rings for multiple cases. - */ -#define MAX_ENV_RANKS 512 -static ncclResult_t parseRings(const char* str, int* nringsRet, int nranks, int* prev, int* next) { - int ranks[MAX_ENV_RANKS]; - int nrings = 0; - int rank = 0; - int offset = 0; - int status = 0; // 0 : between numbers, 1 : inside number - do { - int digit = str[offset] - '0'; - if (digit >= 0 && digit <= 9) { - if (status == 0) { - ranks[rank] = digit; - status = 1; - } else { - ranks[rank] = ranks[rank]*10+digit; - } - } else { - if (status == 1) { - rank++; - if (rank == MAX_ENV_RANKS) goto end; - } - status = 0; - if (str[offset] == '|' || str[offset] == '\0') { - int prevRank = ranks[rank-1]; - // Ignore rings if nranks doesn't match - if (rank != nranks) goto newring; - - for (int r=0; r<nranks; r++) { - int rank = ranks[r]; - // Ignore rings with ranks out of bounds - if (rank < 0 || rank >= nranks) goto newring; - // Ignore rings with duplicate ranks - for (int i=0; i<r; i++) - if (ranks[i] == rank) goto newring; - - next[nrings*nranks+prevRank] = rank; - prev[nrings*nranks+rank] = prevRank; - prevRank = rank; - } - nrings++; -newring: - rank = 0; - } - } - } while (str[offset++] != 0); -end: - *nringsRet = nrings; - return ncclSuccess; -} - -/* - * Ring creation algorithm - * - * First, we establish hierarchical coordinates depending on the way ranks can - * communicate. After fillCoords, we have for each rank a unique 3-int array - * { node, pci_domain, rank } corresponding to the three transports : - * { 2[NET], 1[SHM], 0[P2P] }. - * Also, we renumber ranks (to indexes) based on their growing coordinates. - * - * Then, we ask transports to connect groups together. We start with net, then - * shm, then p2p. We maintain two arrays, prev and next, where values are equal - * to -1 when ranks are not yet connected, and a rank otherwise. We never - * connect ranks outside our group, meaning that on 4 nodes of 2 sockets of 4 - * ranks, if we are rank 13, we should see something like (provided we have a - * single net interface, hence a single ring) : - * - * Connecting all nodes <13> - * 2[NET] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 -1 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1 - * next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 -1 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0 - * - * Connecting P2P domains with shared memory <13> - * 1[SHM] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 11 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1 - * next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 12 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0 - * - * Connecting ranks (only inside the P2P domain) <13> - * 0[P2P] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 11 12 13 14 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1 - * next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 12 13 14 15 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0 - * - * Hence, when we ask a transport to connect groups, we provide it with a subview of the ranks (except for net - * which always sees the full world). That way, P2P can bruteforce all combinations inside the node without - * risking to explode in terms of combinations, and we scale better. - * - * Finally, we loop over Network scores to try to create rings with high scores (=locality) and decrease until - * we get at least one ring. - */ - -static void recIsConnected(int rank, int* connected, int nranks, int* matrix, int transport) { - connected[rank] = 1; - for (int r=0; r<nranks; r++) { - if (connected[r] == 0 && matrix[rank*nranks+r] == transport) { - recIsConnected(r, connected, nranks, matrix, transport); - } - } -} - -static void isConnected(int rank, int* connected, int nranks, int* matrix, int transport) { - for (int r=0; r<nranks; r++) connected[r] = 0; - recIsConnected(rank, connected, nranks, matrix, transport); -} - -#define NEW_IDX(rank) do { \ - rankToIdx[rank] = idx; \ - idxToRank[idx] = rank; \ - for (int t=0; t<NTRANSPORTS; t++) coords[rank*NTRANSPORTS+t] = current[t]; \ - idx++; \ -} while (0) - -int findConnected(int rank, int* matrix, int nranks, int transport, int* coords) { - for (int r=0; r<nranks; r++) { - if (coords[r*NTRANSPORTS] == -1 && matrix[rank*nranks+r] == transport) return r; - } - return -1; -} - -static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankToIdx, int* idxToRank) { - int current[NTRANSPORTS]; - int* p2pConnected; - NCCLCHECK(ncclCalloc(&p2pConnected, nranks)); - for (int i=0; i<NTRANSPORTS; i++) current[i] = 0; - int curRank = 0, idx = 0; - while (1) { - // P2P is handled separately as there is no level below it and we need to - // cover the case of being connected to another GPU indirectly. - // So we detect all GPUs in the same P2P domain once and add them all at - // once. - isConnected(curRank, p2pConnected, nranks, matrix, 0); - for (int r=0; r<nranks; r++) { - if (p2pConnected[r]) { - NEW_IDX(r); - curRank = r; - current[0]++; - } - } - current[0] = 0; - - if (idx == nranks) { - free(p2pConnected); - return ncclSuccess; - } - - // Find next group, either connected through SHM or NET. - int rank; - int transport = 1; - while ((rank = findConnected(curRank, matrix, nranks, transport, coords)) == -1) { - current[transport] = 0; - transport++; - if (transport == NTRANSPORTS) { - WARN("Error : Could not find transport to connect next group\n"); - free(p2pConnected); - return ncclInternalError; } - } - curRank = rank; - current[transport]++; - } -} - -#ifdef __PPC__ -// Make the default NCCL_MIN_NRINGS=4 for IBM/Power nodes -#define DEFAULT_MIN_NRINGS 4 -#else -#define DEFAULT_MIN_NRINGS 0 -#endif -NCCL_PARAM(MinNrings, "MIN_NRINGS", DEFAULT_MIN_NRINGS); -NCCL_PARAM(MaxNrings, "MAX_NRINGS", 0); - -/* Users can force the number of threads with an environment variable */ -NCCL_PARAM(Nthreads, "NTHREADS", -2); -ncclResult_t getEnvThreads(int* nthreads) { - int64_t nt = ncclParamNthreads(); - if (nt != -2) - *nthreads = nt; - return ncclSuccess; -} - -static inline int copyRings(int nrings, int newNrings, int nranks, int* a, int* b, int* c, int* d) { - if (newNrings > MAXCHANNELS) newNrings = MAXCHANNELS; - for (int r=nrings; r<newNrings; r++) { - for (int i=0; i<nranks; i++) { - a[r*nranks+i] = a[(r-nrings)*nranks+i]; - b[r*nranks+i] = b[(r-nrings)*nranks+i]; - c[r*nranks+i] = c[(r-nrings)*nranks+i]; - d[r*nranks+i] = d[(r-nrings)*nranks+i]; - } - } - return newNrings; -} -/* Main ring creation function */ -ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut) { - *nrings = 0; - - if (nranks == 1) return ncclSuccess; - - char* str = getenv("NCCL_RINGS"); - if (str && strlen(str)>0) { - int ret = parseRings(str, nrings, nranks, prev, next); - if (ret == ncclSuccess && *nrings > 0) { - if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings); - NCCLCHECK(getEnvThreads(nthreads)); - for (int r = 0; r<*nrings; r++) { - for (int i = 0; i<nranks; i++) { - if (transports[i*nranks+prev[r*nranks+i]] == 2) treeIn[r*nranks+i] = 1; - if (transports[i*nranks+next[r*nranks+i]] == 2) treeOut[r*nranks+i] = 1; - } - } - return ncclSuccess; - } - if (rank == 0) INFO(NCCL_INIT,"No valid ring found in environment, ignoring"); - *nrings = 0; - } - - // Compute hierarchical topology groups, indexes, and rank<->index tables - int* coords, *globalIdxToRank, *globalRankToIdx; - NCCLCHECK(ncclCalloc(&coords, nranks*NTRANSPORTS)); - for (int i=0; i<nranks*NTRANSPORTS; i++) coords[i] = -1; - NCCLCHECK(ncclCalloc(&globalIdxToRank, nranks)); - NCCLCHECK(ncclCalloc(&globalRankToIdx, nranks)); - - NCCLCHECK(fillCoords(nranks, transports, coords, globalRankToIdx, globalIdxToRank)); - - // Start with a high score, then decrease until we find rings - int minScore = NCCL_MAX_SCORE; - int nringsTmp; - int *prevTmp, *nextTmp, *idxToRank, *rankToIdx, *groups, *subgroups; - NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&idxToRank, nranks)); - NCCLCHECK(ncclCalloc(&rankToIdx, nranks)); - NCCLCHECK(ncclCalloc(&groups, nranks)); - NCCLCHECK(ncclCalloc(&subgroups, nranks)); - - int nThreads; - do { - nThreads = *nthreads; - for (int i=0; i<nranks*MAXCHANNELS; i++) prevTmp[i] = nextTmp[i] = -1; - nringsTmp = MAXCHANNELS; - // Loop over transports to connect groups - for (int t=NTRANSPORTS-1; t>=0; t--) { - for (int i=0; i<nranks; i++) idxToRank[i] = rankToIdx[i] = -1; - - int nidx = 0; - for (int i=0; i<nranks; i++) { - // Extract only ranks in the same local area as rank - // We need to extract them in the topological order, hence we iterate over indexes, not ranks - int r = globalIdxToRank[i]; - int sameLocal = 1; - for (int tr = NTRANSPORTS-1; tr > t; tr--) if (coords[r*NTRANSPORTS+tr] != coords[rank*NTRANSPORTS+tr]) sameLocal = 0; - if (!sameLocal) continue; - - groups[nidx] = coords[r*NTRANSPORTS+t]; - subgroups[nidx] = t ? coords[r*NTRANSPORTS+t-1] : nidx; - rankToIdx[r] = nidx; - idxToRank[nidx] = r; - nidx++; - } - - int ngroups = groups[nidx-1] + 1; // Coords should be ordered - - ncclTvalue_t* subvalues; - int *subprev, *subnext; - NCCLCHECK(ncclCalloc(&subvalues, nidx*nidx)); - NCCLCHECK(ncclCalloc(&subprev, nidx*nringsTmp)); - NCCLCHECK(ncclCalloc(&subnext, nidx*nringsTmp)); - if (ngroups > 1) { - /* Extract subvalues */ - for (int i=0; i<nidx; i++) { - for (int j=0; j<nidx; j++) { - if (transports[idxToRank[i]*nranks+idxToRank[j]] == t) - subvalues[i*nidx+j] = values[idxToRank[i]*nranks+idxToRank[j]]; - else - subvalues[i*nidx+j] = 0; - } - } - /* Extract subprev/subnext */ - for (int i=0; i<nidx*nringsTmp; i++) { - subprev[i] = subnext[i] = -1; - } - for (int r=0; r<nringsTmp; r++) { - int start = -1, end = -1; - for (int i=0; i<nranks; i++) { - if (rankToIdx[i] == -1) continue; - if (prevTmp[r*nranks+i] != -1) start = i; - if (nextTmp[r*nranks+i] != -1) end = i; - } - if (start != -1 && end != -1) { - subprev[r*nidx+rankToIdx[start]] = rankToIdx[end]; - subnext[r*nidx+rankToIdx[end]] = rankToIdx[start]; - } - } - /* Get rings */ - NCCLCHECK(ncclTransports[t].getRings(nidx, groups, subgroups, subvalues, &nringsTmp, subprev, subnext, minScore, &nThreads)); - /* Merge subprev/subnext into prev/next */ - for (int r=0; r<nringsTmp; r++) { - for (int i=0; i<nidx; i++) { - if ((prevTmp[r*nranks+idxToRank[i]] == -1) && (subprev[r*nidx+i] != -1)) prevTmp[r*nranks+idxToRank[i]] = idxToRank[subprev[r*nidx+i]]; - if ((nextTmp[r*nranks+idxToRank[i]] == -1) && (subnext[r*nidx+i] != -1)) nextTmp[r*nranks+idxToRank[i]] = idxToRank[subnext[r*nidx+i]]; - if (t == NTRANSPORTS-1) { - // Save node-level masters for trees - treeIn[r*nranks+idxToRank[i]] = prevTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1; - treeOut[r*nranks+idxToRank[i]] = nextTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1; - } - } - } - //for (int r=0; r<nringsTmp; r++) { - //printf("[%d] [%d] [%d] [%d] Prev ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", prevTmp[r*nranks+i]); printf("\n"); - //printf("[%d] [%d] [%d] [%d] Next ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", nextTmp[r*nranks+i]); printf("\n"); - //} - } - free(subvalues); - free(subprev); - free(subnext); - if (nringsTmp == 0) break; - } - minScore--; - if (nringsTmp > *nrings) { - *nrings = nringsTmp; - for (int i=0; i<nranks*(*nrings); i++) { - prev[i] = prevTmp[i]; - next[i] = nextTmp[i]; - } - } - } while (nringsTmp == 0 && minScore); - - free(coords); - free(globalRankToIdx); - free(globalIdxToRank); - free(prevTmp); - free(nextTmp); - free(idxToRank); - free(rankToIdx); - free(groups); - free(subgroups); - - *nthreads = nThreads; - - /* Duplicate the rings in case of multinode+NVLink */ - int nnodes = 0; - for (int r=0; r<nranks; r++) nnodes += treeIn[r]; - int nvlink; - NCCLCHECK(ncclNvlinkGpu(&nvlink)); - if (nnodes > 1 && nvlink) { - *nrings = copyRings(*nrings, *nrings*2, nranks, prev, next, treeIn, treeOut); - } - - if (*nrings == 0) { - WARN("Could not create rings, falling back on simple ring"); - *nrings = 1; - prev[rank] = (rank-1+nranks) % nranks; - next[rank] = (rank+1)%nranks; - } - - int maxNrings = ncclParamMaxNrings(); - int minNrings = ncclParamMinNrings(); - if (maxNrings > 0 && minNrings > maxNrings) { - if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than NCCL_MAX_NRINGS, ignoring NCCL_MIN_NRINGS"); - minNrings = 0; - } - if (minNrings > MAXCHANNELS) { - if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXCHANNELS, MAXCHANNELS); - minNrings = MAXCHANNELS; - } - if (maxNrings > 0 && maxNrings <= *nrings) { - if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings); - *nrings = maxNrings; - } else { - int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1; - if (minNrings < defaultMinNrings) minNrings = defaultMinNrings; - if (minNrings > 0 && minNrings > *nrings) { - if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings); - *nrings = copyRings(*nrings, minNrings, nranks, prev, next, treeIn, treeOut); - } - } - - NCCLCHECK(getEnvThreads(nthreads)); - return ncclSuccess; -} diff --git a/src/misc/topo.cc b/src/misc/topo.cc deleted file mode 100644 index 3f5bdf9..0000000 --- a/src/misc/topo.cc +++ /dev/null @@ -1,57 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "core.h" -#include "topo.h" - -#define BUSID_SIZE (sizeof("0000:00:00.0")) -#define BUSID_REDUCED_SIZE (sizeof("0000:00")) - -ncclResult_t getCudaPath(int cudaDev, char** path) { - char busId[BUSID_SIZE]; - CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev)); - for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]); - char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0"; - memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1); - memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1); - *path = realpath(busPath, NULL); - if (*path == NULL) { - WARN("Could not find real path of %s", busPath); - return ncclSystemError; - } - return ncclSuccess; -} - -const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" }; - -int pciDistance(char* path1, char* path2) { - int score = 0; - int depth = 0; - int same = 1; - for (int i=0; i<strlen(path1); i++) { - if (path1[i] != path2[i]) same = 0; - if (path1[i] == '/') { - depth++; - if (same == 1) score++; - } - } - if (score <= 3) { -#ifdef __PPC__ - // NUMA distance detection and PATH_SYS not supported on IBM/Power nodes - // nodes currently - return PATH_NODE; -#else - /* Split the former PATH_SOC distance into PATH_NODE and PATH_SYS based on numaId */ - int numaId1 = getNumaId(path1); - int numaId2 = getNumaId(path2); - TRACE(NCCL_INIT, "depth %d score %d path1 %s numaId %d path2 %s numaId %d", depth, score, path1, numaId1, path2, numaId2); - return ((numaId1 == numaId2) ? PATH_NODE : PATH_SYS); -#endif - } - if (score == 4) return PATH_PHB; - if (score == depth-1) return PATH_PIX; - return PATH_PXB; -} diff --git a/src/misc/utils.cc b/src/misc/utils.cc index da99774..5158529 100644 --- a/src/misc/utils.cc +++ b/src/misc/utils.cc @@ -5,27 +5,53 @@ ************************************************************************/ #include "utils.h" -#include "debug.h" -#include "nccl_net.h" -#include <unistd.h> -#include <string.h> -#include <stdarg.h> +#include "core.h" #include "nvmlwrap.h" -#include "core.h" -// Convert a logical cudaDev index to the NVML device minor number -ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) { - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; - nvmlDevice_t nvmlDevice; - unsigned int dev; - *nvmlDev = -1; - CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev)); - NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDevice)); - NCCLCHECK(wrapNvmlDeviceGetMinorNumber(nvmlDevice, &dev)); +// Get current Compute Capability +int ncclCudaCompCap() { + int cudaDev; + if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0; + int ccMajor, ccMinor; + if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0; + if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0; + return ccMajor*10+ccMinor; +} - *nvmlDev = dev; +ncclResult_t int64ToBusId(int64_t id, char* busId) { + sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf)); + return ncclSuccess; +} +ncclResult_t busIdToInt64(char* busId, int64_t* id) { + const int size = strlen(busId); + char* hexStr; + NCCLCHECK(ncclCalloc(&hexStr, size)); + int hexOffset = 0; + for (int i=0; i<size; i++) { + char c = busId[i]; + if (c == '.' || c == ':') continue; + if ((c >= '0' && c <= '9') || + (c >= 'A' && c <= 'F') || + (c >= 'a' && c <= 'f')) { + hexStr[hexOffset++] = busId[i]; + } else break; + } + hexStr[hexOffset] = '\0'; + *id = strtol(hexStr, NULL, 16); + free(hexStr); + return ncclSuccess; +} + +// Convert a logical cudaDev index to the NVML device minor number +ncclResult_t getBusId(int cudaDev, int64_t *busId) { + // On most systems, the PCI bus ID comes back as in the 0000:00:00.0 + // format. Still need to allocate proper space in case PCI domain goes + // higher. + char busIdStr[] = "00000000:00:00.0"; + CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev)); + NCCLCHECK(busIdToInt64(busIdStr, busId)); return ncclSuccess; } @@ -40,53 +66,6 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim) { return ncclSuccess; } -/* Common logging function used by the INFO, WARN and TRACE macros - * Also exported to the dynamically loadable Net transport modules so - * they can share the debugging mechanisms and output files - */ -void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) { - if (ncclDebugLevel <= NCCL_LOG_NONE) return; - - char hostname[1024]; - getHostName(hostname, 1024, '.'); - int cudaDev; - cudaGetDevice(&cudaDev); - - char buffer[1024]; - size_t len = 0; - pthread_mutex_lock(&ncclDebugOutputLock); - if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN) - len = snprintf(buffer, sizeof(buffer), - "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line); - else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask)) - len = snprintf(buffer, sizeof(buffer), - "%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev); -#ifdef ENABLE_TRACE - else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) { - auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch; - double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000; - len = snprintf(buffer, sizeof(buffer), - "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line); - } -#endif - if (len) { - va_list vargs; - va_start(vargs, fmt); - (void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs); - va_end(vargs); - fprintf(ncclDebugFile,"%s\n", buffer); - fflush(ncclDebugFile); - } - pthread_mutex_unlock(&ncclDebugOutputLock); - - // If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort() - if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) { - fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n", - hostname, getpid(), gettid(), cudaDev, filefunc, line); - abort(); - } -} - uint64_t getHash(const char* string, int n) { // Based on DJB2, result = result * 33 + char uint64_t result = 5381; @@ -100,27 +79,39 @@ uint64_t getHash(const char* string, int n) { * that will be unique for both bare-metal and container instances * Equivalent of a hash of; * - * $(hostname) $(readlink /proc/self/ns/uts) $(readlink /proc/self/ns/mnt) + * $(hostname)$(cat /proc/sys/kernel/random/boot_id) + * + * This string can be overridden by using the NCCL_HOSTID env var. */ +#define HOSTID_FILE "/proc/sys/kernel/random/boot_id" uint64_t getHostHash(void) { - char uname[1024]; - // Start off with the full hostname - (void) getHostName(uname, sizeof(uname), '\0'); - int offset = strlen(uname); - int len; - // $(readlink /proc/self/ns/uts) - len = readlink("/proc/self/ns/uts", uname+offset, sizeof(uname)-1-offset); - if (len < 0) len = 0; - offset += len; - // $(readlink /proc/self/ns/mnt) - len = readlink("/proc/self/ns/mnt", uname+offset, sizeof(uname)-1-offset); - if (len < 0) len = 0; - offset += len; - // Trailing '\0' - uname[offset]='\0'; - TRACE(NCCL_INIT,"unique hostname '%s'", uname); + char hostHash[1024]; + char *hostId; + + // Fall back is the full hostname if something fails + (void) getHostName(hostHash, sizeof(hostHash), '\0'); + int offset = strlen(hostHash); + + if ((hostId = getenv("NCCL_HOSTID")) != NULL) { + strncpy(hostHash, hostId, sizeof(hostHash)); + } else { + FILE *file = fopen(HOSTID_FILE, "r"); + if (file != NULL) { + char *p; + if (fscanf(file, "%ms", &p) == 1) { + strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1); + free(p); + } + } + fclose(file); + } + + // Make sure the string is terminated + hostHash[sizeof(hostHash)-1]='\0'; + + TRACE(NCCL_INIT,"unique hostname '%s'", hostHash); - return getHash(uname, strlen(uname)); + return getHash(hostHash, strlen(hostHash)); } /* Generate a hash of the unique identifying string for this process @@ -147,8 +138,6 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) { if (!string) return 0; const char* ptr = string; - // Ignore "^" or "=" prefix, will be detected outside of this function - if (ptr[0] == '^' || ptr[0] == '=') ptr++; int ifNum = 0; int ifC = 0; diff --git a/src/nccl.h.in b/src/nccl.h.in index 985274e..f07e0a4 100644 --- a/src/nccl.h.in +++ b/src/nccl.h.in @@ -41,7 +41,7 @@ typedef enum { ncclSuccess = 0, * This integer is coded with the MAJOR, MINOR and PATCH level of the * NCCL library */ -ncclResult_t ncclGetVersion(int *version); +ncclResult_t ncclGetVersion(int *version); ncclResult_t pncclGetVersion(int *version); /* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be @@ -244,7 +244,8 @@ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou * Start a group call. All subsequent calls to NCCL may not block due to * inter-CPU synchronization. */ -ncclResult_t ncclGroupStart(); +ncclResult_t ncclGroupStart(); +ncclResult_t pncclGroupStart(); /* * Group End @@ -252,7 +253,8 @@ ncclResult_t ncclGroupStart(); * End a group call. Wait for all calls since ncclGroupStart to complete * before returning. */ -ncclResult_t ncclGroupEnd(); +ncclResult_t ncclGroupEnd(); +ncclResult_t pncclGroupEnd(); #ifdef __cplusplus } // end extern "C" diff --git a/src/transport.cc b/src/transport.cc index 1436a5b..4059849 100644 --- a/src/transport.cc +++ b/src/transport.cc @@ -4,7 +4,8 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "comm.h" +#include "info.h" extern struct ncclTransport p2pTransport; extern struct ncclTransport shmTransport; @@ -119,13 +120,13 @@ ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int r } if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) { // Tree up - struct ncclTree* tree = &args->channel->tree; + struct ncclTree* tree = &args->channel->treeUp; for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args)); NCCLCHECK(SaveProxy<proxySend>(tree->up, args)); } if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) { // Tree down - struct ncclTree* tree = &args->channel->tree; + struct ncclTree* tree = &args->channel->treeDn; for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args)); NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args)); } @@ -157,7 +158,9 @@ void* persistentThread(void *comm_) { } } while (op == NULL); op->idle = 0; - if (op->state != ncclProxyOpNone) ret = op->progress(op); + // opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started + // yet and might be cancelled before they even start. Hold on on those. + if (op->state != ncclProxyOpNone && op->opCount < comm->lastOpCount) ret = op->progress(op); if (ret != ncclSuccess) { comm->fatalError = ret; INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); diff --git a/src/transport/net.cc b/src/transport/net.cc index d9559eb..87fc9ce 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -4,39 +4,9 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" -#include "transport.h" -#include "nvmlwrap.h" +#include "comm.h" #include "net.h" -#include "param.h" -#include "topo.h" -#include <cuda_runtime.h> -#include <assert.h> - -#define NET_MAX_IFS 16 -#define NET_MAX_GPUS 32 - -// Cache GPU-NIC distances to avoid re-computing them -#define NET_TVALUE_UNKNOWN 0ULL -static ncclTvalue_t ncclNetTvalues[NET_MAX_GPUS] = { NET_TVALUE_UNKNOWN }; -static int ncclNetNDev; - -// We encode 3 bits of distance per interface into a ncclTvalue_t (64-bit) -#define NET_BITS_PER_IF 3 -#define NET_BITS_PER_IF_MASK ((1<<NET_BITS_PER_IF)-1) -static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t"); -static ncclTvalue_t getTvalue(short* distances, int ndev) { - ncclTvalue_t tvalue = 0; - for (int d=0; d<ndev; d++) { - ncclTvalue_t score = 1 + PATH_SYS - distances[d]; - // Keep 3 bits of score info per dev - tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d)); - } - return tvalue; -} -static int getScore(ncclTvalue_t tvalue, int dev) { - return (tvalue >> (dev*NET_BITS_PER_IF)) & NET_BITS_PER_IF_MASK; -} +#include "graph.h" struct netConnectInfo { ncclNetHandle_t netHandle; @@ -53,6 +23,7 @@ struct netSendResources { int buffSize; void* mhandle; void* llMhandle; + void* ll128Mhandle; struct ncclRecvMem* devRecvMem; uint64_t step; uint64_t llLastCleaning; @@ -70,228 +41,61 @@ struct netRecvResources { int buffSize; void* mhandle; void* llMhandle; + void* ll128Mhandle; struct ncclRecvMem* devRecvMem; uint64_t step; uint64_t llLastCleaning; }; -static ncclResult_t netDistance(int cudaDev, int dev, short* distance) { - char* cudaPath = NULL; - char* nicPath = NULL; - ncclResult_t err; - NCCLCHECK(getCudaPath(cudaDev, &cudaPath)); - err = ncclNetPciPath(dev, &nicPath); - *distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SYS : pciDistance(nicPath, cudaPath); - if (nicPath) free(nicPath); - if (cudaPath) free(cudaPath); - return ncclSuccess; -} - -static ncclResult_t netDevices(int* ndev, short** distances) { - NCCLCHECK(ncclNetDevices(ndev)); - if (*ndev == 0) { - WARN("Error : Network returned 0 device"); - return ncclSystemError; - } - if (*ndev > NET_MAX_IFS) *ndev = NET_MAX_IFS; - - *distances = (short*)malloc(*ndev*sizeof(short)); - if (*distances == NULL) return ncclSystemError; - - // Find distance with current GPU - int cudaDev, nvmlDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev)) - char line[1024]; - sprintf(line, "CUDA Dev %d[%d], %s NIC distance : ", cudaDev, nvmlDev, ncclNetName()); - for (int d=0; d<*ndev; d++) { - NCCLCHECK(netDistance(cudaDev, d, *distances+d)); - sprintf(line+strlen(line), " %s", pathDists[(*distances)[d]]); - } - INFO(NCCL_INIT|NCCL_NET, "%s", line); - return ncclSuccess; -} - -/* Determine if we can communicate with the peer */ -ncclResult_t netCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) { - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - ret[0] = ncclNetTvalues[cudaDev]; - if (ret[0] == NET_TVALUE_UNKNOWN) { - if (cudaDev >= NET_MAX_GPUS) { - WARN("CUDA device %d >= MAX %d\n", cudaDev, NET_MAX_GPUS); - return ncclInternalError; - } - int nDev; - short* distances; - NCCLCHECK(netDevices(&nDev, &distances)); - ncclNetTvalues[cudaDev] = ret[0] = getTvalue(distances, nDev); - ncclNetNDev = nDev; - free(distances); - } +/* Determine if two peers can communicate with NET */ +ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { + *ret = 1; return ncclSuccess; } -static inline int groupBestStart(int nranks, int* groups, int group, ncclTvalue_t* values, int card, int minScore) { - int bestRank = -1; - int bestScore = 0; - for (int rank=0; rank<nranks; rank++) { - if (groups[rank] != group) continue; - for (int i=0; i<nranks; i++) { - ncclTvalue_t netValue = values[rank*nranks+i]; - if (netValue != 0) { - ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK; - if (score >= minScore && score > bestScore) { - bestScore = score; - bestRank = rank; - } - // All other values should be the same, stop here for this rank - break; - } - } - } - return bestRank; -} -static inline int groupBestEnd(int nranks, int* groups, int group, int* subgroups, int startSubGroup, int startRank, ncclTvalue_t* values, int card, int minScore) { - // For the last rank, we don't need the absolute best score, just to be within minScore. - for (int rank=nranks-1; rank>=0; rank--) { - if (groups[rank] != group) continue; - if (startSubGroup != -1 && startSubGroup == subgroups[rank]) continue; - if (startRank == rank) continue; - for (int i=0; i<nranks; i++) { - ncclTvalue_t netValue = values[rank*nranks+i]; - if (netValue != 0) { - ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK; - if (score >= minScore) { - return rank; - } - // All other values should be the same, stop here for this rank - break; - } - } - } - return -1; -} - -ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) { - int nGroups = groups[nranks-1] + 1; - int *cardUsed, *starts, *ends; - NCCLCHECK(ncclCalloc(&cardUsed, NET_MAX_IFS*nGroups)); - NCCLCHECK(ncclCalloc(&starts, nGroups)); - NCCLCHECK(ncclCalloc(&ends, nGroups)); - - for (int ring = 0; ring<*nringsRet; ring++) { - for (int group = 0; group<nGroups; group++) { - int nranksInGroup = 0; - int nsubGroups = 0; - for (int rank=0; rank<nranks; rank++) - if (groups[rank] == group) { - nranksInGroup++; - nsubGroups = std::max(subgroups[rank], nsubGroups); - } - starts[group] = ends[group] = -1; - // Receive on the rank closest to the NIC - for (int card=0; card<NET_MAX_IFS; card++) { - if (cardUsed[group*NET_MAX_IFS+card] == 1) continue; - int start = groupBestStart(nranks, groups, group, values, card, minScore); - // Send from any rank, but best on a different subgroup and close to the NIC also. - int end = (nranksInGroup == 1) ? start - : groupBestEnd(nranks, groups, group, subgroups, nsubGroups ? subgroups[start] : -1, start, values, card, minScore); - //printf("Ring %d, Minscore %d, Card %d, group %d, start = %d, end = %d\n", ring, minScore, card, group, start, end); - if (start != -1 && end != -1) { - cardUsed[group*NET_MAX_IFS+card] = 1; - starts[group] = start; - ends[group] = end; - break; - } - } - if (starts[group] == -1 || ends[group] == -1) { - *nringsRet = ring; - goto done; - } - } - // Link groups together - for (int group = 0; group<nGroups; group++) { - int nextGroup = (group+1)%nGroups; - next[ring*nranks+ends[group]] = starts[nextGroup]; - prev[ring*nranks+starts[nextGroup]] = ends[group]; - } - } -done: - free(cardUsed); - free(starts); - free(ends); - return ncclSuccess; -} - -int getDev(int cudaDev, int ringId) { - ncclTvalue_t tvalues = ncclNetTvalues[cudaDev]; - - int dev = 0; - int maxScore = 0; - for (int d=0; d<ncclNetNDev; d++) if (getScore(tvalues,d) > maxScore) maxScore = getScore(tvalues,d); - int skip = ringId+1; - while (skip) { - for (int d=0; d<ncclNetNDev; d++) { - if (getScore(tvalues, d) == maxScore) { - skip--; - if (skip == 0) { dev = d; goto end; } - } - } - } -end: - return dev; -} - NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2); NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB); -static ncclResult_t netGetGdrSupport(int dev, int read, int* useGdr) { +static ncclResult_t netGetGdrSupport(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr) { *useGdr = 0; - int cudaDev, nvmlDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev)) - if (read) { // For reads (sends) only enable under certain conditions int gdrReadParam = ncclParamNetGdrRead(); if (gdrReadParam == 0) return ncclSuccess; if (gdrReadParam < 0) { int nvlink; - NCCLCHECK(ncclNvlinkGpu(&nvlink)); + NCCLCHECK(ncclTopoHasNvlink(topo, busId, &nvlink)); if (!nvlink) return ncclSuccess; } } // Check if we are close enough that it makes sense to enable GDR int netGdrLevel = ncclParamNetGdrLevel(); - short distance; - NCCLCHECK(netDistance(cudaDev, dev, &distance)); + int distance; + NCCLCHECK(ncclTopoNetDistance(topo, busId, netDev, &distance)); if (distance >= netGdrLevel) { - INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel); + INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d >= %d)", ncclNetName(), busId, netDev, distance, netGdrLevel); return ncclSuccess; } // Finally, check if the NIC supports it int flags; - NCCLCHECK(ncclNetPtrSupport(dev, &flags)); + NCCLCHECK(ncclNetPtrSupport(netDev, &flags)); if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess; *useGdr = 1; - INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d[%d] / HCA %d (distance %d < %d), read %d", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel, read); + INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d < %d), read %d", ncclNetName(), busId, netDev, distance, netGdrLevel, read); return ncclSuccess; } /* Determine if we will use this transport for this peer and return connect * information for this peer */ -ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { +ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { struct netSendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - resources->netDev = getDev(cudaDev, channelId); - NCCLCHECK(netGetGdrSupport(resources->netDev, 1, &resources->useGdr)); + NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &resources->netDev)); + NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr)); int sendSize = sizeof(struct ncclSendMem); NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize)); @@ -303,20 +107,18 @@ ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize)); resources->buffSize = buffSize; - INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [send] via NET/%s/%d%s", channelId, myInfo->rank, peerInfo->rank, ncclNetName(), resources->netDev, + INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev, resources->useGdr ? "/GDRDMA" : ""); return ncclSuccess; } -ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { +ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { struct netRecvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - resources->netDev = getDev(cudaDev, channelId); - NCCLCHECK(netGetGdrSupport(resources->netDev, 0, &resources->useGdr)); + NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &resources->netDev)); + NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr)); int sendSize = sizeof(struct ncclSendMem); NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize)); @@ -328,7 +130,7 @@ ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize)); resources->buffSize = buffSize; - INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [receive] via NET/%s/%d%s", channelId, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev, + INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev, resources->useGdr ? "/GDRDMA" : ""); struct netConnectInfo* info = (struct netConnectInfo*) connectInfo; NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm)); @@ -343,6 +145,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem; send->conn.buff = recvMem->buff; send->conn.llBuff = resources->devHostRecvMem->llBuff; + send->conn.ll128Buff = recvMem->ll128Buff; // Head/Tail/Opcount/Fifos are always on host send->conn.tail = &resources->devHostRecvMem->tail; @@ -360,6 +163,8 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle)); NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->devHostRecvMem->llBuff, NCCL_LL_BUFF_SIZE, NCCL_PTR_HOST, &resources->llMhandle)); + NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE, + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle)); return ncclSuccess; } @@ -373,6 +178,7 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem; recv->conn.buff = recvMem->buff; recv->conn.llBuff = recvMem->llBuff; + recv->conn.ll128Buff = recvMem->ll128Buff; // Head/Tail/Opcount are always on host recv->conn.tail = &resources->devHostRecvMem->tail; @@ -388,6 +194,8 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle)); NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->llBuff, NCCL_LL_BUFF_SIZE, resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->llMhandle)); + NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE, + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle)); return ncclSuccess; } @@ -397,6 +205,7 @@ ncclResult_t netSendFree(void* transportResources) { NCCLCHECK(ncclCudaHostFree(resources->hostSendMem)); NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandle)); NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->llMhandle)); + NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->ll128Mhandle)); NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem)); if (resources->useGdr) CUDACHECK(cudaFree(resources->devRecvMem)); @@ -410,6 +219,7 @@ ncclResult_t netRecvFree(void* transportResources) { NCCLCHECK(ncclCudaHostFree(resources->hostSendMem)); NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandle)); NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->llMhandle)); + NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->ll128Mhandle)); NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem)); if (resources->useGdr) CUDACHECK(cudaFree(resources->devRecvMem)); @@ -437,7 +247,39 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) { volatile int* sizesFifo = resources->hostRecvMem->sizesFifo; volatile uint64_t* recvTail = &resources->hostRecvMem->tail; - if (args->llMode) { + if (args->protocol == NCCL_PROTO_LL128) { + int stepSize = NCCL_LL128_BUFF_SIZE/NCCL_STEPS; + if (args->tail < *recvTail) { + int buffSlot = args->tail%NCCL_STEPS; + if (sizesFifo[buffSlot] != -1) { + struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; + char* localBuff = (char*)localMem->ll128Buff; + int ready = resources->useGdr; + if (!ready) { + // When data is in sysmem, we need to wait until all flags are correct since the GPU only + // called threadfence() + uint64_t flag = args->tail + 1; + int nFifoLines = DIVUP(sizesFifo[buffSlot], sizeof(uint64_t)*NCCL_LL128_LINEELEMS); + volatile uint64_t* lines = (volatile uint64_t*)(localBuff+buffSlot*stepSize); + ready = 1; + for (int i=0; i<nFifoLines; i++) { + if (lines[i*NCCL_LL128_LINEELEMS+NCCL_LL128_DATAELEMS] != flag) { ready = 0; break; } + } + } + if (ready) { + // Send through network + NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, sizesFifo[buffSlot], resources->ll128Mhandle, args->requests+buffSlot)); + if (args->requests[buffSlot] != NULL) { + sizesFifo[buffSlot] = -1; + // Make sure size is reset to zero before we update the head. + __sync_synchronize(); + args->tail += args->sliceSteps; + args->idle = 0; + } + } + } + } + } else if (args->protocol == NCCL_PROTO_LL) { int buffSlot = args->tail%NCCL_STEPS; int size = sizesFifo[buffSlot]; if (size != -1) { @@ -463,17 +305,19 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { } } } else if (args->tail < *recvTail) { - struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; int stepSize = args->channel->buffSize/NCCL_STEPS; + struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; // Send through network int buffSlot = args->tail%NCCL_STEPS; - NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot)); - if (args->requests[buffSlot] != NULL) { - sizesFifo[buffSlot] = -1; - // Make sure size is reset to zero before we update the head. - __sync_synchronize(); - args->tail += args->sliceSteps; - args->idle = 0; + if (sizesFifo[buffSlot] != -1) { + NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot)); + if (args->requests[buffSlot] != NULL) { + sizesFifo[buffSlot] = -1; + // Make sure size is reset to zero before we update the head. + __sync_synchronize(); + args->tail += args->sliceSteps; + args->idle = 0; + } } } } @@ -512,11 +356,11 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { } if (args->state == ncclProxyOpProgress) { args->idle = 1; - int stepSize = ( args->llMode ? NCCL_LL_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS; + int stepSize = ( args->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : args->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS; if (args->head < args->end) { struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; - char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff; - void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle; + char* localBuff = args->protocol == NCCL_PROTO_LL ? (char*)localMem->llBuff : args->protocol == NCCL_PROTO_LL128 ? (char*)localMem->ll128Buff : localMem->buff; + void* mhandle = args->protocol == NCCL_PROTO_LL ? resources->llMhandle : args->protocol == NCCL_PROTO_LL128 ? resources->ll128Mhandle : resources->mhandle; volatile uint64_t* sendHead = &resources->hostSendMem->head; if ((args->tail < args->head + NCCL_STEPS) && (args->tail < *sendHead + NCCL_STEPS) && (args->tail < args->end)) { int buffSlot = args->tail%NCCL_STEPS; @@ -533,7 +377,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size)); if (done) { args->head += args->sliceSteps; - if (args->llMode == 0) { + if (args->protocol == NCCL_PROTO_SIMPLE) { if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle); resources->hostRecvMem->tail = args->head; } @@ -553,7 +397,6 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { struct ncclTransport netTransport = { "NET", netCanConnect, - netGetRings, { netSendSetup, netSendConnect, netSendFree, netSendProxy }, { netRecvSetup, netRecvConnect, netRecvFree, netRecvProxy } }; diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index c8eb6d5..0d5307c 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -8,7 +8,7 @@ #include "core.h" #include "socket.h" #include "net.h" -#include "topo.h" +#include "graph.h" #include "utils.h" #include "param.h" @@ -107,7 +107,9 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { char* userIbEnv = getenv("NCCL_IB_HCA"); struct netIf userIfs[MAX_IB_DEVS]; bool searchNot = userIbEnv && userIbEnv[0] == '^'; + if (searchNot) userIbEnv++; bool searchExact = userIbEnv && userIbEnv[0] == '='; + if (searchExact) userIbEnv++; int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS); if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return ncclInternalError; @@ -199,32 +201,14 @@ ncclResult_t ncclIbGdrSupport(int ibDev) { moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1; } if (moduleLoaded == 0) return ncclSystemError; - ncclResult_t ret = ncclSystemError; - void* ptr; - if (cudaMalloc(&ptr, sizeof(int)) == cudaSuccess) { - struct ibv_mr* mr; - struct ibv_pd* pd; - if (wrap_ibv_alloc_pd(&pd, ncclIbDevs[ibDev].context) == ncclSuccess) { - if ((mr = wrap_direct_ibv_reg_mr(pd, ptr, sizeof(int), IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)) != NULL) { - ret = ncclSuccess; - wrap_ibv_dereg_mr(mr); - } - wrap_ibv_dealloc_pd(pd); - } - cudaFree(ptr); - } - return ret; + return ncclSuccess; } ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) { *supportedTypes = NCCL_PTR_HOST; - int cudaDev, nvmlDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev)) - if (ncclIbGdrSupport(dev) != ncclSuccess) { - INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d '%s' (no module or not supported by GPU)", cudaDev, nvmlDev, dev, ncclIbDevs[dev].devName); + INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName); return ncclSuccess; } *supportedTypes |= NCCL_PTR_CUDA; diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index ab5e8ec..1b1fc4f 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -4,7 +4,7 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "nccl.h" +#include "comm.h" #include "core.h" #include "socket.h" #include "net.h" @@ -108,6 +108,7 @@ struct ncclSocketRequest { void* data; int size; int ctrlFd; + int offset; int used; struct ncclSocketComm* comm; struct ncclSocketTask* tasks[MAX_SOCKETS]; @@ -193,7 +194,7 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) { } if (nThreads == -2 || nSocksPerThread == -2) { // Auto-detection - int autoNt=1, autoNs=1; + int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads char vendorPath[PATH_MAX]; snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetIfNames+dev*MAX_IF_NAME_SIZE); char* rPath = realpath(vendorPath, NULL); @@ -213,6 +214,9 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) { if (strcmp(vendor, "0x1d0f") == 0) { // AWS autoNt = 2; autoNs = 8; + } else if (strcmp(vendor, "0x1ae0") == 0) { // GCP + autoNt = 4; + autoNs = 1; } end: if (nThreads == -2) nThreads = autoNt; @@ -226,7 +230,7 @@ end: } *ns = nSocks; *nt = nThreads; - INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread); + if (nSocks > 0) INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread); return ncclSuccess; } @@ -379,31 +383,45 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) { return ncclInternalError; } r->size = data; + r->offset = 0; r->used = 2; // done exchanging size // divide into subtasks - int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks)); int chunkOffset = 0, i = 0; - while (chunkOffset < r->size) { - int chunkSize = std::min(taskSize, r->size-chunkOffset); - NCCLCHECK(ncclSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++)); - chunkOffset += chunkSize; + if (r->comm->nSocks > 0) { + int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks)); + while (chunkOffset < r->size) { + int chunkSize = std::min(taskSize, r->size-chunkOffset); + NCCLCHECK(ncclSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++)); + chunkOffset += chunkSize; + } } r->nSubs = i; } if (r->used == 2) { // already exchanged size - int nCompleted = 0; - for (int i=0; i<r->nSubs; i++) { - struct ncclSocketTask* sub = r->tasks[i]; - if (sub->result != ncclSuccess) return sub->result; - if (sub->offset == sub->size) nCompleted++; - } - if (nCompleted == r->nSubs) { - if (size) *size = r->size; - *done = 1; - r->used = 0; + if (r->nSubs > 0) { + int nCompleted = 0; for (int i=0; i<r->nSubs; i++) { struct ncclSocketTask* sub = r->tasks[i]; - sub->used = 0; + if (sub->result != ncclSuccess) return sub->result; + if (sub->offset == sub->size) nCompleted++; + } + if (nCompleted == r->nSubs) { + if (size) *size = r->size; + *done = 1; + r->used = 0; + for (int i=0; i<r->nSubs; i++) { + struct ncclSocketTask* sub = r->tasks[i]; + sub->used = 0; + } + } + } else { // progress request using main thread + if (r->offset < r->size) { + NCCLCHECK(socketProgress(r->op, r->ctrlFd, r->data, r->size, &r->offset)); + } + if (r->offset == r->size) { + if (size) *size = r->size; + *done = 1; + r->used = 0; } } } diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index 62bd725..0cc92f3 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -4,15 +4,9 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "comm.h" +#include "graph.h" #include "utils.h" -#include "topo.h" -#include "transport.h" -#include "param.h" -#include <unistd.h> -#include <cuda_runtime.h> -#include <ctype.h> -#include "nvlink.h" struct p2pConnectInfo { int direct; @@ -38,419 +32,91 @@ NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2); NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2); /* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */ -static int busIdToCudaDev(const char* busId) { +static int busIdToCudaDev(int64_t busId) { int ndev; if (cudaGetDeviceCount(&ndev) != cudaSuccess) return -1; for (int i = 0; i < ndev; i++) { - char devBusId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; - if (cudaDeviceGetPCIBusId(devBusId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess) + char devBusIdStr[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + if (cudaDeviceGetPCIBusId(devBusIdStr, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess) return -1; - if (strcmp(busId, devBusId) == 0) { - return i; - } + int64_t devBusId; + NCCLCHECK(busIdToInt64(devBusIdStr, &devBusId)); + if (busId == devBusId) return i; } // BusId was not found in our locally visible CUDA devices return -1; } -/* Determine if we can communicate with the peer through p2p */ -ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) { - // Do not use P2P across root complexes by default (provided CUDA permits it) - int p2pLevel = PATH_NODE; +/* Determine if two peers can communicate through p2p */ +ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { + int cpuCount; + NCCLCHECK(ncclTopoCpuCount(topo, &cpuCount)); + // Do not use P2P across sockets by default (provided CUDA permits it). + // When we are on a single socket, don't even use P2P through the CPU as + // it should be able to sustain two flows to sysmem faster than PCI P2P. + int p2pLevel = cpuCount == 1 ? PATH_PHB : PATH_NODE; if (ncclParamP2pDisable() == 1) p2pLevel = 0; if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel(); + // Disable P2P *ret = 0; if (p2pLevel == 0) return ncclSuccess; // Rule out different nodes - if (myInfo->hostHash != peerInfo->hostHash) return ncclSuccess; + if (info1->hostHash != info2->hostHash) return ncclSuccess; // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) - int peerCudaDev = busIdToCudaDev(peerInfo->busId); - if (peerCudaDev == -1) { + int cudaDev1 = busIdToCudaDev(info1->busId); + int cudaDev2 = busIdToCudaDev(info2->busId); + if (cudaDev1 == -1 || cudaDev2 == -1) { // Peer's CUDA device is not visible in this process #if CUDART_VERSION >= 10010 // But in CUDA 10.1 we can still communicate with 'invisible' devices - TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %d(%s) and %d(%s)", myInfo->nvmlDev, myInfo->busId, peerInfo->nvmlDev, peerInfo->busId); + TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %lx and %lx", info1->busId, info2->busId); // Check for NVLink/NVswitch including P2P access - int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId); - if (nvlinkp2p > 0) { - *ret = nvlinkp2p; + int nvlink; + NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink)); + if (nvlink > 0) { + *ret = 1; return ncclSuccess; } #endif return ncclSuccess; } - TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev); + TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%lx] and [%d=%lx]", cudaDev1, info1->busId, cudaDev2, info2->busId); // Do not detect topology if we're on the same GPU. Note this is not really supported. - if (myInfo->cudaDev == peerCudaDev) { - *ret = 1 + PATH_SYS; + if (cudaDev1 == cudaDev2) { + *ret = 1; return ncclSuccess; } // See if CUDA can do P2P int p2p; - if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerCudaDev) != cudaSuccess) { - INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%d) and dev %d(=%d)", - myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev); + if (cudaDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != cudaSuccess) { + INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%lx) and dev %d(=%lx)", + cudaDev1, info1->busId, cudaDev2, info2->busId); return ncclSuccess; } if (p2p == 0) return ncclSuccess; // Check for NVLink/NVswitch - int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId); - if (nvlinkp2p > 0) { - *ret = nvlinkp2p; + int nvlink; + NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink)); + if (nvlink > 0) { + *ret = 1; return ncclSuccess; } // Finally compute the PCI distance and compare with the p2pLevel. - char* myPath; - char* peerPath; - ncclResult_t err1 = getCudaPath(myInfo->cudaDev, &myPath); - ncclResult_t err2 = getCudaPath(peerCudaDev, &peerPath); - if (err1 == ncclSuccess && err2 == ncclSuccess) { - int distance = pciDistance(myPath, peerPath); - if (distance < p2pLevel) { - *ret = 1 + PATH_SYS - distance; - } + int distance; + NCCLCHECK(ncclTopoGpuDistance(topo, info1->busId, info2->busId, &distance)); + if (distance < p2pLevel) { + *ret = 1; } - if (err1 == ncclSuccess) free(myPath); - if (err2 == ncclSuccess) free(peerPath); - return ncclSuccess; -} - -#define MAXGPUS_NVLINKP2P 8 // 16 would take an almost infinite time anyway -#define MAXGPUS_PCI 64 - -static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) { - int nrings = 0; - ncclTvalue_t* line = matrix+current*n; - inTheRing[current] = 1; - int currentStep = (currentRing+1)*n-remaining; - rings[currentStep-1] = current; - if (remaining == 0) { - int looprank = rings[currentRing*n]; - if (line[looprank] > 0) { - if (currentRing+1 == nRingsMax) { - nrings = 1; - } else { - line[looprank]--; - for (int i=0; i<n; i++) inTheRing[i] = 0; - if (connect) { - // First two slots are already set and we need to respect those constraints - inTheRing[rings[currentStep]] = 1; - nrings = 1 + computeRingsRec(matrix, n, rings, currentRing+1, nRingsMax, inTheRing, rings[currentStep+1], n-2, connect); - } else { - rings[(currentRing+1)*n] = 0; - nrings = 1 + computeRingsRec(matrix, n, rings, currentRing+1, nRingsMax, inTheRing, 0, n-1, connect); - } - line[looprank]++; - for (int i=0; i<n; i++) inTheRing[i] = 1; - } - } - } else { - int ringsSave[MAXCHANNELS*MAXGPUS_NVLINKP2P]; - int maxStep = 0; - for (int i=0; i<n; i++) { - if (inTheRing[i] == 0 && line[i] > 0) { - line[i]--; - int nr = computeRingsRec(matrix, n, rings, currentRing, nRingsMax, inTheRing, i, remaining-1, connect); - if (nr > nrings) { - nrings = nr; - maxStep = (nr+currentRing)*n; - ringsSave[currentStep] = i; - // Save the rest of the rings - for (int r=currentStep+1; r<maxStep; r++) { - ringsSave[r] = rings[r]; - } - if (nrings + currentRing == nRingsMax) { - // We found an optimal solution. Let's stop there. - break; - } - } - line[i]++; - } - } - for (int r=currentStep; r<maxStep; r++) { - rings[r] = ringsSave[r]; - } - } - inTheRing[current] = 0; - return nrings; -} - -static inline int copyRings(int nranks, int* rings, int nrings, int newNrings) { - if (nrings == 0) return 0; - // Copy rings by dup times - if (newNrings > MAXCHANNELS) { - newNrings = MAXCHANNELS; - } - for (int r=nrings; r<newNrings; r++) { - for (int i=0; i<nranks; i++) rings[r*nranks+i] = rings[(r%nrings)*nranks+i]; - } - return newNrings; -} - -int p2pComputeRingsNvLink(ncclTvalue_t* matrix, int nranks, int *rings, int nringsMax, int connect) { - int* inTheRing = (int*)malloc(sizeof(int)*nranks); - if (inTheRing == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*nranks); return 0; } - for (int i=0; i<nranks; i++) inTheRing[i] = 0; - int nrings; - if (connect) { - inTheRing[rings[0]] = 1; - nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, rings[1], nranks-2, connect); - } else { - rings[0] = 0; - nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, 0, nranks-1, connect); - } - free(inTheRing); - return nrings; -} - -static inline int findConnect(int nranks, int* ranks) { - for (int i = 0; i<nranks; i++) { - if (ranks[i] != -1) return i; - } - return -1; -} - -int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int oversubscribe, int* nthreads) { - if (nrings == 0) return 0; - if (nrings > MAXCHANNELS) { - WARN("Max rings reached, limiting to %d", MAXCHANNELS); - nrings = MAXCHANNELS; - } - // Find existing constraints / connections - int connect = 0; - for (int r=0; r<nrings; r++) { - int start = findConnect(nranks, prev+r*nranks); - int end = findConnect(nranks, next+r*nranks); - if (start != -1 && end != -1) { - rings[r*nranks] = end; - rings[r*nranks+1] = start; - connect = 1; - } - } - - // Compute rings - ncclTvalue_t* matrix = (ncclTvalue_t*)malloc(sizeof(ncclTvalue_t)*nranks*nranks); - if (matrix == NULL) { WARN("malloc of %ld bytes failed", sizeof(ncclTvalue_t)*nranks*nranks); return 0; } - for (int i=0; i<nranks; i++) for (int j=0; j<nranks; j++) - matrix[i*nranks+j] = oversubscribe ? values[i*nranks+j]/CONNECT_NVLINK*2 : values[i*nranks+j]/CONNECT_NVLINK ; - - int compNrings = p2pComputeRingsNvLink(matrix, nranks, rings, nrings, connect); - - free(matrix); - - if (oversubscribe || connect) return compNrings; - - if (compNrings && compNrings < nrings && nranks <= 4) { - // Try to oversubscribe to get a better result - int *rings2 = (int *)malloc(sizeof(int)*MAXCHANNELS*nranks); - if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXCHANNELS*nranks); return 0; } - for (int i=0; i<MAXCHANNELS*nranks; i++) rings2[i] = -1; - int nThreads = *nthreads; - int compNrings2 = p2pComputeRingsNvLink(values, nranks, rings2, nrings, prev, next, 1, &nThreads); - if (compNrings2 > compNrings*2) { - // Oversubscription worked. - for (int i=0; i<compNrings2*nranks; i++) rings[i] = rings2[i]; - compNrings = compNrings2; - } - free(rings2); - } - - // Duplicate the rings for direct NVLink - compNrings = copyRings(nranks, rings, compNrings, compNrings*2); - - return compNrings; -} - -int p2pComputeRingsSeqConnect(ncclTvalue_t* values, int nranks, int* rings, int nringsStart, int* prev, int* next, int minScore, int* nthreads) { - int nrings = nringsStart; - int connect = 0; - for (int r=0; r<nrings; r++) { - int start = findConnect(nranks, prev+r*nranks); - int end = findConnect(nranks, next+r*nranks); - if (start != -1 && end != -1) { - rings[r*nranks] = end; - rings[r*nranks+1] = start; - int cur = start; - for (int i=2; i<nranks; i++) { - int next = (cur+1) % nranks; - while (next == end || next == start) next = (next+1) % nranks; - if (values[cur*nranks+next] < minScore) { - return 0; - } - rings[r*nranks+i] = next; - cur = next; - } - connect = 1; - } else { - if (connect == 1 && r > 0) { - WARN("Connecting rings but did not find start/end for ring %d. Disabling other rings.", r); - return r; - } else { - return 0; - } - } - } - return nrings; -} - -int p2pComputeRingsSeqNew(ncclTvalue_t* values, int nranks, int* rings, int nringsStart, int* prev, int* next, int minScore, int* nthreads) { - for (int r=0; r<nringsStart; r++) { - for (int i=0; i<nranks; i++) { - rings[r*nranks+i] = i; - } - } - return nringsStart; -} - -static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) { - for (int score = PATH_SYS+1; score >= minScore; score--) { - int best = -1; - int worst_end_score = PATH_SYS+2; // find the closest to rank, farthest from end - for (int n = 0; n < nranks; n++) { - if (inRing[n]) continue; - if (values[rank*nranks+n] == score) { - if (end == -1) return n; - if (values[end*nranks+n] < worst_end_score) { - best = n; - worst_end_score = values[end*nranks+n]; - } - } - } - if (best != -1) return best; - } - return -1; -} - -int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int minScore) { - int connect = 0; - for (int r=0; r<nrings; r++) { - int start = findConnect(nranks, prev+r*nranks); - int end = findConnect(nranks, next+r*nranks); - - int inRing[MAXGPUS_PCI]; - for (int i=0; i<nranks; i++) inRing[i] = 0; - - if (start == -1 && end == -1) { - if (connect == 1 && r > 0) { - WARN("Connecting ring %d : did not find start/end. Disabling other rings.", r); - return r; - } - end = 0; - inRing[end] = 1; - start = findClosestPci(values, inRing, end, -1, nranks, minScore); - if (start == -1) return r; - } else if (start == -1 || end == -1) { - WARN("Connecting ring %d : inconsistent start/end. Disabling other rings.", r); - return r; - } else { - connect = 1; - } - rings[r*nranks] = end; - rings[r*nranks+1] = start; - inRing[start] = inRing[end] = 1; - int cur = start; - for (int i=2; i<nranks; i++) { - int next = findClosestPci(values, inRing, cur, end, nranks, minScore); - if (next == -1) return r; - - inRing[next] = 1; - rings[r*nranks+i] = next; - cur = next; - } - // Check the loop is closing - inRing[end] = 0; - if (findClosestPci(values, inRing, cur, end, nranks, minScore) != end) return r; - - if (connect == 0) return 1; - } - return nrings; -} - -ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) { - if (*nringsRet == 0) return ncclSuccess; - int *rings; - NCCLCHECK(ncclCalloc(&rings, MAXCHANNELS*nranks)); - for (int i=0; i<MAXCHANNELS*nranks; i++) rings[i] = -1; - int nrings = *nringsRet; - - // NVswitch - int nvswitchLinks = 0; - int directLinks = 0; - for (int rank=0; rank<nranks; rank++) { - for (int j=1; j<nranks; j++) { - int i = (rank + j) % nranks; - ncclTvalue_t links = values[rank*nranks+i]/CONNECT_NVSWITCH; - if (j>1 && links != nvswitchLinks) { - WARN("Internal error : NVswitch links mismatch"); - return ncclInternalError; - } - nvswitchLinks = links; - } - } - if (nvswitchLinks) { - // NVSwitch : Connect existing rings - int nringsConnected = p2pComputeRingsSeqConnect(values, nranks, rings, nrings, prev, next, minScore, nthreads); - if (nringsConnected > 0) { - nrings = nringsConnected; - } else { - nrings = std::min(nrings, nvswitchLinks); // NVSwitch: Limit rings to number of NVLinks - // Or create new ones - nrings = p2pComputeRingsSeqNew(values, nranks, rings, nrings, prev, next, minScore, nthreads); - // And duplicate them - nrings = copyRings(nranks, rings, nrings, nrings*2); - } - goto end; - } - - // point-to-point NVLink - for (int rank=0; rank<nranks; rank++) { - int links = 0; - for (int i=0; i<nranks; i++) { - ncclTvalue_t val = values[rank*nranks+i]; - if (val >= CONNECT_NVSWITCH) continue; - links += val/CONNECT_NVLINK; - } - if (rank == 0) directLinks = links; - else directLinks = std::min(directLinks, links); - } - if (directLinks > 0) { - // NVLink : Connect rings or create new ones - if (nranks > MAXGPUS_NVLINKP2P) { - WARN("Recursive P2P computation cannot work for >8 GPUs"); - return ncclInternalError; - } - nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads); - goto end; - } - - // PCIe or QPI : Connect rings or create new ones - nrings = p2pComputeRingsPci(values, nranks, rings, *nringsRet, prev, next, minScore); - -end: - *nringsRet = nrings; - for (int ring = 0; ring<nrings; ring++) { - for (int index=0; index<nranks; index++) { - int prevIndex = (index - 1 + nranks) % nranks; - int nextIndex = (index + 1) % nranks; - int curRank = rings[ring*nranks+index]; - int prevRank = rings[ring*nranks+prevIndex]; - int nextRank = rings[ring*nranks+nextIndex]; - if (prev[ring*nranks+curRank] == -1) prev[ring*nranks+curRank] = prevRank; - if (next[ring*nranks+curRank] == -1) next[ring*nranks+curRank] = nextRank; - } - } - - free(rings); return ncclSuccess; } @@ -462,7 +128,7 @@ end: } while (0) /* Send: Create and return connect structures for this peer to connect to me */ -ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, +ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { struct p2pSendResources* resources; @@ -477,19 +143,20 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer info.direct = 1; info.directPtr = resources->devMem; if (myInfo->cudaDev == peerInfo->cudaDev) { - INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", channelId, myInfo->rank, peerInfo->rank); + INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/common device", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + return ncclInternalError; } else { // Enable P2P access cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0); if (err == cudaErrorPeerAccessAlreadyEnabled) { cudaGetLastError(); } else if (err != cudaSuccess) { - WARN("failed to peer with device %d(=%d): %d %s", - peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err)); + WARN("failed to peer with device %d(=%lx): %d %s", + peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err)); return ncclInternalError; } - INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer", - channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); + INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer", + channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); } } else { // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) @@ -498,12 +165,12 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer // Map IPC and enable P2P access cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem); if (err != cudaSuccess) { - WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s", - myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err)); + WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s", + myInfo->rank, peerCudaDev, peerInfo->busId, err, cudaGetErrorString(err)); return ncclInternalError; } - INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC", - channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); + INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/IPC", + channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); //TRACE_DUMP_IPC(&info.devIpc); } static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); @@ -512,7 +179,7 @@ ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer } /* Create and return connect structures for this peer to connect to me */ -ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, +ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) { struct p2pRecvResources* resources; @@ -534,11 +201,11 @@ ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer if (err == cudaErrorPeerAccessAlreadyEnabled) { cudaGetLastError(); } else if (err != cudaSuccess) { - WARN("failed to peer with device %d(=%d): %d %s", - peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err)); + WARN("failed to peer with device %d(=%lx): %d %s", + peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err)); return ncclInternalError; } - TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); + TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/direct pointer", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); } } else { // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) @@ -547,11 +214,11 @@ ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer // Map IPC and enable P2P access cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem); if (err != cudaSuccess) { - WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s", - myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err)); + WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s", + myInfo->rank, peerCudaDev, peerInfo->busId, err, cudaGetErrorString(err)); return ncclInternalError; } - TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); + TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/IPC", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); //TRACE_DUMP_IPC(&info.devIpc); } static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); @@ -580,6 +247,7 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC send->conn.buff = remDevMem->buff; send->conn.llBuff = remDevMem->llBuff; + send->conn.ll128Buff = remDevMem->ll128Buff; send->conn.tail = &remDevMem->tail; send->conn.opCountRem = &remDevMem->opCount; send->conn.head = &resources->devMem->head; @@ -610,6 +278,7 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto recv->conn.buff = resources->devMem->buff; recv->conn.llBuff = resources->devMem->llBuff; + recv->conn.ll128Buff = resources->devMem->ll128Buff; recv->conn.tail = &resources->devMem->tail; recv->conn.opCountLoc = &resources->devMem->opCount; recv->conn.head = &remDevMem->head; @@ -638,7 +307,6 @@ ncclResult_t p2pRecvFree(void* resources) { struct ncclTransport p2pTransport = { "P2P", p2pCanConnect, - p2pGetRings, { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL }, { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL } }; diff --git a/src/transport/shm.cc b/src/transport/shm.cc index 2ec5f23..60f16c8 100644 --- a/src/transport/shm.cc +++ b/src/transport/shm.cc @@ -4,13 +4,8 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" -#include "utils.h" -#include "transport.h" -#include "param.h" +#include "comm.h" #include "shm.h" -#include <unistd.h> -#include <cuda_runtime.h> struct shmConnectInfo { uint64_t pidHash; @@ -40,98 +35,29 @@ struct shmRecvResources { NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0); -/* Determine if we can communicate with the peer */ -ncclResult_t shmCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) { - *ret = ((ncclParamShmDisable() == 1) || (myInfo->hostHash != peerInfo->hostHash)) ? 0 : 1; - return ncclSuccess; -} +/* Determine two peers can communicate with SHM */ +ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { + *ret = 0; -static inline int groupFirst(int nranks, int* groups, int group, int rankToAvoid) { - for (int rank = 0; rank<nranks; rank++) { - if ((groups[rank] == group) && (rank != rankToAvoid)) return rank; - } - return -1; -} + if (ncclParamShmDisable() == 1) return ncclSuccess; -static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid) { - for (int rank = nranks-1; rank>=0; rank--) { - if ((groups[rank] == group) && (rank != rankToAvoid)) return rank; - } - return -1; -} + // Same host? + TRACE(NCCL_INIT|NCCL_SHM, "peer1 hostHash %lx peer2 hostHash %lx", info1->hostHash, info2->hostHash); + if (info1->hostHash != info2->hostHash) return ncclSuccess; + + // Common /dev/shm (between containers) ? + TRACE(NCCL_INIT|NCCL_SHM, "peer1 shmDev %lx peer2 shmDev %lx", info1->shmDev, info2->shmDev); + if (info1->shmDev != info2->shmDev) return ncclSuccess; + + *ret = 1; -#define MAXGROUPS 16 - -ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) { - if (*nringsRet == MAXCHANNELS) *nringsRet = 1; - int nGroups = groups[nranks-1] + 1; - int starts[MAXGROUPS]; - int ends[MAXGROUPS]; - for (int ring = 0; ring<*nringsRet; ring++) { - int startGroup = -1, endGroup = -1; - for (int group = 0; group<nGroups; group++) { - int start = -1; - int end = -1; - int nranksInGroup = 0; - for (int rank=0; rank<nranks; rank++) { - if (groups[rank] != group) continue; - nranksInGroup++; - if (prev[ring*nranks+rank] != -1) { - if (start != -1) { - WARN("Multiple starts found in group"); - } - start = rank; - startGroup = group; - } - if (next[ring*nranks+rank] != -1) { - if (end != -1) { - WARN("Multiple ends found in group"); - } - end = rank; - endGroup = group; - } - } - if (nranksInGroup == 1) { - start = end = groupFirst(nranks, groups, group, -1); - } else { - if (start == -1) - start = groupFirst(nranks, groups, group, end); - if (end == -1) - end = groupLast(nranks, groups, group, start); - } - if (start == -1 || end == -1) { - *nringsRet = ring; - return ncclSuccess; - } - starts[group] = start; - ends[group] = end; - } - if (endGroup == -1 || startGroup == -1) { - startGroup = 0; - endGroup = nGroups-1; - // Close the loop - next[ring*nranks+ends[endGroup]] = starts[startGroup]; - prev[ring*nranks+starts[startGroup]] = ends[endGroup]; - } - int group = startGroup; - for (int i=0; i<nGroups-2; i++) { - int nextGroup = (group+1)%nGroups; - if (nextGroup == endGroup) nextGroup = (nextGroup+1)%nGroups; - next[ring*nranks+ends[group]] = starts[nextGroup]; - prev[ring*nranks+starts[nextGroup]] = ends[group]; - group = nextGroup; - } - // Connect with the last - next[ring*nranks+ends[group]] = starts[endGroup]; - prev[ring*nranks+starts[endGroup]] = ends[group]; - } return ncclSuccess; } #define MAX_SHM_NAME_LEN 1024 /* Create and return connect structures for this peer to connect to me */ -ncclResult_t shmSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { +ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { struct shmSendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); @@ -149,13 +75,13 @@ ncclResult_t shmSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); - INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big"); memcpy(connectInfo, &info, sizeof(struct shmConnectInfo)); return ncclSuccess; } -ncclResult_t shmRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { +ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { struct shmRecvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; @@ -194,6 +120,7 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto send->transportResources = resources; send->conn.buff = resources->devRemHostMem->buff; send->conn.llBuff = resources->devRemHostMem->llBuff; + send->conn.ll128Buff = resources->devRemHostMem->ll128Buff; send->conn.tail = &resources->devRemHostMem->tail; send->conn.opCountRem = &resources->devRemHostMem->opCount; @@ -218,6 +145,7 @@ ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto recv->conn.buff = resources->devHostMem->buff; recv->conn.llBuff = resources->devHostMem->llBuff; + recv->conn.ll128Buff = resources->devHostMem->ll128Buff; recv->conn.tail = &resources->devHostMem->tail; recv->conn.opCountLoc = &resources->devHostMem->opCount; return ncclSuccess; @@ -242,7 +170,6 @@ ncclResult_t shmRecvFree(void* transportResources) { struct ncclTransport shmTransport = { "SHM", shmCanConnect, - shmGetRings, { shmSendSetup, shmSendConnect, shmSendFree, NULL }, { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL } }; |