diff options
66 files changed, 3700 insertions, 3205 deletions
diff --git a/makefiles/common.mk b/makefiles/common.mk index 83a2a39..d0e2ca8 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -15,8 +15,7 @@ PROFAPI ?= 0 NVCC = $(CUDA_HOME)/bin/nvcc CUDA_LIB ?= $(CUDA_HOME)/lib64 -CUDA_INC ?= $(CUDA_HOME)/include -CUDA_VERSION = $(strip $(shell $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) +CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) #CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev) CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2) @@ -36,14 +35,14 @@ CUDA8_PTX = -gencode=arch=compute_61,code=compute_61 CUDA9_PTX = -gencode=arch=compute_70,code=compute_70 # Include Volta support if we're using CUDA9 or above -ifeq ($(shell test "$(CUDA_MAJOR)" -gt 8; echo $$?),0) +ifeq ($(shell test "0$(CUDA_MAJOR)" -gt 8; echo $$?),0) NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX) else NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX) endif #$(info NVCC_GENCODE is ${NVCC_GENCODE}) -CXXFLAGS := -I$(CUDA_INC) -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden +CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden CXXFLAGS += -Wall -Wno-sign-compare NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all # Use addprefix so that we can specify more than one path diff --git a/makefiles/version.mk b/makefiles/version.mk index f9cee6a..a8c6e3a 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 3 -NCCL_PATCH := 7 +NCCL_MINOR := 4 +NCCL_PATCH := 2 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/pkg/redhat/nccl.spec.in b/pkg/redhat/nccl.spec.in index 65a2c60..f9d83a3 100644 --- a/pkg/redhat/nccl.spec.in +++ b/pkg/redhat/nccl.spec.in @@ -1,6 +1,6 @@ Name: libnccl -Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch} -Release: ${pkg:Revision} +Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix} +Release: ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor} Summary: NVIDIA Collectives Communication Library (NCCL) Runtime Group: Development/Libraries diff --git a/pkg/srctxz/Makefile b/pkg/srctxz/Makefile index 1cb7c06..ed677fe 100644 --- a/pkg/srctxz/Makefile +++ b/pkg/srctxz/Makefile @@ -36,4 +36,5 @@ $(TXZPREPDIR)/% : %.in -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ + -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ $< > $@ diff --git a/pkg/srctxz/create_srctxz.sh.in b/pkg/srctxz/create_srctxz.sh.in index 0b8e6d2..ae7d01f 100644 --- a/pkg/srctxz/create_srctxz.sh.in +++ b/pkg/srctxz/create_srctxz.sh.in @@ -25,8 +25,9 @@ NCCL_MAJOR=${nccl:Major} NCCL_MINOR=${nccl:Minor} NCCL_PATCH=${nccl:Patch} NCCL_SUFFIX=${nccl:Suffix} +NCCL_BUILD=${pkg:Revision} -NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}" +NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}" tar --exclude build \ --exclude ".git*" \ diff --git a/src/Makefile b/src/Makefile index 481000a..fe60b11 100644 --- a/src/Makefile +++ b/src/Makefile @@ -9,8 +9,8 @@ include ../makefiles/version.mk ##### src files INCEXPORTS := nccl.h nccl_net.h -LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \ - misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \ +LIBSRCFILES := init.cu channel.cu bootstrap.cu transport.cu enqueue.cu \ + misc/group.cu misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/checks.cu misc/trees.cu \ transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \ collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu @@ -29,11 +29,10 @@ LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH)) STATICLIBTARGET := $(STATICLIBNAME) LIBOBJ := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o) DEPFILES := $(LIBOBJ:%.o=%.d) -LDFLAGS += -L${CUDA_LIB} -lcudart_static -lrt +LDFLAGS += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl DEVICELIB := $(BUILDDIR)/obj/collectives/device/colldevice.a - ##### rules build : lib staticlib @@ -41,9 +40,12 @@ lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) staticlib : $(LIBDIR)/$(STATICLIBTARGET) -devicelib: $(INCDIR)/nccl.h +$(DEVICELIB): ALWAYS_REBUILD $(MAKE) -C collectives/device +# Empty target to force rebuild +ALWAYS_REBUILD: + -include $(DEPFILES) $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ) @@ -59,14 +61,14 @@ $(INCDIR)/nccl.h : nccl.h.in -e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \ $< > $@ -$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) devicelib +$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB) @printf "Linking %-35s > %s\n" $(LIBTARGET) $@ mkdir -p $(LIBDIR) $(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS) ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME) ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME) -$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) devicelib +$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB) @printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@ mkdir -p $(LIBDIR) $(eval TMP := $(shell mktemp -d)) diff --git a/src/bootstrap.cu b/src/bootstrap.cu index 13c6e92..6b1d573 100644 --- a/src/bootstrap.cu +++ b/src/bootstrap.cu @@ -15,27 +15,31 @@ // Always use sockets for bootstrap ncclNet_t* ncclBootstrapNet = &ncclNetSocket; -static ncclResult_t bootstrapListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; } -static ncclResult_t bootstrapConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; } -static ncclResult_t bootstrapAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; } -static ncclResult_t bootstrapTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; } -static ncclResult_t bootstrapCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; } -static ncclResult_t bootstrapCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; } -static ncclResult_t bootstrapCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; } +static ncclResult_t bootstrapNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; } +static ncclResult_t bootstrapNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; } +static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; } +static ncclResult_t bootstrapNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; } +static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; } +static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; } +static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; } // Additional sync functions based on async + test for bootstrap, using host ptrs. -static ncclResult_t bootstrapSend(void* sendComm, void* data, int size) { - void* request; - NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, NCCL_PTR_HOST, &request)); +static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) { + void* request, *mhandle; + NCCLCHECK(ncclBootstrapNet->regMr(sendComm, data, size, NCCL_PTR_HOST, &mhandle)); + NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, mhandle, &request)); + NCCLCHECK(ncclBootstrapNet->deregMr(sendComm, mhandle)); int done = 0; - while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL)); + while (!done) NCCLCHECK(bootstrapNetTest(request, &done, NULL)); return ncclSuccess; } -static ncclResult_t bootstrapRecv(void* recvComm, void* data, int size) { - void* request; - NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, NCCL_PTR_HOST, &request)); +static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) { + void* request, *mhandle; + NCCLCHECK(ncclBootstrapNet->regMr(recvComm, data, size, NCCL_PTR_HOST, &mhandle)); + NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, mhandle, &request)); + NCCLCHECK(ncclBootstrapNet->deregMr(recvComm, mhandle)); int done = 0; - while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL)); + while (!done) NCCLCHECK(bootstrapNetTest(request, &done, NULL)); return ncclSuccess; } @@ -51,8 +55,8 @@ struct extId { struct extInfo { int rank; int nranks; - ncclNetHandle_t extHandleListenFromRoot; - ncclNetHandle_t extHandleRing; + ncclNetHandle_t extHandleListenRoot; + ncclNetHandle_t extHandleListen; }; #include <sys/resource.h> @@ -68,28 +72,25 @@ static ncclResult_t setFilesLimit() { static void *bootstrapRoot(void* commId) { struct extInfo info; struct extId* id = (struct extId*)commId; - ncclNetHandle_t *extHandleBstrap = NULL; // for initial rank <-> root information exchange - ncclNetHandle_t *extHandleRing = NULL; // for bootstrap ring creation + ncclNetHandle_t *rankHandles = NULL; + ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange ncclNetHandle_t zero = { 0 }; // for sanity checking void* tmpComm; ncclResult_t res; setFilesLimit(); + TRACE(NCCL_INIT, "BEGIN"); /* Receive addresses from all ranks */ int nranks = 0, c = 0; do { - NCCLCHECKGOTO(bootstrapAccept(id->extListenComm, &tmpComm), res, out); - NCCLCHECKGOTO(bootstrapRecv(tmpComm, &info, sizeof(info)), res, out); - NCCLCHECKGOTO(bootstrapCloseRecv(tmpComm), res, out); + NCCLCHECKGOTO(bootstrapNetAccept(id->extListenComm, &tmpComm), res, out); + NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out); + NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out); if (c == 0) { - extHandleBstrap = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t)); - extHandleRing = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t)); - if (extHandleBstrap == NULL || extHandleRing == NULL) { - WARN("Bootstrap thread : failed to allocate memory"); - goto out; - } nranks = info.nranks; + NCCLCHECKGOTO(ncclCalloc(&rankHandles, nranks), res, out); + NCCLCHECKGOTO(ncclCalloc(&rankHandlesRoot, nranks), res, out); } if (nranks != info.nranks) { @@ -97,40 +98,43 @@ static void *bootstrapRoot(void* commId) { goto out; } - if (memcmp(&zero, &extHandleBstrap[info.rank], sizeof(ncclNetHandle_t)) != 0) { + if (memcmp(&zero, &rankHandlesRoot[info.rank], sizeof(ncclNetHandle_t)) != 0) { WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks); goto out; } - // Save the connection handle for connecting back to the ranks - memcpy(&extHandleBstrap[info.rank], info.extHandleListenFromRoot, sizeof(ncclNetHandle_t)); - // Save the connection handle for the AllGather ring - memcpy(&extHandleRing[info.rank], info.extHandleRing, sizeof(ncclNetHandle_t)); + // Save the connection handle for that rank + memcpy(rankHandlesRoot+info.rank, info.extHandleListenRoot, sizeof(ncclNetHandle_t)); + memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t)); ++c; } while (c < nranks); + TRACE(NCCL_INIT, "COLLECTED HANDLES"); // Send the connect handle for the next rank in the AllGather ring for (int r=0; r<nranks; ++r) { int next = (r+1) % nranks; void *tmpSendComm; - NCCLCHECKGOTO(bootstrapConnect(0, extHandleBstrap[r], &tmpSendComm), res, out); - NCCLCHECKGOTO(bootstrapSend(tmpSendComm, &extHandleRing[next], sizeof(ncclNetHandle_t)), res, out); - NCCLCHECKGOTO(bootstrapCloseSend(tmpSendComm), res, out); + NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot[r], &tmpSendComm), res, out); + NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out); + NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out); } + TRACE(NCCL_INIT, "SENT OUT HANDLES"); out: - bootstrapCloseListen(id->extListenComm); + bootstrapNetCloseListen(id->extListenComm); free(commId); - free(extHandleBstrap); - free(extHandleRing); + if (rankHandles) free(rankHandles); + if (rankHandlesRoot) free(rankHandlesRoot); + + TRACE(NCCL_INIT, "DONE"); return NULL; } ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) { struct extId* id = (struct extId*)commId; id->hostHash = getHostHash(); - NCCLCHECK(bootstrapListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm)); + NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm)); ncclUniqueId* threadIdCopy; NCCLCHECK(ncclCalloc(&threadIdCopy, 1)); memcpy(threadIdCopy, id, sizeof(ncclUniqueId)); @@ -157,10 +161,18 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) { return ncclSuccess; } +struct unexConn { + int peer; + void* comm; + struct unexConn* next; +}; + struct extState { + void* extBstrapListenComm; void* extBstrapRingRecvComm; void* extBstrapRingSendComm; - ncclNetHandle_t extBstrapRootHandle; + ncclNetHandle_t* peerBstrapHandles; + struct unexConn* unexpectedConnections; int rank; int nranks; int dev; @@ -174,39 +186,56 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co state->rank = rank; state->nranks = nranks; *commState = state; - void* extBstrapRootListenComm; // comm on which we accept root's connections + + TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); struct extInfo info = { 0 }; info.rank = rank; info.nranks = nranks; - void *tmpSendComm, *extBstrapRingListenComm, *tmpRecvComm; + void *tmpSendComm, *tmpRecvComm; // Pass the remote address to listen via info if (idFromEnv) { - memcpy(&info.extHandleListenFromRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t)); - memcpy(&info.extHandleRing, &id->extHandleRoot, sizeof(ncclNetHandle_t)); + memcpy(&info.extHandleListen, &id->extHandleRoot, sizeof(ncclNetHandle_t)); + memcpy(&info.extHandleListenRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t)); } // listen will return the local address via info (specify interface type 'findSubnetIf') state->dev = idFromEnv ? findSubnetIf : 0; - NCCLCHECK(bootstrapListen(state->dev, &info.extHandleListenFromRoot, &extBstrapRootListenComm)); - NCCLCHECK(bootstrapListen(state->dev, &info.extHandleRing, &extBstrapRingListenComm)); // AllGather Ring + void* extBstrapListenCommRoot; + NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListen, &state->extBstrapListenComm)); + NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListenRoot, &extBstrapListenCommRoot)); + + // stagger connection times to avoid an overload of the root at very high rank counts + if (nranks > 128) { + long msec = rank; + struct timespec tv; + tv.tv_sec = msec / 1000; + tv.tv_nsec = 1000000 * (msec % 1000); + TRACE(NCCL_INIT, "rank %d delaying connection to root by %ld msec", rank, msec); + (void) nanosleep(&tv, NULL); + } - memcpy(&state->extBstrapRootHandle, &id->extHandleRoot, sizeof(ncclNetHandle_t)); - // send info on my listening sockets to root - NCCLCHECK(bootstrapConnect(state->dev, id->extHandleRoot, &tmpSendComm)); - NCCLCHECK(bootstrapSend(tmpSendComm, &info, sizeof(info))); - NCCLCHECK(bootstrapCloseSend(tmpSendComm)); + // send info on my listening socket to root + NCCLCHECK(bootstrapNetConnect(state->dev, id->extHandleRoot, &tmpSendComm)); + NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info))); + NCCLCHECK(bootstrapNetCloseSend(tmpSendComm)); // get info on my "next" rank in the bootstrap ring from root ncclNetHandle_t extHandleNext; - NCCLCHECK(bootstrapAccept(extBstrapRootListenComm, &tmpRecvComm)); - NCCLCHECK(bootstrapRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext))); - NCCLCHECK(bootstrapCloseRecv(tmpRecvComm)); + NCCLCHECK(bootstrapNetAccept(extBstrapListenCommRoot, &tmpRecvComm)); + NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext))); + NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm)); + NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot)); - NCCLCHECK(bootstrapConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm)); + NCCLCHECK(bootstrapNetConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm)); // Accept the connect request from the previous rank in the AllGather ring - NCCLCHECK(bootstrapAccept(extBstrapRingListenComm, &state->extBstrapRingRecvComm)); - NCCLCHECK(bootstrapCloseListen(extBstrapRingListenComm)); - NCCLCHECK(bootstrapCloseListen(extBstrapRootListenComm)); + NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm)); + + // AllGather all listen handlers + NCCLCHECK(ncclCalloc(&state->peerBstrapHandles, nranks)); + memcpy(state->peerBstrapHandles+rank, info.extHandleListen, sizeof(ncclNetHandle_t)); + NCCLCHECK(bootstrapAllGather(state, state->peerBstrapHandles, sizeof(ncclNetHandle_t))); + + TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); return ncclSuccess; } @@ -224,25 +253,106 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) { * and send previous step's data from (rank-i) to right */ for (int i=0; i<nranks-1; i++) { - int rslice = (rank - i - 1 + nranks) % nranks; - int sslice = (rank - i + nranks) % nranks; + size_t rslice = (rank - i - 1 + nranks) % nranks; + size_t sslice = (rank - i + nranks) % nranks; // Send slice to the right - NCCLCHECK(bootstrapSend(state->extBstrapRingSendComm, data+sslice*size, size)); + NCCLCHECK(bootstrapNetSend(state->extBstrapRingSendComm, data+sslice*size, size)); // Recv slice from the left - NCCLCHECK(bootstrapRecv(state->extBstrapRingRecvComm, data+rslice*size, size)); + NCCLCHECK(bootstrapNetRecv(state->extBstrapRingRecvComm, data+rslice*size, size)); } TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); return ncclSuccess; } -ncclResult_t bootstrapClose(void* commState) { +ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) { struct extState* state = (struct extState*)commState; + void* tmpSendComm; + NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles[peer], &tmpSendComm)); + NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int))); + NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size)); + NCCLCHECK(bootstrapNetCloseSend(tmpSendComm)); + return ncclSuccess; +} + +ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) { + // New unex + struct unexConn* unex; + NCCLCHECK(ncclCalloc(&unex, 1)); + unex->peer = peer; + unex->comm = comm; + + // Enqueue + struct unexConn* list = state->unexpectedConnections; + if (list == NULL) { + state->unexpectedConnections = unex; + return ncclSuccess; + } + while (list->next) list = list->next; + list->next = unex; + return ncclSuccess; +} - NCCLCHECK(bootstrapCloseSend(state->extBstrapRingSendComm)); - NCCLCHECK(bootstrapCloseRecv(state->extBstrapRingRecvComm)); +void* unexpectedDequeue(struct extState* state, int peer) { + struct unexConn* elem = state->unexpectedConnections; + struct unexConn* prev = NULL; + while (elem) { + if (elem->peer == peer) { + if (prev == NULL) { + state->unexpectedConnections = elem->next; + } else { + prev->next = elem->next; + } + void* comm = elem->comm; + free(elem); + return comm; + } + prev = elem; + elem = elem->next; + } + return NULL; +} + +// We can't know who we'll receive from, so we need to receive everything at once +ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) { + struct extState* state = (struct extState*)commState; + + void* tmpRecvComm; + + // Search unexpected connections first + if ((tmpRecvComm = unexpectedDequeue(state, peer)) != NULL) { + NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size)); + NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm)); + return ncclSuccess; + } + + // Then look for new connections + while (1) { + NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &tmpRecvComm)); + int newPeer; + NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &newPeer, sizeof(int))); + if (newPeer == peer) { + NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size)); + NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm)); + return ncclSuccess; + } + // Unexpected connection. Save for later. + NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvComm)); + } +} + +ncclResult_t bootstrapClose(void* commState) { + struct extState* state = (struct extState*)commState; + if (state->unexpectedConnections != NULL) { + WARN("Unexpected connections are not empty.\n"); + return ncclInternalError; + } + NCCLCHECK(bootstrapNetCloseListen(state->extBstrapListenComm)); + NCCLCHECK(bootstrapNetCloseSend(state->extBstrapRingSendComm)); + NCCLCHECK(bootstrapNetCloseRecv(state->extBstrapRingRecvComm)); + free(state->peerBstrapHandles); free(state); return ncclSuccess; diff --git a/src/channel.cu b/src/channel.cu new file mode 100644 index 0000000..937e84e --- /dev/null +++ b/src/channel.cu @@ -0,0 +1,51 @@ +/************************************************************************* + * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "channel.h" +#include "param.h" + +NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES); + +ncclResult_t initChannel(struct ncclComm* comm, int channelid) { + struct ncclChannel* channel = comm->channels+channelid; + channel->id = channelid; + + // Setup intermediate buffering + channel->buffSize = ncclParamBuffsize(); + + // Ring index to user rank table. + NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks)); + NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks)); + + // Communication structures with peers. + NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks)); + NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks)); + for (size_t i=0; i<comm->nRanks; ++i) { + channel->peers[i].send.comm = comm; + channel->peers[i].recv.comm = comm; + } + + // Per-channel operation list. + NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS)); + return ncclSuccess; +} + +ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) { + // Operation list + NCCLCHECK(ncclCudaHostFree(channel->collectives)); + + // Free Ring index to rank tables + free(channel->ring.userRanks); + CUDACHECK(cudaFree(channel->ring.devUserRanks)); + + // Free transport proxy resources + for (int r=0; r<nRanks; r++) { + struct ncclPeer* peer = channel->peers+r; + if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources)); + if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources)); + } + return ncclSuccess; +} diff --git a/src/collectives/all_gather.cu b/src/collectives/all_gather.cu index 8dec28e..db21dee 100644 --- a/src/collectives/all_gather.cu +++ b/src/collectives/all_gather.cu @@ -4,29 +4,15 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" -#include "common_coll.h" #include "enqueue.h" #include "collectives.h" -ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - size_t nbytes = count*ncclTypeSize(datatype); - INFO(NCCL_COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); - if (comm->nRanks == 1) { - if (sendbuff != recvbuff) - CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); - } else { - NCCLCHECK(transportSaveProxies(ALLGATHER_SUBSTEPS, ALLGATHER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm)); - NCCLCHECK(saveKernel(ncclCollAllGather, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes*comm->nRanks, 1)); - } - return ncclSuccess; -} - NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { - return ncclEnqueueCheck(ncclAllGatherFunc, "AllGather", sendbuff, recvbuff, sendcount, datatype, - ncclSum, 0, comm, stream); + struct ncclInfo info = { ncclCollAllGather, "AllGather", + sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */ + ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS }; + return ncclEnqueueCheck(&info); } diff --git a/src/collectives/all_reduce.cu b/src/collectives/all_reduce.cu index cc14083..1492c90 100644 --- a/src/collectives/all_reduce.cu +++ b/src/collectives/all_reduce.cu @@ -4,29 +4,15 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" -#include "common_coll.h" #include "enqueue.h" #include "collectives.h" -ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - size_t nbytes = count*ncclTypeSize(datatype); - INFO(NCCL_COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); - if (comm->nRanks == 1) { - if (sendbuff != recvbuff) - CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); - } else { - NCCLCHECK(transportSaveProxies(ALLREDUCE_SUBSTEPS, ALLREDUCE_BUFCHUNKS, (comm->nRanks)*2-2, comm->nRanks, nbytes, proxyPatternRing, comm)); - NCCLCHECK(saveKernel(ncclCollAllReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, comm->nRanks)); - } - return ncclSuccess; -} - NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); + ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { - return ncclEnqueueCheck(ncclAllReduceFunc, "AllReduce", sendbuff, recvbuff, count, datatype, - op, 0, comm, stream); + ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { + struct ncclInfo info = { ncclCollAllReduce, "AllReduce", + sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */ + ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS }; + return ncclEnqueueCheck(&info); } diff --git a/src/collectives/broadcast.cu b/src/collectives/broadcast.cu index 91ce905..6a3d0a8 100644 --- a/src/collectives/broadcast.cu +++ b/src/collectives/broadcast.cu @@ -4,39 +4,23 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" -#include "common_coll.h" #include "enqueue.h" #include "collectives.h" -ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - size_t nbytes = count*ncclTypeSize(datatype); - INFO(NCCL_COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); - if (comm->nRanks == 1) { - if (sendbuff != recvbuff) - CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); - } else { - NCCLCHECK(transportSaveProxies(BROADCAST_SUBSTEPS, BROADCAST_BUFCHUNKS, 1, 1, nbytes, proxyPatternFrom(root), comm)); - NCCLCHECK(saveKernel(ncclCollBroadcast, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes, 1)); - } - - return ncclSuccess; +NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream) { + struct ncclInfo info = { ncclCollBroadcast, "Broadcast", + sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */ + BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS }; + return ncclEnqueueCheck(&info); } - /* Deprecated original "in place" function, similar to MPI */ NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream) { - return ncclEnqueueCheck(ncclBroadcastFunc, "Bcast", buff, buff, count, datatype, - ncclSum, root, comm, stream); + return ncclBroadcast(buff, buff, count, datatype, root, comm, stream); } -NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, cudaStream_t stream); -ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, cudaStream_t stream) { - return ncclEnqueueCheck(ncclBroadcastFunc, "Broadcast", sendbuff, recvbuff, count, datatype, - ncclSum, root, comm, stream); -} diff --git a/src/collectives/collectives.h b/src/collectives/collectives.h index 4a5cb7a..e6b19cb 100644 --- a/src/collectives/collectives.h +++ b/src/collectives/collectives.h @@ -7,9 +7,7 @@ #ifndef NCCL_COLLECTIVES_H_ #define NCCL_COLLECTIVES_H_ -typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t; - -#define FUNC_INDEX(coll, redop, dtype, ll) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll) +#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*2+(al))*2+(ll)) #define NCCL_COLL_NAME(coll, op, dtype) \ coll##_##op##_##dtype @@ -18,13 +16,17 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed coll##Kernel_##op##_##dtype /* Declare all collective operations */ -#define DECL_COLL4(coll, op, dtype) \ +#define DECL_COLL5(coll, op, dtype) \ extern __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \ - extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl coll); \ + extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl c); \ + +#define DECL_COLL4(coll, op, dtype) \ + DECL_COLL5(coll, op, dtype) \ + DECL_COLL5(coll##LL, op, dtype) #define DECL_COLL3(coll, op, dtype) \ - DECL_COLL4(coll##LL, op, dtype) \ - DECL_COLL4(coll, op, dtype) + DECL_COLL4(coll##Ring, op, dtype) \ + DECL_COLL4(coll##Tree, op, dtype) #define DECL_COLL2(coll, op) \ DECL_COLL3(coll, op, i8) \ @@ -52,15 +54,16 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed DECL_ALL_COLLS -#define ALLREDUCE_SUBSTEPS 2 -#define ALLREDUCE_BUFCHUNKS 2 -#define ALLGATHER_SUBSTEPS 2 -#define ALLGATHER_BUFCHUNKS 2 -#define REDUCESCATTER_SUBSTEPS 2 -#define REDUCESCATTER_BUFCHUNKS 2 -#define BROADCAST_SUBSTEPS 8 -#define BROADCAST_BUFCHUNKS 2 -#define REDUCE_SUBSTEPS 8 -#define REDUCE_BUFCHUNKS 2 +// CHUNKSIZE must be a multiple of SLICESIZE +#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4) +#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2) +#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4) +#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2) +#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4) +#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2) +#define BROADCAST_SLICESTEPS 1 +#define BROADCAST_CHUNKSTEPS 1 +#define REDUCE_SLICESTEPS 1 +#define REDUCE_CHUNKSTEPS 1 #endif diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile index e2bcd49..8e92596 100644 --- a/src/collectives/device/Makefile +++ b/src/collectives/device/Makefile @@ -12,18 +12,13 @@ OBJDIR := $(BUILDDIR)/obj/collectives/device LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu -LIBOBJ := $(patsubst %.cu,$(OBJDIR)/%_sum.o, $(LIBSRCFILES)) \ - $(patsubst %.cu,$(OBJDIR)/%_prod.o, $(LIBSRCFILES)) \ - $(patsubst %.cu,$(OBJDIR)/%_min.o, $(LIBSRCFILES)) \ - $(patsubst %.cu,$(OBJDIR)/%_max.o, $(LIBSRCFILES)) \ - $(OBJDIR)/functions.o - LIBSRCFILES += functions.cu DEPFILES := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES)) -DEPENDFILES := $(DEPFILES:%.d=%.dep) +DEPENDFILES:= $(DEPFILES:%.d=%.dep) STATICLIB := $(OBJDIR)/colldevice.a DEVOBJ := $(OBJDIR)/devlink.o +RULESFILE := $(OBJDIR)/Makefile.rules NVCUFLAGS += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" @@ -33,6 +28,16 @@ all: $(STATICLIB) # Dummy rule so that the extra dependency (%.dep) files are preserved by make all_deps: $(DEPENDFILES) +# Auto-generating the rules per op/reduction/datatype/algorithm +$(RULESFILE) : + @printf "Generating %-35s > %s\n" rules $@ + @mkdir -p $(OBJDIR) + @./gen_rules.sh $(OBJDIR) > $@ + +-include $(RULESFILE) + +LIBOBJ := $(GENOBJS) $(OBJDIR)/functions.o + -include $(DEPFILES) $(STATICLIB): $(LIBOBJ) $(DEVOBJ) @@ -58,26 +63,6 @@ $(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep mkdir -p `dirname $@` $(NVCC) $(NVCUFLAGS) -dc $< -o $@ -$(OBJDIR)/%_sum.o : %.cu $(OBJDIR)/%.dep - @printf "Compiling %-35s > %s\n" $< $@ - mkdir -p `dirname $@` - $(NVCC) -DNCCL_OP=0 $(NVCUFLAGS) -dc $< -o $@ - -$(OBJDIR)/%_prod.o : %.cu $(OBJDIR)/%.dep - @printf "Compiling %-35s > %s\n" $< $@ - mkdir -p `dirname $@` - $(NVCC) -DNCCL_OP=1 $(NVCUFLAGS) -dc $< -o $@ - -$(OBJDIR)/%_min.o : %.cu $(OBJDIR)/%.dep - @printf "Compiling %-35s > %s\n" $< $@ - mkdir -p `dirname $@` - $(NVCC) -DNCCL_OP=2 $(NVCUFLAGS) -dc $< -o $@ - -$(OBJDIR)/%_max.o : %.cu $(OBJDIR)/%.dep - @printf "Compiling %-35s > %s\n" $< $@ - mkdir -p `dirname $@` - $(NVCC) -DNCCL_OP=3 $(NVCUFLAGS) -dc $< -o $@ - # ... and create the device-side linked object with all those. $(DEVOBJ) : $(LIBOBJ) $(NVCC) $(NVCUFLAGS) -dlink $^ -o $@ diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu index 0f572ce..530bf14 100644 --- a/src/collectives/device/all_gather.cu +++ b/src/collectives/device/all_gather.cu @@ -4,12 +4,8 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "common.h" #include "all_gather.h" +#include "common.h" #include "collectives.h" -#define UNROLL 4 - -#if NCCL_OP == 0 -IMPL_COLL3(ncclAllGather, copy, FuncSum, i8, int8_t, ncclCollAllGather, ncclSum, ncclInt8); -#endif +IMPL_COLL_C(ncclAllGather, ncclCollAllGather); diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h index a30e575..36809c9 100644 --- a/src/collectives/device/all_gather.h +++ b/src/collectives/device/all_gather.h @@ -8,72 +8,35 @@ #include "primitives.h" #include "collectives.h" -// Increase Step and poffset/noffset for buffer sync -#define NEXT_STEP \ - step++; \ - poffset = noffset; \ - noffset += sliceSize; \ - if (noffset == buffSize) noffset = 0; - template<int UNROLL, class FUNC, typename T> -__device__ void ncclAllGatherKernel(struct CollectiveArgs* args) { +__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - __shared__ T* sharedNextOutput; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - int prevdirect = ring->recv.conn.direct; - int nextdirect = ring->send.conn.direct; - - WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS); - WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS); - PostFlag postDoneToPrev(ring->recv.conn.head, ALLGATHER_SUBSTEPS, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS); - - typedef Primitives<UNROLL, ALLGATHER_SUBSTEPS, T> Prims; - + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; const int nranks = comm->nRanks; - const int buffSize = ring->buffSize / sizeof(T); - const int sliceSize = buffSize / ALLGATHER_BUFCHUNKS; - const ssize_t loopSize = args->nRings*(ssize_t)sliceSize; - - if (tid == 0) { - // Update in case we skipped some collectives - *ring->recv.conn.opCount = args->opCount; - // Wait for next to be ready - WaitFlag waitOpCountNext(ring->send.conn.opCount, 0); - waitOpCountNext.wait(args->opCount); - if (prevdirect) { - *ring->recv.conn.ptrExchange = args->ThisOutput; - } - if (nextdirect) { - void* volatile* ptr = &(ring->devMemSend->ptrExchange); - while (*ptr == nullptr); - sharedNextOutput = (T*)*ptr; - *ptr = nullptr; - } - } - __syncthreads(); - - uint64_t step = 0ULL; - int poffset, noffset = 0; + const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS; + const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - T * __restrict__ prevInput = (T*)ring->recv.conn.buff; - T * __restrict__ nextOutput = (T*)ring->send.conn.buff; + + ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC> + prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings)); - ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); - ssize_t chunkOffset = gridOffset + bid*chunkSize; + int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); + ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); + ssize_t chunkOffset = gridOffset + bid*realChunkSize; /////////////// begin AllGather steps /////////////// ssize_t offset; - int maxOffset = min(chunkSize, size-chunkOffset); + int nelem = min(realChunkSize, size-chunkOffset); int rankDest; // step 0: push data to next GPU @@ -81,129 +44,51 @@ __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) { offset = chunkOffset + rankDest * size; if (thisInput + chunkOffset == thisOutput + offset) { // In place - Prims::Copy(tid, nthreads, - thisInput + chunkOffset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); + prims.directSend(thisInput+chunkOffset, offset, nelem); } else { - Prims::DoubleCopy(tid, nthreads, - thisInput + chunkOffset, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); + prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem); } - NEXT_STEP; // Increases step, poffset, noffset - // k-2 steps: copy to next GPU - if (prevdirect) { - for (int j=1; j<nranks-1; ++j) { - rankDest = ring->devUserRanks[nranks-j]; - offset = chunkOffset + rankDest * size; - - Prims::Copy(tid, nthreads, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; - } - Prims::Copy(tid, nthreads, - NULL, - NULL, - 0, 0, - step, - waitReadyFromPrev, - postDoneToPrev); - } else { - for (int j=1; j<nranks-1; ++j) { - rankDest = ring->devUserRanks[nranks-j]; - offset = chunkOffset + rankDest * size; - - Prims::DoubleCopy(tid, nthreads, - prevInput + poffset, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; - } - - // Make final copy from buffer to dest. - rankDest = ring->devUserRanks[1]; + for (int j=1; j<nranks-1; ++j) { + rankDest = ring->devUserRanks[nranks-j]; offset = chunkOffset + rankDest * size; - // Here we need to copy from buffer to this output. - Prims::Copy(tid, nthreads, - prevInput + poffset, - thisOutput + offset, - sliceSize, maxOffset, - step, - waitReadyFromPrev, - postDoneToPrev); + prims.directRecvCopySend(thisOutput+offset, offset, nelem); } - } - if (tid == 0) { - waitDoneFromNext.wait(ALLGATHER_SUBSTEPS*(step + ALLGATHER_BUFCHUNKS)); - *ring->send.conn.head = 0ULL; - *ring->recv.conn.tail = 0ULL; - __threadfence_system(); - *ring->recv.conn.opCount = args->opCount+1; + // Make final copy from buffer to dest. + rankDest = ring->devUserRanks[1]; + offset = chunkOffset + rankDest * size; + + // Final wait/copy. + prims.directRecv(thisOutput+offset, offset, nelem); } } -#include "ll_kernel.h" - -#define NEXT_STEP_LL \ - poffset = noffset; \ - pflag = nflag; \ - noffset += NCCL_LL_SLICE_LINES; \ - if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \ - nflag++; \ - step++; +template<int UNROLL, class FUNC, typename T> +__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { } template<int UNUSED, class FUNC, typename T> -__device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) { +__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; - const int llNthreads = args->nThreads; + const int nthreads = args->nThreads; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead; - volatile uint64_t * sendHeadPtr = ring->send.conn.llHead; - volatile int * sizesFifo = ring->send.conn.llFifo; - uint64_t sendHead = sendHeadPtr[0]; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; - typedef LLPrimitives<T, FUNC> LL; + ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); const ssize_t size = args->N; //const int rank = comm->rank; const int nranks = comm->nRanks; ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nRings*chunkSize; - - uint64_t step = ring->send.conn.llStep; - uint32_t pflag, nflag = step + 1; - int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step); + const ssize_t loopSize = args->nChannels*chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff; - union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { @@ -213,57 +98,34 @@ __device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) { /////////////// begin AllGather steps /////////////// ssize_t offset; - int maxOffset = min(chunkSize, size-chunkOffset); + int nelem = min(chunkSize, size-chunkOffset); int rankDest; // step 0: push data to next GPU rankDest = ring->devUserRanks[0]; offset = chunkOffset + rankDest * size; - WAIT_NEXT; if (thisInput + chunkOffset == thisOutput + offset) { // In place - LL::ReduceCopy( - thisInput + chunkOffset, - nextOutput + noffset, - maxOffset, nflag, llNthreads); + LLprims.send(thisInput+chunkOffset, nelem); } else { - LL::ReduceCopy( - thisInput + chunkOffset, - thisOutput + offset, - nextOutput + noffset, - maxOffset, nflag, llNthreads); + LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem); } - POST_SIZE; - - NEXT_STEP_LL; // k-2 steps: copy to next GPU for (int j=1; j<nranks-1; ++j) { rankDest = ring->devUserRanks[nranks-j]; offset = chunkOffset + rankDest * size; - WAIT_NEXT; - LL::ReduceCopy( - prevInput + poffset, - thisOutput + offset, - nextOutput + noffset, - maxOffset, pflag, nflag, llNthreads); - POST_SIZE; - ACK_PREV; - - NEXT_STEP_LL; + LLprims.recvCopySend(thisOutput+offset, nelem); } // step k-1: final store rankDest = ring->devUserRanks[1]; offset = chunkOffset + rankDest * size; - LL::ReduceCopy( - prevInput + poffset, - thisOutput + offset, - maxOffset, pflag, llNthreads); - ACK_PREV; + LLprims.recv(thisOutput+offset, nelem); } - - FIFO_CLEANING_AND_SAVE_STEP(nflag); } + +template<int UNUSED, class FUNC, typename T> +__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { } diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu index caa1479..aaa96b4 100644 --- a/src/collectives/device/all_reduce.cu +++ b/src/collectives/device/all_reduce.cu @@ -4,18 +4,8 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "common.h" #include "all_reduce.h" +#include "common.h" #include "collectives.h" -#define UNROLL 4 - -#if NCCL_OP == 0 -IMPL_COLL2(ncclAllReduce, sum, FuncSum, ncclCollAllReduce, ncclSum); -#elif NCCL_OP == 1 -IMPL_COLL2(ncclAllReduce, prod, FuncProd, ncclCollAllReduce, ncclProd); -#elif NCCL_OP == 2 -IMPL_COLL2(ncclAllReduce, min, FuncMin, ncclCollAllReduce, ncclMin); -#elif NCCL_OP == 3 -IMPL_COLL2(ncclAllReduce, max, FuncMax, ncclCollAllReduce, ncclMax); -#endif +IMPL_COLL_R(ncclAllReduce, ncclCollAllReduce); diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h index d7abc64..ea89a71 100644 --- a/src/collectives/device/all_reduce.h +++ b/src/collectives/device/all_reduce.h @@ -8,233 +8,152 @@ #include "primitives.h" #include "collectives.h" -// Increase Step and poffset/noffset for buffer sync -#define NEXT_STEP \ - step++; \ - poffset = noffset; \ - noffset += sliceSize; \ - if (noffset == buffSize) noffset = 0; - template<int UNROLL, class FUNC, typename T> -__device__ void ncclAllReduceKernel(struct CollectiveArgs* args) { +__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - __shared__ T* sharedNextOutput; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - int prevdirect = ring->recv.conn.direct; - int nextdirect = ring->send.conn.direct; - - WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS); - WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS); - PostFlag postDoneToPrev(ring->recv.conn.head, ALLREDUCE_SUBSTEPS, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS); - - typedef Primitives<UNROLL, ALLREDUCE_SUBSTEPS, T, FUNC> Prims; - + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; - //const int rank = comm->rank; const int nranks = comm->nRanks; - const int buffSize = ring->buffSize / sizeof(T); - const int sliceSize = buffSize / ALLREDUCE_BUFCHUNKS; - const ssize_t loopSize = args->nRings*(ssize_t)sliceSize; - - if (tid == 0) { - // Update in case we skipped some collectives - *ring->recv.conn.opCount = args->opCount; - // Wait for next to be ready - WaitFlag waitOpCountNext(ring->send.conn.opCount, 0); - waitOpCountNext.wait(args->opCount); - if (prevdirect) { - *ring->recv.conn.ptrExchange = args->ThisOutput; - } - if (nextdirect) { - void* volatile* ptr = &(ring->devMemSend->ptrExchange); - while (*ptr == nullptr); - sharedNextOutput = (T*)*ptr; - *ptr = nullptr; - } - } - __syncthreads(); - - uint64_t step = 0ULL; - int poffset, noffset = 0; + const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS; + const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - T * __restrict__ prevInput = (T*)ring->recv.conn.buff; - T * __restrict__ nextOutput = (T*)ring->send.conn.buff; + + ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC> + prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) { - int chunkSize = min(sliceSize, DIVUP(size-gridOffset,nranks*args->nRings)); - ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); - ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize; + int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels)); + ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); + ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize; /////////////// begin AllReduce steps /////////////// ssize_t offset; - int maxOffset; + int nelem; int slice; // step 0: push data to next GPU slice = ring->devUserRanks[nranks-1]; - offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + offset = chunkOffset + slice * realChunkSize; + nelem = min(realChunkSize, size-offset); - Prims::Copy(tid, nthreads, - thisInput + offset, - nextOutput + noffset, - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); - - NEXT_STEP; // Increases step, poffset, noffset + prims.send(thisInput+offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j=2; j<nranks; ++j) { slice = ring->devUserRanks[nranks-j]; - offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); - - Prims::Reduce(tid, nthreads, - prevInput + poffset, - thisInput + offset, - nextOutput + noffset, - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; + offset = chunkOffset + slice * realChunkSize; + nelem = min(realChunkSize, size-offset); + + prims.recvReduceSend(thisInput+offset, nelem); } // step k-1: reduce this buffer and data, which will produce the final // result that we store in this data and push to the next GPU slice = ring->devUserRanks[0]; - offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + offset = chunkOffset + slice * realChunkSize; + nelem = min(realChunkSize, size-offset); - Prims::ReduceCopy(tid, nthreads, - prevInput + poffset, - thisInput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - thisOutput + offset, - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; + prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem); // k-2 steps: copy to next GPU - if (prevdirect) { - for (int j=1; j<nranks-1; ++j) { - slice = ring->devUserRanks[nranks - j]; - offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); - - Prims::Copy(tid, nthreads, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; - } - Prims::Copy(tid, nthreads, - NULL, - NULL, - 0, 0, - step, - waitReadyFromPrev, - postDoneToPrev); - } else { - for (int j=1; j<nranks-1; ++j) { - slice = ring->devUserRanks[nranks - j]; - offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); - - Prims::DoubleCopy(tid, nthreads, - prevInput + poffset, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; - } + for (int j=1; j<nranks-1; ++j) { + slice = ring->devUserRanks[nranks-j]; + offset = chunkOffset + slice * realChunkSize; + nelem = min(realChunkSize, size-offset); - // Make final copy from buffer to dest. - slice = ring->devUserRanks[1]; - offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); - - // Here we need to copy from buffer to this output. - Prims::Copy(tid, nthreads, - prevInput + poffset, - thisOutput + offset, - sliceSize, maxOffset, - step, - waitReadyFromPrev, - postDoneToPrev); + prims.directRecvCopySend(thisOutput+offset, offset, nelem); } - } - if (tid == 0) { - // Wait for next to have consumed all data before we reset the flag - waitDoneFromNext.wait(ALLREDUCE_SUBSTEPS*(step + ALLREDUCE_BUFCHUNKS)); - *ring->send.conn.head = 0ULL; - *ring->recv.conn.tail = 0ULL; - __threadfence_system(); - *ring->recv.conn.opCount = args->opCount+1; + // Make final copy from buffer to dest. + slice = ring->devUserRanks[1]; + offset = chunkOffset + slice * realChunkSize; + nelem = min(realChunkSize, size-offset); + + // Final wait/copy. + prims.directRecv(thisOutput+offset, offset, nelem); } } -#include "ll_kernel.h" +template<int UNROLL, class FUNC, typename T> +__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int nthreads = blockDim.x - 1; + const int bid = args->bid; + struct ncclComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclTree* tree = &channel->tree; + const ssize_t size = args->N; + const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int chunkSize = args->lastChunkSize; + const ssize_t loopSize = args->nChannels*chunkSize; -#define NEXT_STEP_LL \ - poffset = noffset; \ - pflag = nflag; \ - noffset += NCCL_LL_SLICE_LINES; \ - if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \ - nflag++; \ - step++; + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + do { + // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) + ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Up + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (tree->up == -1) { + prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem); + } else if (tree->down[0] == -1) { + prims.send(thisInput+offset, nelem); + } else { + prims.recvReduceSend(thisInput+offset, nelem); + } + } + } while(0); + + do { + // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) + ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Down + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (tree->up == -1) { + prims.send(thisOutput+offset, nelem); + } else if (tree->down[0] == -1) { + prims.recv(thisOutput+offset, nelem); + } else { + prims.recvCopySend(thisOutput+offset, nelem); + } + } + } while(0); +} template<int UNUSED, class FUNC, typename T> -__device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) { +__device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; - const int llNthreads = args->nThreads; + const int nthreads = args->nThreads; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead; - volatile uint64_t * sendHeadPtr = ring->send.conn.llHead; - volatile int * sizesFifo = ring->send.conn.llFifo; - uint64_t sendHead = sendHeadPtr[0]; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; - typedef LLPrimitives<T, FUNC> LL; + ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); const ssize_t size = args->N; //const int rank = comm->rank; const int nranks = comm->nRanks; ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nRings*nranks*chunkSize; - - uint64_t step = ring->send.conn.llStep; - uint32_t pflag, nflag = step + 1; - int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step); + const ssize_t loopSize = args->nChannels*nranks*chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff; - union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { @@ -244,89 +163,99 @@ __device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) { /////////////// begin AllReduce steps /////////////// ssize_t offset; - int maxOffset; + int nelem; int slice; // step 0: push data to next GPU slice = ring->devUserRanks[nranks-1]; offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + nelem = min(chunkSize, size-offset); - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - nextOutput + noffset, - maxOffset, nflag, llNthreads); - POST_SIZE; - - NEXT_STEP_LL; + LLprims.send(thisInput+offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j=2; j<nranks; ++j) { slice = ring->devUserRanks[nranks-j]; offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); - - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - prevInput + poffset, - nextOutput + noffset, - maxOffset, pflag, nflag, llNthreads); - POST_SIZE; - ACK_PREV; - - NEXT_STEP_LL; + nelem = min(chunkSize, size-offset); + + LLprims.recvReduceSend(thisInput+offset, nelem); } // step k-1: reduce this buffer and data, which will produce the final // result that we store in this data and push to the next GPU slice = ring->devUserRanks[0]; offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + nelem = min(chunkSize, size-offset); - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - prevInput + poffset, - thisOutput + offset, - nextOutput + noffset, - maxOffset, pflag, nflag, llNthreads); - POST_SIZE; - ACK_PREV; - - NEXT_STEP_LL; + LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem); // k-2 steps: copy to next GPU for (int j=1; j<nranks-1; ++j) { - slice = ring->devUserRanks[nranks - j]; + slice = ring->devUserRanks[nranks-j]; offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); - - WAIT_NEXT; - LL::ReduceCopy( - prevInput + poffset, - thisOutput + offset, - nextOutput + noffset, - maxOffset, pflag, nflag, llNthreads); - POST_SIZE; - ACK_PREV; - - NEXT_STEP_LL; + nelem = min(chunkSize, size-offset); + + LLprims.recvCopySend(thisOutput+offset, nelem); } // Make final copy from buffer to dest. slice = ring->devUserRanks[1]; offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + nelem = min(chunkSize, size-offset); // Here we need to copy from buffer to this output. - LL::ReduceCopy( - prevInput + poffset, - thisOutput + offset, - maxOffset, pflag, llNthreads); - ACK_PREV; + LLprims.recv(thisOutput+offset, nelem); } +} - FIFO_CLEANING_AND_SAVE_STEP(nflag); +template<int UNUSED, class FUNC, typename T> +__device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int nthreads = args->nThreads; + const int bid = args->bid; + struct ncclComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclTree* tree = &channel->tree; + const ssize_t size = args->N; + ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); + const ssize_t loopSize = args->nChannels*chunkSize; + + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + do { + // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) + ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Up + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (tree->up == -1) { + LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem); + } else if (tree->down[0] == -1) { + LLprims.send(thisInput+offset, nelem); + } else { + LLprims.recvReduceSend(thisInput+offset, nelem); + } + } + } while(0); + + do { + // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) + ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Down + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (tree->up == -1) { + LLprims.send(thisOutput+offset, nelem); + } else if (tree->down[0] == -1) { + LLprims.recv(thisOutput+offset, nelem); + } else { + LLprims.recvCopySend(thisOutput+offset, nelem); + } + } + } while(0); } diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu index 4125de4..b83ee70 100644 --- a/src/collectives/device/broadcast.cu +++ b/src/collectives/device/broadcast.cu @@ -4,12 +4,8 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "common.h" #include "broadcast.h" +#include "common.h" #include "collectives.h" -#define UNROLL 4 - -#if NCCL_OP == 0 -IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8); -#endif +IMPL_COLL_C(ncclBroadcast, ncclCollBroadcast); diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h index c2f6d00..fb18312 100644 --- a/src/collectives/device/broadcast.h +++ b/src/collectives/device/broadcast.h @@ -8,174 +8,74 @@ #include "primitives.h" #include "collectives.h" -// Increase Step and boffset for buffer sync -#define NEXT_STEP \ - step++; \ - boffset += sliceSize; \ - if (boffset == buffSize) boffset = 0; - template<int UNROLL, class FUNC, typename T> -__device__ void ncclBroadcastKernel(struct CollectiveArgs* args) { +__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - __shared__ T* sharedNextOutput; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - int prevdirect = ring->recv.conn.direct; - int nextdirect = ring->send.conn.direct; - - WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS); - WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0); - PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS); - - typedef Primitives<UNROLL, BROADCAST_SUBSTEPS, T> Prims; - + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; - const int buffSize = ring->buffSize / sizeof(T); - const int sliceSize = buffSize / BROADCAST_BUFCHUNKS; - const ssize_t loopSize = args->nRings*(ssize_t)sliceSize; + const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS; + const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; const int rank = ring->devUserRanks[0]; const int nextRank = ring->devUserRanks[1]; const int root = args->root; - if (tid == 0) { - // Update in case we skipped some collectives - *ring->recv.conn.opCount = args->opCount; - if (nextRank != root) { - // Wait for next to be ready - WaitFlag waitOpCountNext(ring->send.conn.opCount, 0); - waitOpCountNext.wait(args->opCount); - } - if (rank != root && prevdirect) { - *ring->recv.conn.ptrExchange = args->ThisOutput; - } - if (nextRank != root && nextdirect) { - void* volatile* ptr = &(ring->devMemSend->ptrExchange); - while (*ptr == nullptr); - sharedNextOutput = (T*)*ptr; - *ptr = nullptr; - } - } - __syncthreads(); - - uint64_t step = 0ULL; - int boffset = 0; - // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - T * __restrict__ prevInput = (T*)ring->recv.conn.buff; - T * __restrict__ nextOutput = (T*)ring->send.conn.buff; + + ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, FUNC> + prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings)); - ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); - ssize_t offset = gridOffset + bid*chunkSize; - int maxOffset = min(chunkSize, size-offset); + int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); + ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); + ssize_t offset = gridOffset + bid*realChunkSize; + int nelem = min(realChunkSize, size-offset); if (rank == root) { if (thisInput == thisOutput) { - Prims::Copy(tid, nthreads, - thisInput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); + prims.send(thisInput+offset, nelem); } else { - Prims::DoubleCopy(tid, nthreads, - thisInput + offset, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); + prims.copySend(thisInput+offset, thisOutput+offset, nelem); } } else if (nextRank == root) { - if (prevdirect) maxOffset = 0; // Only wait for signals - Prims::Copy(tid, nthreads, - prevInput + boffset, - thisOutput + offset, - sliceSize, maxOffset, - step, - waitReadyFromPrev, - postDoneToPrev); + prims.recv(thisOutput+offset, nelem); } else { - if (prevdirect) { - Prims::Copy(tid, nthreads, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - } else { - Prims::DoubleCopy(tid, nthreads, - prevInput + boffset, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - } - } - NEXT_STEP; // Increases step, boffset - } - - if (tid == 0) { - if (nextRank != root) { - // Wait for next to have consumed data before resetting the flag - waitDoneFromNext.wait(BROADCAST_SUBSTEPS*(step + BROADCAST_BUFCHUNKS - 1)); - *ring->send.conn.head = 0ULL; + prims.recvCopySend(thisOutput+offset, nelem); } - *ring->recv.conn.tail = 0ULL; - __threadfence_system(); - *ring->recv.conn.opCount = args->opCount+1; } } -#include "ll_kernel.h" - -#define NEXT_STEP_LL \ - boffset += NCCL_LL_SLICE_LINES; \ - if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \ - flag++; \ - step++; +template<int UNROLL, class FUNC, typename T> +__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { } template<int UNUSED, class FUNC, typename T> -__device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) { +__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; - const int llNthreads = args->nThreads; + const int nthreads = args->nThreads; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead; - volatile uint64_t * sendHeadPtr = ring->send.conn.llHead; - volatile int * sizesFifo = ring->send.conn.llFifo; - uint64_t sendHead = sendHeadPtr[0]; - const int rank = comm->rank; - const int nextRank = ring->devUserRanks[1]; - const int root = args->root; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; - typedef LLPrimitives<T, FUNC> LL; + ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); const ssize_t size = args->N; - ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nRings*chunkSize; + const int rank = ring->devUserRanks[0]; + const int nextRank = ring->devUserRanks[1]; + const int root = args->root; - uint64_t step = ring->send.conn.llStep; - uint32_t flag = step + 1; - int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step); + ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); + const ssize_t loopSize = args->nChannels*chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff; - union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { @@ -183,46 +83,20 @@ __device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) { } ssize_t offset = gridOffset + bid*chunkSize; - int maxOffset = min(chunkSize, size-offset); + int nelem = min(chunkSize, size-offset); if (rank == root) { - WAIT_NEXT; if (thisInput == thisOutput) { - LL::ReduceCopy( - thisInput + offset, - nextOutput + boffset, - maxOffset, flag, llNthreads); + LLprims.send(thisInput+offset, nelem); } else { - LL::ReduceCopy( - thisInput + offset, - thisOutput + offset, - nextOutput + boffset, - maxOffset, flag, llNthreads); + LLprims.copySend(thisInput + offset, thisOutput + offset, nelem); } - POST_SIZE; - NEXT_STEP_LL; } else if (nextRank == root) { - LL::ReduceCopy( - prevInput + boffset, - thisOutput + offset, - maxOffset, flag, llNthreads); - NEXT_STEP_LL; - ACK_PREV; + LLprims.recv(thisOutput + offset, nelem); } else { - WAIT_NEXT; - LL::ReduceCopy( - prevInput + boffset, - thisOutput + offset, - nextOutput + boffset, - maxOffset, flag, flag, llNthreads); - POST_SIZE; - NEXT_STEP_LL; - ACK_PREV; + LLprims.recvCopySend(thisOutput + offset, nelem); } } - - // We need everyone to acknowledge data even if they didn't receive anything - // so that the next collective can start right away. - ACK_PREV; - - FIFO_CLEANING_AND_SAVE_STEP(flag); } + +template<int UNUSED, class FUNC, typename T> +__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { } diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index c988913..e4aecbd 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -11,13 +11,29 @@ #include "core.h" #include "nccl.h" +// Exit If Abort Barrier across CTA: make sure all threads exit consistently +// Each thread sets a predicate to true if abort == 1 +// all CTA's threads enter the barrier and do a popc on their predicates being True +// If any of the thread's predicate was True, all the threads call exit() +static inline __device__ void exitIfAbortBarrier(int abort) { + uint32_t popc; + asm ("{"); + asm volatile (" .reg .pred barr_pred;"); + asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort)); + asm volatile (" bar.red.popc.u32 %0, 13, barr_pred;" : "=r"(popc)); + asm ("}"); + if (popc) { asm volatile ("exit;"); } +} + typedef void(*ncclKern_t)(struct CollectiveArgs* args); extern __device__ ncclKern_t ncclFuncs[]; static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) { int* d = (int*)dst; int* s = (int*)src; - __syncthreads(); + // When aggregation is effective, if some threads have aborted inside the LL kernel, + // make sure the rest of the threads abort as well + exitIfAbortBarrier(0); for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o]; __syncthreads(); } @@ -27,12 +43,14 @@ static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* ho } /* Functions for aggregation case */ -#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \ +#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \ __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \ - coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(args); \ + coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(args); \ } + +#if NCCL_OP == 0 /* Kernels with the first operation inlined */ -#define IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, fIndex) \ +#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \ __launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ int tid = threadIdx.x; \ @@ -40,25 +58,25 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ __shared__ struct ncclColl localColl; \ \ struct ncclComm* comm = firstColl.args.comm; \ - struct ncclRing* ring = comm->rings+bid; \ + struct ncclChannel* channel = comm->channels+bid; \ struct ncclColl* c; \ if (bid == 0) { \ /* To optimize for latency, (only) the first operation is passed as argument.*/ \ c = &firstColl; \ } else { \ c = &localColl; \ - load_coll(c, ring->devCollectives+ring->collFifoHead, tid); \ + load_coll(c, channel->devCollectives+channel->collFifoHead, tid); \ } \ while (1) { \ - if (tid < c->nThreads) { \ + if (tid < c->args.nThreads) { \ if (c->funcIndex == fIndex) { \ - coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(&c->args); \ + coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&c->args); \ } else { \ ncclFuncs[c->funcIndex](&c->args); \ } \ } \ int nextIndex = c->nextIndex; \ - if (tid == 0) ring->collFifoHead = nextIndex; \ + if (tid == 0) channel->collFifoHead = nextIndex; \ \ if (c->active == 2) { \ return; \ @@ -66,25 +84,75 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ \ /* Load next collective operation*/ \ c = &localColl; /* for bid 0 */ \ - load_coll(c, ring->devCollectives+nextIndex, tid); \ + load_coll(c, channel->devCollectives+nextIndex, tid); \ } \ } +#else +#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) +#endif + +// Only generate inline kernels for LL +#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \ + IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \ + IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \ + IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1, al)) \ #define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \ - IMPL_COLL4(coll##LL, op, ncclFunc, dtype, ctype) \ - IMPL_COLL4K(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1)) \ - IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \ - IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 0)) \ + IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0) \ + IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 1) +#if NCCL_TYPE == 0 +#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ + IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8) +#elif NCCL_TYPE == 1 +#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ + IMPL_COLL3(coll, op, ncclFunc, u8, uint8_t, ncclColl, ncclOp, ncclUint8) +#elif NCCL_TYPE == 2 +#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ + IMPL_COLL3(coll, op, ncclFunc, i32, int32_t, ncclColl, ncclOp, ncclInt32) +#elif NCCL_TYPE == 3 +#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ + IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32) +#elif NCCL_TYPE == 4 +#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ + IMPL_COLL3(coll, op, ncclFunc, i64, int64_t, ncclColl, ncclOp, ncclInt64) +#elif NCCL_TYPE == 5 +#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ + IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64) +#elif NCCL_TYPE == 6 +#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ + IMPL_COLL3(coll, op, ncclFunc, f16, half, ncclColl, ncclOp, ncclFloat16) +#elif NCCL_TYPE == 7 +#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ + IMPL_COLL3(coll, op, ncclFunc, f32, float, ncclColl, ncclOp, ncclFloat32) +#elif NCCL_TYPE == 8 #define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ - IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8) \ - IMPL_COLL3(coll, op, ncclFunc, u8, uint8_t, ncclColl, ncclOp, ncclUint8) \ - IMPL_COLL3(coll, op, ncclFunc, i32, int32_t, ncclColl, ncclOp, ncclInt32) \ - IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32) \ - IMPL_COLL3(coll, op, ncclFunc, i64, int64_t, ncclColl, ncclOp, ncclInt64) \ - IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64) \ - IMPL_COLL3(coll, op, ncclFunc, f16, half, ncclColl, ncclOp, ncclFloat16) \ - IMPL_COLL3(coll, op, ncclFunc, f32, float, ncclColl, ncclOp, ncclFloat32) \ IMPL_COLL3(coll, op, ncclFunc, f64, double, ncclColl, ncclOp, ncclFloat64) +#endif + +// Reduction define all functions +#if NCCL_OP == 0 +#define IMPL_COLL_R(collf, colln) \ + IMPL_COLL2(collf, sum, FuncSum, colln, ncclSum); +#elif NCCL_OP == 1 +#define IMPL_COLL_R(collf, colln) \ + IMPL_COLL2(collf, prod, FuncProd, colln, ncclProd); +#elif NCCL_OP == 2 +#define IMPL_COLL_R(collf, colln) \ + IMPL_COLL2(collf, min, FuncMin, colln, ncclMin); +#elif NCCL_OP == 3 +#define IMPL_COLL_R(collf, colln) \ + IMPL_COLL2(collf, max, FuncMax, colln, ncclMax); +#endif + +// Copy primitives only define one +#if NCCL_OP == 0 && NCCL_TYPE == 0 +#define IMPL_COLL_C(collf, colln) \ + IMPL_COLL3(collf, copy, FuncSum, i8, int8_t, colln, ncclSum, ncclInt8); +#else +#define IMPL_COLL_C(collf, colln) +#endif + +#define COLL_UNROLL 4 #endif diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h index 0eaa061..e1fb096 100644 --- a/src/collectives/device/common_kernel.h +++ b/src/collectives/device/common_kernel.h @@ -192,14 +192,6 @@ struct MULTI<FUNC, int64_t> { } }; -#define ALIGNUP(x, a) ((((x)-1) & ~((a)-1)) + (a)) - -template<typename T> -__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) { - size_t ptrval = reinterpret_cast<size_t>(ptr); - return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align)); -} - template<typename T> inline __device__ T vFetch(const volatile T* ptr) { return *ptr; @@ -236,25 +228,6 @@ void vStore<half>(volatile half* ptr, const half val) { } #endif -template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS> -__device__ inline void ReduceCopy( - const int tid, const int nthreads, - const volatile T * __restrict__ const src0, - const volatile T * __restrict__ const src1, - volatile T * __restrict__ const dest0, - volatile T * __restrict__ const dest1, const int N) { - for (int idx = tid; idx < N; idx += nthreads) { - T val = vFetch(src0+idx); - if (TWO_INPUTS) { - val = FUNC()(val, vFetch(src1+idx)); - } - vStore(dest0+idx, val); - if (TWO_OUTPUTS) { - vStore(dest1+idx, val); - } - } -} - typedef ulong2 Pack128; template<class FUNC, typename T> @@ -265,72 +238,111 @@ struct MULTI128 { } }; -inline __device__ void Fetch128(Pack128& v, Pack128* p) { +inline __device__ void Fetch128(Pack128& v, const Pack128* p) { asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory"); } inline __device__ void Store128(Pack128* p, Pack128& v) { asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" :: "l"(p), "l"(v.x), "l"(v.y) : "memory"); } +template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS> +__device__ __forceinline__ void ReduceCopyMulti(const int tid, const int nthreads, + int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS], + const int offset, const int N) { + for (int idx = offset+tid; idx < offset+N; idx += nthreads) { + T val = vFetch(srcs[0]+idx); + #pragma unroll + for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx)); + #pragma unroll 1 + for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx)); + + #pragma unroll + for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val); + #pragma unroll 1 + for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val); + } +} + #define WARP_SIZE 32 -template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL> -__device__ inline void ReduceCopy128b( const int w, const int nw, const int t, - Pack128 * src0, Pack128 * src1, Pack128 * dest0, Pack128 * dest1, - const int N) { - Pack128 t0[UNROLL]; - Pack128 t1[UNROLL]; - const Pack128* src0_end = src0 + N; + +template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS> +__device__ __forceinline__ void ReduceCopy128bMulti( const int w, const int nw, const int t, + int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS], + const int elemOffset, const int Npack) { const int inc = nw * UNROLL * WARP_SIZE; - const int offset = w * UNROLL * WARP_SIZE + t; - src0 += offset; if (TWO_INPUTS) src1 += offset; - dest0 += offset; if (TWO_OUTPUTS) dest1 += offset; - - while (src0 < src0_end) { -#pragma unroll - for (int u = 0; u < UNROLL; ++u) { - Fetch128(t0[u], src0+u*WARP_SIZE); - if (TWO_INPUTS) Fetch128(t1[u], src1+u*WARP_SIZE); + int offset = w * UNROLL * WARP_SIZE + t; + + const Pack128* srcs[MAXSRCS]; + for (int i=0; i<MAXSRCS; i++) srcs[i] = ((const Pack128*)(s[i]+elemOffset))+offset; + Pack128* dsts[MAXDSTS]; + for (int i=0; i<MAXDSTS; i++) dsts[i] = ((Pack128*)(d[i]+elemOffset))+offset; + + while (offset < Npack) { + Pack128 vals[UNROLL]; + // Load and reduce + for (int u = 0; u < UNROLL; ++u) Fetch128(vals[u], srcs[0]+u*WARP_SIZE); + + for (int i=1; i<MINSRCS; i++) { + Pack128 vals2[UNROLL]; + for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE); + for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]); } -#pragma unroll - for (int u = 0; u < UNROLL; ++u) { - if (TWO_INPUTS) MULTI128<FUNC, T>()(t0[u], t1[u]); - Store128(dest0+u*WARP_SIZE, t0[u]); - if (TWO_OUTPUTS) Store128(dest1+u*WARP_SIZE, t0[u]); + #pragma unroll 1 + for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) { + Pack128 vals2[UNROLL]; + for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE); + for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]); } - src0 += inc; if (TWO_INPUTS) src1 += inc; - dest0 += inc; if (TWO_OUTPUTS) dest1 += inc; + + // Store + for (int i = 0; i < MINDSTS; i++) { + for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]); + } + #pragma unroll 1 + for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) { + for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]); + } + for (int i=0; i<MAXSRCS; i++) srcs[i] += inc; + for (int i=0; i<MAXDSTS; i++) dsts[i] += inc; + offset += inc; } } -template<int UNROLL, class FUNC, typename T, bool HAS_DEST1, bool HAS_SRC1> -__device__ inline void ReduceOrCopy(const int tid, const int nthreads, - volatile T * __restrict__ dest0, volatile T * __restrict__ dest1, - const volatile T * __restrict__ src0, const volatile T * __restrict__ src1, +template <typename T> +__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); } + +// Try to limit consecutive load/stores to 8. +// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise +#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS))) + +template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS> +__device__ __forceinline__ void ReduceOrCopyMulti(const int tid, const int nthreads, + int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS], int N) { int Nrem = N; if (Nrem <= 0) return; - int Npreamble = (Nrem<alignof(Pack128)) ? Nrem : AlignUp(dest0, alignof(Pack128)) - dest0; + int alignDiff = 0; + int align = ptrAlign128(srcs[0]); + #pragma unroll + for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i])); + for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i])); + #pragma unroll + for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i])); + for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i])); - // stage 0: check if we'll be able to use the fast, 128-bit aligned path. - // If not, we'll just use the slow preamble path for the whole operation - bool alignable = (((AlignUp(src0, alignof(Pack128)) == src0 + Npreamble)) && - (!HAS_DEST1 || (AlignUp(dest1, alignof(Pack128)) == dest1 + Npreamble)) && - (!HAS_SRC1 || (AlignUp(src1, alignof(Pack128)) == src1 + Npreamble))); - - if (!alignable) { - Npreamble = Nrem; - } + int Npreamble = alignDiff ? Nrem : + N < alignof(Pack128) ? N : + (alignof(Pack128) - align) % alignof(Pack128); // stage 1: preamble: handle any elements up to the point of everything coming // into alignment - ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Npreamble); - - Nrem -= Npreamble; - if (Nrem == 0) return; - - dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; } - src0 += Npreamble; if (HAS_SRC1) { src1 += Npreamble; } + if (Npreamble) { + ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble); + Nrem -= Npreamble; + if (Nrem == 0) return; + } + int offset = Npreamble; // stage 2: fast path: use 128b loads/stores to do the bulk of the work, // assuming the pointers we have are all 128-bit alignable. @@ -338,35 +350,33 @@ __device__ inline void ReduceOrCopy(const int tid, const int nthreads, int nw = nthreads / WARP_SIZE; // Number of warps int t = tid % WARP_SIZE; // Thread (inside the warp) - const int PackFactor = sizeof(Pack128) / sizeof(T); + const int packFactor = sizeof(Pack128) / sizeof(T); // stage 2a: main loop - int Nalign2a = (Nrem / (PackFactor * UNROLL * nthreads)) - * (UNROLL * nthreads); // round down + int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE)) + * (AUTOUNROLL * WARP_SIZE); // round down + int Nelem2a = Npack2a * packFactor; - ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2a); + ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a); - int Ndone2a = Nalign2a * PackFactor; - Nrem -= Ndone2a; + Nrem -= Nelem2a; if (Nrem == 0) return; - dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; } - src0 += Ndone2a; if (HAS_SRC1) { src1 += Ndone2a; } + offset += Nelem2a; // stage 2b: slightly less optimized for section when we don't have full - // UNROLLs + // unrolling - int Nalign2b = Nrem / PackFactor; + int Npack2b = Nrem / packFactor; + int Nelem2b = Npack2b * packFactor; - ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, 1>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2b); + ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b); - int Ndone2b = Nalign2b * PackFactor; - Nrem -= Ndone2b; + Nrem -= Nelem2b; if (Nrem == 0) return; - dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; } - src0 += Ndone2b; if (HAS_SRC1) { src1 += Ndone2b; } + offset += Nelem2b; // stage 2c: tail - ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Nrem); + ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem); } #endif // COMMON_KERNEL_H_ diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu index 1fb8108..ea06b68 100644 --- a/src/collectives/device/functions.cu +++ b/src/collectives/device/functions.cu @@ -8,9 +8,13 @@ #include "collectives.h" #include "common.h" -#define NCCL_FUNC4(coll, op, dtype) \ +#define NCCL_FUNC5(coll, op, dtype) \ NCCL_COLL_NAME(coll, op, dtype), \ - NCCL_COLL_NAME(coll##LL, op, dtype) \ + NCCL_COLL_NAME(coll##LL, op, dtype) + +#define NCCL_FUNC4(coll, op, dtype) \ + NCCL_FUNC5(coll##Ring, op, dtype), \ + NCCL_FUNC5(coll##Tree, op, dtype) // Must be consistent with ncclDataType_t #define NCCL_FUNCS3A(coll, op) \ @@ -55,7 +59,7 @@ NCCL_FUNCS2A(ncclAllReduce) } // Must be consistent with the ncclFuncSet enum -__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2] = { +__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = { // Don't try to initialize the host shadow copy of this device-side global // variable. There is no host pointer to a device-side function, which // confuses clang. This will be fixed in the next clang release. diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh new file mode 100755 index 0000000..3942c8c --- /dev/null +++ b/src/collectives/device/gen_rules.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +dir=$1 + +targets="GENOBJS := \\\\\n" + +for base in all_reduce all_gather broadcast reduce reduce_scatter; do + opn=0 + for op in sum prod min max; do + dtn=0 + for dt in i8 u8 i32 u32 i64 u64 f16 f32 f64; do + echo "${dir}/${base}_${op}_${dt}.o : ${base}.cu ${dir}/${base}.dep" + echo " @printf \"Compiling %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o" + echo " mkdir -p ${dir}" + echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc ${base}.cu -o ${dir}/${base}_${op}_${dt}.o" + echo "" + targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n" + dtn=$(($dtn + 1)) + done + opn=$(($opn + 1)) + done +done +echo -e "$targets" diff --git a/src/collectives/device/ll_kernel.h b/src/collectives/device/ll_kernel.h deleted file mode 100644 index 5ec3c9a..0000000 --- a/src/collectives/device/ll_kernel.h +++ /dev/null @@ -1,154 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_LL_KERNEL_H_ -#define NCCL_LL_KERNEL_H_ - -static __device__ uint64_t readLL(union ncclLLFifoLine* src, uint32_t flag) { - uint32_t data1, flag1, data2, flag2; - do { - asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4)); - } while ((flag1 != flag) || (flag2 != flag)); - uint64_t val64 = data1 + (((uint64_t)data2) << 32); - return val64; -} - -static __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) { - asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag)); -} - -// Using memcpy handles misaligned pointers. -static __device__ uint64_t readAL(uint64_t* src) { - uint64_t val; - memcpy((char*)&val, (char*)src, sizeof(uint64_t)); - return val; -} -static __device__ void storeAL(uint64_t* dst, uint64_t val) { - memcpy((char*)dst, (char*)&val, sizeof(uint64_t)); -} - -template <typename T, class FUNC> -class LLPrimitives { - private: - template <int HAS_SRC1, int HAS_SRC2, int HAS_DST1, int HAS_DST2> - static __device__ void ReduceCopyGeneric(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) { - if (size <= 0) return; - size_t size64 = size * sizeof(T) / sizeof(uint64_t); - uint64_t* src1A = (uint64_t*)src1; - uint64_t* dst1A = (uint64_t*)dst1; - int offset = threadIdx.x; - // Do multiples of 64 bits -#pragma unroll 1 - for (; offset < size64; offset += nthreads) { - uint64_t val; - if (HAS_SRC1) { - val = readAL(src1A+offset); - if (HAS_SRC2) val = MULTI<FUNC, T>()(readLL(src2+offset, iflag), val); - } else if (HAS_SRC2) { - val = readLL(src2+offset, iflag); - } - if (HAS_DST1) storeAL(dst1A+offset, val); - if (HAS_DST2) storeLL(dst2+offset, val, oflag); - } - // Finish last word - int sizeDone = size64*(sizeof(uint64_t)/sizeof(T)); - int sizeRem = size - sizeDone; - if (threadIdx.x == 0 && sizeRem) { - const T* src1B = src1 + sizeDone; - T* dst1B = dst1 + sizeDone; - - uint64_t lastVal; - T* vals = (T*)&lastVal; - - if (HAS_SRC2) { - uint64_t lastVal2 = readLL(src2+size64, iflag); - T* src2B = (T*)&lastVal2; - for (int offset = 0; offset < sizeRem; offset++) { - vals[offset] = HAS_SRC1 ? FUNC()(src2B[offset], src1B[offset]) : src2B[offset]; - } - } else if (HAS_SRC1) { - for (int offset = 0; offset < sizeRem; offset++) { - vals[offset] = src1B[offset]; - } - } - if (HAS_DST2) storeLL(dst2+size64, lastVal, oflag); - if (HAS_DST1) { - for (int offset = 0; offset < sizeRem; offset++) { - dst1B[offset] = vals[offset]; - } - } - } - } - public: - static __device__ void ReduceCopy(const T* src, union ncclLLFifoLine* dst, int size, uint32_t oflag, int nthreads) { - return ReduceCopyGeneric<1, 0, 0, 1>(src, NULL, NULL, dst, size, 0, oflag, nthreads); - } - - static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst, int size, uint32_t iflag, int nthreads) { - return ReduceCopyGeneric<0, 1, 1, 0>(NULL, src, dst, NULL, size, iflag, 0, nthreads); - } - - static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, union ncclLLFifoLine* dst, int size, uint32_t iflag, uint32_t oflag, int nthreads) { - return ReduceCopyGeneric<1, 1, 0, 1>(src1, src2, NULL, dst, size, iflag, oflag, nthreads); - } - - static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst, int size, uint32_t iflag, int nthreads) { - return ReduceCopyGeneric<1, 1, 1, 0>(src1, src2, dst, NULL, size, iflag, 0, nthreads); - } - - static __device__ void ReduceCopy(const T* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t oflag, int nthreads) { - return ReduceCopyGeneric<1, 0, 1, 1>(src, NULL, dst1, dst2, size, 0, oflag, nthreads); - } - - static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) { - return ReduceCopyGeneric<0, 1, 1, 1>(NULL, src, dst1, dst2, size, iflag, oflag, nthreads); - } - - static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) { - return ReduceCopyGeneric<1, 1, 1, 1>(src1, src2, dst1, dst2, size, iflag, oflag, nthreads); - } -}; - -// Common macros - -#define STEP_TO_SLOT(step) \ - (step % NCCL_LL_CHUNKS) - -#define WAIT_NEXT \ - if (tid == 0) { \ - while (sendHead + NCCL_LL_CHUNKS <= step) { \ - sendHead = sendHeadPtr[0]; \ - } \ - } \ - asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads)); - -#define POST_SIZE \ - if (tid == 0 && sizesFifo) sizesFifo[step % NCCL_LL_CHUNKS] = (maxOffset <= 0) ? -1 : (maxOffset*2*(int)sizeof(T)); - -#define ACK_PREV \ - asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads)); \ - if (tid == 0) recvHeadPtr[0] = step; - -#define FIFO_CLEANING_AND_SAVE_STEP(flag) do { \ - if (step > ring->send.conn.llLastCleaning + NCCL_LL_CLEAN_FREQ) { \ - /* Reset all flags */ \ - static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); \ - static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); \ - const union ncclLLFifoLine resetLine = { 0, flag, 0, flag }; \ - for (int i=0; i<NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*llNthreads); i++) { \ - prevInput[tid+i*llNthreads].i4 = resetLine.i4; \ - } \ - __threadfence_system(); \ - /* Restart from the same slot, only make sure sender waits for data to be reset */ \ - step += NCCL_LL_CHUNKS; \ - ACK_PREV; \ - while (sendHeadPtr[0] < step); \ - if (tid == 0) ring->send.conn.llLastCleaning = step; \ - } \ - ring->send.conn.llStep = step; \ -} while (0); - -#endif diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h index e2baa4b..c5aaf54 100644 --- a/src/collectives/device/primitives.h +++ b/src/collectives/device/primitives.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,218 +9,579 @@ #include <type_traits> #include "reduce_kernel.h" // for reduction funcs +#include "common.h" + +#define SPINS_BEFORE_CHECK_ABORT 1000000 + +// Unroll unconditionally the first send/recv since nsend/nrecv should be at +// least 1 if SEND/RECV is set. +#define FOR_SEND(func, ...) do { \ + if (SEND) { \ + /* Send to far first, then close */ \ + for (int i=1; i<NSEND && i<nsend; i++) func(i, ##__VA_ARGS__); \ + func(0, ##__VA_ARGS__); \ + } \ +} while (0) + +#define FOR_RECV(func, ...) do { \ + if (RECV) { \ + /* Recv from close first, then far */ \ + func(0, ##__VA_ARGS__); \ + for (int i=1; i<NRECV && i<nrecv; i++) func(i, ##__VA_ARGS__); \ + } \ +} while (0) +// Implementation of primitive types +template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, class FUNC> +class ncclPrimitives { + private: + const int tid; + const int nthreads; + int nrecv = 0; + int nsend = 0; + const int stepSize; + struct ncclConnInfo* recvConn[NRECV]; + struct ncclConnInfo* sendConn[NSEND]; + volatile uint64_t* waitPtr; + uint64_t recvStep[NRECV]; + uint64_t sendStep[NSEND]; + uint64_t sendConnHead[NSEND]; + const T* recvDirectBuff[NRECV]; + T* sendDirectBuff[NSEND]; + const T* recvBuff[NRECV]; + T* sendBuff[NSEND]; + struct ncclComm* comm; + + inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; } + inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; } + inline __device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); } + inline __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); } + + inline __device__ void barrier() { + asm volatile ("bar.sync 1, %0;" :: "r"(nthreads)); + } -/* Defines primitive operations: Copy, Reduce, DoubleCopy, and ReduceCopy. - * - * In order to reduce the reptetion of template arguments, the operations - * are bundled as static methods of the Primitives class. - * - * Each primitive operation copies/reduces a contiguous buffer and syncs - * an optional set of flags against a sub-step counter. The sync value is - * based on the step parameter. Sync flags must be of type WaitFlag or - * PostFlag. The primitive routines wait for all WaitFlag args to attain - * at least a value of SUBSTEPS*(step-1)+substep+1 (i.e. completion of - * corresponding substep by previous step) before executing the transfer. - * After each substep is transfered, all PostFlag arguments get updated to - * the value SUBSTEPS*step+substep+1. - */ - - -class WaitFlag { - volatile uint64_t * const flag; - const int shift; - public: - __device__ __forceinline__ - WaitFlag(volatile uint64_t * const flag, const int shift) : flag(flag), shift(shift) { } - __device__ __forceinline__ - void wait(uint64_t val) { while ((*flag + shift) < val) /*SPIN*/; } -}; + uint32_t mismatch = 0; + const uint64_t opCount; + inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) { + if (mismatch) { + // In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch + *(comm->fatalDevError) = ncclDevAssertedMismatch; + } else if (remoteOpCount && *remoteOpCount > opCount) { + mismatch += 1; + } + } + + uint32_t spins = 0; + uint32_t abort = 0; + + inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) { + spins++; + if (spins == SPINS_BEFORE_CHECK_ABORT) { + abort = *(comm->abortFlag); + checkMismatch(remoteOpCount); + spins = 0; + } + return abort; + } + + inline __device__ void waitRecv(int i) { + spins = 0; + mismatch = 0; + recvStep[i] += SLICESTEPS; + if (tid == i) { + while (*(waitPtr) < recvStep[i]) { + if (checkAbort(recvConn[i]->opCountRem)) break; + } + } + } + + inline __device__ void waitSend(int i) { + spins = 0; + mismatch = 0; + sendStep[i] += SLICESTEPS; + if (tid == WARP_SIZE+i) { + while (sendConnHead[i] + NCCL_STEPS < sendStep[i]) { + sendConnHead[i] = *waitPtr; + if (checkAbort(sendConn[i]->opCountRem)) break; + } + } + } + + inline __device__ void postRecv(int i) { + *(recvConn[i]->head) = recvStep[i] += SLICESTEPS; + } + + inline __device__ void postSend(int i) { + *(sendConn[i]->tail) = sendStep[i] += SLICESTEPS; + } + + inline __device__ void postSendSize(int i, int size) { + if (sendConn[i]->fifo) sendConn[i]->fifo[sendStep[i]%NCCL_STEPS] = size; + } + + template <int DIRECTRECV> + inline __device__ const T* directRecvPtr(int i, int directOffset) { + return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i); + } + + template <int DIRECTSEND> + inline __device__ T* directSendPtr(int i, int directOffset) { + return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i); + } + + template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST> + inline __device__ void + GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) { + int offset = 0; + int sliceSize = stepSize * SLICESTEPS; + + const T* srcs[RECV*NRECV+SRC]; + srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset); + if (RECV) { + if (SRC) srcs[1] = recvPtr(0); + for (int i=1; i<NRECV && i<nrecv; i++) srcs[SRC+i] = recvPtr(i); + } + + T* dsts[SEND*NSEND+DST]; + dsts[0] = DST ? dstPtr : directSendPtr<DIRECTSEND>(0, directOffset); + if (SEND) { + if (DST) dsts[1] = directSendPtr<DIRECTSEND>(0, directOffset); + for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset); + } + + #pragma unroll 1 + for (int slice=0; slice<SLICESPERCHUNK; ++slice) { + int realSize = max(0, min(sliceSize, nelem-offset)); + if (tid < nthreads) { + FOR_SEND(waitSend); + FOR_RECV(waitRecv); + if (realSize > 0) { + barrier(); + if (DIRECTRECV && recvDirectBuff[0]) { + // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy + if (SEND) { + ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize); + } + } else { + ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize); + } + } + exitIfAbortBarrier(abort); + } else { + exitIfAbortBarrier(abort); + FOR_SEND(postSendSize, realSize*sizeof(T)); + if (SEND) __threadfence_system(); + FOR_SEND(postSend); + FOR_RECV(postRecv); + } + for (int i=0; i<RECV*NRECV+SRC; i++) srcs[i] += sliceSize; + for (int i=0; i<SEND*NSEND+DST; i++) dsts[i] += sliceSize; + offset += sliceSize; + } + } + + __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) { + recvConn[i] = conn; + recvBuff[i] = (const T*)recvConn[i]->buff; + recvStep[i] = recvConn[i]->step; + recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS); + // Return credits in case we rounded up. + if (tid == nthreads) *recvConn[i]->head = recvStep[i]; + if (tid == i) { + waitPtr = recvConn[i]->tail; + *(recvConn[i]->opCountLoc) = opCount; + } + recvDirectBuff[i] = NULL; + if (directBuff && recvConn[i]->direct) { + recvDirectBuff[i] = directBuff; + if (tid == 0) *recvConn[i]->ptrExchange = directBuff; + } + nrecv++; + } + + __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) { + sendConn[i] = conn; + sendBuff[i] = (T*)sendConn[i]->buff; + sendStep[i] = sendConn[i]->step; + sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS); + if (tid == WARP_SIZE+i) { + waitPtr = sendConn[i]->head; + sendConnHead[i] = *waitPtr; + *(sendConn[i]->opCountLoc) = opCount; + } + sendDirectBuff[i] = NULL; + if (directBuff && sendConn[i]->direct) { + void* volatile* ptr = sendConn[i]->ptrExchange; + while ((sendDirectBuff[i] = (T*)(*ptr)) == NULL); + __syncthreads(); + if (tid == 0) *ptr = NULL; + } + nsend++; + } + + __device__ __forceinline__ void saveRecvConn(int i) { + if (tid == i) { + recvConn[i]->step = recvStep[i]; + __threadfence_system(); + *(recvConn[i]->opCountLoc) += 1; + } + } + + __device__ __forceinline__ void saveSendConn(int i) { + if (tid == WARP_SIZE+i) { + sendConn[i]->step = sendStep[i]; + __threadfence_system(); + *(sendConn[i]->opCountLoc) += 1; + } + } -class PostFlag { - volatile uint64_t * const flag; - const int shift; - volatile int * const fifo; - const int fifo_size; public: __device__ __forceinline__ - PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size) : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size) { } - __device__ __forceinline__ - void post(uint64_t val) { *flag = (val - shift); } - __device__ __forceinline__ - void postSize(uint64_t step, int size) { if (fifo != NULL) fifo[step%fifo_size] = size; }; -}; + ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount) + : comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) { + // Make sure step is updated before we read it + __syncthreads(); + for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, directBuff); + for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, directBuff); + } -// Helper to check if any argument is of type T. -// e.g. AnyAre<WaitFlag>(Flag1, Flag2, ...) -template<typename T> __device__ __forceinline__ -bool AnyAre() { return false; } + __device__ __forceinline__ void + send(const T* src, int nelem) { + GenericOp<0, 0, 0, 1, 1, 0>(src, NULL, nelem, 0); + } + __device__ __forceinline__ void + directSend(const T* src, int directOffset, int nelem) { + GenericOp<0, 1, 0, 1, 1, 0>(src, NULL, nelem, directOffset); + } -template<typename T, typename FIRST_T, typename... TAIL_Ts> -__device__ __forceinline__ -bool AnyAre(FIRST_T first, TAIL_Ts... tail) { - return std::is_same<T, FIRST_T>::value || AnyAre<T>(tail...); -} + __device__ __forceinline__ void + recv(T* dst, int nelem) { + GenericOp<0, 0, 1, 0, 0, 1>(NULL, dst, nelem, 0); + } + __device__ __forceinline__ void + directRecv(T* dst, int directOffset, int nelem) { + GenericOp<1, 0, 1, 0, 0, 1>(NULL, dst, nelem, directOffset); + } + __device__ __forceinline__ void + copySend(const T* src, T* dst, int nelem) { + GenericOp<0, 0, 0, 1, 1, 1>(src, dst, nelem, 0); + } + __device__ __forceinline__ void + directCopySend(const T* src, T* dst, int directOffset, int nelem) { + GenericOp<0, 1, 0, 1, 1, 1>(src, dst, nelem, directOffset); + } -// Wait on all WaitFlags, ignore PostFlags -__device__ __forceinline__ -void WaitOnFlags(uint64_t val) { } + __device__ __forceinline__ void + recvCopySend(T* dst, int nelem) { + GenericOp<0, 0, 1, 1, 0, 1>(NULL, dst, nelem, 0); + } + __device__ __forceinline__ void + directRecvCopySend(T* dst, int directOffset, int nelem) { + GenericOp<1, 1, 1, 1, 0, 1>(NULL, dst, nelem, directOffset); + } -template <typename... TAIL_Ts> __device__ __forceinline__ -void WaitOnFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) { - flag.wait(val); - WaitOnFlags(val, tail...); -} + __device__ __forceinline__ void + recvReduceCopy(const T* src, T* dst, int nelem) { + GenericOp<0, 0, 1, 0, 1, 1>(src, dst, nelem, 0); + } -template <typename... TAIL_Ts> __device__ __forceinline__ -void WaitOnFlags(uint64_t val, PostFlag, TAIL_Ts... tail) { - WaitOnFlags(val, tail...); -} + __device__ __forceinline__ void + recvReduceSend(const T* src, int nelem) { + GenericOp<0, 0, 1, 1, 1, 0>(src, NULL, nelem, 0); + } + __device__ __forceinline__ void + recvReduceCopySend(const T* src, T* dst, int nelem) { + GenericOp<0, 0, 1, 1, 1, 1>(src, dst, nelem, 0); + } + __device__ __forceinline__ void + directRecvReduceCopySend(const T* src, T* dst, int directOffset, int nelem) { + // Direct is only for the send part + GenericOp<0, 1, 1, 1, 1, 1>(src, dst, nelem, directOffset); + } -// Post all PostFlags, ignore WaitFlags -__device__ __forceinline__ -void PostToFlags(uint64_t val) { } + __device__ __forceinline__ ~ncclPrimitives() { + // Save steps for next collective. Have thread 0 do it to be compatible + // with the way LL works. + for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i); + for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i); + } +}; -template <typename... TAIL_Ts> __device__ __forceinline__ -void PostToFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) { - PostToFlags(val, tail...); -} +template <typename T, class FUNC, int NRECV, int NSEND> +class ncclLLPrimitives { + private: + const int tid; + const int nthreads; + int nrecv = 0; + int nsend = 0; + struct ncclConnInfo* recvConn[NRECV]; + struct ncclConnInfo* sendConn[NSEND]; + volatile uint64_t* waitPtr; + volatile uint64_t* postPtr; + volatile int* fifoPtr; + uint64_t recvStep[NRECV]; + uint64_t sendStep[NSEND]; + uint64_t sendConnHead; + union ncclLLFifoLine* recvBuff[NRECV]; + union ncclLLFifoLine* sendBuff[NSEND]; + struct ncclComm* comm; + + inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } + inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } + inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } + inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); } + inline __device__ uint32_t recvFlag(int i) { return recvStep[i]+1; } + inline __device__ uint32_t sendFlag(int i) { return sendStep[i]+1; } + + // Exit If Abort Barrier : make sure all threads exit consistently + // Each thread sets a predicate to true if val == 1 + // all CTA's threads enter the barrier and do a popc on their predicates being True + // If any of the thread's predicate was True, all the threads call exit() + inline __device__ void exitIfAbortLocalBarrier() { + uint32_t popc; + asm ("{"); + asm volatile (" .reg .pred barr_pred;"); + asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort)); + asm volatile (" bar.red.popc.u32 %0, 14, %1, barr_pred;" : "=r"(popc) : "r"(nthreads)); + asm ("}"); + if (popc) { + // Make sure threads not participating in the operation get the abort and all threads exit + exitIfAbortBarrier(1); + } + } + + inline __device__ void barrier() { + asm volatile ("bar.sync 1, %0;" :: "r"(nthreads)); + } + + uint32_t mismatch = 0; + const uint64_t opCount; + + inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) { + if (mismatch > 20) { + // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch + // Note that we are not using _threadfence_system in LL so the error cannot be asserted + *(comm->fatalDevError) = ncclDevSuspectedMismatch; + } else if (remoteOpCount && *remoteOpCount > opCount) { + mismatch += 1; + } + } -template <typename... TAIL_Ts> __device__ __forceinline__ -void PostToFlags(uint64_t val, PostFlag flag, TAIL_Ts... tail) { - flag.post(val); - PostToFlags(val, tail...); -} + uint32_t spins = 0; + uint32_t abort = 0; + inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) { + spins++; + if (spins == SPINS_BEFORE_CHECK_ABORT) { + abort = *(comm->abortFlag); + checkMismatch(remoteOpCount); + spins = 0; + } + return abort; + } -// Post sizes for PostFlags, ignore WaitFlags -__device__ __forceinline__ -void PostSizeToFlags(uint64_t step, int size) { } + inline __device__ void waitSend(int i, int nbytes) { + spins = 0; + mismatch = 0; + if (tid == WARP_SIZE+i) { + while (sendConnHead + NCCL_STEPS < sendStep[i] + 1) { + sendConnHead = *waitPtr; + if (checkAbort(sendConn[i]->opCountRem)) break; + } + if (fifoPtr) fifoPtr[sendStep[i]%NCCL_STEPS] = nbytes; + } + } -template <typename... TAIL_Ts> __device__ __forceinline__ -void PostSizeToFlags(uint64_t step, int size, WaitFlag flag, TAIL_Ts... tail) { - PostSizeToFlags(step, size, tail...); -} + inline __device__ void postRecv(int i) { + recvStep[i]++; + if (tid == i) *postPtr = recvStep[i]; + } -template <typename... TAIL_Ts> __device__ __forceinline__ -void PostSizeToFlags(uint64_t step, int size, PostFlag flag, TAIL_Ts... tail) { - flag.postSize(step, size); - PostSizeToFlags(step, size, tail...); -} + inline __device__ void postSend(int i) { + sendStep[i]++; + } + __device__ uint64_t readLL(int i, int offset) { + union ncclLLFifoLine* src = recvPtr(i) + offset; + uint32_t flag = recvFlag(i); + uint32_t data1, flag1, data2, flag2; + spins = 0; + mismatch = 0; + do { + asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4)); + if (checkAbort(recvConn[i]->opCountRem)) break; + } while ((flag1 != flag) || (flag2 != flag)); + uint64_t val64 = data1 + (((uint64_t)data2) << 32); + return val64; + } -// Create pointer arithmetic syntax that doesn't break for std::nullptr_t -template <typename Tptr> __device__ __forceinline__ -Tptr ptradd(Tptr ptr, int i) { - return ptr + i; -} + __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) { + asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag)); + } -__device__ __forceinline__ -std::nullptr_t ptradd(std::nullptr_t ptr, int i) { - return nullptr; -} + // Using memcpy handles misaligned pointers. + __device__ uint64_t readAL(uint64_t* src) { + uint64_t val; + memcpy((char*)&val, (char*)src, sizeof(uint64_t)); + return val; + } + __device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) { + memcpy((char*)dst, (char*)&val, nbytes); + } -// Implementation of primitive types -template <int UNROLL, int SUBSTEPS, typename T, typename REDOP=FuncSum<T> > -class Primitives { - private: - template <typename SRC2_T, // either T* or std::nullptr_t - typename DST2_T, // either T* or std::nullptr_t - typename... SYNC_Ts> // either WaitFunc or PostFunc - static __device__ __forceinline__ void - GenericOp(const int tid, const int nthreads, - const T* src1, - const SRC2_T src2, - T* dst1, - DST2_T dst2, - int len, int maxoffset, uint64_t step, SYNC_Ts... flags) { - - enum { noSrc2 = std::is_same<SRC2_T, std::nullptr_t>::value }; - enum { noDst2 = std::is_same<DST2_T, std::nullptr_t>::value }; - static_assert(noSrc2 || std::is_same<SRC2_T, const T*>::value, - "src2 must be of type T* or std::nullptr_t"); - static_assert(noDst2 || std::is_same<DST2_T, T*>::value, - "dst2 must be of type T* or std::nullptr_t"); - - using OpType = typename std::conditional<noSrc2, FuncSum<T>, REDOP>::type; - - int sliceSize = len / SUBSTEPS; - int sliceOffset = 0; - -#pragma unroll 1 - for (int sub=0; sub<SUBSTEPS; ++sub) { - int realSize = max(0, min(sliceSize, maxoffset-sliceOffset)); - if (tid < nthreads) { - if (AnyAre<WaitFlag>(flags...)) { - if (tid == 0) { - WaitOnFlags(SUBSTEPS*step + sub + 1, flags...); - } - asm volatile ("bar.sync 1, %0;" :: "r"(nthreads)); + template <int RECV, int SEND, int SRC, int DST> + __device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) { + uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T); + FOR_SEND(waitSend, nbytes*2); + barrier(); + uint32_t npack = DIVUP(nbytes, sizeof(uint64_t)); + uint64_t* srcPack = (uint64_t*)srcPtr; + uint64_t* dstPack = (uint64_t*)dstPtr; + // Do multiples of 64 bits + #pragma unroll 2 + for (int offset=tid; offset<npack; offset+=nthreads) { + // Recv : local, then intra-node, then inter-node + uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset); + if (RECV) { + if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val); + for (int i=1; i<NRECV && i<nrecv; i++) { + val = MULTI<FUNC, T>()(readLL(i, offset), val); } - ReduceOrCopy - < - UNROLL, - OpType, - T, - !std::is_same<DST2_T, std::nullptr_t>::value, // HAS_DEST1 - !std::is_same<SRC2_T, std::nullptr_t>::value // HAS_SRC1 - > - ( - tid, nthreads, - ptradd(dst1, sliceOffset), - ptradd(dst2, sliceOffset), - ptradd(src1, sliceOffset), - ptradd(src2, sliceOffset), - realSize - ); - if (AnyAre<PostFlag>(flags...)) { - __syncthreads(); + } + + // Send : inter-node, then intra-node, then local + if (SEND) { + for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i)); + storeLL(sendPtr(0)+offset, val, sendFlag(0)); + } + if (DST) { + if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) { + // Last incomplete word + storeAL(dstPack+offset, val, nbytes & 0x7); + } else { + storeAL(dstPack+offset, val, sizeof(uint64_t)); } - } else { - if (AnyAre<PostFlag>(flags...)) { - __syncthreads(); - PostSizeToFlags(SUBSTEPS*step+sub, realSize*sizeof(T), flags...); - __threadfence_system(); - PostToFlags(SUBSTEPS*step + sub + 1, flags...); + } + } + exitIfAbortLocalBarrier(); + FOR_RECV(postRecv); + FOR_SEND(postSend); + } + + __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) { + recvConn[i] = conn; + recvBuff[i] = recvConn[i]->llBuff; + recvStep[i] = recvConn[i]->step; + if (tid == i) { + postPtr = recvConn[i]->head; + *(recvConn[i]->opCountLoc) = opCount; + } + nrecv++; + } + + __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) { + sendConn[i] = conn; + sendBuff[i] = sendConn[i]->llBuff; + sendStep[i] = sendConn[i]->step; + if (tid == WARP_SIZE+i) { + waitPtr = sendConn[i]->head; + fifoPtr = sendConn[i]->fifo; + sendConnHead = *waitPtr; + *(sendConn[i]->opCountLoc) = opCount; + } + nsend++; + } + + __device__ __forceinline__ void saveRecvConn(int i) { + if (tid == i) { + recvConn[i]->step = recvStep[i]; + *(recvConn[i]->opCountLoc) += 1; + __threadfence_block(); + } + } + + __device__ __forceinline__ void saveSendConn(int i) { + if (tid == WARP_SIZE+i) { + sendConn[i]->step = sendStep[i]; + *(sendConn[i]->opCountLoc) += 1; + __threadfence_block(); + } + } + + __device__ __forceinline__ void llSendCleaning(int i) { + if (sendStep[i] > sendConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) { + /* Reset all flags */ + static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); + static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); + for (int s=0; s<NCCL_STEPS; s++) { + waitSend(i, 0); + for (int o=tid; o<NCCL_LL_SLICE_LINES; o+=nthreads) { + const union ncclLLFifoLine resetLine = { 0, sendFlag(i), 0, sendFlag(i) }; + sendPtr(i)[o].i4 = resetLine.i4; } } - sliceOffset += sliceSize; + if (tid == 0) sendConn[i]->llLastCleaning = sendStep[i]; + } + } + + __device__ __forceinline__ void llRecvCleaning(int i) { + if (recvStep[i] > recvConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) { + recvStep[i] += NCCL_STEPS; + if (tid == 0) recvConn[i]->llLastCleaning = recvStep[i]; } } public: - template <typename... SYNC_Ts> - static __device__ __forceinline__ void - Copy(const int tid, const int nthreads, const T* src, T* dst, - int len, int maxOffset, uint64_t step, SYNC_Ts... flags) { - GenericOp(tid, nthreads, src, nullptr, dst, nullptr, len, maxOffset, step, flags...); + __device__ __forceinline__ + ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount) + : comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) { + // Make sure step is updated before we read it. + barrier(); + + for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i); + for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i); } - template <typename... SYNC_Ts> - static __device__ __forceinline__ void - DoubleCopy(const int tid, const int nthreads, const T* src, T* dst1, T* dst2, - int len, int maxOffset, uint64_t step, SYNC_Ts... flags) { - GenericOp(tid, nthreads, src, nullptr, dst1, dst2, len, maxOffset, step, flags...); + __device__ void send(const T* src, int nelem) { + return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem); } - template <typename... SYNC_Ts> - static __device__ __forceinline__ void - Reduce(const int tid, const int nthreads, const T* src1, const T* src2, T* dst, - int len, int maxOffset, uint64_t step, SYNC_Ts... flags) { - GenericOp(tid, nthreads, src1, src2, dst, nullptr, len, maxOffset, step, flags...); + __device__ void recv(T* dst, int nelem) { + return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem); } - template <typename... SYNC_Ts> - static __device__ __forceinline__ void - ReduceCopy(const int tid, const int nthreads, const T* src1, const T* src2, T* dst1, T* dst2, - int len, int maxOffset, uint64_t step, SYNC_Ts... flags) { - GenericOp(tid, nthreads, src1, src2, dst1, dst2, len, maxOffset, step, flags...); + __device__ void recvReduceSend(const T* src, int nelem) { + return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem); + } + + __device__ void recvReduceCopy(const T* src, T* dst, int nelem) { + return LLGenericOp<1, 0, 1, 1>(src, dst, nelem); } -}; -#endif // end include guard + __device__ void copySend(const T* src, T* dst, int nelem) { + return LLGenericOp<0, 1, 1, 1>(src, dst, nelem); + } + + __device__ void recvCopySend(T* dst, int nelem) { + return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem); + } + + __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) { + return LLGenericOp<1, 1, 1, 1>(src, dst, nelem); + } + + __device__ __forceinline__ ~ncclLLPrimitives() { + for (int i=0; i<NSEND && i<nsend; i++) llSendCleaning(i); + for (int i=0; i<NRECV && i<nrecv; i++) llRecvCleaning(i); + // Save steps for the next operation + for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i); + for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i); + } +}; +#endif diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu index bd1d23c..1ef66d4 100644 --- a/src/collectives/device/reduce.cu +++ b/src/collectives/device/reduce.cu @@ -4,18 +4,8 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "common.h" #include "reduce.h" +#include "common.h" #include "collectives.h" -#define UNROLL 4 - -#if NCCL_OP == 0 -IMPL_COLL2(ncclReduce, sum, FuncSum, ncclCollReduce, ncclSum); -#elif NCCL_OP == 1 -IMPL_COLL2(ncclReduce, prod, FuncProd, ncclCollReduce, ncclProd); -#elif NCCL_OP == 2 -IMPL_COLL2(ncclReduce, min, FuncMin, ncclCollReduce, ncclMin); -#elif NCCL_OP == 3 -IMPL_COLL2(ncclReduce, max, FuncMax, ncclCollReduce, ncclMax); -#endif +IMPL_COLL_R(ncclReduce, ncclCollReduce); diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h index f5694b1..302d053 100644 --- a/src/collectives/device/reduce.h +++ b/src/collectives/device/reduce.h @@ -8,143 +8,71 @@ #include "primitives.h" #include "collectives.h" -// Increase Step and boffset for buffer sync -#define NEXT_STEP \ - step++; \ - boffset += sliceSize; \ - if (boffset == buffSize) boffset = 0; - template<int UNROLL, class FUNC, typename T> -__device__ void ncclReduceKernel(struct CollectiveArgs* args) { +__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - - WaitFlag waitDoneFromNext(ring->send.conn.head, (REDUCE_BUFCHUNKS-1)*REDUCE_SUBSTEPS); - WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0); - PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS); - - typedef Primitives<UNROLL, REDUCE_SUBSTEPS, T, FUNC> Prims; - + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; const int nranks = comm->nRanks; - const int buffSize = ring->buffSize / sizeof(T); - const int sliceSize = buffSize / REDUCE_BUFCHUNKS; - const ssize_t loopSize = args->nRings*(ssize_t)sliceSize; + const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int chunkSize = stepSize * REDUCE_CHUNKSTEPS; + const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; const int rank = ring->devUserRanks[0]; const int prevRank = ring->devUserRanks[nranks-1]; const int root = args->root; - if (tid == 0) { - // Update in case we skipped some collectives - *ring->recv.conn.opCount = args->opCount; - - if (rank != root) { - // Wait for next to be ready - WaitFlag waitOpCountNext(ring->send.conn.opCount, 0); - waitOpCountNext.wait(args->opCount); - } - } - __syncthreads(); - - uint64_t step = 0ULL; - int boffset = 0; - // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - T * __restrict__ prevInput = (T*)ring->recv.conn.buff; - T * __restrict__ nextOutput = (T*)ring->send.conn.buff; + + ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, FUNC> + prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings)); - ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); - ssize_t offset = gridOffset + bid*chunkSize; - int maxOffset = min(chunkSize, size-offset); + int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); + ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); + ssize_t offset = gridOffset + bid*realChunkSize; + int nelem = min(realChunkSize, size-offset); if (prevRank == root) { - Prims::Copy(tid, nthreads, - thisInput + offset, - nextOutput + boffset, - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); + prims.send(thisInput+offset, nelem); } else if (rank == root) { - Prims::Reduce(tid, nthreads, - prevInput + boffset, - thisInput + offset, - thisOutput + offset, - sliceSize, maxOffset, - step, - waitReadyFromPrev, - postDoneToPrev); + prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem); } else { - Prims::Reduce(tid, nthreads, - prevInput + boffset, - thisInput + offset, - nextOutput + boffset, - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - } - NEXT_STEP; // Increases step, boffset - } - - if (tid == 0) { - if (rank != root) { - // Wait for next to have consumed data before resetting the flag - waitDoneFromNext.wait(REDUCE_SUBSTEPS*(step + REDUCE_BUFCHUNKS - 1)); - *ring->send.conn.head = 0ULL; + prims.recvReduceSend(thisInput+offset, nelem); } - *ring->recv.conn.tail = 0ULL; - __threadfence_system(); - *ring->recv.conn.opCount = args->opCount+1; } } -#include "ll_kernel.h" - -#define NEXT_STEP_LL \ - boffset += NCCL_LL_SLICE_LINES; \ - if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \ - flag++; \ - step++; +template<int UNROLL, class FUNC, typename T> +__device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { } template<int UNUSED, class FUNC, typename T> -__device__ void ncclReduceLLKernel(struct CollectiveArgs* args) { +__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; - const int llNthreads = args->nThreads; + const int nthreads = args->nThreads; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead; - volatile uint64_t * sendHeadPtr = ring->send.conn.llHead; - volatile int * sizesFifo = ring->send.conn.llFifo; - uint64_t sendHead = sendHeadPtr[0]; - const int nranks = comm->nRanks; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; + + ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); + + const ssize_t size = args->N; const int rank = comm->rank; + const int nranks = comm->nRanks; const int prevRank = ring->devUserRanks[nranks-1]; const int root = args->root; - typedef LLPrimitives<T, FUNC> LL; - - const ssize_t size = args->N; ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nRings*chunkSize; - - uint64_t step = ring->send.conn.llStep; - uint32_t flag = step + 1; - int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step); + const ssize_t loopSize = args->nChannels*chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff; - union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { @@ -152,39 +80,16 @@ __device__ void ncclReduceLLKernel(struct CollectiveArgs* args) { } ssize_t offset = gridOffset + bid*chunkSize; - int maxOffset = min(chunkSize, size-offset); + int nelem = min(chunkSize, size-offset); if (prevRank == root) { - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - nextOutput + boffset, - maxOffset, flag, llNthreads); - POST_SIZE; - NEXT_STEP_LL; + LLprims.send(thisInput+offset, nelem); } else if (rank == root) { - LL::ReduceCopy( - thisInput + offset, - prevInput + boffset, - thisOutput + offset, - maxOffset, flag, llNthreads); - NEXT_STEP_LL; - ACK_PREV; + LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem); } else { - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - prevInput + boffset, - nextOutput + boffset, - maxOffset, flag, flag, llNthreads); - POST_SIZE; - NEXT_STEP_LL; - ACK_PREV; + LLprims.recvReduceSend(thisInput+offset, nelem); } } - - // We need everyone to acknowledge data even if they didn't receive anything - // so that the next collective can start right away. - ACK_PREV; - - FIFO_CLEANING_AND_SAVE_STEP(flag); } + +template<int UNUSED, class FUNC, typename T> +__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { } diff --git a/src/collectives/device/reduce_kernel.h b/src/collectives/device/reduce_kernel.h index 0cb8f13..0e90793 100644 --- a/src/collectives/device/reduce_kernel.h +++ b/src/collectives/device/reduce_kernel.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -46,30 +46,28 @@ struct FuncMin { } }; +#define MASK0 0x00ff00ff +#define MASK1 0xff00ff00 +static __device__ uint32_t addChar4(const uint32_t x, const uint32_t y) { + /* This can be used both for signed and unsigned 8-bit addition */ + const uint32_t x0 = x & MASK0; + const uint32_t x1 = x & MASK1; + const uint32_t y0 = y & MASK0; + const uint32_t y1 = y & MASK1; + const uint32_t r0 = (x0+y0); + const uint32_t r1 = (x1+y1); + return (r0 & MASK0) | (r1 & MASK1); +} + template<> struct FuncSum<int8_t> { - union converter { uint32_t storage; char4 a; }; __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const { #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500) int32_t rv, z=0; asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; -#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700) - int32_t rv; - asm("vadd.s32.s32.s32 %0, %1.b0, %2.b0; \n\t" - "vadd.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vadd.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vadd.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); - return rv; #else - converter cx, cy, cr; - cx.storage = x; - cy.storage = y; - cr.a.x = cx.a.x + cy.a.x; - cr.a.y = cx.a.y + cy.a.y; - cr.a.z = cx.a.z + cy.a.z; - cr.a.w = cx.a.w + cy.a.w; - return cr.storage; + return addChar4(x, y); #endif } __device__ int8_t operator()(const int8_t x, const int8_t y) const { @@ -78,28 +76,13 @@ struct FuncSum<int8_t> { }; template<> struct FuncSum<uint8_t> { - union converter { uint32_t storage; uchar4 a; }; __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const { #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500) int32_t rv, z=0; asm("vadd4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; -#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700) - int32_t rv; - asm("vadd.u32.u32.u32 %0, %1.b0, %2.b0; \n\t" - "vadd.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vadd.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vadd.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); - return rv; #else - converter cx, cy, cr; - cx.storage = x; - cy.storage = y; - cr.a.x = cx.a.x + cy.a.x; - cr.a.y = cx.a.y + cy.a.y; - cr.a.z = cx.a.z + cy.a.z; - cr.a.w = cx.a.w + cy.a.w; - return cr.storage; + return addChar4(x, y); #endif } __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const { @@ -109,22 +92,6 @@ struct FuncSum<uint8_t> { static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) { /* This can be used both for signed and unsigned 8-bit multiplication */ -#if (__CUDA_ARCH__ >= 300) - uint32_t rv; - asm("{ .reg .u32 t0, t1, t2, t3;\n\t" - " vmad.u32.u32.u32 t3, %1.b3, %2.b3, 0;\n\t" - " vmad.u32.u32.u32 t2, %1.b2, %2.b2, 0;\n\t" - " shl.b32 t3, t3, 16;\n\t" - " shl.b32 t2, t2, 16;\n\t" - " vmad.u32.u32.u32 t1, %1.b1, %2.b1, t3;\n\t" - " shl.b32 t1, t1, 8;\n\t" - " vmad.u32.u32.u32 t0, %1.b0, %2.b0, t2;\n\t" - " and.b32 t1, t1, 0xff00ff00;\n\t" - " and.b32 t0, t0, 0x00ff00ff;\n\t" - " or.b32 %0, t0, t1;\n\t" - "}" : "=r"(rv) : "r"(x), "r"(y)); - return rv; -#else union converter { uint32_t storage; char4 a; }; converter cx, cy, cr; cx.storage = x; @@ -134,7 +101,6 @@ static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) { cr.a.z = cx.a.z * cy.a.z; cr.a.w = cx.a.w * cy.a.w; return cr.storage; -#endif } template<> @@ -164,13 +130,6 @@ struct FuncMax<int8_t> { int32_t rv, z=0; asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; -#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700) - int32_t rv; - asm("vmax.s32.s32.s32 %0, %1.b0, %2.b0; \n\t" - "vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); - return rv; #else converter cx, cy, cr; cx.storage = x; @@ -194,13 +153,6 @@ struct FuncMax<uint8_t> { int32_t rv, z=0; asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; -#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700) - int32_t rv; - asm("vmax.u32.u32.u32 %0, %1.b0, %2.b0; \n\t" - "vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); - return rv; #else converter cx, cy, cr; cx.storage = x; @@ -225,13 +177,6 @@ struct FuncMin<int8_t> { int32_t rv, z=0; asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; -#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700) - int32_t rv; - asm("vmin.s32.s32.s32 %0, %1.b0, %2.b0; \n\t" - "vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); - return rv; #else converter cx, cy, cr; cx.storage = x; @@ -255,13 +200,6 @@ struct FuncMin<uint8_t> { int32_t rv, z=0; asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; -#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700) - int32_t rv; - asm("vmin.u32.u32.u32 %0, %1.b0, %2.b0; \n\t" - "vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); - return rv; #else converter cx, cy, cr; cx.storage = x; diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu index b16053c..10857ed 100644 --- a/src/collectives/device/reduce_scatter.cu +++ b/src/collectives/device/reduce_scatter.cu @@ -4,18 +4,8 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "common.h" #include "reduce_scatter.h" +#include "common.h" #include "collectives.h" -#define UNROLL 4 - -#if NCCL_OP == 0 -IMPL_COLL2(ncclReduceScatter, sum, FuncSum, ncclCollReduceScatter, ncclSum); -#elif NCCL_OP == 1 -IMPL_COLL2(ncclReduceScatter, prod, FuncProd, ncclCollReduceScatter, ncclProd); -#elif NCCL_OP == 2 -IMPL_COLL2(ncclReduceScatter, min, FuncMin, ncclCollReduceScatter, ncclMin); -#elif NCCL_OP == 3 -IMPL_COLL2(ncclReduceScatter, max, FuncMax, ncclCollReduceScatter, ncclMax); -#endif +IMPL_COLL_R(ncclReduceScatter, ncclCollReduceScatter); diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h index cad011b..c70c845 100644 --- a/src/collectives/device/reduce_scatter.h +++ b/src/collectives/device/reduce_scatter.h @@ -8,156 +8,82 @@ #include "primitives.h" #include "collectives.h" -// Increase Step and poffset/noffset for buffer sync -#define NEXT_STEP \ - step++; \ - poffset = noffset; \ - noffset += sliceSize; \ - if (noffset == buffSize) noffset = 0; - template<int UNROLL, class FUNC, typename T> -__device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) { +__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - - WaitFlag waitDoneFromNext(ring->send.conn.head, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS); - WaitFlag waitReadyFromPrev(ring->recv.conn.tail, REDUCESCATTER_SUBSTEPS); - PostFlag postDoneToPrev(ring->recv.conn.head, REDUCESCATTER_SUBSTEPS, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS); - - typedef Primitives<UNROLL, REDUCESCATTER_SUBSTEPS, T, FUNC> Prims; - + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; const int nranks = comm->nRanks; - const int buffSize = ring->buffSize / sizeof(T); - const int sliceSize = buffSize / REDUCESCATTER_BUFCHUNKS; - const ssize_t loopSize = args->nRings*(ssize_t)sliceSize; - - if (tid == 0) { - // Update in case we skipped some collectives - *ring->recv.conn.opCount = args->opCount; - // Wait for next to be ready - WaitFlag waitOpCountNext(ring->send.conn.opCount, 0); - waitOpCountNext.wait(args->opCount); - } - __syncthreads(); - - uint64_t step = 0ULL; - int poffset, noffset = 0; + const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS; + const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - T * __restrict__ prevInput = (T*)ring->recv.conn.buff; - T * __restrict__ nextOutput = (T*)ring->send.conn.buff; + + ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, FUNC> + prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings)); - ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); - ssize_t chunkOffset = gridOffset + bid*chunkSize; + int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); + ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); + ssize_t chunkOffset = gridOffset + bid*realChunkSize; /////////////// begin ReduceScatter steps /////////////// ssize_t offset; - int maxOffset = min(chunkSize, size-chunkOffset); + int nelem = min(realChunkSize, size-chunkOffset); int rankDest; // step 0: push data to next GPU rankDest = ring->devUserRanks[nranks-1]; offset = chunkOffset + rankDest * size; - Prims::Copy(tid, nthreads, - thisInput + offset, - nextOutput + noffset, - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); - - NEXT_STEP; // Increases step, poffset, noffset + prims.send(thisInput+offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j=2; j<nranks; ++j) { rankDest = ring->devUserRanks[nranks-j]; offset = chunkOffset + rankDest * size; - Prims::Reduce(tid, nthreads, - prevInput + poffset, - thisInput + offset, - nextOutput + noffset, - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; + prims.recvReduceSend(thisInput+offset, nelem); } - // step k-1: reduce this buffer and data, which will produce the final - // result that we store in this data and push to the next GPU + // step k-1: reduce this buffer and data, which will produce the final result rankDest = ring->devUserRanks[0]; offset = chunkOffset + rankDest * size; - Prims::Reduce(tid, nthreads, - prevInput + poffset, - thisInput + offset, - thisOutput + chunkOffset, - sliceSize, maxOffset, - step, - waitReadyFromPrev, - postDoneToPrev); - } - - if (tid == 0) { - waitDoneFromNext.wait(REDUCESCATTER_SUBSTEPS*(step + REDUCESCATTER_BUFCHUNKS)); - *ring->send.conn.head = 0ULL; - *ring->recv.conn.tail = 0ULL; - __threadfence_system(); - *ring->recv.conn.opCount = args->opCount+1; + prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem); } } -#include "ll_kernel.h" - -#define NEXT_STEP_LL \ - poffset = noffset; \ - pflag = nflag; \ - noffset += NCCL_LL_SLICE_LINES; \ - if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \ - nflag++; \ - step++; +template<int UNROLL, class FUNC, typename T> +__device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { } template<int UNUSED, class FUNC, typename T> -__device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) { +__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; - const int llNthreads = args->nThreads; + const int nthreads = args->nThreads; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead; - volatile uint64_t * sendHeadPtr = ring->send.conn.llHead; - volatile int * sizesFifo = ring->send.conn.llFifo; - uint64_t sendHead = sendHeadPtr[0]; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; - typedef LLPrimitives<T, FUNC> LL; + ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); const ssize_t size = args->N; //const int rank = comm->rank; const int nranks = comm->nRanks; ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nRings*chunkSize; - - uint64_t step = ring->send.conn.llStep; - uint32_t pflag, nflag = step + 1; - int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step); + const ssize_t loopSize = args->nChannels*chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff; - union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { @@ -167,37 +93,21 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) { /////////////// begin ReduceScatter steps /////////////// ssize_t offset; - int maxOffset = min(chunkSize, size-chunkOffset); + int nelem = min(chunkSize, size-chunkOffset); int rankDest; // step 0: push data to next GPU rankDest = ring->devUserRanks[nranks-1]; offset = chunkOffset + rankDest * size; - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - nextOutput + noffset, - maxOffset, nflag, llNthreads); - POST_SIZE; - - NEXT_STEP_LL; + LLprims.send(thisInput+offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j=2; j<nranks; ++j) { rankDest = ring->devUserRanks[nranks-j]; offset = chunkOffset + rankDest * size; - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - prevInput + poffset, - nextOutput + noffset, - maxOffset, pflag, nflag, llNthreads); - POST_SIZE; - ACK_PREV; - - NEXT_STEP_LL; + LLprims.recvReduceSend(thisInput+offset, nelem); } // step k-1: reduce this buffer and data, which will produce the final @@ -205,13 +115,9 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) { rankDest = ring->devUserRanks[0]; offset = chunkOffset + rankDest * size; - LL::ReduceCopy( - thisInput + offset, - prevInput + poffset, - thisOutput + chunkOffset, - maxOffset, pflag, llNthreads); - ACK_PREV; + LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem); } - - FIFO_CLEANING_AND_SAVE_STEP(nflag); } + +template<int UNUSED, class FUNC, typename T> +__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { } diff --git a/src/collectives/reduce.cu b/src/collectives/reduce.cu index d8fde80..302d4bc 100644 --- a/src/collectives/reduce.cu +++ b/src/collectives/reduce.cu @@ -4,30 +4,15 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" -#include "common_coll.h" #include "enqueue.h" #include "collectives.h" -ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - size_t nbytes = count*ncclTypeSize(datatype); - INFO(NCCL_COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); - if (comm->nRanks == 1) { - if (sendbuff != recvbuff) - CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); - } else { - NCCLCHECK(transportSaveProxies(REDUCE_SUBSTEPS, REDUCE_BUFCHUNKS, 1, 1, nbytes, proxyPatternTo(root), comm)); - NCCLCHECK(saveKernel(ncclCollReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, 1)); - } - - return ncclSuccess; -} - NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - return ncclEnqueueCheck(ncclReduceFunc, "Reduce", sendbuff, recvbuff, count, datatype, - op, root, comm, stream); + struct ncclInfo info = { ncclCollReduce, "Reduce", + sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */ + REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS }; + return ncclEnqueueCheck(&info); } diff --git a/src/collectives/reduce_scatter.cu b/src/collectives/reduce_scatter.cu index 1447d4a..4ee77ef 100644 --- a/src/collectives/reduce_scatter.cu +++ b/src/collectives/reduce_scatter.cu @@ -4,29 +4,15 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" -#include "common_coll.h" #include "enqueue.h" #include "collectives.h" -ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - size_t nbytes = count*ncclTypeSize(datatype); - INFO(NCCL_COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); - if (comm->nRanks == 1) { - if (sendbuff != recvbuff) - CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); - } else { - NCCLCHECK(transportSaveProxies(REDUCESCATTER_SUBSTEPS, REDUCESCATTER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm)); - NCCLCHECK(saveKernel(ncclCollReduceScatter, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes*comm->nRanks, 1)); - } - return ncclSuccess; -} - NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { - return ncclEnqueueCheck(ncclReduceScatterFunc, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype, - op, 0, comm, stream); + struct ncclInfo info = { ncclCollReduceScatter, "ReduceScatter", + sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */ + REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS }; + return ncclEnqueueCheck(&info); } diff --git a/src/enqueue.cu b/src/enqueue.cu new file mode 100644 index 0000000..d283223 --- /dev/null +++ b/src/enqueue.cu @@ -0,0 +1,442 @@ +/************************************************************************* + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "enqueue.h" +#include "checks.h" +#include "param.h" + +#include "collectives/collectives.h" + +// Only generate inline kernels for LL +#define NCCL_FUNC5(coll, op, dtype) \ + (void*)NCCL_KERN_NAME(coll##LL, op, dtype), \ + (void*)NCCL_KERN_NAME(coll##LL, op, dtype) + +#define NCCL_FUNC4(coll, op, dtype) \ + (void*)NCCL_FUNC5(coll##Ring, op, dtype), \ + (void*)NCCL_FUNC5(coll##Tree, op, dtype) + +// Must be consistent with ncclDataType_t +#define NCCL_FUNCS3A(coll, op) \ + (void*)NCCL_FUNC4(coll, op, i8), \ + (void*)NCCL_FUNC4(coll, op, u8), \ + (void*)NCCL_FUNC4(coll, op, i32), \ + (void*)NCCL_FUNC4(coll, op, u32), \ + (void*)NCCL_FUNC4(coll, op, i64), \ + (void*)NCCL_FUNC4(coll, op, u64), \ + (void*)NCCL_FUNC4(coll, op, f16), \ + (void*)NCCL_FUNC4(coll, op, f32), \ + (void*)NCCL_FUNC4(coll, op, f64) +#define NCCL_FUNCS3B(coll, op) \ + (void*)NCCL_FUNC4(coll, op, i8), \ + (void*)NCCL_FUNC4(coll, op, i8), \ + (void*)NCCL_FUNC4(coll, op, i8), \ + (void*)NCCL_FUNC4(coll, op, i8), \ + (void*)NCCL_FUNC4(coll, op, i8), \ + (void*)NCCL_FUNC4(coll, op, i8), \ + (void*)NCCL_FUNC4(coll, op, i8), \ + (void*)NCCL_FUNC4(coll, op, i8), \ + (void*)NCCL_FUNC4(coll, op, i8) + +// Must be consistent with ncclRedOp_t -- but we only generate kernel for sums. +#define NCCL_FUNCS2A(coll) \ + NCCL_FUNCS3A(coll, sum), \ + NCCL_FUNCS3A(coll, sum), \ + NCCL_FUNCS3A(coll, sum), \ + NCCL_FUNCS3A(coll, sum) +#define NCCL_FUNCS2B(coll) \ + NCCL_FUNCS3B(coll, copy), \ + NCCL_FUNCS3B(coll, copy), \ + NCCL_FUNCS3B(coll, copy), \ + NCCL_FUNCS3B(coll, copy) + +// Must be consistent with the ncclFuncSet enum +static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = { + NCCL_FUNCS2B(ncclBroadcast), + NCCL_FUNCS2A(ncclReduce), + NCCL_FUNCS2B(ncclAllGather), + NCCL_FUNCS2A(ncclReduceScatter), + NCCL_FUNCS2A(ncclAllReduce) +}; + +/*****************************************************************************/ +/* Launch system : synchronization and CUDA kernel launch */ +/*****************************************************************************/ + +ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) { +#if CUDART_VERSION >= 9000 + if (cgMode & 0x01) { + CUDACHECK(cudaLaunchCooperativeKernelMultiDevice(paramsList, numDevices, + // These flags are to reduce the latency of using this API + cudaCooperativeLaunchMultiDeviceNoPreSync|cudaCooperativeLaunchMultiDeviceNoPostSync)); + return ncclSuccess; + } +#endif + int savedDev; + CUDACHECK(cudaGetDevice(&savedDev)); + for (int i = 0; i < numDevices; i++) { + struct cudaLaunchParams* params = paramsList+i; + CUDACHECK(cudaSetDevice(cudaDevs[i])); + CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream)); + } + CUDACHECK(cudaSetDevice(savedDev)); + return ncclSuccess; +} + +ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) { + params->gridDim.x = std::min((int) params->gridDim.x, comm->nChannels); + + // Set active = 2 for the last operation + for (int r=0; r<params->gridDim.x; r++) { + struct ncclChannel* channel = comm->channels+r; + channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active = 2; + } + + // Find the first operation, choose the kernel accordingly and pass it + // as the first argument. + struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart; + memcpy(&comm->args, coll, sizeof(struct ncclColl)); + // As we pass that coll directly, we can free it immediately. + coll->active = 0; + + params->func = ncclKerns[coll->funcIndex]; + return ncclSuccess; +} + +ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) { + volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); + int val = *ptr; + bool done = false; + while (done == false) { + if (val >= comm->intraRanks) { + WARN("Trying to launch too many collectives"); + return ncclInvalidUsage; + } + if (val+1 == comm->intraRanks) { + // Reset the barrier. + comm->intraBarrier[comm->intraPhase^1] = 0; + *isLast = 1; + return ncclSuccess; + } + done = __sync_bool_compare_and_swap(ptr, val, val+1); + val++; + } + *isLast = 0; + return ncclSuccess; +} + +ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) { + volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); + int val = *ptr; + if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) { + WARN("Trying to launch too many collectives"); + return ncclInternalError; + } + return ncclSuccess; +} + +ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) { + volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); + while (*ptr < comm->intraRanks) pthread_yield(); + comm->intraPhase ^= 1; + return ncclSuccess; +} + +ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) { + if (comm->nRanks == 1) return ncclSuccess; + struct cudaLaunchParams* params = comm->myParams; + + NCCLCHECK(setupLaunch(comm, params)); + + // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL + if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) { + // Enqueue event in user stream + CUDACHECK(cudaEventRecord(comm->doneEvent, comm->userStream)); + // Create dependency between user stream and internal NCCL stream + CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->doneEvent, 0)); + params->stream = comm->groupStream; + } else { + if (comm->userStream != params->stream) { + // Stream changed from last call, create dependency against last NCCL kernel launch + CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0)); + } + params->stream = comm->userStream; + } + + int isLast = 0; + NCCLCHECK(ncclCpuBarrierIn(comm, &isLast)); + + if (isLast) { + if (comm->launchMode == ncclComm::GROUP) { + // I'm the last. Launch all operations. + NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode)); + } + NCCLCHECK(ncclCpuBarrierLast(comm)); + } + return ncclSuccess; +} + +ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) { + if (comm->nRanks == 1) return ncclSuccess; + // We can't print the CG mode before the first barrier happened. + if (comm->rank == 0 && *comm->intraCGMode & 0x10) { + *comm->intraCGMode ^= 0x10; + INFO(NCCL_INIT,"Launch mode %s%s%s", + comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel", + *comm->intraCGMode ? "/CGMD" : "", + (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : ""); + } + + NCCLCHECK(ncclCpuBarrierOut(comm)); + + struct cudaLaunchParams *params = comm->myParams; + if (comm->launchMode == ncclComm::PARALLEL) { + CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream)); + } + // Start the network proxies as soon as the kernel has been launched. We can't + // perform any CUDA call between the two or having a cudaFree between the CUDA + // launch and the transportStartProxy call could cause a deadlock. + // Also, starting the proxies after the CUDA launch seems to be better for + // performance (latency). + for (int r=0; r<params->gridDim.x; r++) { + struct ncclChannel* channel = comm->channels+r; + channel->collStart = channel->collFifoTail; + channel->collCount = 0; + } + params->gridDim.x = params->blockDim.x = 0; + NCCLCHECK(transportStartProxy(comm)); + return ncclSuccess; +} + +ncclResult_t ncclEnqueueEvents(ncclComm_t comm) { + struct cudaLaunchParams *params = comm->myParams; + // Enqueue event after NCCL kernel + CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream)); + // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL + if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) { + // Create dependency between NCCL internal stream and user stream + CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0)); + } + comm->userStreamSet = false; + return ncclSuccess; +} + +/*****************************************************************************/ +/* Enqueueing system : computation of kernel and proxy operations parameters */ +/*****************************************************************************/ + +static ncclResult_t getPatternInfo(struct ncclInfo* info) { + if (info->coll == ncclCollBroadcast) info->pattern = ncclPatternPipelineFrom; + else if (info->coll == ncclCollReduce) info->pattern = ncclPatternPipelineTo; + else if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->pattern = ncclPatternRing; + else if (info->coll == ncclCollAllReduce) { + if (info->nBytes <= info->comm->treeThreshold) + info->pattern = ncclPatternTreeUpDown; + else + info->pattern = ncclPatternRingTwice; + } + else { + WARN("Unknown collective %d", info->coll); + return ncclInternalError; + } + return ncclSuccess; +} + +static ncclResult_t getLoopInfo(struct ncclInfo* info) { + switch (info->pattern) { + case ncclPatternTreeUp: + case ncclPatternTreeDown: + case ncclPatternTreeUpDown: + case ncclPatternPipelineFrom: + case ncclPatternPipelineTo: + info->nstepsPerLoop = info-> nchunksPerLoop = 1; break; + case ncclPatternRing: + info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break; + case ncclPatternRingTwice: + info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break; + default: + WARN("Unknown pattern %d\n", info->pattern); + return ncclInternalError; + } + return ncclSuccess; +} + +static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) { + // Compute thresholds and limits that users can override + int perThreadLLThreshold = std::min(info->comm->threadThreshold, (ssize_t)NCCL_LL_CHANNEL_THRESHOLD); + int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads); + + // First compute nThreads + int nt = NCCL_LL_MIN_NTHREADS; + while (DIVUP(info->nBytes, nt*info->nchunksPerLoop) > perThreadLLThreshold && nt*2 <= maxLLNthreads) nt *= 2; + + // Then compute nChannels + int nc = DIVUP(info->nBytes, nt*info->nchunksPerLoop*perThreadLLThreshold); + if (nc == 0) nc = 1; + if (nc > info->comm->nChannels) nc = info->comm->nChannels; + + // Check if we have a fixed LL threshold, otherwise compute it. + int perThreadThreshold = info->comm->threadThreshold; + if (info->pattern >= ncclPatternTreeUp) perThreadThreshold *= 4; + ssize_t llThreshold = info->comm->llThreshold >= 0 ? + info->comm->llThreshold : + nc*nt*info->nchunksPerLoop*perThreadThreshold; + + if (info->nBytes <= llThreshold) { + *llMode = 1; + *nChannels = nc; + *nThreads = nt; + } else { + *llMode = 0; + *nChannels = info->comm->nChannels; + *nThreads = info->comm->nThreads+1; + } +} + +static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) { + // Set nstepsPerLoop and nchunksPerLoop + NCCLCHECK(getPatternInfo(info)); + NCCLCHECK(getLoopInfo(info)); + + coll->args.root = info->root; + coll->args.N = info->count; + coll->args.ThisInput = info->sendbuff; + coll->args.ThisOutput = info->recvbuff; + coll->args.comm = info->comm->devComm; + coll->args.opCount = info->comm->opCount; + + // Compute llMode, nChannels, nThreads + int llMode; + getKernelInfo(info, &coll->args.nChannels, &coll->args.nThreads, &llMode); + + int treeMode = info->pattern >= ncclPatternTreeUp ? 1 : 0; + coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, llMode, treeMode); + + int stepSize = ( llMode ? NCCL_LL_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS; + int chunkSteps = (llMode|treeMode) ? 1 : info->chunkSteps; + int sliceSteps = (llMode|treeMode) ? 1 : info->sliceSteps; + int chunkSize = stepSize*chunkSteps; + + // Compute lastChunkSize + if (treeMode == 1 && llMode == 0) { + if (info->pattern == ncclPatternTreeUpDown) { + // Optimize chunkSize / nSteps + while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2; + while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2; + while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2; + } + // Use lastChunkSize as chunkSize + coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype); + } else if (llMode == 1) { + int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t); + const ssize_t loopSize = coll->args.nChannels*info->nchunksPerLoop*(ssize_t)sliceSize; + coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), coll->args.nChannels*info->nchunksPerLoop); + ALIGN_SIZE(coll->args.lastChunkSize, coll->args.nThreads*sizeof(uint64_t)); + coll->args.lastChunkSize /= ncclTypeSize(info->datatype); + } + + // Compute nSteps for proxies + size_t nBytes = llMode ? info->nBytes*2 : info->nBytes; + + int nLoops = (int)(DIVUP(nBytes, (((size_t)(coll->args.nChannels))*info->nchunksPerLoop*chunkSize))); + proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps; + proxyArgs->sliceSteps = sliceSteps; + proxyArgs->chunkSteps = chunkSteps; + proxyArgs->llMode = llMode; + proxyArgs->opCount = info->comm->opCount; + TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> llmode %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p", + coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, nBytes, llMode, coll->args.nChannels, coll->args.nThreads, + nLoops, proxyArgs->nsteps, info->comm); + return ncclSuccess; +} + +static ncclResult_t saveKernel(struct ncclInfo* info) { + if (info->comm->nRanks == 1) { + if (info->sendbuff != info->recvbuff) + CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, cudaMemcpyDeviceToDevice, info->stream)); + return ncclSuccess; + } + + struct ncclColl coll; + struct ncclProxyArgs proxyArgs; + memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs)); + NCCLCHECK(computeColl(info, &coll, &proxyArgs)); + + info->comm->myParams->blockDim.x = max(info->comm->myParams->blockDim.x, coll.args.nThreads); + if (info->comm->userStreamSet == false) { + info->comm->userStream = info->stream; + info->comm->userStreamSet = true; + } else if (info->stream != info->comm->userStream) { + WARN("Error : mixing different streams within a group call is not supported."); + return ncclInvalidUsage; + } + for (int bid=0; bid<coll.args.nChannels; bid++) { + struct ncclChannel* channel = info->comm->channels+(info->comm->myParams->gridDim.x % info->comm->nChannels); + + if (channel->collCount == NCCL_MAX_OPS) { + WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS); + return ncclInvalidUsage; + } + + // Proxy + proxyArgs.channel = channel; + NCCLCHECK(transportSaveProxies(&proxyArgs, info->pattern, info->root, info->comm->nRanks)); + + info->comm->myParams->gridDim.x++; + + int opIndex = channel->collFifoTail; + struct ncclColl* c = channel->collectives+opIndex; + volatile uint8_t* activePtr = (volatile uint8_t*)&c->active; + while (activePtr[0] != 0) sched_yield(); + + memcpy(c, &coll, sizeof(struct ncclColl)); + + c->args.bid = bid; + c->active = 1; + opIndex = (opIndex+1)%NCCL_MAX_OPS; + c->nextIndex = opIndex; + channel->collFifoTail = opIndex; + channel->collCount++; + } + /*if (llMode == 0)*/ info->comm->opCount++; + return ncclSuccess; +} + + +ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { + if (info->comm == NULL) return ncclInvalidArgument; + + INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", + info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count, + info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream); + + // Launch asynchronously if needed + if (ncclAsyncMode()) { + ncclResult_t ret = ncclSuccess; + int savedDev = -1; + if (info->comm->checkPointers) { + CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end); + CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, end); + } + // Check arguments + NCCLCHECKGOTO(ArgsCheck(info), ret, end); + // Always register comm even in case of error to make sure ncclGroupEnd + // cleans it up. + NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end); + NCCLCHECKGOTO(saveKernel(info), ret, end); +end: + if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev)); + ncclAsyncErrCheck(ret); + return ret; + } else { + NCCLCHECK(ArgsCheck(info)); + NCCLCHECK(saveKernel(info)); + NCCLCHECK(ncclBarrierEnqueue(info->comm)); + NCCLCHECK(ncclBarrierEnqueueWait(info->comm)); + NCCLCHECK(ncclEnqueueEvents(info->comm)); + return ncclSuccess; + } +} diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index 278593c..a1aaf50 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -13,5 +13,7 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv); ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out); ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState); ncclResult_t bootstrapAllGather(void* commState, void* allData, int size); +ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size); +ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size); ncclResult_t bootstrapClose(void* commState); #endif diff --git a/src/include/channel.h b/src/include/channel.h new file mode 100644 index 0000000..76c5e8a --- /dev/null +++ b/src/include/channel.h @@ -0,0 +1,14 @@ +/************************************************************************* + * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_CHANNEL_H_ +#define NCCL_CHANNEL_H_ +#include "core.h" + +ncclResult_t initChannel(struct ncclComm* comm, int channelid); +ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks); + +#endif diff --git a/src/include/checks.h b/src/include/checks.h new file mode 100644 index 0000000..bf7750e --- /dev/null +++ b/src/include/checks.h @@ -0,0 +1,10 @@ +/************************************************************************* + * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" + +ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname); +ncclResult_t ArgsCheck(struct ncclInfo* info); diff --git a/src/include/common_coll.h b/src/include/common_coll.h deleted file mode 100644 index 3ec7354..0000000 --- a/src/include/common_coll.h +++ /dev/null @@ -1,195 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef COMMON_COLL_H_ -#define COMMON_COLL_H_ - -#include "core.h" -#include "enqueue.h" -#include "collectives/collectives.h" - -static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) { - cudaPointerAttributes attr; - cudaError_t err = cudaPointerGetAttributes(&attr, pointer); - if (err != cudaSuccess || attr.devicePointer == NULL) { - WARN("%s : %s is not a valid pointer", opname, ptrname); - return ncclInvalidArgument; - } -#if CUDART_VERSION >= 10000 - if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { -#else - if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { -#endif - WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev); - return ncclInvalidArgument; - } - return ncclSuccess; -} - -static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) { - if (ptr == NULL) { - WARN("%s : %s argument is NULL", opname, ptrname); - return ncclInvalidArgument; - } - return ncclSuccess; -} - -static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) { - NCCLCHECK(PtrCheck(comm, opname, "comm")); - // First, the easy ones - if (root < 0 || root >= comm->nRanks) { - WARN("%s : invalid root %d (root should be in the 0..%d range)", opname, root, comm->nRanks); - return ncclInvalidArgument; - } - if (type < 0 || type >= ncclNumTypes) { - WARN("%s : invalid type %d", opname, type); - return ncclInvalidArgument; - } - if (op < 0 || op >= ncclNumOps) { - WARN("%s : invalid reduction operation %d", opname, op); - return ncclInvalidArgument; - } - - if (comm->checkPointers) { - // Check CUDA device pointers - if (strcmp(opname, "Broadcast") != 0 || comm->rank == root) { - NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname)); - } - if (strcmp(opname, "Reduce") != 0 || comm->rank == root) { - NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname)); - } - } - return ncclSuccess; -} - -static __inline__ int ncclTypeSize(ncclDataType_t type) { - switch (type) { - case ncclInt8: - case ncclUint8: - return 1; - case ncclFloat16: - return 2; - case ncclInt32: - case ncclUint32: - case ncclFloat32: - return 4; - case ncclInt64: - case ncclUint64: - case ncclFloat64: - return 8; - default: - return -1; - } -} - -// In : comm, nbytes ; Out : nrings, nthreads, ll -// - We start with the minimum number of threads possible (64) and see if the size fits in LL; -// If not, we increase the number of threads by 2x, until we reach the max number of LL threads (256, or set by user via NCCL_NTHREADS, or platform non-LL default) -// - We use "maxRings" to limit the max number of rings we can use before reaching the max number of LL threads -// This ensures we don't use a large number of rings with a small number of threads -// - We use the NCCL_LL_RING_THRESHOLD as the per-thread threshold before we reach the max number of threads -// we use NCCL_THREAD_THRESHOLD when we reach the max -// - If by the max number of LL threads, the size still cannot fit in LL, then we use non-LL setting -// - We honor the NCCL_LL_THRESHOLD (total threshold) set by user too -static inline void ncclGetCollResource(ncclComm_t comm, size_t nbytes, int* nrings, int* nthreads, int* ll) { - *ll = 0; - int llEnforced = 0; /* see if the size falls in the NCCL_LL_THRESHOLD range set by user */ - if (comm->llThreshold >= 0) { /* user sets total LL threshold */ - if (nbytes > comm->llThreshold) { /* non-LL */ - *nthreads = comm->nThreads+1; - *nrings = comm->nRings; - return; - } else { - llEnforced = 1; /* user wants to use LL */ - } - } - int nt = NCCL_LL_MIN_NTHREADS; /* start with min number of LL threads */ - size_t nr; - int ll_max_nthreads = std::min(NCCL_LL_MAX_NTHREADS, comm->nThreads); /* respect user's setting or platform's default setting */ - int maxRings = (comm->nRanks <= 4) ? 1 : ll_max_nthreads / NCCL_LL_MIN_NTHREADS; - ssize_t threshold = std::min(comm->threadThreshold, (ssize_t)NCCL_LL_RING_THRESHOLD); - while (nt < ll_max_nthreads && *ll == 0) { - nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*nt*comm->nRanks)); - if (nr <= maxRings) { /* avoid using few threads but many rings */ - nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr; - *ll = nbytes > comm->nRanks*nr*nt*threshold ? 0 : 1; - } - if (*ll == 0) { - nt = nt << 1; - } - } - if (*ll == 1) { - *nthreads = nt; - *nrings = (int)nr; - return; /* we can use smaller number of threads to make LL work, stop here */ - } - nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*ll_max_nthreads*comm->nRanks)); /* else we try the max number of LL threads */ - nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr; - *ll = nbytes > comm->nRanks*nr*ll_max_nthreads*comm->threadThreshold ? llEnforced : 1; - *nthreads = *ll ? ll_max_nthreads : comm->nThreads+1; - *nrings = *ll ? (int)nr : comm->nRings; -} - -static ncclResult_t saveKernel(int coll, const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t dtype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, size_t nbytes, int loopFactor) { - int llMode, nBlocks, nThreads; - ncclGetCollResource(comm, nbytes, &nBlocks, &nThreads, &llMode); - comm->myParams->blockDim.x = std::max((int)comm->myParams->blockDim.x, nThreads); - if (comm->userStreamSet == false) { - comm->userStream = stream; - comm->userStreamSet = true; - } else if (stream != comm->userStream) { - WARN("Error : mixing different streams within a group call is not supported."); - return ncclInvalidUsage; - } - int lastChunkSize = 0; - if (llMode == 1) { - int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / ncclTypeSize(dtype); - const ssize_t loopSize = nBlocks*loopFactor*(ssize_t)sliceSize; - lastChunkSize = DIVUP((count-count/loopSize*loopSize), nBlocks*loopFactor); - ALIGN_SIZE(lastChunkSize, nThreads*sizeof(uint64_t)/ncclTypeSize(dtype)); - } - for (int bid=0; bid<nBlocks; bid++) { - struct ncclRing* ring = comm->rings+(comm->myParams->gridDim.x % comm->nRings); - if (ring->collCount == NCCL_MAX_OPS) { - WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS); - return ncclInvalidUsage; - } - - comm->myParams->gridDim.x++; - - int opIndex = ring->collFifoTail; - struct ncclColl* c = ring->collectives+opIndex; - volatile uint8_t* activePtr = (volatile uint8_t*)&c->active; - while (activePtr[0] != 0) sched_yield(); - - struct CollectiveArgs* args = &c->args; - args->root = root; - args->N = count; - args->ThisInput = sendbuff; - args->ThisOutput = recvbuff; - args->comm = comm->devComm; - args->opCount = comm->opCount; - args->bid = bid; - args->nRings = nBlocks; - args->nThreads = nThreads; - args->lastChunkSize = lastChunkSize; - - c->nThreads = nThreads; - c->funcIndex = FUNC_INDEX(coll, op, dtype, llMode); - c->active = 1; - opIndex = (opIndex+1)%NCCL_MAX_OPS; - c->nextIndex = opIndex; - ring->collFifoTail = opIndex; - ring->collCount++; - } - /*if (llMode == 0)*/ comm->opCount++; - return ncclSuccess; -} - -extern __global__ void ncclMultiOpKernel (struct ncclColl firstColl); - -#endif diff --git a/src/include/core.h b/src/include/core.h index 8285df5..d57d271 100644 --- a/src/include/core.h +++ b/src/include/core.h @@ -8,6 +8,7 @@ #define NCCL_CORE_H_ #define NCCL_MAX_OPS 2048 +#define NCCL_STEPS 8 #include "nccl.h" #include "transport.h" @@ -29,15 +30,15 @@ struct cudaLaunchParams { }; #endif -#define MAXRINGS 16 +#define MAXCHANNELS 16 #define MAXTHREADS 256 #define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */ -// Rings / LL tuning -#define NCCL_LL_RING_THRESHOLD 8 // Per thread size before we start increasing nrings -#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL for Volta and above +// Channels / LL tuning +#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings +#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL #define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs -#define NCCL_LL_MAX_NTHREADS 256 +#define NCCL_LL_MAX_NTHREADS MAXTHREADS #define NCCL_LL_MIN_NTHREADS 64 #define DIVUP(x, y) \ @@ -63,43 +64,84 @@ union ncclLLFifoLine { int4 i4; }; +typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t; + +typedef enum { + ncclPatternRing, + ncclPatternRingTwice, + ncclPatternPipelineFrom, + ncclPatternPipelineTo, + ncclPatternTreeUp, + ncclPatternTreeDown, + ncclPatternTreeUpDown +} ncclPattern_t; + +typedef enum { + ncclDevSuccess, + ncclDevAssertedMismatch, + ncclDevSuspectedMismatch +} ncclDevError_t; + +// Used to pass NCCL call information between functions +struct ncclInfo { + ncclColl_t coll; + const char* opName; + // NCCL Coll Args + const void* sendbuff; + void* recvbuff; + size_t count; + ncclDataType_t datatype; + ncclRedOp_t op; + int root; + ncclComm_t comm; + cudaStream_t stream; + // Algorithm details + int chunkSteps; + int sliceSteps; + // Computed later + ncclPattern_t pattern; + size_t nBytes; + int nstepsPerLoop; + int nchunksPerLoop; +}; + struct ncclConnInfo { // Regular comm mechanism char *buff; // Local for recv, remote for send uint64_t *tail; // Local for recv, remote for send uint64_t *head; // Local for send, remote for recv - uint64_t *opCount; // Local for recv, remote for send + uint64_t *opCountLoc; // opCount of local rank + uint64_t *opCountRem; // opCount of remote rank int direct; // Direct communication void **ptrExchange; // Pointer exchange for direct communication int *fifo; // Size fifo for proxy + uint64_t step; // Keep where we are + // Low latency mechanism - char *llBuff; // Local for recv, remote for send - uint64_t *llHead; // Local for send, remote for recv - int *llFifo; // LL Size fifo for proxy - uint64_t llStep; // Keep where we are + union ncclLLFifoLine *llBuff; // Local for recv, remote for send uint64_t llLastCleaning; }; struct ncclConnector { - struct transportProxyInfo* proxyInfo; - struct ncclTransport* transport; + int connected; + struct ncclProxyArgs *proxyAppend; + struct ncclTransportComm* transportComm; void* transportResources; // Host-side resources struct ncclConnInfo conn; + struct ncclComm *comm; }; #define CACHE_LINE_SIZE 128 #define MEM_ALIGN 4096 -#define SIZES_FIFO_SIZE 32 #define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */ -#define NCCL_LL_CHUNKS 8 #define NUM_LINES_PER_THREAD 8 -#define NCCL_LL_BUFF_SIZE (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_LL_CHUNKS*sizeof(union ncclLLFifoLine)) // 256K -#define NCCL_LL_BUFF_LINES (NCCL_LL_BUFF_SIZE / (2*sizeof(uint64_t))) -#define NCCL_LL_SLICE_LINES (NCCL_LL_BUFF_LINES / NCCL_LL_CHUNKS) +#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS) +#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS) +#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine)) #define NCCL_LL_CLEAN_FREQ 0x10000000 struct ncclSendMem { @@ -109,7 +151,7 @@ struct ncclSendMem { char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; void* ptrExchange; char pad2[CACHE_LINE_SIZE-sizeof(void*)]; - uint64_t llHead; + uint64_t opCount; }; char pad3[MEM_ALIGN]; }; @@ -119,37 +161,54 @@ struct ncclRecvMem { union { struct { uint64_t tail; - char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)]; + char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; uint64_t opCount; - char pad4[CACHE_LINE_SIZE-sizeof(uint64_t)]; - int sizesFifo[SIZES_FIFO_SIZE]; - int llSizesFifo[SIZES_FIFO_SIZE]; + char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)]; + int sizesFifo[NCCL_STEPS]; }; - char pad5[MEM_ALIGN]; + char pad4[MEM_ALIGN]; }; - char llBuff[NCCL_LL_BUFF_SIZE]; + ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES]; char buff[1]; // Actually larger than that }; struct ncclRing { + // Shortcuts for userRanks[1] and userRanks[n-1] + int prev; + int next; + + // Maps an internal nccl index to user-specified rank order. This is necessary + // since we need to know how the user expects data to be ordered across + // devices. Ordered from current device. + int* userRanks; + int* devUserRanks; +}; + +#define NCCL_MAX_TREE_ARITY 3 +struct ncclTree { + int depth; + int up; + int down[NCCL_MAX_TREE_ARITY]; +}; + +struct ncclPeer { + struct ncclConnector send; + struct ncclConnector recv; +}; + +struct ncclChannel { union { struct { + struct ncclRing ring; + struct ncclTree tree; + int id; int nthreads; - // Per ring resources - struct ncclSendMem* devMemSend; // CUDA-size resources - struct ncclRecvMem* devMemRecv; // CUDA-size resources int buffSize; - int devMemSendSize; // Keep the size for IPCs - int devMemRecvSize; // Keep the size for IPCs - struct ncclConnector send; - struct ncclConnector recv; - // Maps an internal nccl index to user-specified rank order. This is necessary - // since we need to know how the user expects data to be ordered across - // devices. Ordered from current device. - int* userRanks; - int* devUserRanks; + // Communication structures + struct ncclPeer* peers; + struct ncclPeer* devPeers; // Operation list for aggregation struct ncclColl* collectives; @@ -162,7 +221,7 @@ struct ncclRing { int data[0x80]; }; }; -static_assert(sizeof(struct ncclRing) == 0x80*sizeof(int), "ncclRing must have a pow2 size"); +static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size"); /* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */ /* to make sure reads to host from the CUDA kernel are aligned. */ @@ -179,7 +238,7 @@ struct CollectiveArgs { size_t N; uint32_t root; uint8_t bid; - uint8_t nRings; + uint8_t nChannels; uint16_t nThreads; int lastChunkSize; @@ -188,7 +247,6 @@ struct ncclColl { union { struct { struct CollectiveArgs args; - uint16_t nThreads; uint16_t funcIndex; uint16_t nextIndex; uint8_t active; @@ -199,11 +257,16 @@ struct ncclColl { static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size"); struct ncclComm { - struct ncclRing rings[MAXRINGS]; + struct ncclChannel channels[MAXCHANNELS]; + + struct ncclPeerInfo* peerInfo; + + void* bootstrap; int rank; // my rank in the communicator int nRanks; // number of GPUs in communicator int cudaDev; // my cuda device index + int nvmlDev; // my NVML device number enum { GROUP, PARALLEL } launchMode; cudaStream_t userStream; @@ -215,18 +278,31 @@ struct ncclComm { // where syncs are not symmetric). uint64_t opCount; - // Rings for collectives - int nRings; + // Channels for collectives + int nChannels; int nThreads; // Low-latency algorithm threshold ssize_t llThreshold; ssize_t threadThreshold; + // Tree algorithm threshold + ssize_t treeThreshold; + // An internal CUDA stream for NCCL kernel CGMD launches int groupCudaStream; cudaStream_t groupStream; + // Whether there has been a fatal error in this communicator. + ncclResult_t fatalError; + + // Error reported by GPU + volatile ncclDevError_t* fatalDevError; + + // On host: this pointer has been obtained from cudaHostAlloc(cudaHostAllocMapped) + // On device: this pointer has been obtained from cudaHostGetDevicePointer() + volatile uint32_t *abortFlag; + // Device copy of the communicator struct ncclComm *devComm; @@ -244,6 +320,10 @@ struct ncclComm { int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not struct ncclColl args; void* argsptr; + + // Global proxy thread + pthread_t proxyThread; + struct ncclProxyState proxyState; }; // Check CUDA calls @@ -324,6 +404,28 @@ struct ncclComm { #endif // end PROFAPI int ncclCudaCompCap(); +ncclResult_t ncclNvlinkGpu(int* nvlink); +int64_t ncclTreeThreshold(); + +static __inline__ int ncclTypeSize(ncclDataType_t type) { + switch (type) { + case ncclInt8: + case ncclUint8: + return 1; + case ncclFloat16: + return 2; + case ncclInt32: + case ncclUint32: + case ncclFloat32: + return 4; + case ncclInt64: + case ncclUint64: + case ncclFloat64: + return 8; + default: + return -1; + } +} #include <sys/mman.h> static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) { diff --git a/src/include/cpuset.h b/src/include/cpuset.h new file mode 100644 index 0000000..f70d1d8 --- /dev/null +++ b/src/include/cpuset.h @@ -0,0 +1,61 @@ +/************************************************************************* + * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_CPUSET_H_ +#define NCCL_CPUSET_H_ + +// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t + +static int hexToInt(char c) { + int v = c - '0'; + if (v < 0) return -1; + if (v > 9) v = 10 + c - 'a'; + if ((v < 0) || (v > 15)) return -1; + return v; +} + +#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t)) + +ncclResult_t ncclStrToCpuset(char* str, cpu_set_t* mask) { + uint32_t cpumasks[CPU_SET_N_U32]; + int m = CPU_SET_N_U32-1; + cpumasks[m] = 0; + for (int o=0; o<strlen(str); o++) { + char c = str[o]; + if (c == ',') { + m--; + cpumasks[m] = 0; + } else { + int v = hexToInt(c); + if (v == -1) break; + cpumasks[m] <<= 4; + cpumasks[m] += v; + } + } + // Copy cpumasks to mask + for (int a=0; m<CPU_SET_N_U32; a++,m++) { + memcpy(((uint32_t*)mask)+a, cpumasks+m, sizeof(uint32_t)); + } + return ncclSuccess; +} + +ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) { + int c = 0; + uint8_t* m8 = (uint8_t*)mask; + for (int o=sizeof(cpu_set_t)-1; o>=0; o--) { + if (c == 0 && m8[o] == 0) continue; + sprintf(str+c, "%02x", m8[o]); + c+=2; + if (o && o%4 == 0) { + sprintf(str+c, ","); + c++; + } + } + str[c] = '\0'; + return ncclSuccess; +} + +#endif diff --git a/src/include/debug.h b/src/include/debug.h index 55dee18..3acdf8c 100644 --- a/src/include/debug.h +++ b/src/include/debug.h @@ -25,6 +25,7 @@ extern uint64_t ncclDebugMask; extern pthread_mutex_t ncclDebugOutputLock; extern FILE *ncclDebugFile; extern ncclResult_t getHostName(char* hostname, int maxlen); +extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev); extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...); diff --git a/src/include/enqueue.h b/src/include/enqueue.h index 69d0463..4db7094 100644 --- a/src/include/enqueue.h +++ b/src/include/enqueue.h @@ -10,12 +10,7 @@ #include "core.h" #include "group.h" -typedef ncclResult_t(*ncclFunc_t)(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); - -ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff, - void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, - ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast); ncclResult_t ncclCpuBarrierLast(ncclComm_t comm); ncclResult_t ncclCpuBarrierOut(ncclComm_t comm); diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h index ce3f6ca..89edbf5 100644 --- a/src/include/nccl_net.h +++ b/src/include/nccl_net.h @@ -58,8 +58,50 @@ typedef struct { ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v1_t; -typedef ncclNet_v1_t ncclNet_t; +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Return the device path in /sys. NCCL will call free on this path. + ncclResult_t (*pciPath)(int dev, char** path); + // Return whether this device supports host pointers and/or CUDA pointers + // as data from the current GPU. Supported types should be composed with + // NCCL_PTR_HOST and NCCL_PTR_CUDA. + ncclResult_t (*ptrSupport)(int dev, int* supportedTypes); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connectHandle + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); + // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v2_t; + +typedef ncclNet_v2_t ncclNet_t; -#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v1 +#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v2 #endif // end include guard diff --git a/src/include/net.h b/src/include/net.h index ebc9677..e75e6bb 100644 --- a/src/include/net.h +++ b/src/include/net.h @@ -26,9 +26,11 @@ static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK( static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; } static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; } static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; } -static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, type, request)); return ncclSuccess; } -static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, type, request)); return ncclSuccess; } -static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size) { NCCLCHECK(ncclNet->flush(recvComm, data, size)); return ncclSuccess; } +static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; } +static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; } +static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; } +static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; } +static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclNet->flush(recvComm, data, size, mhandle)); return ncclSuccess; } static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; } static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; } static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; } diff --git a/src/include/nvlink.h b/src/include/nvlink.h index 7eb74c9..1baf9e5 100644 --- a/src/include/nvlink.h +++ b/src/include/nvlink.h @@ -67,18 +67,15 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) { if (res != ncclSuccess) return 0; for(int l=0; l<maxNvLinks; ++l) { - // nvmlDeviceGetNvLinkCapability(NVML_NVLINK_CAP_P2P_SUPPORTED) would seem to - // report whether the NVLink connects to a peer GPU (versus a POWER CPU?). I - // don't know whether nvmlDeviceGetNvLinkRemotePciInfo() would succeed in - // the POWER CPU case, so it seems best to check this as well. + // Check whether we can use this NVLink for P2P unsigned canP2P; if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue; - // nvmlDeviceGetNvLinkRemotePciInfo() will return NVML_ERROR_NOT_SUPPORTED - // if the links don't exist, or are disabled. So checking for that return - // here would probably make the nvmlDeviceGetNvLinkCapability check above - // redundant. Presumably, we still need to check the P2P capability above, - // since even non-GPUs would possess PCI info. + // Make sure the Nvlink is up. The previous call should have trained the link. + nvmlEnableState_t isActive; + if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue; + + // Try to figure out what's on the other side of the NVLink nvmlPciInfo_t remoteProc; if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue; @@ -89,7 +86,7 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) { p[c] = toupper(p[c]); } - if (strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) { + if (busId2 != NULL && strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) { links++; } else { // Make a lower case copy of the bus ID for calling ncclDeviceType @@ -101,11 +98,21 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) { lowerId[c] = tolower(p[c]); } - // Determine if the remote side is NVswitch + // Determine if the remote side is NVswitch or a GPU enum ncclNvLinkDeviceType type; - if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) { - //TODO: we are making an assumption that all GPUs are connected to this switch - //This assumption may change for future architectures + ncclResult_t ret = ncclDeviceType(lowerId, &type); + if (ret == ncclSuccess) { + if (type == ncclNvLinkDeviceSwitch) { + //TODO: we are making an assumption that all GPUs are connected to this switch + //This assumption may change for future architectures + nvswitch_links++; + } else if (type == ncclNvLinkDeviceGpu && busId2 == NULL) { + links++; + } + } else { + // The NVLink is up but we couldn't find the PCI device on the other + // side. Assume it's an NVswitch outside a VM. + if (l==0) INFO(NCCL_INIT, "Assuming NVLink is connected to NVswitch"); nvswitch_links++; } } @@ -113,43 +120,4 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) { return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*links; } -static int getNumNvlinks(const char* busId) { - nvmlDevice_t nvmlDev; - ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev); - if (res != ncclSuccess) return 0; - - int nvlinks = 0, nvswitch_links = 0; - int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4; - for(int l=0; l<maxNvLinks; ++l) { - unsigned canP2P; - nvmlEnableState_t isActive; - if (wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) == ncclSuccess && canP2P && - wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) == ncclSuccess && isActive == NVML_FEATURE_ENABLED) { - nvlinks++; - } else { - continue; - } - - nvmlPciInfo_t remoteProc; - if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue; - - // Make a lower case copy of the bus ID for calling ncclDeviceType - // PCI system path is in lower case - char* p = remoteProc.busId; - char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; - for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) { - if (p[c] == 0) break; - lowerId[c] = tolower(p[c]); - } - - // Determine if the remote side is NVswitch - enum ncclNvLinkDeviceType type; - if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) { - //TODO: we are making an assumption that all GPUs are connected to this switch - //This assumption may change for future architectures - nvswitch_links++; - } - } - return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*nvlinks; -} #endif diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h index ddfd233..0b6198a 100644 --- a/src/include/nvmlwrap.h +++ b/src/include/nvmlwrap.h @@ -7,7 +7,7 @@ #ifndef NCCL_NVMLWRAP_H_ #define NCCL_NVMLWRAP_H_ -#include "core.h" +#include "nccl.h" //#define NVML_DIRECT 1 #ifdef NVML_DIRECT @@ -32,14 +32,6 @@ static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) NVMLCHECK(nvmlDeviceGetIndex(device, index)); return ncclSuccess; } -static ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) { - NVMLCHECK(nvmlDeviceSetCpuAffinity(device)); - return ncclSuccess; -} -static ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) { - NVMLCHECK(nvmlDeviceClearCpuAffinity(device)); - return ncclSuccess; -} static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) { NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device)); return ncclSuccess; @@ -61,6 +53,10 @@ static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsig NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult)); return ncclSuccess; } +static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) { + NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber)); + return ncclSuccess; +} #else // Dynamically handle dependencies on NVML @@ -136,14 +132,14 @@ ncclResult_t wrapNvmlInit(void); ncclResult_t wrapNvmlShutdown(void); ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device); ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index); -ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device); -ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device); ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci); ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult); +ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber); + #endif // NVML_DIRECT #endif // End include guard diff --git a/src/include/ring.h b/src/include/ring.h deleted file mode 100644 index fa5e099..0000000 --- a/src/include/ring.h +++ /dev/null @@ -1,14 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_RING_H_ -#define NCCL_RING_H_ -#include "core.h" - -ncclResult_t initRing(struct ncclComm* comm, int ringid); -ncclResult_t freeRing(struct ncclRing* ring); - -#endif diff --git a/src/include/rings.h b/src/include/rings.h index 751846c..43fc595 100644 --- a/src/include/rings.h +++ b/src/include/rings.h @@ -12,6 +12,6 @@ static int getDefaultThreads() { return ncclCudaCompCap() == 3 ? 128 : 256; } -ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next); +ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut); #endif diff --git a/src/include/socket.h b/src/include/socket.h index 624af40..fb5cfc0 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -60,7 +60,9 @@ static inline int envSocketFamily(void) { } static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) { +#ifdef ENABLE_TRACE char line[1024]; +#endif struct netIf userIfs[MAX_IFS]; bool searchNot = prefixList && prefixList[0] == '^'; int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS); @@ -106,7 +108,6 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre // Store the IP address int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6); memcpy(addrs+found, interface->ifa_addr, salen); - INFO(NCCL_INIT|NCCL_NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line)); found++; } } @@ -336,8 +337,10 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr) TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line)); #endif - /* Put the socket in listen mode */ - SYSCHECK(listen(sockfd, 128), "listen"); + /* Put the socket in listen mode + * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn + */ + SYSCHECK(listen(sockfd, 16384), "listen"); *fd = sockfd; return ncclSuccess; } diff --git a/src/include/transport.h b/src/include/transport.h index 59f83c9..6231a71 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,6 +9,7 @@ #include "nccl.h" #include <stdint.h> +#include "nvmlwrap.h" #define NTRANSPORTS 3 @@ -19,11 +20,13 @@ struct ncclRing; struct ncclConnector; struct ncclComm; -#define RANK_INFO_SIZE 64 -typedef char ncclTinfo_t[RANK_INFO_SIZE]; - -struct ncclInfo { - ncclTinfo_t tinfo[NTRANSPORTS]; +struct ncclPeerInfo { + int rank; + int cudaDev; + int nvmlDev; + uint64_t hostHash; + uint64_t pidHash; + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; }; // Used to hold the transport connection values @@ -34,18 +37,47 @@ struct ncclConnect { char data[CONNECT_SIZE]; }; +enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress, ncclProxyOpDone }; + +struct ncclProxyArgs; +typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*); + struct ncclProxyArgs { - struct ncclRing* ring; - int substeps; + proxyProgressFunc_t progress; + struct ncclChannel* channel; + struct ncclConnector* connector; + int sliceSteps; + int chunkSteps; int nsteps; uint64_t opCount; int llMode; - bool needProxy; - int active; // add component before this line -- it is left out during initialization + int state; // add component before this line -- it is left out during initialization + + // Internal state + uint64_t head; + uint64_t tail; + uint64_t end; + void* requests[NCCL_STEPS]; + int idle; + + // Element linking + pthread_mutex_t mutex; + struct ncclProxyArgs* next; + struct ncclProxyArgs* nextPeer; +}; + +struct ncclProxyPool; +struct ncclProxyState { + pthread_cond_t cond; + pthread_mutex_t mutex; + bool stop; + struct ncclProxyArgs* ops; + struct ncclProxyArgs* pool; + struct ncclProxyPool* pools; }; struct ncclTransportComm { - ncclResult_t (*setup)(ncclTinfo_t*, ncclTinfo_t*, struct ncclConnect*, struct ncclRing*); + ncclResult_t (*setup)(struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId); ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*); ncclResult_t (*free)(void*); ncclResult_t (*proxy)(struct ncclProxyArgs*); @@ -53,8 +85,7 @@ struct ncclTransportComm { struct ncclTransport { const char name[4]; - ncclResult_t (*fillInfo)(ncclTinfo_t*, int); - ncclResult_t (*canConnect)(ncclTvalue_t*, ncclTinfo_t*, ncclTinfo_t*); + ncclResult_t (*canConnect)(ncclTvalue_t*, struct ncclPeerInfo*, struct ncclPeerInfo*); ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*); struct ncclTransportComm send; struct ncclTransportComm recv; @@ -64,37 +95,17 @@ struct ncclTransport { typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*); -#define TRANSPORT_PROXY_FIFO_SIZE NCCL_MAX_OPS - -struct transportProxyInfo { - struct ncclComm* comm; - pthread_t thread; - threadFunc_t func; - volatile int proxyReady; - struct ncclProxyArgs argsFifo[TRANSPORT_PROXY_FIFO_SIZE]; - volatile uint64_t argsFifoHead; - volatile uint64_t argsFifoTail; - pthread_cond_t cond; - pthread_mutex_t mutex; -}; - -ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm); -ncclResult_t transportDestroyProxy(struct ncclConnector* connector); - enum proxyMode { proxyRing = 0, proxyFrom = 1, proxyTo = 2 }; -static int proxyPatternRing = proxyRing; -static inline int proxyPatternFrom(int root) { return 1+root; } -static inline int proxyPatternTo(int root) { return -1-root; } -static inline enum proxyMode proxyPatternMode(int pattern) { return (pattern == 0) ? proxyRing : ((pattern > 0) ? proxyFrom : proxyTo); } -static inline int proxyPatternRoot(int pattern) { return (pattern > 0) ? pattern-1 : -pattern-1; } - -ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t size, int pattern, struct ncclComm* comm); -ncclResult_t transportStartProxies(struct ncclComm* comm); +ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr); +ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks); +ncclResult_t transportStartProxy(struct ncclComm* comm); +ncclResult_t transportCreateProxy(struct ncclComm* comm); +ncclResult_t transportDestroyProxy(struct ncclComm* comm); #include <unistd.h> diff --git a/src/include/trees.h b/src/include/trees.h new file mode 100644 index 0000000..1a151d1 --- /dev/null +++ b/src/include/trees.h @@ -0,0 +1,13 @@ +/************************************************************************* + * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_TREES_H_ +#define NCCL_TREES_H_ + +ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0); +ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* u1, int* d1_0, int* d1_1); + +#endif diff --git a/src/init.cu b/src/init.cu index 9d0188e..75822e6 100644 --- a/src/init.cu +++ b/src/init.cu @@ -1,21 +1,26 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "nccl.h" #include "core.h" -#include "ring.h" +#include "channel.h" #include "param.h" #include "nvmlwrap.h" #include "rings.h" +#include "trees.h" #include "bootstrap.h" #include "transport.h" -#include "common_coll.h" #include "group.h" #include "utils.h" #include "net.h" +#include "checks.h" +#include "enqueue.h" +#include "topo.h" +#include "nvlink.h" +#include "cpuset.h" #include <stdio.h> #include <stdlib.h> #include <sys/mman.h> @@ -55,6 +60,16 @@ NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); ncclNet_t* ncclNet = NULL; // We define this as weak to let tests redefine their own +#pragma weak ncclNvlinkGpu +ncclResult_t ncclNvlinkGpu(int* nvlink) { + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev)); + *nvlink = getNvlinkGpu(busId, NULL); + return ncclSuccess; +} +// We define this as weak to let tests redefine their own #pragma weak ncclCudaCompCap int ncclCudaCompCap() { int cudaDev; @@ -77,10 +92,7 @@ ncclResult_t initNet(ncclNet_t* net) { int ndev; if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError; if (net->devices(&ndev) != ncclSuccess) return ncclInternalError; - if (ndev <= 0) { - INFO(NCCL_INIT|NCCL_NET, "Net/%s: call to devices() returned 0 devices.", net->name); - return ncclSystemError; - } + if (ndev <= 0) return ncclSystemError; return ncclSuccess; } @@ -91,15 +103,15 @@ ncclResult_t initNetPlugin(ncclNet_t** net) { // string, so checking errno doesn't hurt to try to provide a better // error message if (errno == ENOENT) { - INFO(NCCL_INIT|NCCL_NET, "No network plugin found."); + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (libnccl-net.so)."); } else { - INFO(NCCL_INIT|NCCL_NET, "Unable to load libnccl-net.so : %s", dlerror()); + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror()); } return ncclSuccess; } ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL)); if (extNet == NULL) { - INFO(NCCL_INIT|NCCL_NET, "NetPlugin: could not find " STR(NCCL_PLUGIN_SYMBOL) " symbol"); + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol."); goto cleanup; } if (initNet(extNet) == ncclSuccess) { @@ -116,21 +128,18 @@ ncclResult_t initNet() { NCCLCHECK(initNet(&ncclNetSocket)); NCCLCHECK(initNetPlugin(&ncclNet)); - if (ncclNet != NULL) { - INFO(NCCL_INIT|NCCL_NET, "Using network plugin %s", ncclNetName()); - return ncclSuccess; - } + if (ncclNet != NULL) return ncclSuccess; if (initNet(&ncclNetIb) == ncclSuccess) { ncclNet = &ncclNetIb; } else { ncclNet = &ncclNetSocket; } - INFO(NCCL_INIT|NCCL_NET,"Using network %s", ncclNetName()); return ncclSuccess; } NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2); NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2); +NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", -2); int ncclThreadThreshold(int minCompCap, int multiNode) { int threshold = ncclParamThreadThreshold(); @@ -177,10 +186,15 @@ static ncclResult_t commFree(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; + free(comm->peerInfo); + + if (comm->bootstrap) + NCCLCHECK(bootstrapClose(comm->bootstrap)); + CUDACHECK(cudaFree(comm->devComm)); - for (int ring=0; ring<comm->nRings; ring++) - NCCLCHECK(freeRing(comm->rings+ring)); + for (int channel=0; channel<comm->nChannels; channel++) + NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks)); if (comm->doneEvent != NULL) CUDACHECK(cudaEventDestroy(comm->doneEvent)); @@ -199,6 +213,8 @@ static ncclResult_t commFree(ncclComm_t comm) { free(comm->intraCGMode); free(comm->intraCC); } + CUDACHECK(cudaFreeHost((void *)comm->abortFlag)); + CUDACHECK(cudaFreeHost((void *)comm->fatalDevError)); free(comm); return ncclSuccess; @@ -222,12 +238,15 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { struct ncclComm* comm; NCCLCHECK(ncclCalloc(&comm, 1)); - INFO(NCCL_INIT,"comm %p rank %d nranks %d", comm, rank, ndev); comm->rank = rank; comm->nRanks = ndev; cudaGetDevice(&comm->cudaDev); + getNvmlDevice(comm->cudaDev, &comm->nvmlDev); + INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev); + comm->doneEvent = doneEvent; comm->llThreshold = ncclParamLlThreshold(); + comm->treeThreshold = ncclParamTreeThreshold(); comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false; #if CUDART_VERSION >= 9200 comm->groupCudaStream = ncclParamGroupCudaStream(); @@ -235,6 +254,13 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { // Don't allow the user to overload the default setting in older CUDA builds comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM; #endif + comm->fatalError = ncclSuccess; + + CUDACHECK(cudaHostAlloc((void**) &comm->fatalDevError, sizeof(ncclDevError_t), cudaHostAllocMapped)); + *comm->fatalDevError = ncclDevSuccess; + + CUDACHECK(cudaHostAlloc((void**) &comm->abortFlag, sizeof(uint32_t), cudaHostAllocMapped)); + *comm->abortFlag = 0; comm->argsptr = &comm->args; @@ -248,9 +274,18 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { // Copy the comm on the device NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1)); // Copy userRanks - for (int r=0; r<comm->nRings; r++) { - NCCLCHECK(ncclCudaMemcpy(comm->rings[r].devUserRanks, comm->rings[r].userRanks, comm->nRanks)); + for (int r=0; r<comm->nChannels; r++) { + NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks)); + NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks)); } + // Copy the device-accessible pointer to comm->abortFlag + void *devAbortFlag; + CUDACHECK(cudaHostGetDevicePointer(&devAbortFlag, (uint32_t *)comm->abortFlag, 0)); + CUDACHECK(cudaMemcpy(&comm->devComm->abortFlag, &devAbortFlag, sizeof(int *), cudaMemcpyHostToDevice)); + // Copy the device-accessible pointer to comm->fatalDevError + void *devFatalError; + CUDACHECK(cudaHostGetDevicePointer(&devFatalError, (ncclDevError_t *)comm->fatalDevError, 0)); + CUDACHECK(cudaMemcpy(&comm->devComm->fatalDevError, &devFatalError, sizeof(ncclDevError_t *), cudaMemcpyHostToDevice)); return ncclSuccess; } @@ -267,35 +302,81 @@ static void showVersion() { } } -static ncclResult_t fillInfo(struct ncclInfo* info, int rank) { - for (int t=0; t<NTRANSPORTS; t++) { - NCCLCHECK(ncclTransports[t].fillInfo(info->tinfo+t, rank)); - } +static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank) { + info->rank = rank; + CUDACHECK(cudaGetDevice(&info->cudaDev)); + NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev)) + info->hostHash=getHostHash(); + info->pidHash=getPidHash(); + + // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the + // cudaDev is a CUDA runtime dev number which could be different from the + // NVML device number. Then we get the busID from NVML to be sure it is + // consistent with NVML remote PCI bus Ids. + CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev)); + nvmlDevice_t nvmlDevice; + NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice)); + nvmlPciInfo_t pciInfo; + NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo)); + strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE); return ncclSuccess; } template <int type> -static ncclResult_t selectTransport(struct ncclInfo* myInfo, struct ncclInfo* peerInfo, struct ncclConnect* connect, struct ncclTransport** transportRet, struct ncclRing* ring) { +static ncclResult_t selectTransport(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) { for (int t=0; t<NTRANSPORTS; t++) { struct ncclTransport *transport = ncclTransports+t; struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv; ncclTvalue_t ret = 0; - NCCLCHECK(transport->canConnect(&ret, myInfo->tinfo+t, peerInfo->tinfo+t)); + NCCLCHECK(transport->canConnect(&ret, myInfo, peerInfo)); if (ret > 0) { - NCCLCHECK(transportComm->setup(myInfo->tinfo+t, peerInfo->tinfo+t, connect, ring)); - *transportRet = transport; + connector->transportComm = transportComm; + NCCLCHECK(transportComm->setup(myInfo, peerInfo, connect, connector, buffSize, channelId)); return ncclSuccess; } } WARN("No transport found !"); - *transportRet = NULL; return ncclInternalError; } -static ncclResult_t setupRing(struct ncclComm* comm, int ringid, int rank, int nranks, int* ringRanks, struct ncclInfo* allInfo, struct ncclConnect* connect) { - NCCLCHECK(initRing(comm, ringid)); +static int log2(int n) { + int l = 0; + while (n>>=1) l++; + return l; +} + +static ncclResult_t ncclTreeThreshold(int nnodes, int nranks, int nChannels, ssize_t *treeThreshold) { + int nvlink; + NCCLCHECK(ncclNvlinkGpu(&nvlink)); + float ringbw = nvlink ? 5000*nChannels : 5000; // approx, in MB/s or B/us + float ringlatinter = 6; + float treelatintra = 4; + float treelatinter = 15; + float treebw; + if (!nvlink) { + treebw = ringbw * 2 / 3; + } else { + treebw = ringbw * 3 / 4; + if (nnodes == 2) treebw *= 2; + } + float ringlat = ringlatinter*(nranks-1); + float treelat = treelatinter*log2(nnodes)+treelatintra*(nranks/nnodes-1); + if (nnodes < 2 || ringlat <= treelat) + *treeThreshold = 0; + else if (treebw > ringbw) + *treeThreshold = 0x7fffffffffffffff; + else + *treeThreshold = (ssize_t)(((ringbw*treebw/(ringbw-treebw)))*(ringlat-treelat)); + return ncclSuccess; +} + +static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks, int* treeMasters) { + TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); + NCCLCHECK(initChannel(comm, channelId)); + + struct ncclChannel* channel = comm->channels+channelId; + struct ncclRing* ring = &channel->ring; - struct ncclRing* ring = comm->rings+ringid; // Reorganize ranks to start with rank. int shift; for (shift = 0; shift<nranks; shift++) { @@ -306,21 +387,85 @@ static ncclResult_t setupRing(struct ncclComm* comm, int ringid, int rank, int n for (int i=0; i<nranks; i++) { ring->userRanks[i] = ringRanks[(i+shift)%nranks]; } - int prev = ring->userRanks[nranks-1]; - int next = ring->userRanks[1]; + int prev = ring->prev = ring->userRanks[nranks-1]; + int next = ring->next = ring->userRanks[1]; + + struct ncclTree* tree = &channel->tree; + tree->up = -1; + tree->down[0] = tree->down[1] = tree->down[2] = -1; + + // + // Find per-node masters and connect them via a binary tree + // + + int nMasters = 0; + for (int r=0; r<nranks; r++) nMasters += treeMasters[r]; + if (nMasters == 0) { + nMasters = 1; + treeMasters[0] = 1; + } + + if (comm->treeThreshold == -2) + NCCLCHECK(ncclTreeThreshold(nMasters, comm->nRanks, comm->nChannels, &comm->treeThreshold)); + + if (comm->treeThreshold > 0) { + // Compute tree depth. Not an exact value but a good approximation in most + // cases and consistent across nodes + tree->depth = nranks/nMasters + log2(nMasters); + + // Find my master : go backwards in the ring to find my root + int master = 0; + for (int i = 0; i<nranks; i++) { + int r = ring->userRanks[(nranks-i)%nranks]; + if (treeMasters[r]) { + master = r; + break; + } + } - NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+0, &ring->recv.transport, ring)); - NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+1, &ring->send.transport, ring)); - NCCLCHECK(transportCreateProxy(0, ring, comm)); - NCCLCHECK(transportCreateProxy(1, ring, comm)); + int ranks[nMasters]; + int i = 0, masterIndex = -1; + // Build binary tree + for (int r=0; r<nranks; r++) { + // Create index table + if (r == master) masterIndex = i; + if (treeMasters[r]) ranks[i++] = r; + } + int btreeUp, btreeDown0, btreeDown1; + int u0, d0_0, d0_1, u1, d1_0, d1_1; + NCCLCHECK(ncclGetDtree(nMasters, masterIndex, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1)); + if (channelId < DIVUP(comm->nChannels, 2)) { + btreeUp = u0; btreeDown0 = d0_0; btreeDown1 = d0_1; + } else { + btreeUp = u1; btreeDown0 = d1_0; btreeDown1 = d1_1; + } + + // + // Now build the full tree, combining the intra-node ring and the + // inter-node binary tree. + // + + if (rank == master) { + int nDown = 0; + if (btreeUp != -1) tree->up = ranks[btreeUp]; + if (treeMasters[next] == 0) tree->down[nDown++] = next; + if (btreeDown0 != -1) tree->down[nDown++] = ranks[btreeDown0]; + if (btreeDown1 != -1) tree->down[nDown++] = ranks[btreeDown1]; + } else { + tree->up = prev; + if (treeMasters[next] == 0) tree->down[0] = next; + } + } + + TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); return ncclSuccess; } -static ncclResult_t fillConnect(struct ncclInfo* allInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) { +static ncclResult_t fillConnect(struct ncclPeerInfo* peerInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) { for (int r=0; r<nranks; r++) { connectTransport[r] = -1; for (int t=0; t<NTRANSPORTS; t++) { - NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, allInfo[rank].tinfo+t, allInfo[r].tinfo+t)); + NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, peerInfo+rank, peerInfo+r)); if (connectValue[r] > 0) { connectTransport[r] = t; break; @@ -330,11 +475,6 @@ static ncclResult_t fillConnect(struct ncclInfo* allInfo, int nranks, int rank, return ncclSuccess; } -static void swap(void* mem1, void* mem2, int size) { - char tmp[size]; - memcpy(tmp, mem1, size); memcpy(mem1, mem2, size); memcpy(mem2, tmp, size); -} - #define MAXWIDTH 20 #define PREFIXLEN 15 #define STRLENGTH (PREFIXLEN+5*MAXWIDTH) @@ -380,9 +520,9 @@ void dumpLine(int* values, int nranks, const char* prefix) { static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) { for (int r=0; r<nrings; r++) { char prefix[30]; - /*sprintf(prefix, "[%d] Ring %d Prev : ", rank, r); + /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r); dumpLine(prev+r*nranks, nranks, prefix); - sprintf(prefix, "[%d] Ring %d Next : ", rank, r); + sprintf(prefix, "[%d] Channel %d Next : ", rank, r); dumpLine(next+r*nranks, nranks, prefix);*/ int current = rank; @@ -390,7 +530,7 @@ static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int rings[r*nranks+i] = current; current = next[r*nranks+current]; } - sprintf(prefix, "Ring %02d : ", r); + sprintf(prefix, "Channel %02d : ", r); if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix); if (current != rank) { WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank); @@ -488,140 +628,274 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct return ncclSuccess; } +static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) { + TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv); + uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */ + struct ncclConnect connect; + struct ncclConnector* conn; + for (int i=0; i<nrecv; i++) { + int peer = peerRecv[i]; + if (peer == -1) continue; + conn = &channel->peers[peer].recv; + if (conn->connected) { ++nSkippedRecv; continue; } + NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); + NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + } + for (int i=0; i<nsend; i++) { + int peer = peerSend[i]; + if (peer == -1) continue; + conn = &channel->peers[peer].send; + if (conn->connected) { ++nSkippedSend; continue; } + NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); + NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + } + for (int i=0; i<nsend; i++) { + int peer = peerSend[i]; + if (peer == -1) continue; + conn = &channel->peers[peer].send; + if (conn->connected) {++nSkippedSend; continue; } + NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + NCCLCHECK(conn->transportComm->connect(&connect, conn)); + conn->connected = 1; + } + for (int i=0; i<nrecv; i++) { + int peer = peerRecv[i]; + if (peer == -1) continue; + conn = &channel->peers[peer].recv; + if (conn->connected) {++nSkippedRecv; continue; } + NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + NCCLCHECK(conn->transportComm->connect(&connect, conn)); + conn->connected = 1; + } + TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv); + return ncclSuccess; +} + static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) { + // We use 3 AllGathers + // 1. { peerInfo, comm } + // 2. ConnectTransport[nranks], ConnectValue[nranks] + // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] } + int rank = comm->rank; int nranks = comm->nRanks; - void* commState; - NCCLCHECK(bootstrapInit(commId, rank, nranks, &commState)); + TRACE(NCCL_INIT, "rank %d nranks %d - BEGIN", rank, nranks); + NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap)); - struct ncclInfo* allInfo; - NCCLCHECK(ncclCalloc(&allInfo, nranks)); - NCCLCHECK(fillInfo(allInfo+rank, rank)); - NCCLCHECK(bootstrapAllGather(commState, allInfo, sizeof(struct ncclInfo))); + // AllGather1 - begin + struct { + struct ncclPeerInfo peerInfo; + struct ncclComm* comm; + } *allGather1Data; + + NCCLCHECK(ncclCalloc(&allGather1Data, nranks)); + allGather1Data[rank].comm = comm; + NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank)); + NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data))); + + NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks)); + for (int i = 0; i < nranks; i++) { + memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo)); + } + // AllGather1 data is used again below + // AllGather1 - end + + // AllGather2 - begin + size_t allGather2DataRowSize = sizeof(int)*nranks + sizeof(ncclTvalue_t)*nranks; + void *allGather2Data; + NCCLCHECK(ncclCalloc((char **)&allGather2Data, allGather2DataRowSize*nranks)); + int *myTransportRow = (int *)((char *)allGather2Data + allGather2DataRowSize*rank); + ncclTvalue_t *myValueRow = (ncclTvalue_t *)(myTransportRow + nranks); + + NCCLCHECK(fillConnect(comm->peerInfo, nranks, rank, myTransportRow, myValueRow)); + NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather2Data, allGather2DataRowSize)); int* connectTransport; ncclTvalue_t* connectValue; NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks)); NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks)); + for (int i = 0; i < nranks; i++) { + memcpy(connectTransport + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize, sizeof(int)*nranks); + memcpy(connectValue + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize + nranks*sizeof(int), sizeof(ncclTvalue_t)*nranks); + } + free(allGather2Data); + // AllGather2 - end - NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank)); - NCCLCHECK(bootstrapAllGather(commState, connectTransport, nranks*(sizeof(int)))); - NCCLCHECK(bootstrapAllGather(commState, connectValue, nranks*(sizeof(ncclTvalue_t)))); //if (rank == 0) dumpMatrix(connectTransport, nranks); //if (rank == 0) dumpMatrixTvalue(connectValue, nranks); // Get my rings int nrings; - int* prev, *next; - NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS)); - NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS)); + int* prev, *next, *treeIn, *treeOut; + NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS)); comm->nThreads = getDefaultThreads(); - NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next)); + NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut)); + TRACE(NCCL_INIT, "rank %d nranks %d - BUILD %d RINGS", rank, nranks, nrings); + assert(nrings <= MAXCHANNELS); free(connectTransport); free(connectValue); + // AllGather3 - begin + struct { + int nThreads; + int nrings; + int cudaCompCap; + int prev[MAXCHANNELS]; + int next[MAXCHANNELS]; + } *allGather3Data; + + NCCLCHECK(ncclCalloc(&allGather3Data, nranks)); + allGather3Data[rank].nThreads = comm->nThreads; + allGather3Data[rank].nrings = nrings; + allGather3Data[rank].cudaCompCap = ncclCudaCompCap(); + for (int r=0; r<nrings; r++) { + allGather3Data[rank].prev[r] = *(prev+r*nranks+rank); + allGather3Data[rank].next[r] = *(next+r*nranks+rank); + } + NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data))); + // Find max nThreads - int allData[nranks]; - allData[rank] = comm->nThreads; - NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int))); for (int i=0; i<nranks; i++) - comm->nThreads = std::max(allData[i], comm->nThreads); - if (rank == 0) INFO(NCCL_INIT,"Using %d threads", comm->nThreads); + comm->nThreads = std::max(allGather3Data[i].nThreads, comm->nThreads); // Determine the minimum CUDA Compute capability of all GPUs - int myCompCap = ncclCudaCompCap(); + int myCompCap = allGather3Data[rank].cudaCompCap; int minCompCap = myCompCap; - allData[rank] = myCompCap; - NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int))); - for (int i=0; i<nranks; i++) - minCompCap = std::min(allData[i], minCompCap); - if (rank == 0) INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap); + for (int i = 0; i < nranks; i++) + minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap); + + // Determine thread threshold across all GPUs + int nnodes = 0; + for (int r=0; r<nranks; r++) nnodes += treeIn[r]; + comm->threadThreshold = ncclThreadThreshold(minCompCap, nnodes); // Find min nrings across ranks - allData[rank] = nrings; - NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int))); for (int i=0; i<nranks; i++) - nrings = std::min(allData[i], nrings); - - // Exchange data with others to build complete rings - comm->nRings = nrings; - for (int r=0; r<nrings; r++) { - NCCLCHECK(bootstrapAllGather(commState, prev+r*nranks, sizeof(int))); - NCCLCHECK(bootstrapAllGather(commState, next+r*nranks, sizeof(int))); + nrings = std::min(allGather3Data[i].nrings, nrings); + comm->nChannels = nrings; + + // Unpack the per ring prev/next arrays + for (int i = 0; i < nranks; i++) { + for (int r = 0; r < nrings; r++) { + prev[r*nranks+i] = allGather3Data[i].prev[r]; + next[r*nranks+i] = allGather3Data[i].next[r]; + } } + free(allGather3Data); + // AllGather3 - end + int *rings; - NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS)); + NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS)); NCCLCHECK(buildRings(nrings, rings, rank, nranks, prev, next)); free(prev); free(next); + TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d RINGS", rank, nranks, nrings); // Connect with prev/next for each ring - struct ncclConnect *connectData; - NCCLCHECK(ncclCalloc(&connectData, 2*nranks)); + struct ncclConnect *connect; + NCCLCHECK(ncclCalloc(&connect, 2)); for (int r=0; r<nrings; r++) { - int* ringRanks = rings+r*nranks; - struct ncclRing *ring = comm->rings+r; - NCCLCHECK(setupRing(comm, r, rank, nranks, ringRanks, allInfo, connectData+2*rank)); - int prev_offset = ring->userRanks[nranks-1]*2+1; - int next_offset = ring->userRanks[1]*2; - NCCLCHECK(bootstrapAllGather(commState, connectData, sizeof(struct ncclConnect)*2)); - NCCLCHECK(ring->send.transport->send.connect(connectData+next_offset, &ring->send)); - NCCLCHECK(ring->recv.transport->recv.connect(connectData+prev_offset, &ring->recv)); - } - free(connectData); - free(rings); - free(allInfo); + struct ncclChannel* channel = comm->channels+r; + NCCLCHECK(setupChannel(comm, r, rank, nranks, rings+r*nranks, treeIn+r*nranks)); + NCCLCHECK(p2pSetup(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next)); + NCCLCHECK(p2pSetup(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up)); + NCCLCHECK(p2pSetup(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down)); + } + if (comm->treeThreshold > 0) { + char line[1024]; + line[0]='\0'; + for (int c=0; c<nrings; c++) { + struct ncclTree* tree = &comm->channels[c].tree; + snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d->%d->%d/%d/%d", + c, tree->up, rank, tree->down[0], tree->down[1], tree->down[2]); + } + line[1023] = '\0'; + INFO(NCCL_INIT, "Trees%s", line); + } + if (rank == 0) { + char treeline[64]; + snprintf(treeline, 64, "enabled up to size %ld", comm->treeThreshold); + INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees %s", comm->nThreads, minCompCap, + comm->treeThreshold == 0 ? "disabled" : + comm->treeThreshold == 0x7fffffffffffffff ? "enabled for all sizes" : + treeline); + } - // Intra-process barrier setup - struct rankInfo { - uint64_t hostHash; - uint64_t pidHash; - struct ncclComm* comm; - } rankInfos[nranks]; - rankInfos[rank].hostHash = getHostHash(); - rankInfos[rank].pidHash = getPidHash(); - rankInfos[rank].comm = comm; - NCCLCHECK(bootstrapAllGather(commState, rankInfos, sizeof(struct rankInfo))); + TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, nrings); + free(connect); + free(rings); + free(treeIn); + free(treeOut); - // Compute intra ranks + // Compute intra ranks (using AllGather1 data) int intraRank0 = -1, intraRank = -1, intraRanks = 0; - int multiNode = 0; - for (int r=0; r<nranks; r++) { - if ((rankInfos[r].hostHash == rankInfos[rank].hostHash) && - (rankInfos[r].pidHash == rankInfos[rank].pidHash)) { - if (intraRanks == 0) intraRank0 = r; - if (r == rank) intraRank = intraRanks; + for (int i = 0; i < nranks; i++) { + if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) && + (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) { + if (intraRanks == 0) intraRank0 = i; + if (i == rank) intraRank = intraRanks; intraRanks++; - } else if (rankInfos[r].hostHash != rankInfos[rank].hostHash) { - multiNode = 1; } } TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d", - rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0); - if (intraRank == -1 || intraRank0 == -1 || rankInfos[intraRank0].comm == NULL) { + rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0); + if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) { WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d", - rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0); + rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0); return ncclInternalError; } - NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, rankInfos[intraRank0].comm)); + NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm)); - // Determine thread threshold across all GPUs - comm->threadThreshold = ncclThreadThreshold(minCompCap, multiNode); + // Done with AllGather1 data + free(allGather1Data); - // Barrier - bootstrapClose(commState); + if (nnodes) NCCLCHECK(transportCreateProxy(comm)); + + TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); return ncclSuccess; } -bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice) { - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; - if (cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev) != cudaSuccess) return false; - if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevice) != ncclSuccess) return false; - if (wrapNvmlDeviceSetCpuAffinity(*nvmlDevice) != ncclSuccess) { - WARN("Failed to set CPU affinity"); - return false; +static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) { + CPU_ZERO_S(sizeof(cpu_set_t), mask); + char* cudaPath; + NCCLCHECK(getCudaPath(cudaDev, &cudaPath)); + char path[PATH_MAX]; + strncpy(path, cudaPath, PATH_MAX-1); + snprintf(path+strlen(path), PATH_MAX-1-strlen(path), "/local_cpus"); + path[PATH_MAX-1] = '\0'; + int fd; + SYSCHECKVAL(open(path, O_RDONLY), "open", fd); + char affinityStr[sizeof(cpu_set_t)*2]; + int r = read(fd, affinityStr, sizeof(cpu_set_t)*2); + if (r > 0) + NCCLCHECK(ncclStrToCpuset(affinityStr, mask)); + close(fd); + free(cudaPath); + return ncclSuccess; +} + +static ncclResult_t setCpuAffinity(int cudaDev) { + // Work within the enveloppe we were provided + cpu_set_t mask; + SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity"); + + // Find the subpart that is local to our GPU + cpu_set_t gpuMask; + NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask)); + cpu_set_t finalMask; + CPU_AND(&finalMask, &mask, &gpuMask); + + // If those are not disjoint, try to stay local + if (CPU_COUNT(&finalMask)) { + char affinityStr[sizeof(cpu_set_t)*2]; + NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr)); + INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", cudaDev, affinityStr); + SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity"); } - return true; + return ncclSuccess; } ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { @@ -633,9 +907,8 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId // Make sure all host memory allocation are close to the GPU int cudaDev; - nvmlDevice_t nvmlDevice; CUDACHECK(cudaGetDevice(&cudaDev)); - SetCpuAffinity(cudaDev, &nvmlDevice); + NCCLCHECK(setCpuAffinity(cudaDev)); ncclResult_t res; NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup); @@ -645,7 +918,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup); - INFO(NCCL_INIT,"comm %p rank %d nranks %d - COMPLETE", *newcomm, myrank, nranks); + INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->nvmlDev); return ncclSuccess; cleanup: @@ -664,8 +937,6 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm NCCLCHECK(ncclInit()); if (myrank == 0) showVersion(); - INFO(NCCL_INIT,"rank %d nranks %d", myrank, nranks); - // Make sure the CUDA runtime is initialized. CUDACHECK(cudaFree(NULL)); @@ -685,7 +956,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm } static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) { - struct ncclInfo* allInfo; + struct ncclPeerInfo* allInfo; NCCLCHECK(ncclCalloc(&allInfo, nranks)); for (int rank=0; rank<nranks; rank++) { CUDACHECK(cudaSetDevice(devs[rank])); @@ -699,12 +970,14 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, for (int rank=0; rank<nranks; rank++) NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank)); - int* prev, *prevFinal, *next, *nextFinal; - NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS)); - NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXRINGS)); - NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS)); - NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXRINGS)); - int nrings = MAXRINGS; + int* prev, *prevFinal, *next, *nextFinal, *treeIn, *treeOut; + NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS)); + int nrings = MAXCHANNELS; int nthreads=0; int myCompCap = ncclCudaCompCap(); int minCompCap = myCompCap; @@ -713,7 +986,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nringsRank; int nthreadsRank = getDefaultThreads(); myCompCap = ncclCudaCompCap(); - NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next)); + NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut)); nrings = std::min(nrings, nringsRank); nthreads = std::max(nthreads, nthreadsRank); minCompCap = std::min(minCompCap, myCompCap); @@ -728,11 +1001,10 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, free(prev); free(next); - INFO(NCCL_INIT,"Using %d threads", nthreads); - INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap); + INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees disabled", nthreads, minCompCap); int* rings; - NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS)); + NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS)); NCCLCHECK(buildRings(nrings, rings, 0, nranks, prevFinal, nextFinal)); free(prevFinal); free(nextFinal); @@ -741,7 +1013,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int threadThreshold = ncclThreadThreshold(minCompCap, 0); for (int rank=0; rank<nranks; rank++) { - comms[rank]->nRings = nrings; + comms[rank]->nChannels = nrings; comms[rank]->nThreads = nthreads; comms[rank]->threadThreshold = threadThreshold; } @@ -751,26 +1023,32 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int* ringRanks = rings+r*nranks; for (int rank=0; rank<nranks; rank++) { CUDACHECK(cudaSetDevice(devs[rank])); - NCCLCHECK(setupRing(comms[rank], r, rank, nranks, ringRanks, allInfo, connect+2*rank)); - } - // RingExchange connect information - for (int rank=0; rank<nranks; rank++) { - // Swap rank->prev and prevRank->next - struct ncclRing *ring = comms[rank]->rings+r; - int prevRank = ring->userRanks[nranks-1]; - struct ncclConnect* prevRankNextConnect = connect+2*prevRank+1; - struct ncclConnect* rankPrevConnect = connect+2*rank; - swap(prevRankNextConnect, rankPrevConnect, sizeof(struct ncclConnect)); + struct ncclChannel* channel = comms[rank]->channels+r; + struct ncclRing *ring = &channel->ring; + NCCLCHECK(setupChannel(comms[rank], r, rank, nranks, ringRanks, treeIn)); + // Make sure we don't use trees, we cannot use them with initAll + comms[rank]->treeThreshold = 0; + int prev = channel->ring.prev = ring->userRanks[nranks-1]; + int next = channel->ring.next = ring->userRanks[1]; + struct ncclConnector* recv = &channel->peers[prev].recv; + struct ncclConnector* send = &channel->peers[next].send; + NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+rank*2+0, recv, channel->buffSize, channel->id)); + NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+rank*2+1, send, channel->buffSize, channel->id)); } for (int rank=0; rank<nranks; rank++) { CUDACHECK(cudaSetDevice(devs[rank])); - struct ncclRing *ring = comms[rank]->rings+r; - NCCLCHECK(ring->send.transport->send.connect(connect+2*rank+1, &ring->send)); - NCCLCHECK(ring->recv.transport->recv.connect(connect+2*rank+0, &ring->recv)); + struct ncclChannel* channel = comms[rank]->channels+r; + struct ncclRing *ring = &channel->ring; + struct ncclConnector* recv = &channel->peers[ring->prev].recv; + struct ncclConnector* send = &channel->peers[ring->next].send; + NCCLCHECK(recv->transportComm->connect(connect+ring->prev*2+1, recv)); + NCCLCHECK(send->transportComm->connect(connect+ring->next*2+0, send)); } } - free(rings); free(allInfo); + free(rings); + free(treeIn); + free(treeOut); return ncclSuccess; } @@ -794,7 +1072,6 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { int savedDevice; int rank, cudaDev; ncclComm_t comm = NULL; - nvmlDevice_t nvmlDevice; int ncclDevList[ndev]; for (int i=0; i<ndev; i++) { ncclDevList[i] = devlist ? devlist[i] : i; @@ -812,7 +1089,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { cudaDev = ncclDevList[rank]; CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup); - SetCpuAffinity(cudaDev, &nvmlDevice); + NCCLCHECK(setCpuAffinity(cudaDev)); NCCLCHECKGOTO(commAlloc(&comm, ndev, rank), res, cleanup); comms[rank] = comm; @@ -848,27 +1125,50 @@ final: return res; } -NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm); -ncclResult_t ncclCommDestroy(ncclComm_t comm) { - if (comm == NULL) - return ncclSuccess; +static ncclResult_t commDestroy(ncclComm_t comm) { int savedDevice; CUDACHECK(cudaGetDevice(&savedDevice)); int commDevice = comm->cudaDev; + int rank = comm->rank; if (savedDevice != commDevice) { CUDACHECK(cudaSetDevice(commDevice)); } + TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, *comm->abortFlag, comm->fatalError); + + CUDACHECK(cudaStreamSynchronize(comm->groupStream)); + NCCLCHECK(transportDestroyProxy(comm)); NCCLCHECK(commFree(comm)); if (savedDevice != commDevice) CUDACHECK(cudaSetDevice(savedDevice)); + INFO(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank); + return ncclSuccess; } +NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm); +ncclResult_t ncclCommDestroy(ncclComm_t comm) { + if (comm == NULL) + return ncclSuccess; + + return commDestroy(comm); +} + +NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm); +ncclResult_t ncclCommAbort(ncclComm_t comm) { + if (comm == NULL) + return ncclSuccess; + + // Ask anything that might still be running on the device to quit + *comm->abortFlag = 1; + + return commDestroy(comm); +} + NCCL_API(const char*, ncclGetErrorString, ncclResult_t code); const char* ncclGetErrorString(ncclResult_t code) { switch (code) { @@ -882,6 +1182,39 @@ const char* ncclGetErrorString(ncclResult_t code) { } } +NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError); +ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) { + NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm")); + NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError")); + + // Check device reported error + static ncclDevError_t printedDevErr = ncclDevSuccess; + switch(*comm->fatalDevError) { + case ncclDevSuccess : + break; + case ncclDevAssertedMismatch : + if (printedDevErr != ncclDevAssertedMismatch) { + WARN("Mismatched collective detected, please check your collective calls at and around rank %d. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank); + printedDevErr = ncclDevAssertedMismatch; + } + if (comm->fatalError == ncclSuccess) { + comm->fatalError = ncclInvalidUsage; + } + break; + case ncclDevSuspectedMismatch : + if (printedDevErr != ncclDevSuspectedMismatch) { + WARN("Your program may be hanging, this may be caused by a collective mismatch around rank %d. Please check your collective calls at and around this rank. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank); + printedDevErr = ncclDevSuspectedMismatch; + } + break; + default: + WARN("Unknown device error %d", *comm->fatalDevError); + return ncclInternalError; + } + *asyncError = comm->fatalError; + return ncclSuccess; +} + NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count); ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { NCCLCHECK(PtrCheck(comm, "CommCount", "comm")); diff --git a/src/misc/checks.cu b/src/misc/checks.cu new file mode 100644 index 0000000..a07e577 --- /dev/null +++ b/src/misc/checks.cu @@ -0,0 +1,69 @@ +/************************************************************************* + * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "checks.h" + +static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) { + cudaPointerAttributes attr; + cudaError_t err = cudaPointerGetAttributes(&attr, pointer); + if (err != cudaSuccess || attr.devicePointer == NULL) { + WARN("%s : %s is not a valid pointer", opname, ptrname); + return ncclInvalidArgument; + } +#if CUDART_VERSION >= 10000 + if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { +#else + if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { +#endif + WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev); + return ncclInvalidArgument; + } + return ncclSuccess; +} + +ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) { + if (ptr == NULL) { + WARN("%s : %s argument is NULL", opname, ptrname); + return ncclInvalidArgument; + } + return ncclSuccess; +} + +ncclResult_t ArgsCheck(struct ncclInfo* info) { + NCCLCHECK(PtrCheck(info->comm, info->opName, "comm")); + // First, the easy ones + if (info->root < 0 || info->root >= info->comm->nRanks) { + WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks); + return ncclInvalidArgument; + } + if (info->datatype < 0 || info->datatype >= ncclNumTypes) { + WARN("%s : invalid type %d", info->opName, info->datatype); + return ncclInvalidArgument; + } + // Type is OK, compute nbytes. Convert Allgather/Broadcast calls to chars. + info->nBytes = info->count * ncclTypeSize(info->datatype); + if (info->coll == ncclCollAllGather || info->coll == ncclCollBroadcast) { + info->count = info->nBytes; + info->datatype = ncclInt8; + } + if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank + + if (info->op < 0 || info->op >= ncclNumOps) { + WARN("%s : invalid reduction operation %d", info->opName, info->op); + return ncclInvalidArgument; + } + + if (info->comm->checkPointers) { + // Check CUDA device pointers + if (info->coll != ncclCollBroadcast || info->comm->rank == info->root) { + NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName)); + } + if (info->coll != ncclCollReduce || info->comm->rank == info->root) { + NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName)); + } + } + return ncclSuccess; +} diff --git a/src/misc/enqueue.cu b/src/misc/enqueue.cu deleted file mode 100644 index 80846dd..0000000 --- a/src/misc/enqueue.cu +++ /dev/null @@ -1,248 +0,0 @@ -/************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "enqueue.h" -#include "common_coll.h" -#include "param.h" - -#include "collectives/collectives.h" - -#define NCCL_FUNC4(coll, op, dtype) \ - (void*)NCCL_KERN_NAME(coll, op, dtype), \ - (void*)NCCL_KERN_NAME(coll##LL, op, dtype) - -// Must be consistent with ncclDataType_t -#define NCCL_FUNCS3A(coll, op) \ - (void*)NCCL_FUNC4(coll, op, i8), \ - (void*)NCCL_FUNC4(coll, op, u8), \ - (void*)NCCL_FUNC4(coll, op, i32), \ - (void*)NCCL_FUNC4(coll, op, u32), \ - (void*)NCCL_FUNC4(coll, op, i64), \ - (void*)NCCL_FUNC4(coll, op, u64), \ - (void*)NCCL_FUNC4(coll, op, f16), \ - (void*)NCCL_FUNC4(coll, op, f32), \ - (void*)NCCL_FUNC4(coll, op, f64) -#define NCCL_FUNCS3B(coll, op) \ - (void*)NCCL_FUNC4(coll, op, i8), \ - (void*)NCCL_FUNC4(coll, op, i8), \ - (void*)NCCL_FUNC4(coll, op, i8), \ - (void*)NCCL_FUNC4(coll, op, i8), \ - (void*)NCCL_FUNC4(coll, op, i8), \ - (void*)NCCL_FUNC4(coll, op, i8), \ - (void*)NCCL_FUNC4(coll, op, i8), \ - (void*)NCCL_FUNC4(coll, op, i8), \ - (void*)NCCL_FUNC4(coll, op, i8) - -// Must be consistent with ncclRedOp_t -#define NCCL_FUNCS2A(coll) \ - NCCL_FUNCS3A(coll, sum ), \ - NCCL_FUNCS3A(coll, prod), \ - NCCL_FUNCS3A(coll, max ), \ - NCCL_FUNCS3A(coll, min ) -#define NCCL_FUNCS2B(coll) \ - NCCL_FUNCS3B(coll, copy), \ - NCCL_FUNCS3B(coll, copy), \ - NCCL_FUNCS3B(coll, copy), \ - NCCL_FUNCS3B(coll, copy) - -// Must be consistent with the ncclFuncSet enum -static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2] = { - NCCL_FUNCS2B(ncclBroadcast), - NCCL_FUNCS2A(ncclReduce), - NCCL_FUNCS2B(ncclAllGather), - NCCL_FUNCS2A(ncclReduceScatter), - NCCL_FUNCS2A(ncclAllReduce) -}; - -ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) { -#if CUDART_VERSION >= 9000 - if (cgMode & 0x01) { - CUDACHECK(cudaLaunchCooperativeKernelMultiDevice(paramsList, numDevices, - // These flags are to reduce the latency of using this API - cudaCooperativeLaunchMultiDeviceNoPreSync|cudaCooperativeLaunchMultiDeviceNoPostSync)); - return ncclSuccess; - } -#endif - int savedDev; - CUDACHECK(cudaGetDevice(&savedDev)); - for (int i = 0; i < numDevices; i++) { - struct cudaLaunchParams* params = paramsList+i; - CUDACHECK(cudaSetDevice(cudaDevs[i])); - CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream)); - } - CUDACHECK(cudaSetDevice(savedDev)); - return ncclSuccess; -} - -ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) { - params->gridDim.x = std::min((int) params->gridDim.x, comm->nRings); - - // Set active = 2 for the last operation - for (int r=0; r<params->gridDim.x; r++) { - struct ncclRing* ring = comm->rings+r; - ring->collectives[(ring->collStart+ring->collCount-1)%NCCL_MAX_OPS].active = 2; - } - - // Find the first operation, choose the kernel accordingly and pass it - // as the first argument. - struct ncclColl* coll = comm->rings[0].collectives+comm->rings[0].collStart; - memcpy(&comm->args, coll, sizeof(struct ncclColl)); - // As we pass that coll directly, we can free it immediately. - coll->active = 0; - - params->func = ncclKerns[coll->funcIndex]; - return ncclSuccess; -} - -ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) { - volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); - int val = *ptr; - bool done = false; - while (done == false) { - if (val >= comm->intraRanks) { - WARN("Trying to launch too many collectives"); - return ncclInvalidUsage; - } - if (val+1 == comm->intraRanks) { - // Reset the barrier. - comm->intraBarrier[comm->intraPhase^1] = 0; - *isLast = 1; - return ncclSuccess; - } - done = __sync_bool_compare_and_swap(ptr, val, val+1); - val++; - } - *isLast = 0; - return ncclSuccess; -} - -ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) { - volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); - int val = *ptr; - if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) { - WARN("Trying to launch too many collectives"); - return ncclInternalError; - } - return ncclSuccess; -} - -ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) { - volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); - while (*ptr < comm->intraRanks) pthread_yield(); - comm->intraPhase ^= 1; - return ncclSuccess; -} - -ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) { - if (comm->nRanks == 1) return ncclSuccess; - struct cudaLaunchParams* params = comm->myParams; - - NCCLCHECK(setupLaunch(comm, params)); - - // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL - if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) { - // Enqueue event in user stream - CUDACHECK(cudaEventRecord(comm->doneEvent, comm->userStream)); - // Create dependency between user stream and internal NCCL stream - CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->doneEvent, 0)); - params->stream = comm->groupStream; - } else { - if (comm->userStream != params->stream) { - // Stream changed from last call, create dependency against last NCCL kernel launch - CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0)); - } - params->stream = comm->userStream; - } - - int isLast = 0; - NCCLCHECK(ncclCpuBarrierIn(comm, &isLast)); - - if (isLast) { - if (comm->launchMode == ncclComm::GROUP) { - // I'm the last. Launch all operations. - NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode)); - } - NCCLCHECK(ncclCpuBarrierLast(comm)); - } - return ncclSuccess; -} - -ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) { - if (comm->nRanks == 1) return ncclSuccess; - // We can't print the CG mode before the first barrier happened. - if (comm->rank == 0 && *comm->intraCGMode & 0x10) { - *comm->intraCGMode ^= 0x10; - INFO(NCCL_INIT,"Launch mode %s%s%s", - comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel", - *comm->intraCGMode ? "/CGMD" : "", - (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : ""); - } - - NCCLCHECK(ncclCpuBarrierOut(comm)); - - struct cudaLaunchParams *params = comm->myParams; - if (comm->launchMode == ncclComm::PARALLEL) { - CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream)); - } - // Start the network proxies as soon as the kernel has been launched. We can't - // perform any CUDA call between the two or having a cudaFree between the CUDA - // launch and the transportStartProxies call could cause a deadlock. - // Also, starting the proxies after the CUDA launch seems to be better for - // performance (latency). - for (int r=0; r<params->gridDim.x; r++) { - struct ncclRing* ring = comm->rings+r; - ring->collStart = ring->collFifoTail; - ring->collCount = 0; - } - params->gridDim.x = params->blockDim.x = 0; - NCCLCHECK(transportStartProxies(comm)); - return ncclSuccess; -} - -ncclResult_t ncclEnqueueEvents(ncclComm_t comm) { - struct cudaLaunchParams *params = comm->myParams; - // Enqueue event after NCCL kernel - CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream)); - // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL - if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) { - // Create dependency between NCCL internal stream and user stream - CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0)); - } - comm->userStreamSet = false; - return ncclSuccess; -} - -ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff, - void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, - ncclComm_t comm, cudaStream_t stream) { - if (comm == NULL) return ncclInvalidArgument; - // Launch asynchronously if needed - if (ncclAsyncMode()) { - ncclResult_t ret = ncclSuccess; - int savedDev = -1; - if (comm->checkPointers) { - CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end); - CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, end); - } - // Check arguments - NCCLCHECKGOTO(ArgsCheck(sendbuff, recvbuff, count, type, op, root, comm, primName), ret, end); - // Always register comm even in case of error to make sure ncclGroupEnd - // cleans it up. - NCCLCHECK(ncclAsyncColl(comm)); - NCCLCHECKGOTO(func(sendbuff, recvbuff, count, type, op, root, comm, stream), ret, end); -end: - if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev)); - ncclAsyncErrCheck(ret); - return ret; - } else { - NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, type, op, root, comm, primName)); - NCCLCHECK(func(sendbuff, recvbuff, count, type, op, root, comm, stream)); - NCCLCHECK(ncclBarrierEnqueue(comm)); - NCCLCHECK(ncclBarrierEnqueueWait(comm)); - NCCLCHECK(ncclEnqueueEvents(comm)); - return ncclSuccess; - } -} diff --git a/src/misc/group.cu b/src/misc/group.cu index 1716a75..c428a22 100644 --- a/src/misc/group.cu +++ b/src/misc/group.cu @@ -179,13 +179,13 @@ group_cleanup: // an atomic operation, we need to cancel all operations. for (int i=0; i<ncclGroupIndex; i++) { struct ncclComm* comm = ncclGroupArgs[i].coll.comm; - for (int r=0; r<comm->nRings; r++) { - struct ncclRing* ring = comm->rings+r; - for (int i=0; i<ring->collCount; i++) { - ring->collectives[(ring->collStart + i)%NCCL_MAX_OPS].active = 0; + for (int c=0; c<comm->nChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + for (int i=0; i<channel->collCount; i++) { + channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0; } - ring->collFifoTail = ring->collStart; - ring->collCount = 0; + channel->collFifoTail = channel->collStart; + channel->collCount = 0; } comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0; comm->userStreamSet = false; diff --git a/src/misc/nvmlwrap.cu b/src/misc/nvmlwrap.cu index d9407f4..635f332 100644 --- a/src/misc/nvmlwrap.cu +++ b/src/misc/nvmlwrap.cu @@ -16,14 +16,14 @@ static nvmlReturn_t (*nvmlInternalInit)(void); static nvmlReturn_t (*nvmlInternalShutdown)(void); static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device); static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index); -static nvmlReturn_t (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device); -static nvmlReturn_t (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device); static const char* (*nvmlInternalErrorString)(nvmlReturn_t r); static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci); static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult); +static nvmlReturn_t (*nvmlInternalDeviceGetMinorNumber)(nvmlDevice_t device, unsigned int* minorNumber); + ncclResult_t wrapNvmlSymbols(void) { if (nvmlState == nvmlInitialized) @@ -70,10 +70,9 @@ ncclResult_t wrapNvmlSymbols(void) { LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown); LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId); LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex); - LOAD_SYM(nvmlhandle, "nvmlDeviceSetCpuAffinity", nvmlInternalDeviceSetCpuAffinity); - LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity); LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString); LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo); + LOAD_SYM(nvmlhandle, "nvmlDeviceGetMinorNumber", nvmlInternalDeviceGetMinorNumber); LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState); LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo); LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability); @@ -86,9 +85,8 @@ teardown: nvmlInternalShutdown = NULL; nvmlInternalDeviceGetHandleByPciBusId = NULL; nvmlInternalDeviceGetIndex = NULL; - nvmlInternalDeviceSetCpuAffinity = NULL; - nvmlInternalDeviceClearCpuAffinity = NULL; nvmlInternalDeviceGetPciInfo = NULL; + nvmlInternalDeviceGetMinorNumber = NULL; nvmlInternalDeviceGetNvLinkState = NULL; nvmlInternalDeviceGetNvLinkRemotePciInfo = NULL; nvmlInternalDeviceGetNvLinkCapability = NULL; @@ -155,46 +153,28 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { return ncclSuccess; } -ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) { - if (nvmlInternalDeviceSetCpuAffinity == NULL) { - WARN("lib wrapper not initialized."); - return ncclInternalError; - } - // Workaround : it seems SetCpuAffinity is not thread safe. - static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; - pthread_mutex_lock(&lock); - nvmlReturn_t ret = nvmlInternalDeviceSetCpuAffinity(device); - pthread_mutex_unlock(&lock); - if (ret != NVML_SUCCESS) { - WARN("nvmlDeviceSetCpuAffinity() failed: %s ", - nvmlInternalErrorString(ret)); - return ncclSystemError; - } - return ncclSuccess; -} - -ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) { - if (nvmlInternalInit == NULL) { +ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) { + if (nvmlInternalDeviceGetPciInfo == NULL) { WARN("lib wrapper not initialized."); return ncclInternalError; } - nvmlReturn_t ret = nvmlInternalDeviceClearCpuAffinity(device); + nvmlReturn_t ret = nvmlInternalDeviceGetPciInfo(device, pci); if (ret != NVML_SUCCESS) { - WARN("nvmlDeviceClearCpuAffinity() failed: %s ", + WARN("nvmlDeviceGetPciInfo() failed: %s ", nvmlInternalErrorString(ret)); return ncclSystemError; } return ncclSuccess; } -ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) { - if (nvmlInternalDeviceGetPciInfo == NULL) { +ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) { + if (nvmlInternalDeviceGetMinorNumber == NULL) { WARN("lib wrapper not initialized."); return ncclInternalError; } - nvmlReturn_t ret = nvmlInternalDeviceGetPciInfo(device, pci); + nvmlReturn_t ret = nvmlInternalDeviceGetMinorNumber(device, minorNumber); if (ret != NVML_SUCCESS) { - WARN("nvmlDeviceGetPciInfo() failed: %s ", + WARN("nvmlDeviceGetMinorNumber() failed: %s ", nvmlInternalErrorString(ret)); return ncclSystemError; } @@ -208,8 +188,9 @@ ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link } nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive); if (ret != NVML_SUCCESS) { - INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ", - nvmlInternalErrorString(ret)); + if (ret != NVML_ERROR_NOT_SUPPORTED) + INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ", + nvmlInternalErrorString(ret)); return ncclSystemError; } return ncclSuccess; diff --git a/src/misc/rings.cu b/src/misc/rings.cu index a5d4616..a7b122c 100644 --- a/src/misc/rings.cu +++ b/src/misc/rings.cu @@ -160,7 +160,10 @@ static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankTo while ((rank = findConnected(curRank, matrix, nranks, transport, coords)) == -1) { current[transport] = 0; transport++; - if (transport == NTRANSPORTS) { free(p2pConnected); return ncclInternalError; } + if (transport == NTRANSPORTS) { + WARN("Error : Could not find transport to connect next group\n"); + free(p2pConnected); + return ncclInternalError; } } curRank = rank; current[transport]++; @@ -179,8 +182,20 @@ ncclResult_t getEnvThreads(int* nthreads) { return ncclSuccess; } +static inline int copyRings(int nrings, int newNrings, int nranks, int* a, int* b, int* c, int* d) { + if (newNrings > MAXCHANNELS) newNrings = MAXCHANNELS; + for (int r=nrings; r<newNrings; r++) { + for (int i=0; i<nranks; i++) { + a[r*nranks+i] = a[(r-nrings)*nranks+i]; + b[r*nranks+i] = b[(r-nrings)*nranks+i]; + c[r*nranks+i] = c[(r-nrings)*nranks+i]; + d[r*nranks+i] = d[(r-nrings)*nranks+i]; + } + } + return newNrings; +} /* Main ring creation function */ -ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next) { +ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut) { *nrings = 0; if (nranks == 1) return ncclSuccess; @@ -191,6 +206,12 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* if (ret == ncclSuccess && *nrings > 0) { if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings); NCCLCHECK(getEnvThreads(nthreads)); + for (int r = 0; r<*nrings; r++) { + for (int i = 0; i<nranks; i++) { + if (transports[i*nranks+prev[i]] == 2) treeIn[i] = 1; + if (transports[i*nranks+next[i]] == 2) treeOut[i] = 1; + } + } return ncclSuccess; } if (rank == 0) INFO(NCCL_INIT,"No valid ring found in environment, ignoring"); @@ -210,8 +231,8 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* int minScore = NCCL_MAX_SCORE; int nringsTmp; int *prevTmp, *nextTmp, *idxToRank, *rankToIdx, *groups, *subgroups; - NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXRINGS)); - NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXRINGS)); + NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&idxToRank, nranks)); NCCLCHECK(ncclCalloc(&rankToIdx, nranks)); NCCLCHECK(ncclCalloc(&groups, nranks)); @@ -220,8 +241,8 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* int nThreads; do { nThreads = *nthreads; - for (int i=0; i<nranks*MAXRINGS; i++) prevTmp[i] = nextTmp[i] = -1; - nringsTmp = MAXRINGS; + for (int i=0; i<nranks*MAXCHANNELS; i++) prevTmp[i] = nextTmp[i] = -1; + nringsTmp = MAXCHANNELS; // Loop over transports to connect groups for (int t=NTRANSPORTS-1; t>=0; t--) { for (int i=0; i<nranks; i++) idxToRank[i] = rankToIdx[i] = -1; @@ -282,6 +303,11 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* for (int i=0; i<nidx; i++) { if ((prevTmp[r*nranks+idxToRank[i]] == -1) && (subprev[r*nidx+i] != -1)) prevTmp[r*nranks+idxToRank[i]] = idxToRank[subprev[r*nidx+i]]; if ((nextTmp[r*nranks+idxToRank[i]] == -1) && (subnext[r*nidx+i] != -1)) nextTmp[r*nranks+idxToRank[i]] = idxToRank[subnext[r*nidx+i]]; + if (t == NTRANSPORTS-1) { + // Save node-level masters for trees + treeIn[r*nranks+idxToRank[i]] = prevTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1; + treeOut[r*nranks+idxToRank[i]] = nextTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1; + } } } //for (int r=0; r<nringsTmp; r++) { @@ -316,6 +342,15 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* *nthreads = nThreads; + /* Duplicate the rings in case of multinode+NVLink */ + int nnodes = 0; + for (int r=0; r<nranks; r++) nnodes += treeIn[r]; + int nvlink; + NCCLCHECK(ncclNvlinkGpu(&nvlink)); + if (nnodes > 1 && nvlink) { + *nrings = copyRings(*nrings, *nrings*2, nranks, prev, next, treeIn, treeOut); + } + if (*nrings == 0) { WARN("Could not create rings, falling back on simple ring"); *nrings = 1; @@ -329,9 +364,9 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than NCCL_MAX_NRINGS, ignoring NCCL_MIN_NRINGS"); minNrings = 0; } - if (minNrings > MAXRINGS) { - if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXRINGS, MAXRINGS); - minNrings = MAXRINGS; + if (minNrings > MAXCHANNELS) { + if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXCHANNELS, MAXCHANNELS); + minNrings = MAXCHANNELS; } if (maxNrings > 0 && maxNrings <= *nrings) { if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings); @@ -341,13 +376,7 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* if (minNrings < defaultMinNrings) minNrings = defaultMinNrings; if (minNrings > 0 && minNrings > *nrings) { if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings); - for (int r=*nrings; r<MAXRINGS && r <minNrings; r++) { - for (int i=0; i<nranks; i++) { - prev[r*nranks+i] = prev[(r-*nrings)*nranks+i]; - next[r*nranks+i] = next[(r-*nrings)*nranks+i]; - } - } - *nrings = minNrings; + *nrings = copyRings(*nrings, minNrings, nranks, prev, next, treeIn, treeOut); } } diff --git a/src/misc/trees.cu b/src/misc/trees.cu new file mode 100644 index 0000000..e53ea0b --- /dev/null +++ b/src/misc/trees.cu @@ -0,0 +1,108 @@ +/************************************************************************* + * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" +#include "net.h" +#include "param.h" + +#define RANK_TO_INDEX(r) (rank > root ? rank-1 : rank) + +/* Btree which alternates leaves and nodes. + * Assumes root is 0, which conveniently builds a tree on powers of two, + * (because we have pow2-1 ranks) which lets us manipulate bits. + * Find first non-zero bit, then : + * Find the parent : + * xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below) + * xx11[0] -> xx10[0] (3,7,11 below) + * Find the children : + * xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13) + * xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13) + * + * Illustration : + * 0---------------8 + * ______/ \______ + * 4 12 + * / \ / \ + * 2 6 10 \ + * / \ / \ / \ \ + * 1 3 5 7 9 11 13 + */ +ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) { + int up, down0, down1; + int bit; + for (bit=1; bit<nranks; bit<<=1) { + if (bit & rank) break; + } + + if (rank == 0) { + *u = -1; + *d0 = nranks > 1 ? bit >> 1 : -1; + *d1 = -1; + return ncclSuccess; + } + + up = (rank ^ bit) | (bit << 1); + if (up >= nranks) up = (rank ^ bit); + *u = up; + + int lowbit = bit >> 1; + // down0 is always within bounds + down0 = lowbit == 0 ? -1 : rank-lowbit; + + down1 = lowbit == 0 ? -1 : rank+lowbit; + // Make sure down1 is within bounds + while (down1 >= nranks) { + down1 = lowbit == 0 ? -1 : rank+lowbit; + lowbit >>= 1; + } + *d0 = down0; *d1 = down1; + + return ncclSuccess; +} + +/* Build a double binary tree. Take the previous tree for the first tree. + * For the second tree, we use a mirror tree (if nranks is odd) + * + * 8---------0---------5 + * ______/ \______ _____/ \______ + * 4 12 1 9 + * / \ / \ / \ + * 2 6 10 3 7 10 + * / \ / \ / \ / \ / \ / \ + * 1 3 5 7 9 11 2 4 6 8 11 12 + * + * or shift it by one rank (if nranks is even) + * + * 8---------0--------------9 + * ______/ \ ______/ \ + * 4 \ 5 \ + * / \ \ / \ \ + * 2 6 10 3 7 11 + * / \ / \ / \ / \ / \ / \ + * 1 3 5 7 9 11 2 4 6 8 10 1 + */ +ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* s1, int* d1_0, int* d1_1) { + // First tree ... use a btree + ncclGetBtree(nranks, rank, s0, d0_0, d0_1); + // Second tree ... mirror or shift + if (nranks % 2 == 0) { + // shift + int shiftrank = (rank-1+nranks) % nranks; + int u, d0, d1; + ncclGetBtree(nranks, shiftrank, &u, &d0, &d1); + *s1 = u == -1 ? -1 : (u+1) % nranks; + *d1_0 = d0 == -1 ? -1 : (d0+1) % nranks; + *d1_1 = d1 == -1 ? -1 : (d1+1) % nranks; + } else { + // mirror + int u, d0, d1; + ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1); + *s1 = u == -1 ? -1 : nranks-1-u; + *d1_0 = d0 == -1 ? -1 : nranks-1-d0; + *d1_1 = d1 == -1 ? -1 : nranks-1-d1; + } + return ncclSuccess; +} diff --git a/src/misc/utils.cu b/src/misc/utils.cu index d8e3aec..c618e71 100644 --- a/src/misc/utils.cu +++ b/src/misc/utils.cu @@ -11,6 +11,24 @@ #include <string.h> #include <stdarg.h> +#include "nvmlwrap.h" +#include "core.h" + +// Convert a logical cudaDev index to the NVML device minor number +ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) { + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + nvmlDevice_t nvmlDevice; + unsigned int dev; + *nvmlDev = -1; + CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev)); + NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDevice)); + NCCLCHECK(wrapNvmlDeviceGetMinorNumber(nvmlDevice, &dev)); + + *nvmlDev = dev; + + return ncclSuccess; +} + ncclResult_t getHostName(char* hostname, int maxlen) { if (gethostname(hostname, maxlen) != 0) { strncpy(hostname, "unknown", maxlen); diff --git a/src/nccl.h.in b/src/nccl.h.in index 7227625..985274e 100644 --- a/src/nccl.h.in +++ b/src/nccl.h.in @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -68,14 +68,24 @@ ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); -/* Frees resources associated with communicator object. */ +/* Frees resources associated with communicator object, but waits for any operations + * that might still be running on the device. */ ncclResult_t ncclCommDestroy(ncclComm_t comm); ncclResult_t pncclCommDestroy(ncclComm_t comm); +/* Frees resources associated with communicator object and aborts any operations + * that might still be running on the device. */ +ncclResult_t ncclCommAbort(ncclComm_t comm); +ncclResult_t pncclCommAbort(ncclComm_t comm); + /* Returns a human-readable error message. */ const char* ncclGetErrorString(ncclResult_t result); const char* pncclGetErrorString(ncclResult_t result); +/* Checks whether the comm has encountered any asynchronous errors */ +ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); +ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); + /* Gets the number of ranks in the communicator clique. */ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); diff --git a/src/ring.cu b/src/ring.cu deleted file mode 100644 index fede793..0000000 --- a/src/ring.cu +++ /dev/null @@ -1,70 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "ring.h" -#include "param.h" - -NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES); - -ncclResult_t initRing(struct ncclComm* comm, int ringid) { - struct ncclRing* ring = comm->rings+ringid; - ring->id = ringid; - - // Setup intermediate buffering - ring->buffSize = ncclParamBuffsize(); - - const int sendSize = ring->devMemSendSize = sizeof(struct ncclSendMem); - struct ncclSendMem* sendMem; - NCCLCHECK(ncclCudaCalloc((char**)&sendMem, sendSize)); - ring->devMemSend = sendMem; - - const int recvSize = ring->devMemRecvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize; - struct ncclRecvMem* recvMem; - NCCLCHECK(ncclCudaCalloc((char**)&recvMem, recvSize)); - ring->devMemRecv = recvMem; - - TRACE(NCCL_INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize); - - // Pre-configure send/recv pointers. Those are the default, they may change later. - ring->recv.conn.buff = recvMem->buff; - ring->recv.conn.llBuff = recvMem->llBuff; - ring->recv.conn.tail = &recvMem->tail; - ring->recv.conn.opCount = &recvMem->opCount; - ring->recv.conn.direct = 0; - ring->send.conn.head = &sendMem->head; - ring->send.conn.llHead = &sendMem->llHead; - ring->send.conn.direct = 0; - ring->send.conn.llStep = 0; - ring->send.conn.llLastCleaning = 0; - - // Ring index to user rank table. - NCCLCHECK(ncclCudaCalloc(&ring->devUserRanks, comm->nRanks)); - NCCLCHECK(ncclCalloc(&ring->userRanks, comm->nRanks)); - - // Per-ring operation list. - NCCLCHECK(ncclCudaHostAlloc((void**)&ring->collectives, (void**)&ring->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS)); - return ncclSuccess; -} - -ncclResult_t freeRing(struct ncclRing* ring) { - // Intermediate buffering - CUDACHECK(cudaFree(ring->devMemSend)); - CUDACHECK(cudaFree(ring->devMemRecv)); - - // Index to rank table - free(ring->userRanks); - CUDACHECK(cudaFree(ring->devUserRanks)); - - // Operation list - NCCLCHECK(ncclCudaHostFree(ring->collectives)); - - // Free transport proxy resources - if (ring->send.transportResources) NCCLCHECK(ring->send.transport->send.free(ring->send.transportResources)); - NCCLCHECK(transportDestroyProxy(&ring->send)); - if (ring->recv.transportResources) NCCLCHECK(ring->recv.transport->recv.free(ring->recv.transportResources)); - NCCLCHECK(transportDestroyProxy(&ring->recv)); - return ncclSuccess; -} diff --git a/src/transport.cu b/src/transport.cu index 7c13d5c..1436a5b 100644 --- a/src/transport.cu +++ b/src/transport.cu @@ -1,11 +1,10 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "core.h" -#include "common_coll.h" extern struct ncclTransport p2pTransport; extern struct ncclTransport shmTransport; @@ -17,74 +16,16 @@ struct ncclTransport ncclTransports[NTRANSPORTS] = { netTransport, }; -static void FifoPullArgs(struct transportProxyInfo* info, struct ncclProxyArgs *args) { - struct ncclProxyArgs *fifoArgs = info->argsFifo + (info->argsFifoHead % TRANSPORT_PROXY_FIFO_SIZE); - pthread_mutex_lock(&info->mutex); - while (fifoArgs->active == 0) - pthread_cond_wait(&info->cond, &info->mutex); - __sync_synchronize(); - memcpy(args, fifoArgs, sizeof(struct ncclProxyArgs)); - __sync_synchronize(); - fifoArgs->active = 0; - pthread_cond_signal(&info->cond); - pthread_mutex_unlock(&info->mutex); - info->argsFifoHead++; -} - -static struct ncclProxyArgs* FifoGetNextArgs(struct transportProxyInfo* info) { - if (info == NULL) return NULL; - struct ncclProxyArgs* fifoArgs = info->argsFifo + (info->argsFifoTail % TRANSPORT_PROXY_FIFO_SIZE); - pthread_mutex_lock(&info->mutex); - while (fifoArgs->active == 1) - pthread_cond_wait(&info->cond, &info->mutex); - pthread_mutex_unlock(&info->mutex); - info->argsFifoTail++; - return fifoArgs; -} - -static void FifoPushArgs(struct transportProxyInfo* info) { - if (info == NULL) return; - - struct ncclProxyArgs* fifoArgs = info->argsFifo + ((info->argsFifoTail-1) % TRANSPORT_PROXY_FIFO_SIZE); - if (fifoArgs->active == 0) return; - - pthread_mutex_lock(&info->mutex); - pthread_cond_signal(&info->cond); - pthread_mutex_unlock(&info->mutex); -} - -static void WaitProxyReady(struct transportProxyInfo* info) { - pthread_mutex_lock(&info->mutex); - while (info->proxyReady == 0) - pthread_cond_wait(&info->cond, &info->mutex); - pthread_mutex_unlock(&info->mutex); -} - -static void SetProxyReady(struct transportProxyInfo* info) { - pthread_mutex_lock(&info->mutex); - info->proxyReady = 1; - pthread_cond_signal(&info->cond); - pthread_mutex_unlock(&info->mutex); -} - -static void StopProxy(struct transportProxyInfo* info) { - struct ncclProxyArgs* fifoArgs = FifoGetNextArgs(info); - fifoArgs->active = -1; - FifoPushArgs(info); -} - #define RECV 0 #define SEND 1 -static bool NeedProxy(int type, int pattern, struct ncclRing* ring, int nranks) { - enum proxyMode mode = proxyPatternMode(pattern); - if (mode == proxyRing) return true; +static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) { + if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true; /* In chains, one rank does not need a proxy. Let's figure out which one it is */ - int root = proxyPatternRoot(pattern); // Which index in the reorganized rings should we compare root against */ const int myrank = 0, nextrank = 1, prevrank = nranks-1; - int index = mode == proxyFrom ? + int index = pattern == ncclPatternPipelineFrom ? /* no recv / no send if root = */ /* bcast */ (type == RECV ? myrank : nextrank ): /* reduce */ (type == RECV ? prevrank : myrank ); @@ -92,96 +33,216 @@ static bool NeedProxy(int type, int pattern, struct ncclRing* ring, int nranks) return (root != rank); } -static void SaveProxy(struct ncclConnector* connector, struct ncclProxyArgs* args, int needProxy) { - struct transportProxyInfo* info = connector->proxyInfo; - if (info == NULL) return; - struct ncclProxyArgs* fifoArgs = FifoGetNextArgs(info); - args->needProxy = needProxy; - __sync_synchronize(); - memcpy(fifoArgs, args, sizeof(struct ncclProxyArgs)); - __sync_synchronize(); - fifoArgs->active = 1; +enum { proxyRecv=0, proxySend=1 }; + +#define PROXYARGS_ALLOCATE_SIZE 32 +struct ncclProxyPool { + struct ncclProxyPool *next; + struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE]; +}; + +ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) { + struct ncclProxyState* state = &comm->proxyState; + struct ncclProxyArgs* elem; + pthread_mutex_lock(&state->mutex); + if (state->pool == NULL) { + // Allocate a new pool of elements + struct ncclProxyPool* newPool; + NCCLCHECK(ncclCalloc(&newPool, 1)); + struct ncclProxyArgs* newElems = newPool->elems; + // Chain newly allocated elements + for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) { + if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1; + } + // Add them all to the pool list + state->pool = newElems; + // Save the pool memory block for later resource release + newPool->next = state->pools; + state->pools = newPool; + } + elem = state->pool; + state->pool = state->pool->next; + pthread_mutex_unlock(&state->mutex); + elem->next = elem->nextPeer = NULL; + *argsptr = elem; + return ncclSuccess; } -ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t nbytes, int pattern, struct ncclComm* comm) { - int llMode, nrings, nthreads; - ncclGetCollResource(comm, nbytes, &nrings, &nthreads, &llMode); - nbytes = llMode ? nbytes * 2 : nbytes; - substeps = llMode ? 1 : substeps; - subchunks = llMode ? NCCL_LL_CHUNKS : subchunks; - int buffSize = llMode ? NCCL_LL_BUFF_SIZE : comm->rings[0].buffSize; - - int nrounds = (int)(DIVUP(nbytes, ((size_t)nrings * nblocksPerRound * (buffSize/subchunks)))); // Fixed 32-bit overflow - int nsteps = nstepsPerRound * nrounds * substeps; - TRACE(NCCL_NET,"opCount %lx substeps %d subchunks %d nrounds %d nsteps %d comm %p", comm->opCount, subchunks, subchunks, nrounds, nsteps, comm); - TRACE(NCCL_NET,"opCount %lx nbytes %zi nrings %d buffSize %d pattern %d comm %p", comm->opCount, nbytes, nrings, buffSize, pattern, comm); - for (int r=0; r<nrings; r++) { - struct ncclRing* ring = comm->rings+((comm->myParams->gridDim.x+r)%comm->nRings); - struct ncclProxyArgs args = { ring, substeps*subchunks, nsteps, comm->opCount, llMode, 0 }; - SaveProxy(&ring->recv, &args, NeedProxy(RECV, pattern, ring, comm->nRanks)); - SaveProxy(&ring->send, &args, NeedProxy(SEND, pattern, ring, comm->nRanks)); +static void ProxyAppend(struct ncclConnector* connector, struct ncclProxyArgs* args) { + struct ncclComm* comm = connector->comm; + struct ncclProxyState* state = &comm->proxyState; + pthread_mutex_lock(&state->mutex); + if (connector->proxyAppend == NULL) { + // Nothing running for that peer. Add to the circular list + if (state->ops == NULL) { + // Create the list + args->next = args; + state->ops = args; + } else { + // Insert element in the list + args->next = state->ops->next; + state->ops->next = args; + } + connector->proxyAppend = args; + } else { + // There is an active operation already for that peer. + // Add it to the per-peer list + connector->proxyAppend->nextPeer = args; + connector->proxyAppend = args; } + pthread_mutex_unlock(&state->mutex); +} + +template <int type> +static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) { + if (peer < 0) return ncclSuccess; + + struct ncclPeer* peerComm = args->channel->peers+peer; + struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send; + if (connector->transportComm->proxy == NULL) return ncclSuccess; + + struct ncclProxyArgs* op; + NCCLCHECK(transportAllocateProxyArgs(connector->comm, &op)); + memcpy(op, args, sizeof(struct ncclProxyArgs)); + op->connector = connector; + op->progress = connector->transportComm->proxy; + op->state = ncclProxyOpReady; + ProxyAppend(connector, op); return ncclSuccess; } -ncclResult_t transportStartProxies(ncclComm* comm) { - for (int r=0; r<comm->nRings; r++) { - FifoPushArgs(comm->rings[r].send.proxyInfo); - FifoPushArgs(comm->rings[r].recv.proxyInfo); +ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks) { + if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) { + struct ncclRing* ring = &args->channel->ring; + if (NeedProxy(RECV, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxyRecv>(ring->prev, args)); + if (NeedProxy(SEND, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxySend>(ring->next, args)); + } + if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) { + // Tree up + struct ncclTree* tree = &args->channel->tree; + for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args)); + NCCLCHECK(SaveProxy<proxySend>(tree->up, args)); + } + if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) { + // Tree down + struct ncclTree* tree = &args->channel->tree; + for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args)); + NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args)); } - pthread_yield(); // Let other threads run return ncclSuccess; } -void* persistentThread(void *opaqueInfo) { - struct transportProxyInfo* info = (struct transportProxyInfo*)opaqueInfo; - // We need to initialize the context before launching any NCCL cuda kernel, - // otherwise we would create it during the first cudaMemcpyAsync inside the - // proxy function and that would cause a deadlock - cudaSetDevice(info->comm->cudaDev); - // Signal the main thread the context is created and it can proceed. - SetProxyReady(info); +void* persistentThread(void *comm_) { + struct ncclComm* comm = (struct ncclComm*)comm_; + struct ncclProxyState* state = &comm->proxyState; + struct ncclProxyArgs* op = NULL; + ncclResult_t ret = ncclSuccess; + int idle = 1; + int idleSpin = 0; while (1) { - struct ncclProxyArgs args; - FifoPullArgs(info, &args); - if (args.active == -1) { - // Main thread asked to stop + do { + if (*comm->abortFlag) return NULL; + if (op == NULL) { + pthread_mutex_lock(&state->mutex); + op = state->ops; + if (op == NULL) { + if (state->stop) { + // No more commands to process and proxy has been requested to stop + pthread_mutex_unlock(&state->mutex); + return NULL; + } + pthread_cond_wait(&state->cond, &state->mutex); + } + pthread_mutex_unlock(&state->mutex); + } + } while (op == NULL); + op->idle = 0; + if (op->state != ncclProxyOpNone) ret = op->progress(op); + if (ret != ncclSuccess) { + comm->fatalError = ret; + INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); return NULL; } - ncclResult_t res = info->func(&args); - if (res != ncclSuccess) { - WARN("%s:%d -> %d [Proxy thread error]", __FILE__, __LINE__, res); + idle &= op->idle; + pthread_mutex_lock(&state->mutex); + if (!idle) idleSpin = 0; + struct ncclProxyArgs *next = op->next; + if (next->state == ncclProxyOpNone) { + struct ncclProxyArgs *freeOp = next; + if (next->nextPeer) { + // Replace next by its next per-peer element. + next = next->nextPeer; + if (op != freeOp) { + next->next = freeOp->next; + op->next = next; + } else { + next->next = next; + } + } else { + // Remove next from circular list + next->connector->proxyAppend = NULL; + if (op != freeOp) { + next = next->next; + op->next = next; + } else { + next = NULL; + } + } + if (freeOp == state->ops) state->ops = next; + freeOp->next = state->pool; + state->pool = freeOp; } + op = next; + if (op == state->ops) { + if (idle == 1) { + if (++idleSpin == 10) { + sched_yield(); + idleSpin = 0; + } + } + idle = 1; + } + pthread_mutex_unlock(&state->mutex); } } -ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm) { - struct ncclConnector* connector = (type == RECV) ? &ring->recv : &ring->send; - threadFunc_t proxyfunc = (threadFunc_t) ((type == RECV) ? connector->transport->recv.proxy : connector->transport->send.proxy); - if (proxyfunc) { - TRACE(NCCL_NET,"type %d ring %p proxyfunc %p comm %p", type, ring, proxyfunc, comm); - struct transportProxyInfo* info; - NCCLCHECK(ncclCalloc(&info, 1)); - connector->proxyInfo = info; - info->comm = comm; - info->cond = PTHREAD_COND_INITIALIZER; - info->mutex = PTHREAD_MUTEX_INITIALIZER; - info->func = proxyfunc; - info->argsFifoHead = info->argsFifoTail = 0; - info->proxyReady = 0; - pthread_create(&connector->proxyInfo->thread, NULL, persistentThread, info); - // Wait for thread to initialize its CUDA context. - WaitProxyReady(info); +ncclResult_t transportStartProxy(struct ncclComm* comm) { + pthread_mutex_lock(&comm->proxyState.mutex); + if (comm->proxyState.ops != NULL) + pthread_cond_signal(&comm->proxyState.cond); + pthread_mutex_unlock(&comm->proxyState.mutex); + return ncclSuccess; +} + +ncclResult_t transportCreateProxy(struct ncclComm* comm) { + if (!comm->proxyThread) { + comm->proxyState.cond = PTHREAD_COND_INITIALIZER; + comm->proxyState.mutex = PTHREAD_MUTEX_INITIALIZER; + comm->proxyState.ops = NULL; + pthread_create(&comm->proxyThread, NULL, persistentThread, comm); } return ncclSuccess; } -ncclResult_t transportDestroyProxy(struct ncclConnector* connector) { - if (connector->proxyInfo) { - StopProxy(connector->proxyInfo); - pthread_join(connector->proxyInfo->thread, NULL); - free(connector->proxyInfo); - connector->proxyInfo = NULL; +ncclResult_t transportDestroyProxy(struct ncclComm* comm) { + struct ncclProxyState* state = &comm->proxyState; + + // Request the proxy to stop and then wake it + pthread_mutex_lock(&state->mutex); + state->stop = true; + pthread_cond_signal(&state->cond); + pthread_mutex_unlock(&state->mutex); + if (comm->proxyThread) pthread_join(comm->proxyThread, NULL); + + // Free off any memory allocated for the proxy arg pools + pthread_mutex_lock(&state->mutex); + struct ncclProxyState* proxyState = &comm->proxyState; + while (proxyState->pools != NULL) { + struct ncclProxyPool *next = proxyState->pools->next; + free(proxyState->pools); + proxyState->pools = next; } + pthread_mutex_unlock(&state->mutex); + return ncclSuccess; } diff --git a/src/transport/net.cu b/src/transport/net.cu index 9c366b3..06a6e23 100644 --- a/src/transport/net.cu +++ b/src/transport/net.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,11 +9,17 @@ #include "nvmlwrap.h" #include "net.h" #include "param.h" -#include "nvlink.h" +#include "topo.h" #include <cuda_runtime.h> #include <assert.h> #define NET_MAX_IFS 16 +#define NET_MAX_GPUS 32 + +// Cache GPU-NIC distances to avoid re-computing them +#define NET_TVALUE_UNKNOWN 0ULL +static ncclTvalue_t ncclNetTvalues[NET_MAX_GPUS] = { NET_TVALUE_UNKNOWN }; +static int ncclNetNDev; // We encode 3 bits of distance per interface into a ncclTvalue_t (64-bit) #define NET_BITS_PER_IF 3 @@ -28,13 +34,9 @@ static ncclTvalue_t getTvalue(short* distances, int ndev) { } return tvalue; } - -struct netInfo { - int rank; - int ndev; - ncclTvalue_t tValue; - short distances[NET_MAX_IFS]; -}; +static int getScore(ncclTvalue_t tvalue, int dev) { + return (tvalue >> (dev*NET_BITS_PER_IF)) & NET_BITS_PER_IF_MASK; +} struct netConnectInfo { ncclNetHandle_t netHandle; @@ -46,11 +48,13 @@ struct netSendResources { struct ncclRecvMem* hostRecvMem; struct ncclSendMem* devHostSendMem; struct ncclRecvMem* devHostRecvMem; - struct ncclSendMem* hostDevMem; int netDev; int useGdr; - struct ncclRecvMem* devNetMem; - uint64_t llStep; + int buffSize; + void* mhandle; + void* llMhandle; + struct ncclRecvMem* devRecvMem; + uint64_t step; uint64_t llLastCleaning; }; @@ -61,50 +65,70 @@ struct netRecvResources { struct ncclRecvMem* hostRecvMem; struct ncclSendMem* devHostSendMem; struct ncclRecvMem* devHostRecvMem; - struct ncclRecvMem* hostDevMem; int netDev; int useGdr; - uint64_t llStep; + int buffSize; + void* mhandle; + void* llMhandle; + struct ncclRecvMem* devRecvMem; + uint64_t step; uint64_t llLastCleaning; }; -/* Fill information necessary to exchange between ranks to choose whether or not - * to use this transport */ -ncclResult_t netFillInfo(ncclTinfo_t* opaqueInfo, int rank) { - struct netInfo* info = (struct netInfo*)opaqueInfo; - static_assert(sizeof(struct netInfo) <= sizeof(ncclTinfo_t), "NET Info too large"); - info->rank = rank; - NCCLCHECK(ncclNetDevices(&info->ndev)); - if (info->ndev == 0) { +static ncclResult_t netDistance(int cudaDev, int dev, short* distance) { + char* cudaPath = NULL; + char* nicPath = NULL; + ncclResult_t err; + NCCLCHECK(getCudaPath(cudaDev, &cudaPath)); + err = ncclNetPciPath(dev, &nicPath); + *distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath); + if (nicPath) free(nicPath); + if (cudaPath) free(cudaPath); + return ncclSuccess; +} + +static ncclResult_t netDevices(int* ndev, short** distances) { + NCCLCHECK(ncclNetDevices(ndev)); + if (*ndev == 0) { WARN("Error : Network returned 0 device"); return ncclSystemError; } - if (info->ndev > NET_MAX_IFS) info->ndev = NET_MAX_IFS; + if (*ndev > NET_MAX_IFS) *ndev = NET_MAX_IFS; - // Find distance with current GPU - int cudaDev; - cudaGetDevice(&cudaDev); - char* cudaPath; - NCCLCHECK(getCudaPath(cudaDev, &cudaPath)); + *distances = (short*)malloc(*ndev*sizeof(short)); + if (*distances == NULL) return ncclSystemError; + // Find distance with current GPU + int cudaDev, nvmlDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev)) char line[1024]; - sprintf(line, "CUDA Dev %d, %s NIC distance : ", cudaDev, ncclNetName()); - for (int d=0; d<info->ndev; d++) { - char* nicPath; - ncclResult_t err = ncclNetPciPath(d, &nicPath); - info->distances[d] = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath); - sprintf(line+strlen(line), " %s", pathDists[info->distances[d]]); - if (err == ncclSuccess) free(nicPath); + sprintf(line, "CUDA Dev %d[%d], %s NIC distance : ", cudaDev, nvmlDev, ncclNetName()); + for (int d=0; d<*ndev; d++) { + NCCLCHECK(netDistance(cudaDev, d, *distances+d)); + sprintf(line+strlen(line), " %s", pathDists[(*distances)[d]]); } INFO(NCCL_INIT|NCCL_NET, "%s", line); - free(cudaPath); return ncclSuccess; } /* Determine if we can communicate with the peer */ -ncclResult_t netCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) { - struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo; - ret[0] = getTvalue(myInfo->distances, myInfo->ndev); +ncclResult_t netCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) { + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + ret[0] = ncclNetTvalues[cudaDev]; + if (ret[0] == NET_TVALUE_UNKNOWN) { + if (cudaDev >= NET_MAX_GPUS) { + WARN("CUDA device %d >= MAX %d\n", cudaDev, NET_MAX_GPUS); + return ncclInternalError; + } + int nDev; + short* distances; + NCCLCHECK(netDevices(&nDev, &distances)); + ncclNetTvalues[cudaDev] = ret[0] = getTvalue(distances, nDev); + ncclNetNDev = nDev; + free(distances); + } return ncclSuccess; } @@ -196,45 +220,51 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* return ncclSuccess; } -int getDev(int ringId, int nDev, short* distances) { - int minDistance = PATH_SOC; - for (int d=0; d<nDev; d++) if (distances[d] < minDistance) minDistance = distances[d]; +int getDev(int cudaDev, int ringId) { + ncclTvalue_t tvalues = ncclNetTvalues[cudaDev]; + + int dev = 0; + int maxScore = 0; + for (int d=0; d<ncclNetNDev; d++) if (getScore(tvalues,d) > maxScore) maxScore = getScore(tvalues,d); int skip = ringId+1; while (skip) { - for (int d=0; d<nDev; d++) { - if (distances[d] == minDistance) { + for (int d=0; d<ncclNetNDev; d++) { + if (getScore(tvalues, d) == maxScore) { skip--; - if (skip == 0) return d; + if (skip == 0) { dev = d; goto end; } } } } - return 0; +end: + return dev; } NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2); NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB); -static ncclResult_t netGetGdrSupport(int dev, int distance, int read, int* useGdr) { +static ncclResult_t netGetGdrSupport(int dev, int read, int* useGdr) { *useGdr = 0; - int cudaDev; + int cudaDev, nvmlDev; CUDACHECK(cudaGetDevice(&cudaDev)); + NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev)) if (read) { // For reads (sends) only enable under certain conditions int gdrReadParam = ncclParamNetGdrRead(); if (gdrReadParam == 0) return ncclSuccess; - else if (gdrReadParam < 0) { // default : enable only on DGX2 - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; - CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev)); - int nvlinks = getNumNvlinks(busId); - if (nvlinks < CONNECT_NVSWITCH || ncclCudaCompCap() < 7) return ncclSuccess; + if (gdrReadParam < 0) { + int nvlink; + NCCLCHECK(ncclNvlinkGpu(&nvlink)); + if (!nvlink) return ncclSuccess; } } // Check if we are close enough that it makes sense to enable GDR int netGdrLevel = ncclParamNetGdrLevel(); + short distance; + NCCLCHECK(netDistance(cudaDev, dev, &distance)); if (distance >= netGdrLevel) { - INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, dev, distance, netGdrLevel); + INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel); return ncclSuccess; } @@ -243,51 +273,59 @@ static ncclResult_t netGetGdrSupport(int dev, int distance, int read, int* useGd NCCLCHECK(ncclNetPtrSupport(dev, &flags)); if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess; *useGdr = 1; - INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d / HCA %d (distance %d >= %d), read %d", ncclNetName(), cudaDev, dev, distance, netGdrLevel, read); + INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d[%d] / HCA %d (distance %d < %d), read %d", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel, read); return ncclSuccess; } /* Determine if we will use this transport for this peer and return connect * information for this peer */ -ncclResult_t netSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) { +ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { struct netSendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); - ring->send.transportResources = resources; + send->transportResources = resources; + + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + resources->netDev = getDev(cudaDev, channelId); + NCCLCHECK(netGetGdrSupport(resources->netDev, 1, &resources->useGdr)); - struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo; - resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances); - NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 1, &resources->useGdr)); + int sendSize = sizeof(struct ncclSendMem); + NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize)); - int size = offsetof(struct ncclRecvMem, buff)+ring->buffSize; + int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; if (resources->useGdr) { - NCCLCHECK(ncclCudaCalloc((char**)(&resources->devNetMem), size)); + NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize)); } + NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize)); + resources->buffSize = buffSize; - NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, size)); - NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, size)); - + INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [send] via NET/%s/%d%s", channelId, myInfo->rank, peerInfo->rank, ncclNetName(), resources->netDev, + resources->useGdr ? "/GDRDMA" : ""); return ncclSuccess; } -ncclResult_t netRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) { +ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { struct netRecvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); - ring->recv.transportResources = resources; + recv->transportResources = resources; - struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo; - resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances); - NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 0, &resources->useGdr)); + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + resources->netDev = getDev(cudaDev, channelId); + NCCLCHECK(netGetGdrSupport(resources->netDev, 0, &resources->useGdr)); int sendSize = sizeof(struct ncclSendMem); NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize)); - int recvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize; + int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; + if (resources->useGdr) { + NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize)); + } NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize)); + resources->buffSize = buffSize; - struct netInfo* peerInfo = (struct netInfo*)peerOpaqueInfo; - INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d via NET/%s/%d%s%s", ring->id, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev, - resources->useGdr ? "/GDRDMA" : "", - (resources->hostDevMem != NULL) ? "/GDCopy" : ""); + INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [receive] via NET/%s/%d%s", channelId, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev, + resources->useGdr ? "/GDRDMA" : ""); struct netConnectInfo* info = (struct netConnectInfo*) connectInfo; NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm)); return ncclSuccess; @@ -297,27 +335,28 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto // Setup device pointers struct netSendResources* resources = (struct netSendResources*)send->transportResources; - if (resources->useGdr) { - send->conn.buff = resources->devNetMem->buff; - // We don't use devMem for llMode because the CPU has to read the data - send->conn.llBuff = resources->devHostRecvMem->llBuff; - } else { - send->conn.buff = resources->devHostRecvMem->buff; - send->conn.llBuff = resources->devHostRecvMem->llBuff; - } + // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host + struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem; + send->conn.buff = recvMem->buff; + send->conn.llBuff = resources->devHostRecvMem->llBuff; + + // Head/Tail/Opcount/Fifos are always on host send->conn.tail = &resources->devHostRecvMem->tail; - send->conn.opCount = &resources->devHostRecvMem->opCount; + send->conn.opCountRem = &resources->devHostRecvMem->opCount; send->conn.fifo = resources->devHostRecvMem->sizesFifo; - send->conn.llFifo = resources->devHostRecvMem->llSizesFifo; - - if (resources->hostDevMem == NULL) { - send->conn.head = &resources->devHostSendMem->head; - send->conn.llHead = &resources->devHostSendMem->llHead; - } + send->conn.head = &resources->devHostSendMem->head; + send->conn.opCountLoc = &resources->devHostSendMem->opCount; + for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1; // Connect to remote peer struct netConnectInfo* info = (struct netConnectInfo*)connectInfo; NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm)); + + NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->buff, resources->buffSize, + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle)); + NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->devHostRecvMem->llBuff, + NCCL_LL_BUFF_SIZE, NCCL_PTR_HOST, &resources->llMhandle)); + return ncclSuccess; } @@ -326,32 +365,37 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto // Setup device pointers struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources; - recv->conn.head = &resources->devHostSendMem->head; - recv->conn.llHead = &resources->devHostSendMem->llHead; - - if (resources->useGdr == 0) { - recv->conn.buff = resources->devHostRecvMem->buff; - recv->conn.llBuff = resources->devHostRecvMem->llBuff; - } + // Intermediate buffering on GPU for GPU Direct RDMA + struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem; + recv->conn.buff = recvMem->buff; + recv->conn.llBuff = recvMem->llBuff; - if (resources->hostDevMem == NULL) { - recv->conn.tail = &resources->devHostRecvMem->tail; - recv->conn.opCount = &resources->devHostRecvMem->opCount; - } + // Head/Tail/Opcount are always on host + recv->conn.tail = &resources->devHostRecvMem->tail; + recv->conn.opCountLoc = &resources->devHostRecvMem->opCount; + recv->conn.head = &resources->devHostSendMem->head; + recv->conn.opCountRem = &resources->devHostSendMem->opCount; - // Finish connection establishment + // Finish connection establishment from remote peer NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm)); NCCLCHECK(ncclNetCloseListen(resources->netListenComm)); + NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->buff, resources->buffSize, + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle)); + NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->llBuff, NCCL_LL_BUFF_SIZE, + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->llMhandle)); + return ncclSuccess; } ncclResult_t netSendFree(void* transportResources) { struct netSendResources* resources = (struct netSendResources*)transportResources; NCCLCHECK(ncclCudaHostFree(resources->hostSendMem)); + NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandle)); + NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->llMhandle)); NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem)); if (resources->useGdr) - CUDACHECK(cudaFree(resources->devNetMem)); + CUDACHECK(cudaFree(resources->devRecvMem)); NCCLCHECK(ncclNetCloseSend(resources->netSendComm)); free(resources); return ncclSuccess; @@ -360,196 +404,166 @@ ncclResult_t netSendFree(void* transportResources) { ncclResult_t netRecvFree(void* transportResources) { struct netRecvResources* resources = (struct netRecvResources*)transportResources; NCCLCHECK(ncclCudaHostFree(resources->hostSendMem)); + NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandle)); + NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->llMhandle)); NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem)); + if (resources->useGdr) + CUDACHECK(cudaFree(resources->devRecvMem)); NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm)); free(resources); return ncclSuccess; } ncclResult_t netSendProxy(struct ncclProxyArgs* args) { - struct ncclRing* ring = args->ring; - struct netSendResources* resources = (struct netSendResources*) (ring->send.transportResources); - const int llMode = args->llMode; - - volatile uint64_t* prevTail = &resources->hostRecvMem->tail; - struct ncclSendMem* prevMem = resources->hostDevMem ? resources->hostDevMem : resources->hostSendMem; - uint64_t* prevHead = llMode ? &prevMem->llHead : &prevMem->head; - struct ncclRecvMem* localMem = resources->useGdr ? resources->devNetMem : resources->hostRecvMem; - char* localBuff = llMode ? resources->hostRecvMem->llBuff : localMem->buff; - int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST; - volatile int* sizesFifo = llMode ? resources->hostRecvMem->llSizesFifo : resources->hostRecvMem->sizesFifo; - int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize; - int sliceSize = buffSize / args->substeps; - - assert(args->substeps <= SIZES_FIFO_SIZE); - - uint64_t head = llMode ? resources->llStep : 0ULL; - uint64_t tail = llMode ? resources->llStep : 0ULL; - uint64_t end = head + args->nsteps; - - int idle = 0; - void* requests[args->substeps]; - - if (!args->needProxy) goto nextColl; - - TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode); - TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType); - - // Update in case we skipped some collectives - if (llMode == 0) resources->hostRecvMem->opCount = args->opCount; - - while (head < end) { - idle++; - if (llMode) { - if (tail < end && tail < head + args->substeps) { - int slot = tail%args->substeps; - int size = sizesFifo[slot]; - if (size != 0) { - if (size == -1) size = 0; - uint32_t flag = tail + 1; - int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine)); - size = nFifoLines * sizeof(union ncclLLFifoLine); - union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+slot*sliceSize); - for (int i=0; i<nFifoLines; i++) { - volatile uint32_t *f1 = &lines[i].flag1; - volatile uint32_t *f2 = &lines[i].flag2; - while (f1[0] != flag || f2[0] != flag); + struct netSendResources* resources = (struct netSendResources*) (args->connector->transportResources); + if (args->state == ncclProxyOpReady) { + // Update opCount + resources->hostRecvMem->opCount = args->opCount; + + // Round to next multiple of sliceSteps + resources->step = ROUNDUP(resources->step, args->chunkSteps); + args->head = resources->step; + args->tail = resources->step; + args->end = args->head + args->nsteps; + args->state = ncclProxyOpProgress; + } + if (args->state == ncclProxyOpProgress) { + args->idle = 1; + if (args->head < args->end) { + if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) { + volatile int* sizesFifo = resources->hostRecvMem->sizesFifo; + if (args->llMode) { + int buffSlot = args->tail%NCCL_STEPS; + int size = sizesFifo[buffSlot]; + if (size != -1) { + uint32_t flag = args->tail + 1; + int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine)); + size = nFifoLines * sizeof(union ncclLLFifoLine); + union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES; + int ready = 1; + for (int i=0; i<nFifoLines; i++) { + volatile uint32_t *f1 = &lines[i].flag1; + volatile uint32_t *f2 = &lines[i].flag2; + if (f1[0] != flag || f2[0] != flag) { ready = 0; break; } + } + if (ready) { + NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, resources->llMhandle, args->requests+buffSlot)); + if (args->requests[buffSlot] != NULL) { + sizesFifo[buffSlot] = -1; + // Make sure size is reset to zero before we update the head. + __sync_synchronize(); + args->tail += args->sliceSteps; + args->idle = 0; + } + } } - NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, ptrType, requests+slot)); - if (requests[slot] != NULL) { - sizesFifo[slot] = size; - tail++; - idle = 0; + } else if (args->tail < resources->hostRecvMem->tail) { + struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; + int stepSize = args->channel->buffSize/NCCL_STEPS; + // Send through network + int buffSlot = args->tail%NCCL_STEPS; + NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot)); + if (args->requests[buffSlot] != NULL) { + sizesFifo[buffSlot] = -1; + // Make sure size is reset to zero before we update the head. + __sync_synchronize(); + args->tail += args->sliceSteps; + args->idle = 0; } } } - } else while (tail < *prevTail) { - // Send through network - int slot = tail%args->substeps; - NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+slot*sliceSize, sizesFifo[slot], ptrType, requests+slot)); - if (requests[slot] != NULL) { - tail++; - idle = 0; - } - } - if (head < tail) { - int done; - int slot = head%args->substeps; - NCCLCHECK(ncclNetTest(requests[slot], &done, NULL)); - if (done) { - if (llMode) { - sizesFifo[slot] = 0; - // Make sure size is reset to zero before we update the head. - __sync_synchronize(); + if (args->head < args->tail) { + int done; + int buffSlot = args->head%NCCL_STEPS; + NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL)); + if (done) { + args->head += args->sliceSteps; + resources->hostSendMem->head = args->head; + args->idle = 0; } - head++; - *prevHead = head; - idle = 0; } } - if (idle) transportProxyIdle(idle); + if (args->head == args->end) { + resources->step = args->end; + args->idle = 0; + args->state = ncclProxyOpDone; + } } - - // Reset - if (llMode == 0) *prevTail = 0; - -nextColl: - if (llMode) { - resources->llStep += args->nsteps; - // Don't forget to ack otherwise the GPU won't be able to push data. - *prevHead = resources->llStep; - if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - memset(localBuff, 0, NCCL_LL_BUFF_SIZE); - resources->llStep += NCCL_LL_CHUNKS; - *prevHead = resources->llStep; - resources->llLastCleaning = resources->llStep; + if (args->state == ncclProxyOpDone) { + union ncclLLFifoLine* llBuff = resources->hostRecvMem->llBuff; + if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) { + for (int i=0; i< NCCL_LL_BUFF_LINES; i++) llBuff[i].flag1 = llBuff[i].flag2 = resources->step; + resources->step += NCCL_STEPS; + resources->hostSendMem->head = resources->step; + resources->llLastCleaning = resources->step; } + args->state = ncclProxyOpNone; } return ncclSuccess; } ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { - struct ncclRing* ring = args->ring; - struct netRecvResources* resources = (struct netRecvResources*) (ring->recv.transportResources); - int llMode = args->llMode; - - volatile uint64_t* nextHead = llMode ? &resources->hostSendMem->llHead : &resources->hostSendMem->head; - struct ncclRecvMem* localMem = resources->useGdr ? ring->devMemRecv : resources->hostRecvMem; - char* localBuff = llMode ? localMem->llBuff : localMem->buff; - char* nextBuff = (resources->useGdr == 0 && resources->hostDevMem) ? resources->hostDevMem->buff : NULL; - int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST; - uint64_t* nextTail = resources->hostDevMem ? &resources->hostDevMem->tail : &resources->hostRecvMem->tail; - - int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize; - int sliceSize = buffSize / args->substeps; - - uint64_t head = llMode ? resources->llStep : 0ULL; - uint64_t tail = llMode ? resources->llStep : 0ULL; - uint64_t end = head + args->nsteps; - - int idle = 0; - void* requests[args->substeps]; - - if (!args->needProxy) goto nextColl; - - TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode); - TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType); - - if (llMode == 0) { - // Waiting for next opCount is only needed before writing nextTail. - uint64_t* nextOpCount = resources->hostDevMem ? &resources->hostDevMem->opCount : &resources->hostRecvMem->opCount; - transportProxyWait([=] { return *nextOpCount >= args->opCount; }); + struct netRecvResources* resources = (struct netRecvResources*) (args->connector->transportResources); + if (args->state == ncclProxyOpReady) { + // Update opCount + resources->hostSendMem->opCount = args->opCount; + + // Round to next multiple of sliceSteps + resources->step = ROUNDUP(resources->step, args->chunkSteps); + args->head = resources->step; + args->tail = resources->step; + args->end = args->head + args->nsteps; + args->state = ncclProxyOpProgress; } - - while (head < end) { - idle++; - if ((tail < head + args->substeps) && (tail < *nextHead + args->substeps) && (tail < end)) { - int slot = tail%args->substeps; - NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+slot*sliceSize, sliceSize, ptrType, requests+slot)); - if (requests[slot] != NULL) { - tail++; - idle = 0; + if (args->state == ncclProxyOpProgress) { + args->idle = 1; + int stepSize = ( args->llMode ? NCCL_LL_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS; + if (args->head < args->end) { + struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; + char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff; + void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle; + if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) { + int buffSlot = args->tail%NCCL_STEPS; + int sliceSize = stepSize * args->sliceSteps; + NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot)); + if (args->requests[buffSlot] != NULL) { + args->tail += args->sliceSteps; + args->idle = 0; + } } - } - if (tail > head) { - int done; - int slot = head%args->substeps; - int size; - NCCLCHECK(ncclNetTest(requests[slot], &done, &size)); - if (done) { - if (nextBuff) memcpy(nextBuff+slot*sliceSize, localBuff+slot*sliceSize, size); - head++; - if (llMode == 0) { - if (ptrType == NCCL_PTR_CUDA) ncclNetFlush(resources->netRecvComm, localBuff+slot*sliceSize, size); - *nextTail = head; + if (args->tail > args->head) { + int buffSlot = args->head%NCCL_STEPS; + int done, size; + NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size)); + if (done) { + args->head += args->sliceSteps; + if (args->llMode == 0) { + if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle); + resources->hostRecvMem->tail = args->head; + } + args->idle = 0; } - idle = 0; } } - if (idle) transportProxyIdle(idle); - } - - // Wait for last ack and reset - if (llMode == 0) { - transportProxyWait([=] { return *nextHead == head; }); - *nextHead = 0; + if (args->head == args->end) { + resources->step = args->end; + args->idle = 0; + args->state = ncclProxyOpDone; + } } - -nextColl: - if (llMode) { - resources->llStep += args->nsteps; - if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - resources->llStep += NCCL_LL_CHUNKS; - while (*nextHead < resources->llStep); - resources->llLastCleaning = resources->llStep; + if (args->state == ncclProxyOpDone) { + if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) { + resources->step += NCCL_STEPS; + while (resources->hostSendMem->head < resources->step); + resources->llLastCleaning = resources->step; } + args->state = ncclProxyOpNone; } return ncclSuccess; } struct ncclTransport netTransport = { "NET", - netFillInfo, netCanConnect, netGetRings, { netSendSetup, netSendConnect, netSendFree, netSendProxy }, diff --git a/src/transport/net_ib.cu b/src/transport/net_ib.cu index 18e158d..f7c574b 100644 --- a/src/transport/net_ib.cu +++ b/src/transport/net_ib.cu @@ -32,6 +32,7 @@ static int ncclNIbDevs = -1; struct ncclIbDev { int device; uint8_t port; + uint8_t link; ibv_context* context; char devName[MAXNAMESIZE]; }; @@ -97,7 +98,6 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { WARN("NET/IB : No IP interface found."); return ncclInternalError; } - INFO(NCCL_INIT|NCCL_NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName); // Detect IB cards int nIbDevs; @@ -113,47 +113,59 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { for (int d=0; d<nIbDevs; d++) { struct ibv_context * context; - if (ncclSuccess != wrap_ibv_open_device(&context, devices[d])) { + if (ncclSuccess != wrap_ibv_open_device(&context, devices[d]) || context == NULL) { WARN("NET/IB : Unable to open device %s", devices[d]->name); continue; } int found = 0; - if (context) { - struct ibv_device_attr devAttr; - if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) { - WARN("NET/IB : Unable to query device %s", devices[d]->name); + struct ibv_device_attr devAttr; + if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) { + WARN("NET/IB : Unable to query device %s", devices[d]->name); + if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } + continue; + } + for (int port = 1; port <= devAttr.phys_port_cnt; port++) { + struct ibv_port_attr portAttr; + if (ncclSuccess != wrap_ibv_query_port(context, port, &portAttr)) { + WARN("NET/IB : Unable to query port %d", port); continue; } - for (int port = 1; port <= devAttr.phys_port_cnt; port++) { - struct ibv_port_attr portAttr; - if (ncclSuccess != wrap_ibv_query_port(context, port, &portAttr)) { - WARN("NET/IB : Unable to query port %d", port); - continue; - } - if (portAttr.state != IBV_PORT_ACTIVE) continue; - if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND - && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue; - - // check against user specified HCAs/ports - if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) { - continue; - } - INFO(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port, - portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); - ncclIbDevs[ncclNIbDevs].device = d; - ncclIbDevs[ncclNIbDevs].port = port; - ncclIbDevs[ncclNIbDevs].context = context; - strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE); - ncclNIbDevs++; - found++; - pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context); - } + if (portAttr.state != IBV_PORT_ACTIVE) continue; + if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND + && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue; - if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } } + // check against user specified HCAs/ports + if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) { + continue; + } + TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port, + portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); + ncclIbDevs[ncclNIbDevs].device = d; + ncclIbDevs[ncclNIbDevs].port = port; + ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer; + ncclIbDevs[ncclNIbDevs].context = context; + strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE); + ncclNIbDevs++; + found++; + pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context); } + if (found == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } } if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; }; } + if (ncclNIbDevs == 0) { + INFO(NCCL_INIT|NCCL_NET, "NET/IB : No device found."); + } else { + char line[1024]; + line[0] = '\0'; + for (int d=0; d<ncclNIbDevs; d++) { + snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%d/%s", d, ncclIbDevs[d].devName, + ncclIbDevs[d].port, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); + } + line[1023] = '\0'; + char addrline[1024]; + INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s ; OOB %s:%s", line, ncclIbIfName, socketToString(&ncclIbIfAddr.sa, addrline)); + } pthread_mutex_unlock(&ncclIbLock); } return ncclSuccess; @@ -205,11 +217,12 @@ ncclResult_t ncclIbGdrSupport(int ibDev) { ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) { *supportedTypes = NCCL_PTR_HOST; - int cudaDev; + int cudaDev, nvmlDev; CUDACHECK(cudaGetDevice(&cudaDev)); + NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev)) if (ncclIbGdrSupport(dev) != ncclSuccess) { - INFO(NCCL_INIT|NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (no module or not supported by GPU)", cudaDev, ncclIbDevs[dev].devName); + INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d '%s' (no module or not supported by GPU)", cudaDev, nvmlDev, dev, ncclIbDevs[dev].devName); return ncclSuccess; } *supportedTypes |= NCCL_PTR_CUDA; @@ -242,23 +255,15 @@ struct ncclIbHandle { union socketAddress connectAddr; }; -struct ncclIbMr { - struct ibv_mr* mr; - int refcnt; -}; - struct ncclIbVerbs { struct ibv_pd* pd; struct ibv_cq* cq; - struct ncclIbMr mrPool[MAX_REQUESTS]; - int mrRotation; }; struct ncclIbRequest { int used; int type; struct ncclIbVerbs* verbs; - struct ncclIbMr * ibMr; int done; int size; int free; @@ -278,12 +283,12 @@ struct ncclIbSendFifo { }; struct ncclIbSendComm { + struct ncclIbVerbs verbs; struct ncclIbSendFifo fifo[MAX_REQUESTS]; struct ncclIbRequest reqs[MAX_REQUESTS]; uint32_t fifoHead; int fd; int ready; - struct ncclIbVerbs verbs; struct ibv_qp* qp; struct ibv_mr* fifoMr; }; @@ -307,11 +312,11 @@ struct ncclIbRemFifo { }; struct ncclIbRecvComm { + struct ncclIbVerbs verbs; struct ncclIbRemFifo remFifo; struct ncclIbRequest reqs[MAX_REQUESTS]; int fd; int ready; - struct ncclIbVerbs verbs; struct ibv_qp* qp; struct ncclIbGpuFlush gpuFlush; }; @@ -434,13 +439,13 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) { // RoCE support qpInfo.lid = portAttr.lid; if (qpInfo.lid) { // IB - INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid); + INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid); } else { // RoCE union ibv_gid gid; NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid)); qpInfo.spn = gid.global.subnet_prefix; qpInfo.iid = gid.global.interface_id; - INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid); + INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid); } NCCLCHECK(socketSend(comm->fd, &qpInfo, sizeof(qpInfo))); @@ -537,7 +542,6 @@ ncclResult_t ncclIbGetRequest(struct ncclIbRequest* reqs, struct ncclIbRequest** r->used = 1; r->type = 0; r->verbs = NULL; - r->ibMr = NULL; r->done = 0; r->size = -1; r->free = 0; @@ -583,57 +587,34 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size); #define REG_ALIGN (4096) -// Cache previous MRs to avoid registering/unregistering for each Isend/Irecv -ncclResult_t ncclIbGetMr(struct ncclIbVerbs* verbs, void* data, int size, struct ncclIbMr** mrRet) { +ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) { + struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm; uint64_t addr = (uint64_t)data; - int elem = -1; assert(size > 0); - // Look for an already existing MR - for (int i=0; i<MAX_REQUESTS; i++) { - if (verbs->mrPool[i].mr == NULL) continue; - uint64_t regAddr = (uint64_t)verbs->mrPool[i].mr->addr; - uint64_t regSize = (uint64_t)verbs->mrPool[i].mr->length; - if (regAddr <= addr && addr+size <= regAddr+regSize) { - *mrRet = verbs->mrPool+i; - verbs->mrPool[i].refcnt++; - return ncclSuccess; - } - } - - // Find an unused element - if (elem == -1) { - elem = (verbs->mrRotation++); - for (int i=0; i<MAX_REQUESTS; i++) { - elem %= MAX_REQUESTS; - if (verbs->mrPool[elem].refcnt > 0) elem++; else break; - } - if (verbs->mrPool[elem].refcnt > 0) { - WARN("NET/IB : memory register : no MR available"); - return ncclInternalError; - } - } - - assert(elem < MAX_REQUESTS); - assert(verbs->mrPool[elem].refcnt == 0); - // Deregister / register uint64_t regAddr = addr & (~(REG_ALIGN-1)); uint64_t regSize = addr+size - regAddr; regSize = ((regSize + REG_ALIGN-1) / REG_ALIGN ) * REG_ALIGN; - if (verbs->mrPool[elem].mr) NCCLCHECK(wrap_ibv_dereg_mr(verbs->mrPool[elem].mr)); - NCCLCHECK(wrap_ibv_reg_mr(&verbs->mrPool[elem].mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); - *mrRet = verbs->mrPool+elem; - verbs->mrPool[elem].refcnt++; - TRACE(NCCL_INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey); + struct ibv_mr* mr; + NCCLCHECK(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); + *mhandle = (void*)mr; + TRACE(NCCL_INIT,"regAddr %lx size %ld rkey %x", regAddr, regSize, mr->rkey); return ncclSuccess; } -ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void** request) { +ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) { + NCCLCHECK(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle)); + return ncclSuccess; +} + +ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm; if (comm->ready == 0) NCCLCHECK(ncclSendCheck(comm)); if (comm->ready == 0) { *request = NULL; return ncclSuccess; } + struct ibv_mr* mr = (struct ibv_mr*)mhandle; + // Wait for the receiver to have posted the corresponding receive volatile struct ncclIbSendFifo* slot = comm->fifo + (comm->fifoHead%MAX_REQUESTS); volatile uint32_t * readyPtr = &slot->ready; @@ -641,7 +622,6 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void** struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(comm->reqs, &req)); - req->type = type; req->verbs = &comm->verbs; req->size = size; @@ -654,8 +634,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void** wr.sg_list = NULL; wr.num_sge = 0; } else { - NCCLCHECK(ncclIbGetMr(&comm->verbs, data, size, &req->ibMr)); - sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=req->ibMr->mr->lkey; + sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=mr->lkey; wr.sg_list = &sge; wr.num_sge = 1; } @@ -720,14 +699,15 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t return ncclSuccess; } -ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void** request) { +ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; if (comm->ready == 0) NCCLCHECK(ncclRecvCheck(comm)); if (comm->ready == 0) { *request = NULL; return ncclSuccess; } + struct ibv_mr* mr = (struct ibv_mr*)mhandle; + struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(comm->reqs, &req)); - req->type = type; req->verbs = &comm->verbs; req->size = size; @@ -739,10 +719,8 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void** if (size == 0) { wr.sg_list = NULL; wr.num_sge = 0; - req->ibMr = NULL; } else { - NCCLCHECK(ncclIbGetMr(&comm->verbs, data, size, &req->ibMr)); - sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=req->ibMr->mr->lkey; + sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=mr->lkey; wr.sg_list = &sge; wr.num_sge = 1; } @@ -752,25 +730,25 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void** *request = req; // Post to FIFO to notify sender - NCCLCHECK(ncclIbPostFifo(comm, req->ibMr->mr->rkey, (uint64_t)data, size)); + NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size)); return ncclSuccess; } -ncclResult_t ncclIbFlush(void* recvComm, void* data, int size) { +ncclResult_t ncclIbFlush(void* recvComm, void* data, int size, void* mhandle) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; if (comm->gpuFlush.enabled == 0 || size == 0) return ncclSuccess; struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(comm->reqs, &req)); req->verbs = &comm->verbs; - NCCLCHECK(ncclIbGetMr(&comm->verbs, data, 1, &req->ibMr)); + struct ibv_mr* mr = (struct ibv_mr*)mhandle; struct ibv_send_wr wr; memset(&wr, 0, sizeof(wr)); wr.wr_id = (uint64_t)req; wr.wr.rdma.remote_addr = (uint64_t)data; - wr.wr.rdma.rkey = req->ibMr->mr->rkey; + wr.wr.rdma.rkey = mr->rkey; wr.sg_list = &comm->gpuFlush.sge; wr.num_sge = 1; wr.opcode = IBV_WR_RDMA_READ; @@ -800,32 +778,31 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) { } int wrDone = 0; - struct ibv_wc wc; - NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 1, &wc, &wrDone)); + struct ibv_wc wcs[4]; + NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 4, wcs, &wrDone)); if (wrDone == 0) return ncclSuccess; - if (wc.status != IBV_WC_SUCCESS) { - WARN("NET/IB : Got completion with error %d, opcode %d, len %d, vendor err %d", wc.status, wc.opcode, wc.byte_len, wc.vendor_err); - return ncclSystemError; - } + for (int w=0; w<wrDone; w++) { + struct ibv_wc *wc = wcs+w; + if (wc->status != IBV_WC_SUCCESS) { + WARN("NET/IB : Got completion with error %d, opcode %d, len %d, vendor err %d", wc->status, wc->opcode, wc->byte_len, wc->vendor_err); + return ncclSystemError; + } - struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc.wr_id; - if (doneReq) { - if (wc.opcode == IBV_WC_RECV) { - doneReq->size = wc.byte_len; + struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc->wr_id; + if (doneReq) { + if (wc->opcode == IBV_WC_RECV) { + doneReq->size = wc->byte_len; #if USE_RDMA_WRITE - } else if (wc.opcode == IBV_WC_RECV_RDMA_WITH_IMM) { - doneReq->size = wc.imm_data; + } else if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) { + doneReq->size = wc->imm_data; #endif - } - if (doneReq->ibMr != NULL) { - doneReq->ibMr->refcnt--; - if (doneReq->ibMr->refcnt < 0) WARN("NET/IB : doneReq %p MR %p refcount now %d", doneReq, doneReq->ibMr, doneReq->ibMr->refcnt); - } - doneReq->done = 1; - if (doneReq->free == 1) { - // This is an internal (FIFO post) req. Free it immediately. - doneReq->used = 0; + } + doneReq->done = 1; + if (doneReq->free == 1) { + // This is an internal (FIFO post) req. Free it immediately. + doneReq->used = 0; + } } } } @@ -837,12 +814,6 @@ ncclResult_t ncclIbCloseSend(void* sendComm) { close(comm->fd); if (comm->qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qp)); if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr)); - for (int i=0; i<MAX_REQUESTS; i++) { - if (comm->verbs.mrPool[i].mr != NULL) { - if (comm->verbs.mrPool[i].refcnt != 0) WARN("NET/IB : TX MR #%d has non-zero (%d) refcnt", i, comm->verbs.mrPool[i].refcnt); - NCCLCHECK(wrap_ibv_dereg_mr(comm->verbs.mrPool[i].mr)); - } - } NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs)); free(comm); } @@ -859,12 +830,6 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) { if (comm->gpuFlush.hostMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->gpuFlush.hostMr)); } if (comm->remFifo.mr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remFifo.mr)); - for (int i=0; i<MAX_REQUESTS; i++) { - if (comm->verbs.mrPool[i].mr != NULL) { - if (comm->verbs.mrPool[i].refcnt != 0) WARN("NET/IB : RX MR #%d has non-zero (%d) refcnt", i, comm->verbs.mrPool[i].refcnt); - NCCLCHECK(wrap_ibv_dereg_mr(comm->verbs.mrPool[i].mr)); - } - } NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs)); free(comm); } @@ -889,6 +854,8 @@ ncclNet_t ncclNetIb = { ncclIbListen, ncclIbConnect, ncclIbAccept, + ncclIbRegMr, + ncclIbDeregMr, ncclIbIsend, ncclIbIrecv, ncclIbFlush, diff --git a/src/transport/net_socket.cu b/src/transport/net_socket.cu index 1efee15..0464b43 100644 --- a/src/transport/net_socket.cu +++ b/src/transport/net_socket.cu @@ -27,10 +27,19 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) { pthread_mutex_lock(&ncclSocketLock); if (ncclNetIfs == -1) { ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS); - INFO(NCCL_INIT|NCCL_NET,"NET/Socket : %d interfaces found", ncclNetIfs); if (ncclNetIfs <= 0) { WARN("NET/Socket : no interface found"); return ncclInternalError; + } else { + char line[1024]; + char addrline[1024]; + line[0] = '\0'; + for (int i=0; i<ncclNetIfs; i++) { + snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, ncclNetIfNames+i*MAX_IF_NAME_SIZE, + socketToString(&ncclNetIfAddrs[i].sa, addrline)); + } + line[1023] = '\0'; + INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line); } } pthread_mutex_unlock(&ncclSocketLock); @@ -113,7 +122,7 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) { union socketAddress localAddr; char ifName[MAX_IF_NAME_SIZE]; if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) { - WARN("No usable listening interface found"); + WARN("NET/Socket : No usable listening interface found"); return ncclSystemError; } // pass the local address back @@ -205,21 +214,24 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) { return ncclSuccess; } -ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, int type, void** request) { - if (type != NCCL_PTR_HOST) return ncclInternalError; +ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) { + return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess; +} +ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } + +ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm; NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_SEND, data, size, comm->fd, (struct ncclSocketRequest**)request)); return ncclSuccess; } -ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, int type, void** request) { - if (type != NCCL_PTR_HOST) return ncclInternalError; +ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm; NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_RECV, data, size, comm->fd, (struct ncclSocketRequest**)request)); return ncclSuccess; } -ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size) { +ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size, void* mhandle) { // We don't support CUDA pointers, so we don't need a flush operation return ncclInternalError; } @@ -243,6 +255,8 @@ ncclNet_t ncclNetSocket = { ncclSocketListen, ncclSocketConnect, ncclSocketAccept, + ncclSocketRegMr, + ncclSocketDeregMr, ncclSocketIsend, ncclSocketIrecv, ncclSocketFlush, diff --git a/src/transport/p2p.cu b/src/transport/p2p.cu index 6c4626a..9f3e0b6 100644 --- a/src/transport/p2p.cu +++ b/src/transport/p2p.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,18 +11,9 @@ #include "param.h" #include <unistd.h> #include <cuda_runtime.h> -#include "nvmlwrap.h" #include <ctype.h> #include "nvlink.h" -struct p2pInfo { - int rank; - int cudaDev; - uint64_t hostHash; - uint64_t pidHash; - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; -}; - struct p2pConnectInfo { int direct; union { @@ -31,36 +22,40 @@ struct p2pConnectInfo { }; }; -#include <sys/types.h> +struct p2pSendResources { + struct ncclSendMem* devMem; + void* ipcPtr; +}; -/* Fill information necessary to exchange between ranks to choose whether or not - * to use this transport */ -ncclResult_t p2pFillInfo(ncclTinfo_t* opaqueInfo, int rank) { - struct p2pInfo* info = (struct p2pInfo*)opaqueInfo; - static_assert(sizeof(struct p2pInfo) <= sizeof(ncclTinfo_t), "p2p Info too large"); - info->rank = rank; - CUDACHECK(cudaGetDevice(&info->cudaDev)); - info->hostHash=getHostHash(); - info->pidHash=getPidHash(); - - // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the - // cudaDev is a CUDA runtime dev number which could be different from the - // NVML device number. Then we get the busID from NVML to be sure it is - // consistent with NVML remote PCI bus Ids. - CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev)); - nvmlDevice_t nvmlDevice; - NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice)); - nvmlPciInfo_t pciInfo; - NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo)); - strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE); - return ncclSuccess; -} +struct p2pRecvResources { + struct ncclRecvMem* devMem; + void* ipcPtr; +}; + +#include <sys/types.h> NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2); NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2); +/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */ +static int busIdToCudaDev(const char* busId) { + int ndev; + if (cudaGetDeviceCount(&ndev) != cudaSuccess) + return -1; + for (int i = 0; i < ndev; i++) { + char devBusId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + if (cudaDeviceGetPCIBusId(devBusId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess) + return -1; + if (strcmp(busId, devBusId) == 0) { + return i; + } + } + // BusId was not found in our locally visible CUDA devices + return -1; +} + /* Determine if we can communicate with the peer through p2p */ -ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) { +ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) { // Do not use P2P across root complexes by default (provided CUDA permits it) int p2pLevel = PATH_SOC; if (ncclParamP2pDisable() == 1) p2pLevel = 0; @@ -70,23 +65,26 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin if (p2pLevel == 0) return ncclSuccess; - struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo; - struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo; - // Rule out different nodes if (myInfo->hostHash != peerInfo->hostHash) return ncclSuccess; + // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) + int peerCudaDev = busIdToCudaDev(peerInfo->busId); + if (peerCudaDev == -1) return ncclSuccess; // Peer's CUDA device is not visible in this process + + TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev); + // Do not detect topology if we're on the same GPU. Note this is not really supported. - if (myInfo->cudaDev == peerInfo->cudaDev) { + if (myInfo->cudaDev == peerCudaDev) { *ret = 1 + PATH_SOC; return ncclSuccess; } // See if CUDA can do P2P int p2p; - if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerInfo->cudaDev) != cudaSuccess) { - INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d and dev %d", - myInfo->cudaDev, peerInfo->cudaDev); + if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerCudaDev) != cudaSuccess) { + INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%d) and dev %d(=%d)", + myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev); return ncclSuccess; } if (p2p == 0) return ncclSuccess; @@ -102,7 +100,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin char* myPath; char* peerPath; ncclResult_t err1 = getCudaPath(myInfo->cudaDev, &myPath); - ncclResult_t err2 = getCudaPath(peerInfo->cudaDev, &peerPath); + ncclResult_t err2 = getCudaPath(peerCudaDev, &peerPath); if (err1 == ncclSuccess && err2 == ncclSuccess) { int distance = pciDistance(myPath, peerPath); if (distance < p2pLevel) { @@ -174,8 +172,8 @@ static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentR static inline int copyRings(int nranks, int* rings, int nrings, int newNrings) { if (nrings == 0) return 0; // Copy rings by dup times - if (newNrings > MAXRINGS) { - newNrings = MAXRINGS; + if (newNrings > MAXCHANNELS) { + newNrings = MAXCHANNELS; } for (int r=nrings; r<newNrings; r++) { for (int i=0; i<nranks; i++) rings[r*nranks+i] = rings[(r%nrings)*nranks+i]; @@ -191,7 +189,6 @@ int p2pComputeRingsNvLink(ncclTvalue_t* matrix, int nranks, int *rings, int nrin if (connect) { inTheRing[rings[0]] = 1; nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, rings[1], nranks-2, connect); - nrings = copyRings(nranks, rings, nrings, nringsMax); } else { rings[0] = 0; nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, 0, nranks-1, connect); @@ -209,9 +206,9 @@ static inline int findConnect(int nranks, int* ranks) { int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int oversubscribe, int* nthreads) { if (nrings == 0) return 0; - if (nrings > MAXRINGS) { - WARN("Max rings reached, limiting to %d", MAXRINGS); - nrings = MAXRINGS; + if (nrings > MAXCHANNELS) { + WARN("Max rings reached, limiting to %d", MAXCHANNELS); + nrings = MAXCHANNELS; } // Find existing constraints / connections int connect = 0; @@ -239,9 +236,9 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin if (compNrings && compNrings < nrings && nranks <= 4) { // Try to oversubscribe to get a better result - int *rings2 = (int *)malloc(sizeof(int)*MAXRINGS*nranks); - if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXRINGS*nranks); return 0; } - for (int i=0; i<MAXRINGS*nranks; i++) rings2[i] = -1; + int *rings2 = (int *)malloc(sizeof(int)*MAXCHANNELS*nranks); + if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXCHANNELS*nranks); return 0; } + for (int i=0; i<MAXCHANNELS*nranks; i++) rings2[i] = -1; int nThreads = *nthreads; int compNrings2 = p2pComputeRingsNvLink(values, nranks, rings2, nrings, prev, next, 1, &nThreads); if (compNrings2 > compNrings*2) { @@ -255,7 +252,6 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin // Duplicate the rings for direct NVLink compNrings = copyRings(nranks, rings, compNrings, compNrings*2); - if (ncclCudaCompCap() == 6) *nthreads /= 2; return compNrings; } @@ -367,8 +363,8 @@ int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings, ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) { if (*nringsRet == 0) return ncclSuccess; int *rings; - NCCLCHECK(ncclCalloc(&rings, MAXRINGS*nranks)); - for (int i=0; i<MAXRINGS*nranks; i++) rings[i] = -1; + NCCLCHECK(ncclCalloc(&rings, MAXCHANNELS*nranks)); + for (int i=0; i<MAXCHANNELS*nranks; i++) rings[i] = -1; int nrings = *nringsRet; // NVswitch @@ -446,39 +442,47 @@ end: } while (0) /* Send: Create and return connect structures for this peer to connect to me */ -ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) { - struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo; - struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo; +ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, + struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { + + struct p2pSendResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + send->transportResources = resources; + const int sendSize = sizeof(struct ncclSendMem); + NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize)); + struct p2pConnectInfo info; if (myInfo->pidHash == peerInfo->pidHash) { info.direct = 1; - info.directPtr = ring->devMemSend; + info.directPtr = resources->devMem; if (myInfo->cudaDev == peerInfo->cudaDev) { - INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank); + INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", channelId, myInfo->rank, peerInfo->rank); } else { // Enable P2P access cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0); if (err == cudaErrorPeerAccessAlreadyEnabled) { cudaGetLastError(); } else if (err != cudaSuccess) { - WARN("failed to peer with device %d: %d %s", - peerInfo->cudaDev, err, cudaGetErrorString(err)); + WARN("failed to peer with device %d(=%d): %d %s", + peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err)); return ncclInternalError; } INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer", - ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); } } else { + // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) + int peerCudaDev = busIdToCudaDev(peerInfo->busId); info.direct = 0; // Map IPC and enable P2P access - cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)ring->devMemSend); + cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem); if (err != cudaSuccess) { - WARN("rank %d failed to get CUDA IPC handle to device %d : %d %s", - myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err)); + WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s", + myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err)); return ncclInternalError; } INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC", - ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); //TRACE_DUMP_IPC(&info.devIpc); } static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); @@ -487,13 +491,19 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo } /* Create and return connect structures for this peer to connect to me */ -ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) { - struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo; - struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo; +ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, + struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) { + + struct p2pRecvResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + recv->transportResources = resources; + const int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; + NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize)); + struct p2pConnectInfo info; if (myInfo->pidHash == peerInfo->pidHash) { info.direct = 1; - info.directPtr = ring->devMemRecv; + info.directPtr = resources->devMem; if (myInfo->cudaDev == peerInfo->cudaDev) { TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank); } else { @@ -502,22 +512,24 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo if (err == cudaErrorPeerAccessAlreadyEnabled) { cudaGetLastError(); } else if (err != cudaSuccess) { - WARN("failed to peer with device %d: %d %s", - peerInfo->cudaDev, err, cudaGetErrorString(err)); + WARN("failed to peer with device %d(=%d): %d %s", + peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err)); return ncclInternalError; } - TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); } } else { + // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) + int peerCudaDev = busIdToCudaDev(peerInfo->busId); info.direct = 0; // Map IPC and enable P2P access - cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)ring->devMemRecv); + cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem); if (err != cudaSuccess) { - WARN("rank %d failed to get CUDA IPC handle to device %d : %d %s", - myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err)); + WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s", + myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err)); return ncclInternalError; } - TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); //TRACE_DUMP_IPC(&info.devIpc); } static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); @@ -527,22 +539,16 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo /* Connect/Send to this peer */ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) { - void** resources = &send->transportResources; + struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources; struct ncclRecvMem* remDevMem; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; if (info->direct) { remDevMem = (struct ncclRecvMem*)(info->directPtr); send->conn.direct = 1; - *resources = NULL; } else { - void* remPtr = NULL; //TRACE_DUMP_IPC(&info->devIpc); - cudaError_t err = cudaIpcOpenMemHandle(&remPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess); - void** ipcPtrSave; - NCCLCHECK(ncclCalloc(&ipcPtrSave, 1)); - *resources = ipcPtrSave; - *ipcPtrSave = remPtr; - remDevMem = (struct ncclRecvMem*)remPtr; + cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess); + remDevMem = (struct ncclRecvMem*)resources->ipcPtr; if (err != cudaSuccess) { WARN("failed to open CUDA IPC handle : %d %s", err, cudaGetErrorString(err)); @@ -553,30 +559,26 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC send->conn.buff = remDevMem->buff; send->conn.llBuff = remDevMem->llBuff; send->conn.tail = &remDevMem->tail; - send->conn.opCount = &remDevMem->opCount; - // send->conn->head should have been set to devMemSend already + send->conn.opCountRem = &remDevMem->opCount; + send->conn.head = &resources->devMem->head; + send->conn.ptrExchange = &resources->devMem->ptrExchange; + send->conn.opCountLoc = &resources->devMem->opCount; return ncclSuccess; } /* Connect/Recv from this peer */ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) { - void** resources = &recv->transportResources; + struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources; struct ncclSendMem* remDevMem; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; if (info->direct) { remDevMem = (struct ncclSendMem*)(info->directPtr); recv->conn.direct = 1; recv->conn.ptrExchange = &remDevMem->ptrExchange; - *resources = NULL; } else { - void* remPtr = NULL; //TRACE_DUMP_IPC(&info->devIpc); - cudaError_t err = cudaIpcOpenMemHandle(&remPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess); - void** ipcPtrSave; - NCCLCHECK(ncclCalloc(&ipcPtrSave, 1)); - *resources = ipcPtrSave; - *ipcPtrSave = remPtr; - remDevMem = (struct ncclSendMem*)remPtr; + cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess); + remDevMem = (struct ncclSendMem*)resources->ipcPtr; if (err != cudaSuccess) { WARN("failed to open CUDA IPC handle : %d %s", err, cudaGetErrorString(err)); @@ -584,28 +586,35 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto } } - // recv->conn->buff should have been set to devMemRecv already - // recv->conn->tail should have been set to devMemRecv already - // recv->conn->opCount should have been set to devMemRecv already + recv->conn.buff = resources->devMem->buff; + recv->conn.llBuff = resources->devMem->llBuff; + recv->conn.tail = &resources->devMem->tail; + recv->conn.opCountLoc = &resources->devMem->opCount; recv->conn.head = &remDevMem->head; - recv->conn.llHead = &remDevMem->llHead; + recv->conn.opCountRem = &remDevMem->opCount; return ncclSuccess; } -ncclResult_t p2pFree(void* resources) { - if (resources != NULL) { - void** ipcPtrSave = (void**) resources; - CUDACHECK(cudaIpcCloseMemHandle(*ipcPtrSave)); - free(resources); - } +ncclResult_t p2pSendFree(void* resources) { + struct p2pSendResources* sendRes = (struct p2pSendResources*)resources; + if (sendRes->ipcPtr) + CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr)); + CUDACHECK(cudaFree(sendRes->devMem)); + return ncclSuccess; +} + +ncclResult_t p2pRecvFree(void* resources) { + struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources; + if (recvRes->ipcPtr) + CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr)); + CUDACHECK(cudaFree(recvRes->devMem)); return ncclSuccess; } struct ncclTransport p2pTransport = { "P2P", - p2pFillInfo, p2pCanConnect, p2pGetRings, - { p2pSendSetup, p2pSendConnect, p2pFree, NULL }, - { p2pRecvSetup, p2pRecvConnect, p2pFree, NULL } + { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL }, + { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL } }; diff --git a/src/transport/shm.cu b/src/transport/shm.cu index 317f652..56e0242 100644 --- a/src/transport/shm.cu +++ b/src/transport/shm.cu @@ -12,13 +12,6 @@ #include <unistd.h> #include <cuda_runtime.h> -struct shmInfo { - int rank; - int cudaDev; - uint64_t hostHash; - uint64_t pidHash; -}; - struct shmSendConnectInfo { uint64_t pidHash; int id; @@ -51,24 +44,10 @@ struct shmRecvResources { struct ncclRecvMem* devHostMem; }; -/* Fill information necessary to exchange between ranks to choose whether or not - * to use this transport */ -ncclResult_t shmFillInfo(ncclTinfo_t* opaqueInfo, int rank) { - struct shmInfo* info = (struct shmInfo*)opaqueInfo; - static_assert(sizeof(struct shmInfo) <= sizeof(ncclTinfo_t), "shm Info too large"); - info->rank = rank; - CUDACHECK(cudaGetDevice(&info->cudaDev)); - info->hostHash=getHostHash(); - info->pidHash=getPidHash(); - return ncclSuccess; -} - NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0); /* Determine if we can communicate with the peer */ -ncclResult_t shmCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) { - struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo; - struct shmInfo* peerInfo = (struct shmInfo*)peerOpaqueInfo; +ncclResult_t shmCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) { *ret = ((ncclParamShmDisable() == 1) || (myInfo->hostHash != peerInfo->hostHash)) ? 0 : 1; return ncclSuccess; } @@ -88,7 +67,7 @@ static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid) } ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) { - if (*nringsRet == MAXRINGS) *nringsRet = 1; + if (*nringsRet == MAXCHANNELS) *nringsRet = 1; int nGroups = groups[nranks-1] + 1; int starts[nGroups]; int ends[nGroups]; @@ -156,43 +135,40 @@ ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* #define MAX_SHM_NAME_LEN 1024 /* Create and return connect structures for this peer to connect to me */ -ncclResult_t shmSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) { - struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo; - struct shmInfo* peerInfo = (struct shmInfo*)peerOpaqueInfo; +ncclResult_t shmSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { struct shmSendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); - ring->send.transportResources = resources; + send->transportResources = resources; struct shmRecvConnectInfo info; char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank); + sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank); info.shmSize = resources->shmSize = sizeof(struct ncclSendMem); TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); - INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); - info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash; + INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash; static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big"); memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo)); return ncclSuccess; } -ncclResult_t shmRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) { - struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo; +ncclResult_t shmRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { struct shmRecvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); - ring->recv.transportResources = resources; + recv->transportResources = resources; struct shmSendConnectInfo info; char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank); - info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize; + sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank); + info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+buffSize; TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); - info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash; + info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash; static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big"); memcpy(connectInfo, &info, sizeof(struct shmSendConnectInfo)); return ncclSuccess; @@ -216,10 +192,10 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto send->conn.buff = resources->devRemHostMem->buff; send->conn.llBuff = resources->devRemHostMem->llBuff; send->conn.tail = &resources->devRemHostMem->tail; - send->conn.opCount = &resources->devRemHostMem->opCount; + send->conn.opCountRem = &resources->devRemHostMem->opCount; send->conn.head = &resources->devHostMem->head; - send->conn.llHead = &resources->devHostMem->llHead; + send->conn.opCountLoc = &resources->devHostMem->opCount; return ncclSuccess; } @@ -235,12 +211,12 @@ ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); NCCLCHECK(shmUnlink(shmName)); recv->conn.head = &resources->devRemHostMem->head; - recv->conn.llHead = &resources->devRemHostMem->llHead; + recv->conn.opCountRem = &resources->devRemHostMem->opCount; recv->conn.buff = resources->devHostMem->buff; recv->conn.llBuff = resources->devHostMem->llBuff; recv->conn.tail = &resources->devHostMem->tail; - recv->conn.opCount = &resources->devHostMem->opCount; + recv->conn.opCountLoc = &resources->devHostMem->opCount; return ncclSuccess; } @@ -262,7 +238,6 @@ ncclResult_t shmRecvFree(void* transportResources) { struct ncclTransport shmTransport = { "SHM", - shmFillInfo, shmCanConnect, shmGetRings, { shmSendSetup, shmSendConnect, shmSendFree, NULL }, |