Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--makefiles/common.mk7
-rw-r--r--makefiles/version.mk4
-rw-r--r--pkg/redhat/nccl.spec.in4
-rw-r--r--pkg/srctxz/Makefile1
-rw-r--r--pkg/srctxz/create_srctxz.sh.in3
-rw-r--r--src/Makefile16
-rw-r--r--src/bootstrap.cu242
-rw-r--r--src/channel.cu51
-rw-r--r--src/collectives/all_gather.cu22
-rw-r--r--src/collectives/all_reduce.cu26
-rw-r--r--src/collectives/broadcast.cu34
-rw-r--r--src/collectives/collectives.h37
-rw-r--r--src/collectives/device/Makefile39
-rw-r--r--src/collectives/device/all_gather.cu8
-rw-r--r--src/collectives/device/all_gather.h218
-rw-r--r--src/collectives/device/all_reduce.cu14
-rw-r--r--src/collectives/device/all_reduce.h381
-rw-r--r--src/collectives/device/broadcast.cu8
-rw-r--r--src/collectives/device/broadcast.h200
-rw-r--r--src/collectives/device/common.h112
-rw-r--r--src/collectives/device/common_kernel.h186
-rw-r--r--src/collectives/device/functions.cu10
-rwxr-xr-xsrc/collectives/device/gen_rules.sh28
-rw-r--r--src/collectives/device/ll_kernel.h154
-rw-r--r--src/collectives/device/primitives.h709
-rw-r--r--src/collectives/device/reduce.cu14
-rw-r--r--src/collectives/device/reduce.h165
-rw-r--r--src/collectives/device/reduce_kernel.h94
-rw-r--r--src/collectives/device/reduce_scatter.cu14
-rw-r--r--src/collectives/device/reduce_scatter.h158
-rw-r--r--src/collectives/reduce.cu23
-rw-r--r--src/collectives/reduce_scatter.cu22
-rw-r--r--src/enqueue.cu442
-rw-r--r--src/include/bootstrap.h2
-rw-r--r--src/include/channel.h14
-rw-r--r--src/include/checks.h10
-rw-r--r--src/include/common_coll.h195
-rw-r--r--src/include/core.h186
-rw-r--r--src/include/cpuset.h61
-rw-r--r--src/include/debug.h1
-rw-r--r--src/include/enqueue.h7
-rw-r--r--src/include/nccl_net.h46
-rw-r--r--src/include/net.h8
-rw-r--r--src/include/nvlink.h74
-rw-r--r--src/include/nvmlwrap.h18
-rw-r--r--src/include/ring.h14
-rw-r--r--src/include/rings.h2
-rw-r--r--src/include/socket.h9
-rw-r--r--src/include/transport.h87
-rw-r--r--src/include/trees.h13
-rw-r--r--src/init.cu669
-rw-r--r--src/misc/checks.cu69
-rw-r--r--src/misc/enqueue.cu248
-rw-r--r--src/misc/group.cu12
-rw-r--r--src/misc/nvmlwrap.cu49
-rw-r--r--src/misc/rings.cu61
-rw-r--r--src/misc/trees.cu108
-rw-r--r--src/misc/utils.cu18
-rw-r--r--src/nccl.h.in14
-rw-r--r--src/ring.cu70
-rw-r--r--src/transport.cu325
-rw-r--r--src/transport/net.cu532
-rw-r--r--src/transport/net_ib.cu223
-rw-r--r--src/transport/net_socket.cu28
-rw-r--r--src/transport/p2p.cu229
-rw-r--r--src/transport/shm.cu57
66 files changed, 3700 insertions, 3205 deletions
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 83a2a39..d0e2ca8 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -15,8 +15,7 @@ PROFAPI ?= 0
NVCC = $(CUDA_HOME)/bin/nvcc
CUDA_LIB ?= $(CUDA_HOME)/lib64
-CUDA_INC ?= $(CUDA_HOME)/include
-CUDA_VERSION = $(strip $(shell $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
+CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
#CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
@@ -36,14 +35,14 @@ CUDA8_PTX = -gencode=arch=compute_61,code=compute_61
CUDA9_PTX = -gencode=arch=compute_70,code=compute_70
# Include Volta support if we're using CUDA9 or above
-ifeq ($(shell test "$(CUDA_MAJOR)" -gt 8; echo $$?),0)
+ifeq ($(shell test "0$(CUDA_MAJOR)" -gt 8; echo $$?),0)
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
else
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
endif
#$(info NVCC_GENCODE is ${NVCC_GENCODE})
-CXXFLAGS := -I$(CUDA_INC) -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
+CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
CXXFLAGS += -Wall -Wno-sign-compare
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all
# Use addprefix so that we can specify more than one path
diff --git a/makefiles/version.mk b/makefiles/version.mk
index f9cee6a..a8c6e3a 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
-NCCL_MINOR := 3
-NCCL_PATCH := 7
+NCCL_MINOR := 4
+NCCL_PATCH := 2
NCCL_SUFFIX :=
PKG_REVISION := 1
diff --git a/pkg/redhat/nccl.spec.in b/pkg/redhat/nccl.spec.in
index 65a2c60..f9d83a3 100644
--- a/pkg/redhat/nccl.spec.in
+++ b/pkg/redhat/nccl.spec.in
@@ -1,6 +1,6 @@
Name: libnccl
-Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
-Release: ${pkg:Revision}
+Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}
+Release: ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}
Summary: NVIDIA Collectives Communication Library (NCCL) Runtime
Group: Development/Libraries
diff --git a/pkg/srctxz/Makefile b/pkg/srctxz/Makefile
index 1cb7c06..ed677fe 100644
--- a/pkg/srctxz/Makefile
+++ b/pkg/srctxz/Makefile
@@ -36,4 +36,5 @@ $(TXZPREPDIR)/% : %.in
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
-e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
+ -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
$< > $@
diff --git a/pkg/srctxz/create_srctxz.sh.in b/pkg/srctxz/create_srctxz.sh.in
index 0b8e6d2..ae7d01f 100644
--- a/pkg/srctxz/create_srctxz.sh.in
+++ b/pkg/srctxz/create_srctxz.sh.in
@@ -25,8 +25,9 @@ NCCL_MAJOR=${nccl:Major}
NCCL_MINOR=${nccl:Minor}
NCCL_PATCH=${nccl:Patch}
NCCL_SUFFIX=${nccl:Suffix}
+NCCL_BUILD=${pkg:Revision}
-NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}"
+NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"
tar --exclude build \
--exclude ".git*" \
diff --git a/src/Makefile b/src/Makefile
index 481000a..fe60b11 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -9,8 +9,8 @@ include ../makefiles/version.mk
##### src files
INCEXPORTS := nccl.h nccl_net.h
-LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \
- misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \
+LIBSRCFILES := init.cu channel.cu bootstrap.cu transport.cu enqueue.cu \
+ misc/group.cu misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/checks.cu misc/trees.cu \
transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \
collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu
@@ -29,11 +29,10 @@ LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
STATICLIBTARGET := $(STATICLIBNAME)
LIBOBJ := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o)
DEPFILES := $(LIBOBJ:%.o=%.d)
-LDFLAGS += -L${CUDA_LIB} -lcudart_static -lrt
+LDFLAGS += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl
DEVICELIB := $(BUILDDIR)/obj/collectives/device/colldevice.a
-
##### rules
build : lib staticlib
@@ -41,9 +40,12 @@ lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET)
staticlib : $(LIBDIR)/$(STATICLIBTARGET)
-devicelib: $(INCDIR)/nccl.h
+$(DEVICELIB): ALWAYS_REBUILD
$(MAKE) -C collectives/device
+# Empty target to force rebuild
+ALWAYS_REBUILD:
+
-include $(DEPFILES)
$(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ)
@@ -59,14 +61,14 @@ $(INCDIR)/nccl.h : nccl.h.in
-e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \
$< > $@
-$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) devicelib
+$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB)
@printf "Linking %-35s > %s\n" $(LIBTARGET) $@
mkdir -p $(LIBDIR)
$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS)
ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
-$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) devicelib
+$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB)
@printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@
mkdir -p $(LIBDIR)
$(eval TMP := $(shell mktemp -d))
diff --git a/src/bootstrap.cu b/src/bootstrap.cu
index 13c6e92..6b1d573 100644
--- a/src/bootstrap.cu
+++ b/src/bootstrap.cu
@@ -15,27 +15,31 @@
// Always use sockets for bootstrap
ncclNet_t* ncclBootstrapNet = &ncclNetSocket;
-static ncclResult_t bootstrapListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; }
-static ncclResult_t bootstrapConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; }
-static ncclResult_t bootstrapAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; }
-static ncclResult_t bootstrapTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; }
-static ncclResult_t bootstrapCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; }
-static ncclResult_t bootstrapCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; }
-static ncclResult_t bootstrapCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; }
+static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; }
// Additional sync functions based on async + test for bootstrap, using host ptrs.
-static ncclResult_t bootstrapSend(void* sendComm, void* data, int size) {
- void* request;
- NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, NCCL_PTR_HOST, &request));
+static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) {
+ void* request, *mhandle;
+ NCCLCHECK(ncclBootstrapNet->regMr(sendComm, data, size, NCCL_PTR_HOST, &mhandle));
+ NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, mhandle, &request));
+ NCCLCHECK(ncclBootstrapNet->deregMr(sendComm, mhandle));
int done = 0;
- while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
+ while (!done) NCCLCHECK(bootstrapNetTest(request, &done, NULL));
return ncclSuccess;
}
-static ncclResult_t bootstrapRecv(void* recvComm, void* data, int size) {
- void* request;
- NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, NCCL_PTR_HOST, &request));
+static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) {
+ void* request, *mhandle;
+ NCCLCHECK(ncclBootstrapNet->regMr(recvComm, data, size, NCCL_PTR_HOST, &mhandle));
+ NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, mhandle, &request));
+ NCCLCHECK(ncclBootstrapNet->deregMr(recvComm, mhandle));
int done = 0;
- while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL));
+ while (!done) NCCLCHECK(bootstrapNetTest(request, &done, NULL));
return ncclSuccess;
}
@@ -51,8 +55,8 @@ struct extId {
struct extInfo {
int rank;
int nranks;
- ncclNetHandle_t extHandleListenFromRoot;
- ncclNetHandle_t extHandleRing;
+ ncclNetHandle_t extHandleListenRoot;
+ ncclNetHandle_t extHandleListen;
};
#include <sys/resource.h>
@@ -68,28 +72,25 @@ static ncclResult_t setFilesLimit() {
static void *bootstrapRoot(void* commId) {
struct extInfo info;
struct extId* id = (struct extId*)commId;
- ncclNetHandle_t *extHandleBstrap = NULL; // for initial rank <-> root information exchange
- ncclNetHandle_t *extHandleRing = NULL; // for bootstrap ring creation
+ ncclNetHandle_t *rankHandles = NULL;
+ ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
ncclNetHandle_t zero = { 0 }; // for sanity checking
void* tmpComm;
ncclResult_t res;
setFilesLimit();
+ TRACE(NCCL_INIT, "BEGIN");
/* Receive addresses from all ranks */
int nranks = 0, c = 0;
do {
- NCCLCHECKGOTO(bootstrapAccept(id->extListenComm, &tmpComm), res, out);
- NCCLCHECKGOTO(bootstrapRecv(tmpComm, &info, sizeof(info)), res, out);
- NCCLCHECKGOTO(bootstrapCloseRecv(tmpComm), res, out);
+ NCCLCHECKGOTO(bootstrapNetAccept(id->extListenComm, &tmpComm), res, out);
+ NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out);
+ NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out);
if (c == 0) {
- extHandleBstrap = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t));
- extHandleRing = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t));
- if (extHandleBstrap == NULL || extHandleRing == NULL) {
- WARN("Bootstrap thread : failed to allocate memory");
- goto out;
- }
nranks = info.nranks;
+ NCCLCHECKGOTO(ncclCalloc(&rankHandles, nranks), res, out);
+ NCCLCHECKGOTO(ncclCalloc(&rankHandlesRoot, nranks), res, out);
}
if (nranks != info.nranks) {
@@ -97,40 +98,43 @@ static void *bootstrapRoot(void* commId) {
goto out;
}
- if (memcmp(&zero, &extHandleBstrap[info.rank], sizeof(ncclNetHandle_t)) != 0) {
+ if (memcmp(&zero, &rankHandlesRoot[info.rank], sizeof(ncclNetHandle_t)) != 0) {
WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
goto out;
}
- // Save the connection handle for connecting back to the ranks
- memcpy(&extHandleBstrap[info.rank], info.extHandleListenFromRoot, sizeof(ncclNetHandle_t));
- // Save the connection handle for the AllGather ring
- memcpy(&extHandleRing[info.rank], info.extHandleRing, sizeof(ncclNetHandle_t));
+ // Save the connection handle for that rank
+ memcpy(rankHandlesRoot+info.rank, info.extHandleListenRoot, sizeof(ncclNetHandle_t));
+ memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t));
++c;
} while (c < nranks);
+ TRACE(NCCL_INIT, "COLLECTED HANDLES");
// Send the connect handle for the next rank in the AllGather ring
for (int r=0; r<nranks; ++r) {
int next = (r+1) % nranks;
void *tmpSendComm;
- NCCLCHECKGOTO(bootstrapConnect(0, extHandleBstrap[r], &tmpSendComm), res, out);
- NCCLCHECKGOTO(bootstrapSend(tmpSendComm, &extHandleRing[next], sizeof(ncclNetHandle_t)), res, out);
- NCCLCHECKGOTO(bootstrapCloseSend(tmpSendComm), res, out);
+ NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot[r], &tmpSendComm), res, out);
+ NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
+ NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
}
+ TRACE(NCCL_INIT, "SENT OUT HANDLES");
out:
- bootstrapCloseListen(id->extListenComm);
+ bootstrapNetCloseListen(id->extListenComm);
free(commId);
- free(extHandleBstrap);
- free(extHandleRing);
+ if (rankHandles) free(rankHandles);
+ if (rankHandlesRoot) free(rankHandlesRoot);
+
+ TRACE(NCCL_INIT, "DONE");
return NULL;
}
ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) {
struct extId* id = (struct extId*)commId;
id->hostHash = getHostHash();
- NCCLCHECK(bootstrapListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
+ NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm));
ncclUniqueId* threadIdCopy;
NCCLCHECK(ncclCalloc(&threadIdCopy, 1));
memcpy(threadIdCopy, id, sizeof(ncclUniqueId));
@@ -157,10 +161,18 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) {
return ncclSuccess;
}
+struct unexConn {
+ int peer;
+ void* comm;
+ struct unexConn* next;
+};
+
struct extState {
+ void* extBstrapListenComm;
void* extBstrapRingRecvComm;
void* extBstrapRingSendComm;
- ncclNetHandle_t extBstrapRootHandle;
+ ncclNetHandle_t* peerBstrapHandles;
+ struct unexConn* unexpectedConnections;
int rank;
int nranks;
int dev;
@@ -174,39 +186,56 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co
state->rank = rank;
state->nranks = nranks;
*commState = state;
- void* extBstrapRootListenComm; // comm on which we accept root's connections
+
+ TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
struct extInfo info = { 0 };
info.rank = rank;
info.nranks = nranks;
- void *tmpSendComm, *extBstrapRingListenComm, *tmpRecvComm;
+ void *tmpSendComm, *tmpRecvComm;
// Pass the remote address to listen via info
if (idFromEnv) {
- memcpy(&info.extHandleListenFromRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
- memcpy(&info.extHandleRing, &id->extHandleRoot, sizeof(ncclNetHandle_t));
+ memcpy(&info.extHandleListen, &id->extHandleRoot, sizeof(ncclNetHandle_t));
+ memcpy(&info.extHandleListenRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t));
}
// listen will return the local address via info (specify interface type 'findSubnetIf')
state->dev = idFromEnv ? findSubnetIf : 0;
- NCCLCHECK(bootstrapListen(state->dev, &info.extHandleListenFromRoot, &extBstrapRootListenComm));
- NCCLCHECK(bootstrapListen(state->dev, &info.extHandleRing, &extBstrapRingListenComm)); // AllGather Ring
+ void* extBstrapListenCommRoot;
+ NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListen, &state->extBstrapListenComm));
+ NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListenRoot, &extBstrapListenCommRoot));
+
+ // stagger connection times to avoid an overload of the root at very high rank counts
+ if (nranks > 128) {
+ long msec = rank;
+ struct timespec tv;
+ tv.tv_sec = msec / 1000;
+ tv.tv_nsec = 1000000 * (msec % 1000);
+ TRACE(NCCL_INIT, "rank %d delaying connection to root by %ld msec", rank, msec);
+ (void) nanosleep(&tv, NULL);
+ }
- memcpy(&state->extBstrapRootHandle, &id->extHandleRoot, sizeof(ncclNetHandle_t));
- // send info on my listening sockets to root
- NCCLCHECK(bootstrapConnect(state->dev, id->extHandleRoot, &tmpSendComm));
- NCCLCHECK(bootstrapSend(tmpSendComm, &info, sizeof(info)));
- NCCLCHECK(bootstrapCloseSend(tmpSendComm));
+ // send info on my listening socket to root
+ NCCLCHECK(bootstrapNetConnect(state->dev, id->extHandleRoot, &tmpSendComm));
+ NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info)));
+ NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
// get info on my "next" rank in the bootstrap ring from root
ncclNetHandle_t extHandleNext;
- NCCLCHECK(bootstrapAccept(extBstrapRootListenComm, &tmpRecvComm));
- NCCLCHECK(bootstrapRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
- NCCLCHECK(bootstrapCloseRecv(tmpRecvComm));
+ NCCLCHECK(bootstrapNetAccept(extBstrapListenCommRoot, &tmpRecvComm));
+ NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
+ NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
+ NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));
- NCCLCHECK(bootstrapConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
+ NCCLCHECK(bootstrapNetConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm));
// Accept the connect request from the previous rank in the AllGather ring
- NCCLCHECK(bootstrapAccept(extBstrapRingListenComm, &state->extBstrapRingRecvComm));
- NCCLCHECK(bootstrapCloseListen(extBstrapRingListenComm));
- NCCLCHECK(bootstrapCloseListen(extBstrapRootListenComm));
+ NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm));
+
+ // AllGather all listen handlers
+ NCCLCHECK(ncclCalloc(&state->peerBstrapHandles, nranks));
+ memcpy(state->peerBstrapHandles+rank, info.extHandleListen, sizeof(ncclNetHandle_t));
+ NCCLCHECK(bootstrapAllGather(state, state->peerBstrapHandles, sizeof(ncclNetHandle_t)));
+
+ TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
return ncclSuccess;
}
@@ -224,25 +253,106 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
* and send previous step's data from (rank-i) to right
*/
for (int i=0; i<nranks-1; i++) {
- int rslice = (rank - i - 1 + nranks) % nranks;
- int sslice = (rank - i + nranks) % nranks;
+ size_t rslice = (rank - i - 1 + nranks) % nranks;
+ size_t sslice = (rank - i + nranks) % nranks;
// Send slice to the right
- NCCLCHECK(bootstrapSend(state->extBstrapRingSendComm, data+sslice*size, size));
+ NCCLCHECK(bootstrapNetSend(state->extBstrapRingSendComm, data+sslice*size, size));
// Recv slice from the left
- NCCLCHECK(bootstrapRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
+ NCCLCHECK(bootstrapNetRecv(state->extBstrapRingRecvComm, data+rslice*size, size));
}
TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
return ncclSuccess;
}
-ncclResult_t bootstrapClose(void* commState) {
+ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) {
struct extState* state = (struct extState*)commState;
+ void* tmpSendComm;
+ NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles[peer], &tmpSendComm));
+ NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int)));
+ NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size));
+ NCCLCHECK(bootstrapNetCloseSend(tmpSendComm));
+ return ncclSuccess;
+}
+
+ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) {
+ // New unex
+ struct unexConn* unex;
+ NCCLCHECK(ncclCalloc(&unex, 1));
+ unex->peer = peer;
+ unex->comm = comm;
+
+ // Enqueue
+ struct unexConn* list = state->unexpectedConnections;
+ if (list == NULL) {
+ state->unexpectedConnections = unex;
+ return ncclSuccess;
+ }
+ while (list->next) list = list->next;
+ list->next = unex;
+ return ncclSuccess;
+}
- NCCLCHECK(bootstrapCloseSend(state->extBstrapRingSendComm));
- NCCLCHECK(bootstrapCloseRecv(state->extBstrapRingRecvComm));
+void* unexpectedDequeue(struct extState* state, int peer) {
+ struct unexConn* elem = state->unexpectedConnections;
+ struct unexConn* prev = NULL;
+ while (elem) {
+ if (elem->peer == peer) {
+ if (prev == NULL) {
+ state->unexpectedConnections = elem->next;
+ } else {
+ prev->next = elem->next;
+ }
+ void* comm = elem->comm;
+ free(elem);
+ return comm;
+ }
+ prev = elem;
+ elem = elem->next;
+ }
+ return NULL;
+}
+
+// We can't know who we'll receive from, so we need to receive everything at once
+ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
+ struct extState* state = (struct extState*)commState;
+
+ void* tmpRecvComm;
+
+ // Search unexpected connections first
+ if ((tmpRecvComm = unexpectedDequeue(state, peer)) != NULL) {
+ NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
+ NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
+ return ncclSuccess;
+ }
+
+ // Then look for new connections
+ while (1) {
+ NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &tmpRecvComm));
+ int newPeer;
+ NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &newPeer, sizeof(int)));
+ if (newPeer == peer) {
+ NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size));
+ NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
+ return ncclSuccess;
+ }
+ // Unexpected connection. Save for later.
+ NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvComm));
+ }
+}
+
+ncclResult_t bootstrapClose(void* commState) {
+ struct extState* state = (struct extState*)commState;
+ if (state->unexpectedConnections != NULL) {
+ WARN("Unexpected connections are not empty.\n");
+ return ncclInternalError;
+ }
+ NCCLCHECK(bootstrapNetCloseListen(state->extBstrapListenComm));
+ NCCLCHECK(bootstrapNetCloseSend(state->extBstrapRingSendComm));
+ NCCLCHECK(bootstrapNetCloseRecv(state->extBstrapRingRecvComm));
+ free(state->peerBstrapHandles);
free(state);
return ncclSuccess;
diff --git a/src/channel.cu b/src/channel.cu
new file mode 100644
index 0000000..937e84e
--- /dev/null
+++ b/src/channel.cu
@@ -0,0 +1,51 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "channel.h"
+#include "param.h"
+
+NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
+
+ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
+ struct ncclChannel* channel = comm->channels+channelid;
+ channel->id = channelid;
+
+ // Setup intermediate buffering
+ channel->buffSize = ncclParamBuffsize();
+
+ // Ring index to user rank table.
+ NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
+ NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
+
+ // Communication structures with peers.
+ NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks));
+ NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks));
+ for (size_t i=0; i<comm->nRanks; ++i) {
+ channel->peers[i].send.comm = comm;
+ channel->peers[i].recv.comm = comm;
+ }
+
+ // Per-channel operation list.
+ NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
+ return ncclSuccess;
+}
+
+ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
+ // Operation list
+ NCCLCHECK(ncclCudaHostFree(channel->collectives));
+
+ // Free Ring index to rank tables
+ free(channel->ring.userRanks);
+ CUDACHECK(cudaFree(channel->ring.devUserRanks));
+
+ // Free transport proxy resources
+ for (int r=0; r<nRanks; r++) {
+ struct ncclPeer* peer = channel->peers+r;
+ if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
+ if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
+ }
+ return ncclSuccess;
+}
diff --git a/src/collectives/all_gather.cu b/src/collectives/all_gather.cu
index 8dec28e..db21dee 100644
--- a/src/collectives/all_gather.cu
+++ b/src/collectives/all_gather.cu
@@ -4,29 +4,15 @@
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
-#include "common_coll.h"
#include "enqueue.h"
#include "collectives.h"
-ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count,
- ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
- size_t nbytes = count*ncclTypeSize(datatype);
- INFO(NCCL_COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
- if (comm->nRanks == 1) {
- if (sendbuff != recvbuff)
- CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
- } else {
- NCCLCHECK(transportSaveProxies(ALLGATHER_SUBSTEPS, ALLGATHER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
- NCCLCHECK(saveKernel(ncclCollAllGather, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes*comm->nRanks, 1));
- }
- return ncclSuccess;
-}
-
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
- return ncclEnqueueCheck(ncclAllGatherFunc, "AllGather", sendbuff, recvbuff, sendcount, datatype,
- ncclSum, 0, comm, stream);
+ struct ncclInfo info = { ncclCollAllGather, "AllGather",
+ sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
+ ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
+ return ncclEnqueueCheck(&info);
}
diff --git a/src/collectives/all_reduce.cu b/src/collectives/all_reduce.cu
index cc14083..1492c90 100644
--- a/src/collectives/all_reduce.cu
+++ b/src/collectives/all_reduce.cu
@@ -4,29 +4,15 @@
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
-#include "common_coll.h"
#include "enqueue.h"
#include "collectives.h"
-ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count,
- ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
- size_t nbytes = count*ncclTypeSize(datatype);
- INFO(NCCL_COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
- if (comm->nRanks == 1) {
- if (sendbuff != recvbuff)
- CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
- } else {
- NCCLCHECK(transportSaveProxies(ALLREDUCE_SUBSTEPS, ALLREDUCE_BUFCHUNKS, (comm->nRanks)*2-2, comm->nRanks, nbytes, proxyPatternRing, comm));
- NCCLCHECK(saveKernel(ncclCollAllReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, comm->nRanks));
- }
- return ncclSuccess;
-}
-
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
- ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+ ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
- ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
- return ncclEnqueueCheck(ncclAllReduceFunc, "AllReduce", sendbuff, recvbuff, count, datatype,
- op, 0, comm, stream);
+ ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
+ struct ncclInfo info = { ncclCollAllReduce, "AllReduce",
+ sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
+ ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
+ return ncclEnqueueCheck(&info);
}
diff --git a/src/collectives/broadcast.cu b/src/collectives/broadcast.cu
index 91ce905..6a3d0a8 100644
--- a/src/collectives/broadcast.cu
+++ b/src/collectives/broadcast.cu
@@ -4,39 +4,23 @@
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
-#include "common_coll.h"
#include "enqueue.h"
#include "collectives.h"
-ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count,
- ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
- size_t nbytes = count*ncclTypeSize(datatype);
- INFO(NCCL_COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
- if (comm->nRanks == 1) {
- if (sendbuff != recvbuff)
- CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
- } else {
- NCCLCHECK(transportSaveProxies(BROADCAST_SUBSTEPS, BROADCAST_BUFCHUNKS, 1, 1, nbytes, proxyPatternFrom(root), comm));
- NCCLCHECK(saveKernel(ncclCollBroadcast, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes, 1));
- }
-
- return ncclSuccess;
+NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+ ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+ ncclComm_t comm, cudaStream_t stream) {
+ struct ncclInfo info = { ncclCollBroadcast, "Broadcast",
+ sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
+ BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
+ return ncclEnqueueCheck(&info);
}
-
/* Deprecated original "in place" function, similar to MPI */
NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
- return ncclEnqueueCheck(ncclBroadcastFunc, "Bcast", buff, buff, count, datatype,
- ncclSum, root, comm, stream);
+ return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
}
-NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
- ncclComm_t comm, cudaStream_t stream);
-ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
- ncclComm_t comm, cudaStream_t stream) {
- return ncclEnqueueCheck(ncclBroadcastFunc, "Broadcast", sendbuff, recvbuff, count, datatype,
- ncclSum, root, comm, stream);
-}
diff --git a/src/collectives/collectives.h b/src/collectives/collectives.h
index 4a5cb7a..e6b19cb 100644
--- a/src/collectives/collectives.h
+++ b/src/collectives/collectives.h
@@ -7,9 +7,7 @@
#ifndef NCCL_COLLECTIVES_H_
#define NCCL_COLLECTIVES_H_
-typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
-
-#define FUNC_INDEX(coll, redop, dtype, ll) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll)
+#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*2+(al))*2+(ll))
#define NCCL_COLL_NAME(coll, op, dtype) \
coll##_##op##_##dtype
@@ -18,13 +16,17 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed
coll##Kernel_##op##_##dtype
/* Declare all collective operations */
-#define DECL_COLL4(coll, op, dtype) \
+#define DECL_COLL5(coll, op, dtype) \
extern __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
- extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl coll); \
+ extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl c); \
+
+#define DECL_COLL4(coll, op, dtype) \
+ DECL_COLL5(coll, op, dtype) \
+ DECL_COLL5(coll##LL, op, dtype)
#define DECL_COLL3(coll, op, dtype) \
- DECL_COLL4(coll##LL, op, dtype) \
- DECL_COLL4(coll, op, dtype)
+ DECL_COLL4(coll##Ring, op, dtype) \
+ DECL_COLL4(coll##Tree, op, dtype)
#define DECL_COLL2(coll, op) \
DECL_COLL3(coll, op, i8) \
@@ -52,15 +54,16 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed
DECL_ALL_COLLS
-#define ALLREDUCE_SUBSTEPS 2
-#define ALLREDUCE_BUFCHUNKS 2
-#define ALLGATHER_SUBSTEPS 2
-#define ALLGATHER_BUFCHUNKS 2
-#define REDUCESCATTER_SUBSTEPS 2
-#define REDUCESCATTER_BUFCHUNKS 2
-#define BROADCAST_SUBSTEPS 8
-#define BROADCAST_BUFCHUNKS 2
-#define REDUCE_SUBSTEPS 8
-#define REDUCE_BUFCHUNKS 2
+// CHUNKSIZE must be a multiple of SLICESIZE
+#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
+#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
+#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
+#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
+#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
+#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
+#define BROADCAST_SLICESTEPS 1
+#define BROADCAST_CHUNKSTEPS 1
+#define REDUCE_SLICESTEPS 1
+#define REDUCE_CHUNKSTEPS 1
#endif
diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile
index e2bcd49..8e92596 100644
--- a/src/collectives/device/Makefile
+++ b/src/collectives/device/Makefile
@@ -12,18 +12,13 @@ OBJDIR := $(BUILDDIR)/obj/collectives/device
LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu
-LIBOBJ := $(patsubst %.cu,$(OBJDIR)/%_sum.o, $(LIBSRCFILES)) \
- $(patsubst %.cu,$(OBJDIR)/%_prod.o, $(LIBSRCFILES)) \
- $(patsubst %.cu,$(OBJDIR)/%_min.o, $(LIBSRCFILES)) \
- $(patsubst %.cu,$(OBJDIR)/%_max.o, $(LIBSRCFILES)) \
- $(OBJDIR)/functions.o
-
LIBSRCFILES += functions.cu
DEPFILES := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES))
-DEPENDFILES := $(DEPFILES:%.d=%.dep)
+DEPENDFILES:= $(DEPFILES:%.d=%.dep)
STATICLIB := $(OBJDIR)/colldevice.a
DEVOBJ := $(OBJDIR)/devlink.o
+RULESFILE := $(OBJDIR)/Makefile.rules
NVCUFLAGS += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden"
@@ -33,6 +28,16 @@ all: $(STATICLIB)
# Dummy rule so that the extra dependency (%.dep) files are preserved by make
all_deps: $(DEPENDFILES)
+# Auto-generating the rules per op/reduction/datatype/algorithm
+$(RULESFILE) :
+ @printf "Generating %-35s > %s\n" rules $@
+ @mkdir -p $(OBJDIR)
+ @./gen_rules.sh $(OBJDIR) > $@
+
+-include $(RULESFILE)
+
+LIBOBJ := $(GENOBJS) $(OBJDIR)/functions.o
+
-include $(DEPFILES)
$(STATICLIB): $(LIBOBJ) $(DEVOBJ)
@@ -58,26 +63,6 @@ $(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep
mkdir -p `dirname $@`
$(NVCC) $(NVCUFLAGS) -dc $< -o $@
-$(OBJDIR)/%_sum.o : %.cu $(OBJDIR)/%.dep
- @printf "Compiling %-35s > %s\n" $< $@
- mkdir -p `dirname $@`
- $(NVCC) -DNCCL_OP=0 $(NVCUFLAGS) -dc $< -o $@
-
-$(OBJDIR)/%_prod.o : %.cu $(OBJDIR)/%.dep
- @printf "Compiling %-35s > %s\n" $< $@
- mkdir -p `dirname $@`
- $(NVCC) -DNCCL_OP=1 $(NVCUFLAGS) -dc $< -o $@
-
-$(OBJDIR)/%_min.o : %.cu $(OBJDIR)/%.dep
- @printf "Compiling %-35s > %s\n" $< $@
- mkdir -p `dirname $@`
- $(NVCC) -DNCCL_OP=2 $(NVCUFLAGS) -dc $< -o $@
-
-$(OBJDIR)/%_max.o : %.cu $(OBJDIR)/%.dep
- @printf "Compiling %-35s > %s\n" $< $@
- mkdir -p `dirname $@`
- $(NVCC) -DNCCL_OP=3 $(NVCUFLAGS) -dc $< -o $@
-
# ... and create the device-side linked object with all those.
$(DEVOBJ) : $(LIBOBJ)
$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu
index 0f572ce..530bf14 100644
--- a/src/collectives/device/all_gather.cu
+++ b/src/collectives/device/all_gather.cu
@@ -4,12 +4,8 @@
* See LICENSE.txt for license information
************************************************************************/
-#include "common.h"
#include "all_gather.h"
+#include "common.h"
#include "collectives.h"
-#define UNROLL 4
-
-#if NCCL_OP == 0
-IMPL_COLL3(ncclAllGather, copy, FuncSum, i8, int8_t, ncclCollAllGather, ncclSum, ncclInt8);
-#endif
+IMPL_COLL_C(ncclAllGather, ncclCollAllGather);
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index a30e575..36809c9 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -8,72 +8,35 @@
#include "primitives.h"
#include "collectives.h"
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
- step++; \
- poffset = noffset; \
- noffset += sliceSize; \
- if (noffset == buffSize) noffset = 0;
-
template<int UNROLL, class FUNC, typename T>
-__device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
- __shared__ T* sharedNextOutput;
struct ncclComm* comm = args->comm;
- struct ncclRing* ring = comm->rings+blockIdx.x;
- int prevdirect = ring->recv.conn.direct;
- int nextdirect = ring->send.conn.direct;
-
- WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
- WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS);
- PostFlag postDoneToPrev(ring->recv.conn.head, ALLGATHER_SUBSTEPS, NULL, 0);
- PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS);
-
- typedef Primitives<UNROLL, ALLGATHER_SUBSTEPS, T> Prims;
-
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
const int nranks = comm->nRanks;
- const int buffSize = ring->buffSize / sizeof(T);
- const int sliceSize = buffSize / ALLGATHER_BUFCHUNKS;
- const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
-
- if (tid == 0) {
- // Update in case we skipped some collectives
- *ring->recv.conn.opCount = args->opCount;
- // Wait for next to be ready
- WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
- waitOpCountNext.wait(args->opCount);
- if (prevdirect) {
- *ring->recv.conn.ptrExchange = args->ThisOutput;
- }
- if (nextdirect) {
- void* volatile* ptr = &(ring->devMemSend->ptrExchange);
- while (*ptr == nullptr);
- sharedNextOutput = (T*)*ptr;
- *ptr = nullptr;
- }
- }
- __syncthreads();
-
- uint64_t step = 0ULL;
- int poffset, noffset = 0;
+ const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+ const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+ const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
- T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
- T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+ ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
+ prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
- ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
- ssize_t chunkOffset = gridOffset + bid*chunkSize;
+ int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+ ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+ ssize_t chunkOffset = gridOffset + bid*realChunkSize;
/////////////// begin AllGather steps ///////////////
ssize_t offset;
- int maxOffset = min(chunkSize, size-chunkOffset);
+ int nelem = min(realChunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
@@ -81,129 +44,51 @@ __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) {
offset = chunkOffset + rankDest * size;
if (thisInput + chunkOffset == thisOutput + offset) { // In place
- Prims::Copy(tid, nthreads,
- thisInput + chunkOffset,
- nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
- sliceSize, maxOffset,
- step,
- waitDoneFromNext,
- postReadyToNext);
+ prims.directSend(thisInput+chunkOffset, offset, nelem);
} else {
- Prims::DoubleCopy(tid, nthreads,
- thisInput + chunkOffset,
- thisOutput + offset,
- nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
- sliceSize, maxOffset,
- step,
- waitDoneFromNext,
- postReadyToNext);
+ prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem);
}
- NEXT_STEP; // Increases step, poffset, noffset
-
// k-2 steps: copy to next GPU
- if (prevdirect) {
- for (int j=1; j<nranks-1; ++j) {
- rankDest = ring->devUserRanks[nranks-j];
- offset = chunkOffset + rankDest * size;
-
- Prims::Copy(tid, nthreads,
- thisOutput + offset,
- nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
- sliceSize, maxOffset,
- step,
- waitDoneFromNext, waitReadyFromPrev,
- postReadyToNext, postDoneToPrev);
-
- NEXT_STEP;
- }
- Prims::Copy(tid, nthreads,
- NULL,
- NULL,
- 0, 0,
- step,
- waitReadyFromPrev,
- postDoneToPrev);
- } else {
- for (int j=1; j<nranks-1; ++j) {
- rankDest = ring->devUserRanks[nranks-j];
- offset = chunkOffset + rankDest * size;
-
- Prims::DoubleCopy(tid, nthreads,
- prevInput + poffset,
- thisOutput + offset,
- nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
- sliceSize, maxOffset,
- step,
- waitDoneFromNext, waitReadyFromPrev,
- postReadyToNext, postDoneToPrev);
-
- NEXT_STEP;
- }
-
- // Make final copy from buffer to dest.
- rankDest = ring->devUserRanks[1];
+ for (int j=1; j<nranks-1; ++j) {
+ rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
- // Here we need to copy from buffer to this output.
- Prims::Copy(tid, nthreads,
- prevInput + poffset,
- thisOutput + offset,
- sliceSize, maxOffset,
- step,
- waitReadyFromPrev,
- postDoneToPrev);
+ prims.directRecvCopySend(thisOutput+offset, offset, nelem);
}
- }
- if (tid == 0) {
- waitDoneFromNext.wait(ALLGATHER_SUBSTEPS*(step + ALLGATHER_BUFCHUNKS));
- *ring->send.conn.head = 0ULL;
- *ring->recv.conn.tail = 0ULL;
- __threadfence_system();
- *ring->recv.conn.opCount = args->opCount+1;
+ // Make final copy from buffer to dest.
+ rankDest = ring->devUserRanks[1];
+ offset = chunkOffset + rankDest * size;
+
+ // Final wait/copy.
+ prims.directRecv(thisOutput+offset, offset, nelem);
}
}
-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
- poffset = noffset; \
- pflag = nflag; \
- noffset += NCCL_LL_SLICE_LINES; \
- if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
- nflag++; \
- step++;
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
- const int llNthreads = args->nThreads;
+ const int nthreads = args->nThreads;
struct ncclComm* comm = args->comm;
- struct ncclRing* ring = comm->rings+blockIdx.x;
- volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
- volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
- volatile int * sizesFifo = ring->send.conn.llFifo;
- uint64_t sendHead = sendHeadPtr[0];
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
- typedef LLPrimitives<T, FUNC> LL;
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
const ssize_t size = args->N;
//const int rank = comm->rank;
const int nranks = comm->nRanks;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
- const ssize_t loopSize = args->nRings*chunkSize;
-
- uint64_t step = ring->send.conn.llStep;
- uint32_t pflag, nflag = step + 1;
- int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+ const ssize_t loopSize = args->nChannels*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
- union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
- union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
@@ -213,57 +98,34 @@ __device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) {
/////////////// begin AllGather steps ///////////////
ssize_t offset;
- int maxOffset = min(chunkSize, size-chunkOffset);
+ int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
- WAIT_NEXT;
if (thisInput + chunkOffset == thisOutput + offset) { // In place
- LL::ReduceCopy(
- thisInput + chunkOffset,
- nextOutput + noffset,
- maxOffset, nflag, llNthreads);
+ LLprims.send(thisInput+chunkOffset, nelem);
} else {
- LL::ReduceCopy(
- thisInput + chunkOffset,
- thisOutput + offset,
- nextOutput + noffset,
- maxOffset, nflag, llNthreads);
+ LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem);
}
- POST_SIZE;
-
- NEXT_STEP_LL;
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
- WAIT_NEXT;
- LL::ReduceCopy(
- prevInput + poffset,
- thisOutput + offset,
- nextOutput + noffset,
- maxOffset, pflag, nflag, llNthreads);
- POST_SIZE;
- ACK_PREV;
-
- NEXT_STEP_LL;
+ LLprims.recvCopySend(thisOutput+offset, nelem);
}
// step k-1: final store
rankDest = ring->devUserRanks[1];
offset = chunkOffset + rankDest * size;
- LL::ReduceCopy(
- prevInput + poffset,
- thisOutput + offset,
- maxOffset, pflag, llNthreads);
- ACK_PREV;
+ LLprims.recv(thisOutput+offset, nelem);
}
-
- FIFO_CLEANING_AND_SAVE_STEP(nflag);
}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu
index caa1479..aaa96b4 100644
--- a/src/collectives/device/all_reduce.cu
+++ b/src/collectives/device/all_reduce.cu
@@ -4,18 +4,8 @@
* See LICENSE.txt for license information
************************************************************************/
-#include "common.h"
#include "all_reduce.h"
+#include "common.h"
#include "collectives.h"
-#define UNROLL 4
-
-#if NCCL_OP == 0
-IMPL_COLL2(ncclAllReduce, sum, FuncSum, ncclCollAllReduce, ncclSum);
-#elif NCCL_OP == 1
-IMPL_COLL2(ncclAllReduce, prod, FuncProd, ncclCollAllReduce, ncclProd);
-#elif NCCL_OP == 2
-IMPL_COLL2(ncclAllReduce, min, FuncMin, ncclCollAllReduce, ncclMin);
-#elif NCCL_OP == 3
-IMPL_COLL2(ncclAllReduce, max, FuncMax, ncclCollAllReduce, ncclMax);
-#endif
+IMPL_COLL_R(ncclAllReduce, ncclCollAllReduce);
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index d7abc64..ea89a71 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -8,233 +8,152 @@
#include "primitives.h"
#include "collectives.h"
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
- step++; \
- poffset = noffset; \
- noffset += sliceSize; \
- if (noffset == buffSize) noffset = 0;
-
template<int UNROLL, class FUNC, typename T>
-__device__ void ncclAllReduceKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
- __shared__ T* sharedNextOutput;
struct ncclComm* comm = args->comm;
- struct ncclRing* ring = comm->rings+blockIdx.x;
- int prevdirect = ring->recv.conn.direct;
- int nextdirect = ring->send.conn.direct;
-
- WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
- WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS);
- PostFlag postDoneToPrev(ring->recv.conn.head, ALLREDUCE_SUBSTEPS, NULL, 0);
- PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS);
-
- typedef Primitives<UNROLL, ALLREDUCE_SUBSTEPS, T, FUNC> Prims;
-
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
- //const int rank = comm->rank;
const int nranks = comm->nRanks;
- const int buffSize = ring->buffSize / sizeof(T);
- const int sliceSize = buffSize / ALLREDUCE_BUFCHUNKS;
- const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
-
- if (tid == 0) {
- // Update in case we skipped some collectives
- *ring->recv.conn.opCount = args->opCount;
- // Wait for next to be ready
- WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
- waitOpCountNext.wait(args->opCount);
- if (prevdirect) {
- *ring->recv.conn.ptrExchange = args->ThisOutput;
- }
- if (nextdirect) {
- void* volatile* ptr = &(ring->devMemSend->ptrExchange);
- while (*ptr == nullptr);
- sharedNextOutput = (T*)*ptr;
- *ptr = nullptr;
- }
- }
- __syncthreads();
-
- uint64_t step = 0ULL;
- int poffset, noffset = 0;
+ const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+ const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+ const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
- T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
- T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+ ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC>
+ prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) {
- int chunkSize = min(sliceSize, DIVUP(size-gridOffset,nranks*args->nRings));
- ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
- ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize;
+ int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels));
+ ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+ ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize;
/////////////// begin AllReduce steps ///////////////
ssize_t offset;
- int maxOffset;
+ int nelem;
int slice;
// step 0: push data to next GPU
slice = ring->devUserRanks[nranks-1];
- offset = chunkOffset + slice * chunkSize;
- maxOffset = min(chunkSize, size-offset);
+ offset = chunkOffset + slice * realChunkSize;
+ nelem = min(realChunkSize, size-offset);
- Prims::Copy(tid, nthreads,
- thisInput + offset,
- nextOutput + noffset,
- sliceSize, maxOffset,
- step,
- waitDoneFromNext,
- postReadyToNext);
-
- NEXT_STEP; // Increases step, poffset, noffset
+ prims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
slice = ring->devUserRanks[nranks-j];
- offset = chunkOffset + slice * chunkSize;
- maxOffset = min(chunkSize, size-offset);
-
- Prims::Reduce(tid, nthreads,
- prevInput + poffset,
- thisInput + offset,
- nextOutput + noffset,
- sliceSize, maxOffset,
- step,
- waitDoneFromNext, waitReadyFromPrev,
- postReadyToNext, postDoneToPrev);
-
- NEXT_STEP;
+ offset = chunkOffset + slice * realChunkSize;
+ nelem = min(realChunkSize, size-offset);
+
+ prims.recvReduceSend(thisInput+offset, nelem);
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
slice = ring->devUserRanks[0];
- offset = chunkOffset + slice * chunkSize;
- maxOffset = min(chunkSize, size-offset);
+ offset = chunkOffset + slice * realChunkSize;
+ nelem = min(realChunkSize, size-offset);
- Prims::ReduceCopy(tid, nthreads,
- prevInput + poffset,
- thisInput + offset,
- nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
- thisOutput + offset,
- sliceSize, maxOffset,
- step,
- waitDoneFromNext, waitReadyFromPrev,
- postReadyToNext, postDoneToPrev);
-
- NEXT_STEP;
+ prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem);
// k-2 steps: copy to next GPU
- if (prevdirect) {
- for (int j=1; j<nranks-1; ++j) {
- slice = ring->devUserRanks[nranks - j];
- offset = chunkOffset + slice * chunkSize;
- maxOffset = min(chunkSize, size-offset);
-
- Prims::Copy(tid, nthreads,
- thisOutput + offset,
- nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
- sliceSize, maxOffset,
- step,
- waitDoneFromNext, waitReadyFromPrev,
- postReadyToNext, postDoneToPrev);
-
- NEXT_STEP;
- }
- Prims::Copy(tid, nthreads,
- NULL,
- NULL,
- 0, 0,
- step,
- waitReadyFromPrev,
- postDoneToPrev);
- } else {
- for (int j=1; j<nranks-1; ++j) {
- slice = ring->devUserRanks[nranks - j];
- offset = chunkOffset + slice * chunkSize;
- maxOffset = min(chunkSize, size-offset);
-
- Prims::DoubleCopy(tid, nthreads,
- prevInput + poffset,
- thisOutput + offset,
- nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset),
- sliceSize, maxOffset,
- step,
- waitDoneFromNext, waitReadyFromPrev,
- postReadyToNext, postDoneToPrev);
-
- NEXT_STEP;
- }
+ for (int j=1; j<nranks-1; ++j) {
+ slice = ring->devUserRanks[nranks-j];
+ offset = chunkOffset + slice * realChunkSize;
+ nelem = min(realChunkSize, size-offset);
- // Make final copy from buffer to dest.
- slice = ring->devUserRanks[1];
- offset = chunkOffset + slice * chunkSize;
- maxOffset = min(chunkSize, size-offset);
-
- // Here we need to copy from buffer to this output.
- Prims::Copy(tid, nthreads,
- prevInput + poffset,
- thisOutput + offset,
- sliceSize, maxOffset,
- step,
- waitReadyFromPrev,
- postDoneToPrev);
+ prims.directRecvCopySend(thisOutput+offset, offset, nelem);
}
- }
- if (tid == 0) {
- // Wait for next to have consumed all data before we reset the flag
- waitDoneFromNext.wait(ALLREDUCE_SUBSTEPS*(step + ALLREDUCE_BUFCHUNKS));
- *ring->send.conn.head = 0ULL;
- *ring->recv.conn.tail = 0ULL;
- __threadfence_system();
- *ring->recv.conn.opCount = args->opCount+1;
+ // Make final copy from buffer to dest.
+ slice = ring->devUserRanks[1];
+ offset = chunkOffset + slice * realChunkSize;
+ nelem = min(realChunkSize, size-offset);
+
+ // Final wait/copy.
+ prims.directRecv(thisOutput+offset, offset, nelem);
}
}
-#include "ll_kernel.h"
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = blockDim.x - 1;
+ const int bid = args->bid;
+ struct ncclComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclTree* tree = &channel->tree;
+ const ssize_t size = args->N;
+ const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+ const int chunkSize = args->lastChunkSize;
+ const ssize_t loopSize = args->nChannels*chunkSize;
-#define NEXT_STEP_LL \
- poffset = noffset; \
- pflag = nflag; \
- noffset += NCCL_LL_SLICE_LINES; \
- if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
- nflag++; \
- step++;
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->ThisInput;
+ T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+ do {
+ // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
+ ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Up
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->up == -1) {
+ prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+ } else if (tree->down[0] == -1) {
+ prims.send(thisInput+offset, nelem);
+ } else {
+ prims.recvReduceSend(thisInput+offset, nelem);
+ }
+ }
+ } while(0);
+
+ do {
+ // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
+ ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Down
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->up == -1) {
+ prims.send(thisOutput+offset, nelem);
+ } else if (tree->down[0] == -1) {
+ prims.recv(thisOutput+offset, nelem);
+ } else {
+ prims.recvCopySend(thisOutput+offset, nelem);
+ }
+ }
+ } while(0);
+}
template<int UNUSED, class FUNC, typename T>
-__device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
- const int llNthreads = args->nThreads;
+ const int nthreads = args->nThreads;
struct ncclComm* comm = args->comm;
- struct ncclRing* ring = comm->rings+blockIdx.x;
- volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
- volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
- volatile int * sizesFifo = ring->send.conn.llFifo;
- uint64_t sendHead = sendHeadPtr[0];
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
- typedef LLPrimitives<T, FUNC> LL;
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
const ssize_t size = args->N;
//const int rank = comm->rank;
const int nranks = comm->nRanks;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
- const ssize_t loopSize = args->nRings*nranks*chunkSize;
-
- uint64_t step = ring->send.conn.llStep;
- uint32_t pflag, nflag = step + 1;
- int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+ const ssize_t loopSize = args->nChannels*nranks*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
- union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
- union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
@@ -244,89 +163,99 @@ __device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) {
/////////////// begin AllReduce steps ///////////////
ssize_t offset;
- int maxOffset;
+ int nelem;
int slice;
// step 0: push data to next GPU
slice = ring->devUserRanks[nranks-1];
offset = chunkOffset + slice * chunkSize;
- maxOffset = min(chunkSize, size-offset);
+ nelem = min(chunkSize, size-offset);
- WAIT_NEXT;
- LL::ReduceCopy(
- thisInput + offset,
- nextOutput + noffset,
- maxOffset, nflag, llNthreads);
- POST_SIZE;
-
- NEXT_STEP_LL;
+ LLprims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
slice = ring->devUserRanks[nranks-j];
offset = chunkOffset + slice * chunkSize;
- maxOffset = min(chunkSize, size-offset);
-
- WAIT_NEXT;
- LL::ReduceCopy(
- thisInput + offset,
- prevInput + poffset,
- nextOutput + noffset,
- maxOffset, pflag, nflag, llNthreads);
- POST_SIZE;
- ACK_PREV;
-
- NEXT_STEP_LL;
+ nelem = min(chunkSize, size-offset);
+
+ LLprims.recvReduceSend(thisInput+offset, nelem);
}
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
slice = ring->devUserRanks[0];
offset = chunkOffset + slice * chunkSize;
- maxOffset = min(chunkSize, size-offset);
+ nelem = min(chunkSize, size-offset);
- WAIT_NEXT;
- LL::ReduceCopy(
- thisInput + offset,
- prevInput + poffset,
- thisOutput + offset,
- nextOutput + noffset,
- maxOffset, pflag, nflag, llNthreads);
- POST_SIZE;
- ACK_PREV;
-
- NEXT_STEP_LL;
+ LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
- slice = ring->devUserRanks[nranks - j];
+ slice = ring->devUserRanks[nranks-j];
offset = chunkOffset + slice * chunkSize;
- maxOffset = min(chunkSize, size-offset);
-
- WAIT_NEXT;
- LL::ReduceCopy(
- prevInput + poffset,
- thisOutput + offset,
- nextOutput + noffset,
- maxOffset, pflag, nflag, llNthreads);
- POST_SIZE;
- ACK_PREV;
-
- NEXT_STEP_LL;
+ nelem = min(chunkSize, size-offset);
+
+ LLprims.recvCopySend(thisOutput+offset, nelem);
}
// Make final copy from buffer to dest.
slice = ring->devUserRanks[1];
offset = chunkOffset + slice * chunkSize;
- maxOffset = min(chunkSize, size-offset);
+ nelem = min(chunkSize, size-offset);
// Here we need to copy from buffer to this output.
- LL::ReduceCopy(
- prevInput + poffset,
- thisOutput + offset,
- maxOffset, pflag, llNthreads);
- ACK_PREV;
+ LLprims.recv(thisOutput+offset, nelem);
}
+}
- FIFO_CLEANING_AND_SAVE_STEP(nflag);
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads;
+ const int bid = args->bid;
+ struct ncclComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclTree* tree = &channel->tree;
+ const ssize_t size = args->N;
+ ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+ const ssize_t loopSize = args->nChannels*chunkSize;
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->ThisInput;
+ T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+ do {
+ // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
+ ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Up
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->up == -1) {
+ LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+ } else if (tree->down[0] == -1) {
+ LLprims.send(thisInput+offset, nelem);
+ } else {
+ LLprims.recvReduceSend(thisInput+offset, nelem);
+ }
+ }
+ } while(0);
+
+ do {
+ // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
+ ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Down
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->up == -1) {
+ LLprims.send(thisOutput+offset, nelem);
+ } else if (tree->down[0] == -1) {
+ LLprims.recv(thisOutput+offset, nelem);
+ } else {
+ LLprims.recvCopySend(thisOutput+offset, nelem);
+ }
+ }
+ } while(0);
}
diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu
index 4125de4..b83ee70 100644
--- a/src/collectives/device/broadcast.cu
+++ b/src/collectives/device/broadcast.cu
@@ -4,12 +4,8 @@
* See LICENSE.txt for license information
************************************************************************/
-#include "common.h"
#include "broadcast.h"
+#include "common.h"
#include "collectives.h"
-#define UNROLL 4
-
-#if NCCL_OP == 0
-IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8);
-#endif
+IMPL_COLL_C(ncclBroadcast, ncclCollBroadcast);
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index c2f6d00..fb18312 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -8,174 +8,74 @@
#include "primitives.h"
#include "collectives.h"
-// Increase Step and boffset for buffer sync
-#define NEXT_STEP \
- step++; \
- boffset += sliceSize; \
- if (boffset == buffSize) boffset = 0;
-
template<int UNROLL, class FUNC, typename T>
-__device__ void ncclBroadcastKernel(struct CollectiveArgs* args) {
+__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
- __shared__ T* sharedNextOutput;
struct ncclComm* comm = args->comm;
- struct ncclRing* ring = comm->rings+blockIdx.x;
- int prevdirect = ring->recv.conn.direct;
- int nextdirect = ring->send.conn.direct;
-
- WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS);
- WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
- PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
- PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS);
-
- typedef Primitives<UNROLL, BROADCAST_SUBSTEPS, T> Prims;
-
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
- const int buffSize = ring->buffSize / sizeof(T);
- const int sliceSize = buffSize / BROADCAST_BUFCHUNKS;
- const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+ const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+ const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS;
+ const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
const int root = args->root;
- if (tid == 0) {
- // Update in case we skipped some collectives
- *ring->recv.conn.opCount = args->opCount;
- if (nextRank != root) {
- // Wait for next to be ready
- WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
- waitOpCountNext.wait(args->opCount);
- }
- if (rank != root && prevdirect) {
- *ring->recv.conn.ptrExchange = args->ThisOutput;
- }
- if (nextRank != root && nextdirect) {
- void* volatile* ptr = &(ring->devMemSend->ptrExchange);
- while (*ptr == nullptr);
- sharedNextOutput = (T*)*ptr;
- *ptr = nullptr;
- }
- }
- __syncthreads();
-
- uint64_t step = 0ULL;
- int boffset = 0;
-
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
- T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
- T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+ ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, FUNC>
+ prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
- ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
- ssize_t offset = gridOffset + bid*chunkSize;
- int maxOffset = min(chunkSize, size-offset);
+ int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+ ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+ ssize_t offset = gridOffset + bid*realChunkSize;
+ int nelem = min(realChunkSize, size-offset);
if (rank == root) {
if (thisInput == thisOutput) {
- Prims::Copy(tid, nthreads,
- thisInput + offset,
- nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
- sliceSize, maxOffset,
- step,
- waitDoneFromNext,
- postReadyToNext);
+ prims.send(thisInput+offset, nelem);
} else {
- Prims::DoubleCopy(tid, nthreads,
- thisInput + offset,
- thisOutput + offset,
- nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
- sliceSize, maxOffset,
- step,
- waitDoneFromNext,
- postReadyToNext);
+ prims.copySend(thisInput+offset, thisOutput+offset, nelem);
}
} else if (nextRank == root) {
- if (prevdirect) maxOffset = 0; // Only wait for signals
- Prims::Copy(tid, nthreads,
- prevInput + boffset,
- thisOutput + offset,
- sliceSize, maxOffset,
- step,
- waitReadyFromPrev,
- postDoneToPrev);
+ prims.recv(thisOutput+offset, nelem);
} else {
- if (prevdirect) {
- Prims::Copy(tid, nthreads,
- thisOutput + offset,
- nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
- sliceSize, maxOffset,
- step,
- waitDoneFromNext, waitReadyFromPrev,
- postReadyToNext, postDoneToPrev);
- } else {
- Prims::DoubleCopy(tid, nthreads,
- prevInput + boffset,
- thisOutput + offset,
- nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset),
- sliceSize, maxOffset,
- step,
- waitDoneFromNext, waitReadyFromPrev,
- postReadyToNext, postDoneToPrev);
- }
- }
- NEXT_STEP; // Increases step, boffset
- }
-
- if (tid == 0) {
- if (nextRank != root) {
- // Wait for next to have consumed data before resetting the flag
- waitDoneFromNext.wait(BROADCAST_SUBSTEPS*(step + BROADCAST_BUFCHUNKS - 1));
- *ring->send.conn.head = 0ULL;
+ prims.recvCopySend(thisOutput+offset, nelem);
}
- *ring->recv.conn.tail = 0ULL;
- __threadfence_system();
- *ring->recv.conn.opCount = args->opCount+1;
}
}
-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
- boffset += NCCL_LL_SLICE_LINES; \
- if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
- flag++; \
- step++;
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
-__device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
- const int llNthreads = args->nThreads;
+ const int nthreads = args->nThreads;
struct ncclComm* comm = args->comm;
- struct ncclRing* ring = comm->rings+blockIdx.x;
- volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
- volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
- volatile int * sizesFifo = ring->send.conn.llFifo;
- uint64_t sendHead = sendHeadPtr[0];
- const int rank = comm->rank;
- const int nextRank = ring->devUserRanks[1];
- const int root = args->root;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
- typedef LLPrimitives<T, FUNC> LL;
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
const ssize_t size = args->N;
- ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
- const ssize_t loopSize = args->nRings*chunkSize;
+ const int rank = ring->devUserRanks[0];
+ const int nextRank = ring->devUserRanks[1];
+ const int root = args->root;
- uint64_t step = ring->send.conn.llStep;
- uint32_t flag = step + 1;
- int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+ ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+ const ssize_t loopSize = args->nChannels*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
- union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
- union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
@@ -183,46 +83,20 @@ __device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) {
}
ssize_t offset = gridOffset + bid*chunkSize;
- int maxOffset = min(chunkSize, size-offset);
+ int nelem = min(chunkSize, size-offset);
if (rank == root) {
- WAIT_NEXT;
if (thisInput == thisOutput) {
- LL::ReduceCopy(
- thisInput + offset,
- nextOutput + boffset,
- maxOffset, flag, llNthreads);
+ LLprims.send(thisInput+offset, nelem);
} else {
- LL::ReduceCopy(
- thisInput + offset,
- thisOutput + offset,
- nextOutput + boffset,
- maxOffset, flag, llNthreads);
+ LLprims.copySend(thisInput + offset, thisOutput + offset, nelem);
}
- POST_SIZE;
- NEXT_STEP_LL;
} else if (nextRank == root) {
- LL::ReduceCopy(
- prevInput + boffset,
- thisOutput + offset,
- maxOffset, flag, llNthreads);
- NEXT_STEP_LL;
- ACK_PREV;
+ LLprims.recv(thisOutput + offset, nelem);
} else {
- WAIT_NEXT;
- LL::ReduceCopy(
- prevInput + boffset,
- thisOutput + offset,
- nextOutput + boffset,
- maxOffset, flag, flag, llNthreads);
- POST_SIZE;
- NEXT_STEP_LL;
- ACK_PREV;
+ LLprims.recvCopySend(thisOutput + offset, nelem);
}
}
-
- // We need everyone to acknowledge data even if they didn't receive anything
- // so that the next collective can start right away.
- ACK_PREV;
-
- FIFO_CLEANING_AND_SAVE_STEP(flag);
}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index c988913..e4aecbd 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -11,13 +11,29 @@
#include "core.h"
#include "nccl.h"
+// Exit If Abort Barrier across CTA: make sure all threads exit consistently
+// Each thread sets a predicate to true if abort == 1
+// all CTA's threads enter the barrier and do a popc on their predicates being True
+// If any of the thread's predicate was True, all the threads call exit()
+static inline __device__ void exitIfAbortBarrier(int abort) {
+ uint32_t popc;
+ asm ("{");
+ asm volatile (" .reg .pred barr_pred;");
+ asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
+ asm volatile (" bar.red.popc.u32 %0, 13, barr_pred;" : "=r"(popc));
+ asm ("}");
+ if (popc) { asm volatile ("exit;"); }
+}
+
typedef void(*ncclKern_t)(struct CollectiveArgs* args);
extern __device__ ncclKern_t ncclFuncs[];
static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
int* d = (int*)dst;
int* s = (int*)src;
- __syncthreads();
+ // When aggregation is effective, if some threads have aborted inside the LL kernel,
+ // make sure the rest of the threads abort as well
+ exitIfAbortBarrier(0);
for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o];
__syncthreads();
}
@@ -27,12 +43,14 @@ static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* ho
}
/* Functions for aggregation case */
-#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
+#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
__device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \
- coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(args); \
+ coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(args); \
}
+
+#if NCCL_OP == 0
/* Kernels with the first operation inlined */
-#define IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, fIndex) \
+#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \
__launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \
__global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
int tid = threadIdx.x; \
@@ -40,25 +58,25 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
__shared__ struct ncclColl localColl; \
\
struct ncclComm* comm = firstColl.args.comm; \
- struct ncclRing* ring = comm->rings+bid; \
+ struct ncclChannel* channel = comm->channels+bid; \
struct ncclColl* c; \
if (bid == 0) { \
/* To optimize for latency, (only) the first operation is passed as argument.*/ \
c = &firstColl; \
} else { \
c = &localColl; \
- load_coll(c, ring->devCollectives+ring->collFifoHead, tid); \
+ load_coll(c, channel->devCollectives+channel->collFifoHead, tid); \
} \
while (1) { \
- if (tid < c->nThreads) { \
+ if (tid < c->args.nThreads) { \
if (c->funcIndex == fIndex) { \
- coll##Kernel<UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
+ coll##Kernel<COLL_UNROLL, ncclFunc<ctype>, ctype>(&c->args); \
} else { \
ncclFuncs[c->funcIndex](&c->args); \
} \
} \
int nextIndex = c->nextIndex; \
- if (tid == 0) ring->collFifoHead = nextIndex; \
+ if (tid == 0) channel->collFifoHead = nextIndex; \
\
if (c->active == 2) { \
return; \
@@ -66,25 +84,75 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
\
/* Load next collective operation*/ \
c = &localColl; /* for bid 0 */ \
- load_coll(c, ring->devCollectives+nextIndex, tid); \
+ load_coll(c, channel->devCollectives+nextIndex, tid); \
} \
}
+#else
+#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex)
+#endif
+
+// Only generate inline kernels for LL
+#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \
+ IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \
+ IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \
+ IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1, al)) \
#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
- IMPL_COLL4(coll##LL, op, ncclFunc, dtype, ctype) \
- IMPL_COLL4K(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1)) \
- IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \
- IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 0)) \
+ IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0) \
+ IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 1)
+#if NCCL_TYPE == 0
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+ IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8)
+#elif NCCL_TYPE == 1
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+ IMPL_COLL3(coll, op, ncclFunc, u8, uint8_t, ncclColl, ncclOp, ncclUint8)
+#elif NCCL_TYPE == 2
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+ IMPL_COLL3(coll, op, ncclFunc, i32, int32_t, ncclColl, ncclOp, ncclInt32)
+#elif NCCL_TYPE == 3
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+ IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32)
+#elif NCCL_TYPE == 4
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+ IMPL_COLL3(coll, op, ncclFunc, i64, int64_t, ncclColl, ncclOp, ncclInt64)
+#elif NCCL_TYPE == 5
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+ IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64)
+#elif NCCL_TYPE == 6
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+ IMPL_COLL3(coll, op, ncclFunc, f16, half, ncclColl, ncclOp, ncclFloat16)
+#elif NCCL_TYPE == 7
+#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
+ IMPL_COLL3(coll, op, ncclFunc, f32, float, ncclColl, ncclOp, ncclFloat32)
+#elif NCCL_TYPE == 8
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
- IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8) \
- IMPL_COLL3(coll, op, ncclFunc, u8, uint8_t, ncclColl, ncclOp, ncclUint8) \
- IMPL_COLL3(coll, op, ncclFunc, i32, int32_t, ncclColl, ncclOp, ncclInt32) \
- IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32) \
- IMPL_COLL3(coll, op, ncclFunc, i64, int64_t, ncclColl, ncclOp, ncclInt64) \
- IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64) \
- IMPL_COLL3(coll, op, ncclFunc, f16, half, ncclColl, ncclOp, ncclFloat16) \
- IMPL_COLL3(coll, op, ncclFunc, f32, float, ncclColl, ncclOp, ncclFloat32) \
IMPL_COLL3(coll, op, ncclFunc, f64, double, ncclColl, ncclOp, ncclFloat64)
+#endif
+
+// Reduction define all functions
+#if NCCL_OP == 0
+#define IMPL_COLL_R(collf, colln) \
+ IMPL_COLL2(collf, sum, FuncSum, colln, ncclSum);
+#elif NCCL_OP == 1
+#define IMPL_COLL_R(collf, colln) \
+ IMPL_COLL2(collf, prod, FuncProd, colln, ncclProd);
+#elif NCCL_OP == 2
+#define IMPL_COLL_R(collf, colln) \
+ IMPL_COLL2(collf, min, FuncMin, colln, ncclMin);
+#elif NCCL_OP == 3
+#define IMPL_COLL_R(collf, colln) \
+ IMPL_COLL2(collf, max, FuncMax, colln, ncclMax);
+#endif
+
+// Copy primitives only define one
+#if NCCL_OP == 0 && NCCL_TYPE == 0
+#define IMPL_COLL_C(collf, colln) \
+ IMPL_COLL3(collf, copy, FuncSum, i8, int8_t, colln, ncclSum, ncclInt8);
+#else
+#define IMPL_COLL_C(collf, colln)
+#endif
+
+#define COLL_UNROLL 4
#endif
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
index 0eaa061..e1fb096 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@@ -192,14 +192,6 @@ struct MULTI<FUNC, int64_t> {
}
};
-#define ALIGNUP(x, a) ((((x)-1) & ~((a)-1)) + (a))
-
-template<typename T>
-__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
- size_t ptrval = reinterpret_cast<size_t>(ptr);
- return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
-}
-
template<typename T> inline __device__
T vFetch(const volatile T* ptr) {
return *ptr;
@@ -236,25 +228,6 @@ void vStore<half>(volatile half* ptr, const half val) {
}
#endif
-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS>
-__device__ inline void ReduceCopy(
- const int tid, const int nthreads,
- const volatile T * __restrict__ const src0,
- const volatile T * __restrict__ const src1,
- volatile T * __restrict__ const dest0,
- volatile T * __restrict__ const dest1, const int N) {
- for (int idx = tid; idx < N; idx += nthreads) {
- T val = vFetch(src0+idx);
- if (TWO_INPUTS) {
- val = FUNC()(val, vFetch(src1+idx));
- }
- vStore(dest0+idx, val);
- if (TWO_OUTPUTS) {
- vStore(dest1+idx, val);
- }
- }
-}
-
typedef ulong2 Pack128;
template<class FUNC, typename T>
@@ -265,72 +238,111 @@ struct MULTI128 {
}
};
-inline __device__ void Fetch128(Pack128& v, Pack128* p) {
+inline __device__ void Fetch128(Pack128& v, const Pack128* p) {
asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory");
}
inline __device__ void Store128(Pack128* p, Pack128& v) {
asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" :: "l"(p), "l"(v.x), "l"(v.y) : "memory");
}
+template<class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ __forceinline__ void ReduceCopyMulti(const int tid, const int nthreads,
+ int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
+ const int offset, const int N) {
+ for (int idx = offset+tid; idx < offset+N; idx += nthreads) {
+ T val = vFetch(srcs[0]+idx);
+ #pragma unroll
+ for (int i=1; i<MINSRCS; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
+ #pragma unroll 1
+ for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) val = FUNC()(val, vFetch(srcs[i]+idx));
+
+ #pragma unroll
+ for (int i=0; i<MINDSTS; i++) vStore(dsts[i]+idx, val);
+ #pragma unroll 1
+ for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) vStore(dsts[i]+idx, val);
+ }
+}
+
#define WARP_SIZE 32
-template<class FUNC, typename T, bool TWO_INPUTS, bool TWO_OUTPUTS, int UNROLL>
-__device__ inline void ReduceCopy128b( const int w, const int nw, const int t,
- Pack128 * src0, Pack128 * src1, Pack128 * dest0, Pack128 * dest1,
- const int N) {
- Pack128 t0[UNROLL];
- Pack128 t1[UNROLL];
- const Pack128* src0_end = src0 + N;
+
+template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ __forceinline__ void ReduceCopy128bMulti( const int w, const int nw, const int t,
+ int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS],
+ const int elemOffset, const int Npack) {
const int inc = nw * UNROLL * WARP_SIZE;
- const int offset = w * UNROLL * WARP_SIZE + t;
- src0 += offset; if (TWO_INPUTS) src1 += offset;
- dest0 += offset; if (TWO_OUTPUTS) dest1 += offset;
-
- while (src0 < src0_end) {
-#pragma unroll
- for (int u = 0; u < UNROLL; ++u) {
- Fetch128(t0[u], src0+u*WARP_SIZE);
- if (TWO_INPUTS) Fetch128(t1[u], src1+u*WARP_SIZE);
+ int offset = w * UNROLL * WARP_SIZE + t;
+
+ const Pack128* srcs[MAXSRCS];
+ for (int i=0; i<MAXSRCS; i++) srcs[i] = ((const Pack128*)(s[i]+elemOffset))+offset;
+ Pack128* dsts[MAXDSTS];
+ for (int i=0; i<MAXDSTS; i++) dsts[i] = ((Pack128*)(d[i]+elemOffset))+offset;
+
+ while (offset < Npack) {
+ Pack128 vals[UNROLL];
+ // Load and reduce
+ for (int u = 0; u < UNROLL; ++u) Fetch128(vals[u], srcs[0]+u*WARP_SIZE);
+
+ for (int i=1; i<MINSRCS; i++) {
+ Pack128 vals2[UNROLL];
+ for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+ for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
}
-#pragma unroll
- for (int u = 0; u < UNROLL; ++u) {
- if (TWO_INPUTS) MULTI128<FUNC, T>()(t0[u], t1[u]);
- Store128(dest0+u*WARP_SIZE, t0[u]);
- if (TWO_OUTPUTS) Store128(dest1+u*WARP_SIZE, t0[u]);
+ #pragma unroll 1
+ for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) {
+ Pack128 vals2[UNROLL];
+ for (int u = 0; u < UNROLL; ++u) Fetch128(vals2[u], srcs[i]+u*WARP_SIZE);
+ for (int u = 0; u < UNROLL; ++u) MULTI128<FUNC, T>()(vals[u], vals2[u]);
}
- src0 += inc; if (TWO_INPUTS) src1 += inc;
- dest0 += inc; if (TWO_OUTPUTS) dest1 += inc;
+
+ // Store
+ for (int i = 0; i < MINDSTS; i++) {
+ for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+ }
+ #pragma unroll 1
+ for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) {
+ for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]);
+ }
+ for (int i=0; i<MAXSRCS; i++) srcs[i] += inc;
+ for (int i=0; i<MAXDSTS; i++) dsts[i] += inc;
+ offset += inc;
}
}
-template<int UNROLL, class FUNC, typename T, bool HAS_DEST1, bool HAS_SRC1>
-__device__ inline void ReduceOrCopy(const int tid, const int nthreads,
- volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
- const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
+template <typename T>
+__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
+
+// Try to limit consecutive load/stores to 8.
+// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
+#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
+
+template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
+__device__ __forceinline__ void ReduceOrCopyMulti(const int tid, const int nthreads,
+ int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS],
int N) {
int Nrem = N;
if (Nrem <= 0) return;
- int Npreamble = (Nrem<alignof(Pack128)) ? Nrem : AlignUp(dest0, alignof(Pack128)) - dest0;
+ int alignDiff = 0;
+ int align = ptrAlign128(srcs[0]);
+ #pragma unroll
+ for (int i=1; i<MINSRCS; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
+ for (int i=MINSRCS; i<MAXSRCS && i<nsrcs; i++) alignDiff |= (align ^ ptrAlign128(srcs[i]));
+ #pragma unroll
+ for (int i=0; i<MINDSTS; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
+ for (int i=MINDSTS; i<MAXDSTS && i<ndsts; i++) alignDiff |= (align ^ ptrAlign128(dsts[i]));
- // stage 0: check if we'll be able to use the fast, 128-bit aligned path.
- // If not, we'll just use the slow preamble path for the whole operation
- bool alignable = (((AlignUp(src0, alignof(Pack128)) == src0 + Npreamble)) &&
- (!HAS_DEST1 || (AlignUp(dest1, alignof(Pack128)) == dest1 + Npreamble)) &&
- (!HAS_SRC1 || (AlignUp(src1, alignof(Pack128)) == src1 + Npreamble)));
-
- if (!alignable) {
- Npreamble = Nrem;
- }
+ int Npreamble = alignDiff ? Nrem :
+ N < alignof(Pack128) ? N :
+ (alignof(Pack128) - align) % alignof(Pack128);
// stage 1: preamble: handle any elements up to the point of everything coming
// into alignment
- ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Npreamble);
-
- Nrem -= Npreamble;
- if (Nrem == 0) return;
-
- dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; }
- src0 += Npreamble; if (HAS_SRC1) { src1 += Npreamble; }
+ if (Npreamble) {
+ ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble);
+ Nrem -= Npreamble;
+ if (Nrem == 0) return;
+ }
+ int offset = Npreamble;
// stage 2: fast path: use 128b loads/stores to do the bulk of the work,
// assuming the pointers we have are all 128-bit alignable.
@@ -338,35 +350,33 @@ __device__ inline void ReduceOrCopy(const int tid, const int nthreads,
int nw = nthreads / WARP_SIZE; // Number of warps
int t = tid % WARP_SIZE; // Thread (inside the warp)
- const int PackFactor = sizeof(Pack128) / sizeof(T);
+ const int packFactor = sizeof(Pack128) / sizeof(T);
// stage 2a: main loop
- int Nalign2a = (Nrem / (PackFactor * UNROLL * nthreads))
- * (UNROLL * nthreads); // round down
+ int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE))
+ * (AUTOUNROLL * WARP_SIZE); // round down
+ int Nelem2a = Npack2a * packFactor;
- ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, UNROLL>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2a);
+ ReduceCopy128bMulti<FUNC, T, AUTOUNROLL, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a);
- int Ndone2a = Nalign2a * PackFactor;
- Nrem -= Ndone2a;
+ Nrem -= Nelem2a;
if (Nrem == 0) return;
- dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; }
- src0 += Ndone2a; if (HAS_SRC1) { src1 += Ndone2a; }
+ offset += Nelem2a;
// stage 2b: slightly less optimized for section when we don't have full
- // UNROLLs
+ // unrolling
- int Nalign2b = Nrem / PackFactor;
+ int Npack2b = Nrem / packFactor;
+ int Nelem2b = Npack2b * packFactor;
- ReduceCopy128b<FUNC, T, HAS_SRC1, HAS_DEST1, 1>(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2b);
+ ReduceCopy128bMulti<FUNC, T, 1, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b);
- int Ndone2b = Nalign2b * PackFactor;
- Nrem -= Ndone2b;
+ Nrem -= Nelem2b;
if (Nrem == 0) return;
- dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; }
- src0 += Ndone2b; if (HAS_SRC1) { src1 += Ndone2b; }
+ offset += Nelem2b;
// stage 2c: tail
- ReduceCopy<FUNC, T, HAS_SRC1, HAS_DEST1>(tid, nthreads, src0, src1, dest0, dest1, Nrem);
+ ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
}
#endif // COMMON_KERNEL_H_
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu
index 1fb8108..ea06b68 100644
--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@@ -8,9 +8,13 @@
#include "collectives.h"
#include "common.h"
-#define NCCL_FUNC4(coll, op, dtype) \
+#define NCCL_FUNC5(coll, op, dtype) \
NCCL_COLL_NAME(coll, op, dtype), \
- NCCL_COLL_NAME(coll##LL, op, dtype) \
+ NCCL_COLL_NAME(coll##LL, op, dtype)
+
+#define NCCL_FUNC4(coll, op, dtype) \
+ NCCL_FUNC5(coll##Ring, op, dtype), \
+ NCCL_FUNC5(coll##Tree, op, dtype)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
@@ -55,7 +59,7 @@
NCCL_FUNCS2A(ncclAllReduce) }
// Must be consistent with the ncclFuncSet enum
-__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
+__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
// Don't try to initialize the host shadow copy of this device-side global
// variable. There is no host pointer to a device-side function, which
// confuses clang. This will be fixed in the next clang release.
diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh
new file mode 100755
index 0000000..3942c8c
--- /dev/null
+++ b/src/collectives/device/gen_rules.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+#
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+dir=$1
+
+targets="GENOBJS := \\\\\n"
+
+for base in all_reduce all_gather broadcast reduce reduce_scatter; do
+ opn=0
+ for op in sum prod min max; do
+ dtn=0
+ for dt in i8 u8 i32 u32 i64 u64 f16 f32 f64; do
+ echo "${dir}/${base}_${op}_${dt}.o : ${base}.cu ${dir}/${base}.dep"
+ echo " @printf \"Compiling %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o"
+ echo " mkdir -p ${dir}"
+ echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc ${base}.cu -o ${dir}/${base}_${op}_${dt}.o"
+ echo ""
+ targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n"
+ dtn=$(($dtn + 1))
+ done
+ opn=$(($opn + 1))
+ done
+done
+echo -e "$targets"
diff --git a/src/collectives/device/ll_kernel.h b/src/collectives/device/ll_kernel.h
deleted file mode 100644
index 5ec3c9a..0000000
--- a/src/collectives/device/ll_kernel.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_LL_KERNEL_H_
-#define NCCL_LL_KERNEL_H_
-
-static __device__ uint64_t readLL(union ncclLLFifoLine* src, uint32_t flag) {
- uint32_t data1, flag1, data2, flag2;
- do {
- asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
- } while ((flag1 != flag) || (flag2 != flag));
- uint64_t val64 = data1 + (((uint64_t)data2) << 32);
- return val64;
-}
-
-static __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
- asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
-}
-
-// Using memcpy handles misaligned pointers.
-static __device__ uint64_t readAL(uint64_t* src) {
- uint64_t val;
- memcpy((char*)&val, (char*)src, sizeof(uint64_t));
- return val;
-}
-static __device__ void storeAL(uint64_t* dst, uint64_t val) {
- memcpy((char*)dst, (char*)&val, sizeof(uint64_t));
-}
-
-template <typename T, class FUNC>
-class LLPrimitives {
- private:
- template <int HAS_SRC1, int HAS_SRC2, int HAS_DST1, int HAS_DST2>
- static __device__ void ReduceCopyGeneric(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
- if (size <= 0) return;
- size_t size64 = size * sizeof(T) / sizeof(uint64_t);
- uint64_t* src1A = (uint64_t*)src1;
- uint64_t* dst1A = (uint64_t*)dst1;
- int offset = threadIdx.x;
- // Do multiples of 64 bits
-#pragma unroll 1
- for (; offset < size64; offset += nthreads) {
- uint64_t val;
- if (HAS_SRC1) {
- val = readAL(src1A+offset);
- if (HAS_SRC2) val = MULTI<FUNC, T>()(readLL(src2+offset, iflag), val);
- } else if (HAS_SRC2) {
- val = readLL(src2+offset, iflag);
- }
- if (HAS_DST1) storeAL(dst1A+offset, val);
- if (HAS_DST2) storeLL(dst2+offset, val, oflag);
- }
- // Finish last word
- int sizeDone = size64*(sizeof(uint64_t)/sizeof(T));
- int sizeRem = size - sizeDone;
- if (threadIdx.x == 0 && sizeRem) {
- const T* src1B = src1 + sizeDone;
- T* dst1B = dst1 + sizeDone;
-
- uint64_t lastVal;
- T* vals = (T*)&lastVal;
-
- if (HAS_SRC2) {
- uint64_t lastVal2 = readLL(src2+size64, iflag);
- T* src2B = (T*)&lastVal2;
- for (int offset = 0; offset < sizeRem; offset++) {
- vals[offset] = HAS_SRC1 ? FUNC()(src2B[offset], src1B[offset]) : src2B[offset];
- }
- } else if (HAS_SRC1) {
- for (int offset = 0; offset < sizeRem; offset++) {
- vals[offset] = src1B[offset];
- }
- }
- if (HAS_DST2) storeLL(dst2+size64, lastVal, oflag);
- if (HAS_DST1) {
- for (int offset = 0; offset < sizeRem; offset++) {
- dst1B[offset] = vals[offset];
- }
- }
- }
- }
- public:
- static __device__ void ReduceCopy(const T* src, union ncclLLFifoLine* dst, int size, uint32_t oflag, int nthreads) {
- return ReduceCopyGeneric<1, 0, 0, 1>(src, NULL, NULL, dst, size, 0, oflag, nthreads);
- }
-
- static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst, int size, uint32_t iflag, int nthreads) {
- return ReduceCopyGeneric<0, 1, 1, 0>(NULL, src, dst, NULL, size, iflag, 0, nthreads);
- }
-
- static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, union ncclLLFifoLine* dst, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
- return ReduceCopyGeneric<1, 1, 0, 1>(src1, src2, NULL, dst, size, iflag, oflag, nthreads);
- }
-
- static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst, int size, uint32_t iflag, int nthreads) {
- return ReduceCopyGeneric<1, 1, 1, 0>(src1, src2, dst, NULL, size, iflag, 0, nthreads);
- }
-
- static __device__ void ReduceCopy(const T* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t oflag, int nthreads) {
- return ReduceCopyGeneric<1, 0, 1, 1>(src, NULL, dst1, dst2, size, 0, oflag, nthreads);
- }
-
- static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
- return ReduceCopyGeneric<0, 1, 1, 1>(NULL, src, dst1, dst2, size, iflag, oflag, nthreads);
- }
-
- static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) {
- return ReduceCopyGeneric<1, 1, 1, 1>(src1, src2, dst1, dst2, size, iflag, oflag, nthreads);
- }
-};
-
-// Common macros
-
-#define STEP_TO_SLOT(step) \
- (step % NCCL_LL_CHUNKS)
-
-#define WAIT_NEXT \
- if (tid == 0) { \
- while (sendHead + NCCL_LL_CHUNKS <= step) { \
- sendHead = sendHeadPtr[0]; \
- } \
- } \
- asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads));
-
-#define POST_SIZE \
- if (tid == 0 && sizesFifo) sizesFifo[step % NCCL_LL_CHUNKS] = (maxOffset <= 0) ? -1 : (maxOffset*2*(int)sizeof(T));
-
-#define ACK_PREV \
- asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads)); \
- if (tid == 0) recvHeadPtr[0] = step;
-
-#define FIFO_CLEANING_AND_SAVE_STEP(flag) do { \
- if (step > ring->send.conn.llLastCleaning + NCCL_LL_CLEAN_FREQ) { \
- /* Reset all flags */ \
- static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); \
- static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); \
- const union ncclLLFifoLine resetLine = { 0, flag, 0, flag }; \
- for (int i=0; i<NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*llNthreads); i++) { \
- prevInput[tid+i*llNthreads].i4 = resetLine.i4; \
- } \
- __threadfence_system(); \
- /* Restart from the same slot, only make sure sender waits for data to be reset */ \
- step += NCCL_LL_CHUNKS; \
- ACK_PREV; \
- while (sendHeadPtr[0] < step); \
- if (tid == 0) ring->send.conn.llLastCleaning = step; \
- } \
- ring->send.conn.llStep = step; \
-} while (0);
-
-#endif
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index e2baa4b..c5aaf54 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,218 +9,579 @@
#include <type_traits>
#include "reduce_kernel.h" // for reduction funcs
+#include "common.h"
+
+#define SPINS_BEFORE_CHECK_ABORT 1000000
+
+// Unroll unconditionally the first send/recv since nsend/nrecv should be at
+// least 1 if SEND/RECV is set.
+#define FOR_SEND(func, ...) do { \
+ if (SEND) { \
+ /* Send to far first, then close */ \
+ for (int i=1; i<NSEND && i<nsend; i++) func(i, ##__VA_ARGS__); \
+ func(0, ##__VA_ARGS__); \
+ } \
+} while (0)
+
+#define FOR_RECV(func, ...) do { \
+ if (RECV) { \
+ /* Recv from close first, then far */ \
+ func(0, ##__VA_ARGS__); \
+ for (int i=1; i<NRECV && i<nrecv; i++) func(i, ##__VA_ARGS__); \
+ } \
+} while (0)
+// Implementation of primitive types
+template <int UNROLL, int SLICESPERCHUNK, int SLICESTEPS, typename T, int NRECV, int NSEND, class FUNC>
+class ncclPrimitives {
+ private:
+ const int tid;
+ const int nthreads;
+ int nrecv = 0;
+ int nsend = 0;
+ const int stepSize;
+ struct ncclConnInfo* recvConn[NRECV];
+ struct ncclConnInfo* sendConn[NSEND];
+ volatile uint64_t* waitPtr;
+ uint64_t recvStep[NRECV];
+ uint64_t sendStep[NSEND];
+ uint64_t sendConnHead[NSEND];
+ const T* recvDirectBuff[NRECV];
+ T* sendDirectBuff[NSEND];
+ const T* recvBuff[NRECV];
+ T* sendBuff[NSEND];
+ struct ncclComm* comm;
+
+ inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
+ inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
+ inline __device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); }
+ inline __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); }
+
+ inline __device__ void barrier() {
+ asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+ }
-/* Defines primitive operations: Copy, Reduce, DoubleCopy, and ReduceCopy.
- *
- * In order to reduce the reptetion of template arguments, the operations
- * are bundled as static methods of the Primitives class.
- *
- * Each primitive operation copies/reduces a contiguous buffer and syncs
- * an optional set of flags against a sub-step counter. The sync value is
- * based on the step parameter. Sync flags must be of type WaitFlag or
- * PostFlag. The primitive routines wait for all WaitFlag args to attain
- * at least a value of SUBSTEPS*(step-1)+substep+1 (i.e. completion of
- * corresponding substep by previous step) before executing the transfer.
- * After each substep is transfered, all PostFlag arguments get updated to
- * the value SUBSTEPS*step+substep+1.
- */
-
-
-class WaitFlag {
- volatile uint64_t * const flag;
- const int shift;
- public:
- __device__ __forceinline__
- WaitFlag(volatile uint64_t * const flag, const int shift) : flag(flag), shift(shift) { }
- __device__ __forceinline__
- void wait(uint64_t val) { while ((*flag + shift) < val) /*SPIN*/; }
-};
+ uint32_t mismatch = 0;
+ const uint64_t opCount;
+ inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
+ if (mismatch) {
+ // In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch
+ *(comm->fatalDevError) = ncclDevAssertedMismatch;
+ } else if (remoteOpCount && *remoteOpCount > opCount) {
+ mismatch += 1;
+ }
+ }
+
+ uint32_t spins = 0;
+ uint32_t abort = 0;
+
+ inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
+ spins++;
+ if (spins == SPINS_BEFORE_CHECK_ABORT) {
+ abort = *(comm->abortFlag);
+ checkMismatch(remoteOpCount);
+ spins = 0;
+ }
+ return abort;
+ }
+
+ inline __device__ void waitRecv(int i) {
+ spins = 0;
+ mismatch = 0;
+ recvStep[i] += SLICESTEPS;
+ if (tid == i) {
+ while (*(waitPtr) < recvStep[i]) {
+ if (checkAbort(recvConn[i]->opCountRem)) break;
+ }
+ }
+ }
+
+ inline __device__ void waitSend(int i) {
+ spins = 0;
+ mismatch = 0;
+ sendStep[i] += SLICESTEPS;
+ if (tid == WARP_SIZE+i) {
+ while (sendConnHead[i] + NCCL_STEPS < sendStep[i]) {
+ sendConnHead[i] = *waitPtr;
+ if (checkAbort(sendConn[i]->opCountRem)) break;
+ }
+ }
+ }
+
+ inline __device__ void postRecv(int i) {
+ *(recvConn[i]->head) = recvStep[i] += SLICESTEPS;
+ }
+
+ inline __device__ void postSend(int i) {
+ *(sendConn[i]->tail) = sendStep[i] += SLICESTEPS;
+ }
+
+ inline __device__ void postSendSize(int i, int size) {
+ if (sendConn[i]->fifo) sendConn[i]->fifo[sendStep[i]%NCCL_STEPS] = size;
+ }
+
+ template <int DIRECTRECV>
+ inline __device__ const T* directRecvPtr(int i, int directOffset) {
+ return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i);
+ }
+
+ template <int DIRECTSEND>
+ inline __device__ T* directSendPtr(int i, int directOffset) {
+ return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i);
+ }
+
+ template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST>
+ inline __device__ void
+ GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) {
+ int offset = 0;
+ int sliceSize = stepSize * SLICESTEPS;
+
+ const T* srcs[RECV*NRECV+SRC];
+ srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset);
+ if (RECV) {
+ if (SRC) srcs[1] = recvPtr(0);
+ for (int i=1; i<NRECV && i<nrecv; i++) srcs[SRC+i] = recvPtr(i);
+ }
+
+ T* dsts[SEND*NSEND+DST];
+ dsts[0] = DST ? dstPtr : directSendPtr<DIRECTSEND>(0, directOffset);
+ if (SEND) {
+ if (DST) dsts[1] = directSendPtr<DIRECTSEND>(0, directOffset);
+ for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset);
+ }
+
+ #pragma unroll 1
+ for (int slice=0; slice<SLICESPERCHUNK; ++slice) {
+ int realSize = max(0, min(sliceSize, nelem-offset));
+ if (tid < nthreads) {
+ FOR_SEND(waitSend);
+ FOR_RECV(waitRecv);
+ if (realSize > 0) {
+ barrier();
+ if (DIRECTRECV && recvDirectBuff[0]) {
+ // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
+ if (SEND) {
+ ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize);
+ }
+ } else {
+ ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize);
+ }
+ }
+ exitIfAbortBarrier(abort);
+ } else {
+ exitIfAbortBarrier(abort);
+ FOR_SEND(postSendSize, realSize*sizeof(T));
+ if (SEND) __threadfence_system();
+ FOR_SEND(postSend);
+ FOR_RECV(postRecv);
+ }
+ for (int i=0; i<RECV*NRECV+SRC; i++) srcs[i] += sliceSize;
+ for (int i=0; i<SEND*NSEND+DST; i++) dsts[i] += sliceSize;
+ offset += sliceSize;
+ }
+ }
+
+ __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) {
+ recvConn[i] = conn;
+ recvBuff[i] = (const T*)recvConn[i]->buff;
+ recvStep[i] = recvConn[i]->step;
+ recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
+ // Return credits in case we rounded up.
+ if (tid == nthreads) *recvConn[i]->head = recvStep[i];
+ if (tid == i) {
+ waitPtr = recvConn[i]->tail;
+ *(recvConn[i]->opCountLoc) = opCount;
+ }
+ recvDirectBuff[i] = NULL;
+ if (directBuff && recvConn[i]->direct) {
+ recvDirectBuff[i] = directBuff;
+ if (tid == 0) *recvConn[i]->ptrExchange = directBuff;
+ }
+ nrecv++;
+ }
+
+ __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) {
+ sendConn[i] = conn;
+ sendBuff[i] = (T*)sendConn[i]->buff;
+ sendStep[i] = sendConn[i]->step;
+ sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
+ if (tid == WARP_SIZE+i) {
+ waitPtr = sendConn[i]->head;
+ sendConnHead[i] = *waitPtr;
+ *(sendConn[i]->opCountLoc) = opCount;
+ }
+ sendDirectBuff[i] = NULL;
+ if (directBuff && sendConn[i]->direct) {
+ void* volatile* ptr = sendConn[i]->ptrExchange;
+ while ((sendDirectBuff[i] = (T*)(*ptr)) == NULL);
+ __syncthreads();
+ if (tid == 0) *ptr = NULL;
+ }
+ nsend++;
+ }
+
+ __device__ __forceinline__ void saveRecvConn(int i) {
+ if (tid == i) {
+ recvConn[i]->step = recvStep[i];
+ __threadfence_system();
+ *(recvConn[i]->opCountLoc) += 1;
+ }
+ }
+
+ __device__ __forceinline__ void saveSendConn(int i) {
+ if (tid == WARP_SIZE+i) {
+ sendConn[i]->step = sendStep[i];
+ __threadfence_system();
+ *(sendConn[i]->opCountLoc) += 1;
+ }
+ }
-class PostFlag {
- volatile uint64_t * const flag;
- const int shift;
- volatile int * const fifo;
- const int fifo_size;
public:
__device__ __forceinline__
- PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size) : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size) { }
- __device__ __forceinline__
- void post(uint64_t val) { *flag = (val - shift); }
- __device__ __forceinline__
- void postSize(uint64_t step, int size) { if (fifo != NULL) fifo[step%fifo_size] = size; };
-};
+ ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
+ : comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) {
+ // Make sure step is updated before we read it
+ __syncthreads();
+ for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, directBuff);
+ for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, directBuff);
+ }
-// Helper to check if any argument is of type T.
-// e.g. AnyAre<WaitFlag>(Flag1, Flag2, ...)
-template<typename T> __device__ __forceinline__
-bool AnyAre() { return false; }
+ __device__ __forceinline__ void
+ send(const T* src, int nelem) {
+ GenericOp<0, 0, 0, 1, 1, 0>(src, NULL, nelem, 0);
+ }
+ __device__ __forceinline__ void
+ directSend(const T* src, int directOffset, int nelem) {
+ GenericOp<0, 1, 0, 1, 1, 0>(src, NULL, nelem, directOffset);
+ }
-template<typename T, typename FIRST_T, typename... TAIL_Ts>
-__device__ __forceinline__
-bool AnyAre(FIRST_T first, TAIL_Ts... tail) {
- return std::is_same<T, FIRST_T>::value || AnyAre<T>(tail...);
-}
+ __device__ __forceinline__ void
+ recv(T* dst, int nelem) {
+ GenericOp<0, 0, 1, 0, 0, 1>(NULL, dst, nelem, 0);
+ }
+ __device__ __forceinline__ void
+ directRecv(T* dst, int directOffset, int nelem) {
+ GenericOp<1, 0, 1, 0, 0, 1>(NULL, dst, nelem, directOffset);
+ }
+ __device__ __forceinline__ void
+ copySend(const T* src, T* dst, int nelem) {
+ GenericOp<0, 0, 0, 1, 1, 1>(src, dst, nelem, 0);
+ }
+ __device__ __forceinline__ void
+ directCopySend(const T* src, T* dst, int directOffset, int nelem) {
+ GenericOp<0, 1, 0, 1, 1, 1>(src, dst, nelem, directOffset);
+ }
-// Wait on all WaitFlags, ignore PostFlags
-__device__ __forceinline__
-void WaitOnFlags(uint64_t val) { }
+ __device__ __forceinline__ void
+ recvCopySend(T* dst, int nelem) {
+ GenericOp<0, 0, 1, 1, 0, 1>(NULL, dst, nelem, 0);
+ }
+ __device__ __forceinline__ void
+ directRecvCopySend(T* dst, int directOffset, int nelem) {
+ GenericOp<1, 1, 1, 1, 0, 1>(NULL, dst, nelem, directOffset);
+ }
-template <typename... TAIL_Ts> __device__ __forceinline__
-void WaitOnFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
- flag.wait(val);
- WaitOnFlags(val, tail...);
-}
+ __device__ __forceinline__ void
+ recvReduceCopy(const T* src, T* dst, int nelem) {
+ GenericOp<0, 0, 1, 0, 1, 1>(src, dst, nelem, 0);
+ }
-template <typename... TAIL_Ts> __device__ __forceinline__
-void WaitOnFlags(uint64_t val, PostFlag, TAIL_Ts... tail) {
- WaitOnFlags(val, tail...);
-}
+ __device__ __forceinline__ void
+ recvReduceSend(const T* src, int nelem) {
+ GenericOp<0, 0, 1, 1, 1, 0>(src, NULL, nelem, 0);
+ }
+ __device__ __forceinline__ void
+ recvReduceCopySend(const T* src, T* dst, int nelem) {
+ GenericOp<0, 0, 1, 1, 1, 1>(src, dst, nelem, 0);
+ }
+ __device__ __forceinline__ void
+ directRecvReduceCopySend(const T* src, T* dst, int directOffset, int nelem) {
+ // Direct is only for the send part
+ GenericOp<0, 1, 1, 1, 1, 1>(src, dst, nelem, directOffset);
+ }
-// Post all PostFlags, ignore WaitFlags
-__device__ __forceinline__
-void PostToFlags(uint64_t val) { }
+ __device__ __forceinline__ ~ncclPrimitives() {
+ // Save steps for next collective. Have thread 0 do it to be compatible
+ // with the way LL works.
+ for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
+ for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
+ }
+};
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostToFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) {
- PostToFlags(val, tail...);
-}
+template <typename T, class FUNC, int NRECV, int NSEND>
+class ncclLLPrimitives {
+ private:
+ const int tid;
+ const int nthreads;
+ int nrecv = 0;
+ int nsend = 0;
+ struct ncclConnInfo* recvConn[NRECV];
+ struct ncclConnInfo* sendConn[NSEND];
+ volatile uint64_t* waitPtr;
+ volatile uint64_t* postPtr;
+ volatile int* fifoPtr;
+ uint64_t recvStep[NRECV];
+ uint64_t sendStep[NSEND];
+ uint64_t sendConnHead;
+ union ncclLLFifoLine* recvBuff[NRECV];
+ union ncclLLFifoLine* sendBuff[NSEND];
+ struct ncclComm* comm;
+
+ inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+ inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; }
+ inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
+ inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
+ inline __device__ uint32_t recvFlag(int i) { return recvStep[i]+1; }
+ inline __device__ uint32_t sendFlag(int i) { return sendStep[i]+1; }
+
+ // Exit If Abort Barrier : make sure all threads exit consistently
+ // Each thread sets a predicate to true if val == 1
+ // all CTA's threads enter the barrier and do a popc on their predicates being True
+ // If any of the thread's predicate was True, all the threads call exit()
+ inline __device__ void exitIfAbortLocalBarrier() {
+ uint32_t popc;
+ asm ("{");
+ asm volatile (" .reg .pred barr_pred;");
+ asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort));
+ asm volatile (" bar.red.popc.u32 %0, 14, %1, barr_pred;" : "=r"(popc) : "r"(nthreads));
+ asm ("}");
+ if (popc) {
+ // Make sure threads not participating in the operation get the abort and all threads exit
+ exitIfAbortBarrier(1);
+ }
+ }
+
+ inline __device__ void barrier() {
+ asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+ }
+
+ uint32_t mismatch = 0;
+ const uint64_t opCount;
+
+ inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) {
+ if (mismatch > 20) {
+ // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch
+ // Note that we are not using _threadfence_system in LL so the error cannot be asserted
+ *(comm->fatalDevError) = ncclDevSuspectedMismatch;
+ } else if (remoteOpCount && *remoteOpCount > opCount) {
+ mismatch += 1;
+ }
+ }
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostToFlags(uint64_t val, PostFlag flag, TAIL_Ts... tail) {
- flag.post(val);
- PostToFlags(val, tail...);
-}
+ uint32_t spins = 0;
+ uint32_t abort = 0;
+ inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) {
+ spins++;
+ if (spins == SPINS_BEFORE_CHECK_ABORT) {
+ abort = *(comm->abortFlag);
+ checkMismatch(remoteOpCount);
+ spins = 0;
+ }
+ return abort;
+ }
-// Post sizes for PostFlags, ignore WaitFlags
-__device__ __forceinline__
-void PostSizeToFlags(uint64_t step, int size) { }
+ inline __device__ void waitSend(int i, int nbytes) {
+ spins = 0;
+ mismatch = 0;
+ if (tid == WARP_SIZE+i) {
+ while (sendConnHead + NCCL_STEPS < sendStep[i] + 1) {
+ sendConnHead = *waitPtr;
+ if (checkAbort(sendConn[i]->opCountRem)) break;
+ }
+ if (fifoPtr) fifoPtr[sendStep[i]%NCCL_STEPS] = nbytes;
+ }
+ }
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostSizeToFlags(uint64_t step, int size, WaitFlag flag, TAIL_Ts... tail) {
- PostSizeToFlags(step, size, tail...);
-}
+ inline __device__ void postRecv(int i) {
+ recvStep[i]++;
+ if (tid == i) *postPtr = recvStep[i];
+ }
-template <typename... TAIL_Ts> __device__ __forceinline__
-void PostSizeToFlags(uint64_t step, int size, PostFlag flag, TAIL_Ts... tail) {
- flag.postSize(step, size);
- PostSizeToFlags(step, size, tail...);
-}
+ inline __device__ void postSend(int i) {
+ sendStep[i]++;
+ }
+ __device__ uint64_t readLL(int i, int offset) {
+ union ncclLLFifoLine* src = recvPtr(i) + offset;
+ uint32_t flag = recvFlag(i);
+ uint32_t data1, flag1, data2, flag2;
+ spins = 0;
+ mismatch = 0;
+ do {
+ asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
+ if (checkAbort(recvConn[i]->opCountRem)) break;
+ } while ((flag1 != flag) || (flag2 != flag));
+ uint64_t val64 = data1 + (((uint64_t)data2) << 32);
+ return val64;
+ }
-// Create pointer arithmetic syntax that doesn't break for std::nullptr_t
-template <typename Tptr> __device__ __forceinline__
-Tptr ptradd(Tptr ptr, int i) {
- return ptr + i;
-}
+ __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
+ asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
+ }
-__device__ __forceinline__
-std::nullptr_t ptradd(std::nullptr_t ptr, int i) {
- return nullptr;
-}
+ // Using memcpy handles misaligned pointers.
+ __device__ uint64_t readAL(uint64_t* src) {
+ uint64_t val;
+ memcpy((char*)&val, (char*)src, sizeof(uint64_t));
+ return val;
+ }
+ __device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) {
+ memcpy((char*)dst, (char*)&val, nbytes);
+ }
-// Implementation of primitive types
-template <int UNROLL, int SUBSTEPS, typename T, typename REDOP=FuncSum<T> >
-class Primitives {
- private:
- template <typename SRC2_T, // either T* or std::nullptr_t
- typename DST2_T, // either T* or std::nullptr_t
- typename... SYNC_Ts> // either WaitFunc or PostFunc
- static __device__ __forceinline__ void
- GenericOp(const int tid, const int nthreads,
- const T* src1,
- const SRC2_T src2,
- T* dst1,
- DST2_T dst2,
- int len, int maxoffset, uint64_t step, SYNC_Ts... flags) {
-
- enum { noSrc2 = std::is_same<SRC2_T, std::nullptr_t>::value };
- enum { noDst2 = std::is_same<DST2_T, std::nullptr_t>::value };
- static_assert(noSrc2 || std::is_same<SRC2_T, const T*>::value,
- "src2 must be of type T* or std::nullptr_t");
- static_assert(noDst2 || std::is_same<DST2_T, T*>::value,
- "dst2 must be of type T* or std::nullptr_t");
-
- using OpType = typename std::conditional<noSrc2, FuncSum<T>, REDOP>::type;
-
- int sliceSize = len / SUBSTEPS;
- int sliceOffset = 0;
-
-#pragma unroll 1
- for (int sub=0; sub<SUBSTEPS; ++sub) {
- int realSize = max(0, min(sliceSize, maxoffset-sliceOffset));
- if (tid < nthreads) {
- if (AnyAre<WaitFlag>(flags...)) {
- if (tid == 0) {
- WaitOnFlags(SUBSTEPS*step + sub + 1, flags...);
- }
- asm volatile ("bar.sync 1, %0;" :: "r"(nthreads));
+ template <int RECV, int SEND, int SRC, int DST>
+ __device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) {
+ uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T);
+ FOR_SEND(waitSend, nbytes*2);
+ barrier();
+ uint32_t npack = DIVUP(nbytes, sizeof(uint64_t));
+ uint64_t* srcPack = (uint64_t*)srcPtr;
+ uint64_t* dstPack = (uint64_t*)dstPtr;
+ // Do multiples of 64 bits
+ #pragma unroll 2
+ for (int offset=tid; offset<npack; offset+=nthreads) {
+ // Recv : local, then intra-node, then inter-node
+ uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset);
+ if (RECV) {
+ if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val);
+ for (int i=1; i<NRECV && i<nrecv; i++) {
+ val = MULTI<FUNC, T>()(readLL(i, offset), val);
}
- ReduceOrCopy
- <
- UNROLL,
- OpType,
- T,
- !std::is_same<DST2_T, std::nullptr_t>::value, // HAS_DEST1
- !std::is_same<SRC2_T, std::nullptr_t>::value // HAS_SRC1
- >
- (
- tid, nthreads,
- ptradd(dst1, sliceOffset),
- ptradd(dst2, sliceOffset),
- ptradd(src1, sliceOffset),
- ptradd(src2, sliceOffset),
- realSize
- );
- if (AnyAre<PostFlag>(flags...)) {
- __syncthreads();
+ }
+
+ // Send : inter-node, then intra-node, then local
+ if (SEND) {
+ for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i));
+ storeLL(sendPtr(0)+offset, val, sendFlag(0));
+ }
+ if (DST) {
+ if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) {
+ // Last incomplete word
+ storeAL(dstPack+offset, val, nbytes & 0x7);
+ } else {
+ storeAL(dstPack+offset, val, sizeof(uint64_t));
}
- } else {
- if (AnyAre<PostFlag>(flags...)) {
- __syncthreads();
- PostSizeToFlags(SUBSTEPS*step+sub, realSize*sizeof(T), flags...);
- __threadfence_system();
- PostToFlags(SUBSTEPS*step + sub + 1, flags...);
+ }
+ }
+ exitIfAbortLocalBarrier();
+ FOR_RECV(postRecv);
+ FOR_SEND(postSend);
+ }
+
+ __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
+ recvConn[i] = conn;
+ recvBuff[i] = recvConn[i]->llBuff;
+ recvStep[i] = recvConn[i]->step;
+ if (tid == i) {
+ postPtr = recvConn[i]->head;
+ *(recvConn[i]->opCountLoc) = opCount;
+ }
+ nrecv++;
+ }
+
+ __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
+ sendConn[i] = conn;
+ sendBuff[i] = sendConn[i]->llBuff;
+ sendStep[i] = sendConn[i]->step;
+ if (tid == WARP_SIZE+i) {
+ waitPtr = sendConn[i]->head;
+ fifoPtr = sendConn[i]->fifo;
+ sendConnHead = *waitPtr;
+ *(sendConn[i]->opCountLoc) = opCount;
+ }
+ nsend++;
+ }
+
+ __device__ __forceinline__ void saveRecvConn(int i) {
+ if (tid == i) {
+ recvConn[i]->step = recvStep[i];
+ *(recvConn[i]->opCountLoc) += 1;
+ __threadfence_block();
+ }
+ }
+
+ __device__ __forceinline__ void saveSendConn(int i) {
+ if (tid == WARP_SIZE+i) {
+ sendConn[i]->step = sendStep[i];
+ *(sendConn[i]->opCountLoc) += 1;
+ __threadfence_block();
+ }
+ }
+
+ __device__ __forceinline__ void llSendCleaning(int i) {
+ if (sendStep[i] > sendConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
+ /* Reset all flags */
+ static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS");
+ static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS");
+ for (int s=0; s<NCCL_STEPS; s++) {
+ waitSend(i, 0);
+ for (int o=tid; o<NCCL_LL_SLICE_LINES; o+=nthreads) {
+ const union ncclLLFifoLine resetLine = { 0, sendFlag(i), 0, sendFlag(i) };
+ sendPtr(i)[o].i4 = resetLine.i4;
}
}
- sliceOffset += sliceSize;
+ if (tid == 0) sendConn[i]->llLastCleaning = sendStep[i];
+ }
+ }
+
+ __device__ __forceinline__ void llRecvCleaning(int i) {
+ if (recvStep[i] > recvConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
+ recvStep[i] += NCCL_STEPS;
+ if (tid == 0) recvConn[i]->llLastCleaning = recvStep[i];
}
}
public:
- template <typename... SYNC_Ts>
- static __device__ __forceinline__ void
- Copy(const int tid, const int nthreads, const T* src, T* dst,
- int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
- GenericOp(tid, nthreads, src, nullptr, dst, nullptr, len, maxOffset, step, flags...);
+ __device__ __forceinline__
+ ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount)
+ : comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) {
+ // Make sure step is updated before we read it.
+ barrier();
+
+ for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i);
+ for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i);
}
- template <typename... SYNC_Ts>
- static __device__ __forceinline__ void
- DoubleCopy(const int tid, const int nthreads, const T* src, T* dst1, T* dst2,
- int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
- GenericOp(tid, nthreads, src, nullptr, dst1, dst2, len, maxOffset, step, flags...);
+ __device__ void send(const T* src, int nelem) {
+ return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem);
}
- template <typename... SYNC_Ts>
- static __device__ __forceinline__ void
- Reduce(const int tid, const int nthreads, const T* src1, const T* src2, T* dst,
- int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
- GenericOp(tid, nthreads, src1, src2, dst, nullptr, len, maxOffset, step, flags...);
+ __device__ void recv(T* dst, int nelem) {
+ return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem);
}
- template <typename... SYNC_Ts>
- static __device__ __forceinline__ void
- ReduceCopy(const int tid, const int nthreads, const T* src1, const T* src2, T* dst1, T* dst2,
- int len, int maxOffset, uint64_t step, SYNC_Ts... flags) {
- GenericOp(tid, nthreads, src1, src2, dst1, dst2, len, maxOffset, step, flags...);
+ __device__ void recvReduceSend(const T* src, int nelem) {
+ return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem);
+ }
+
+ __device__ void recvReduceCopy(const T* src, T* dst, int nelem) {
+ return LLGenericOp<1, 0, 1, 1>(src, dst, nelem);
}
-};
-#endif // end include guard
+ __device__ void copySend(const T* src, T* dst, int nelem) {
+ return LLGenericOp<0, 1, 1, 1>(src, dst, nelem);
+ }
+
+ __device__ void recvCopySend(T* dst, int nelem) {
+ return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem);
+ }
+
+ __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) {
+ return LLGenericOp<1, 1, 1, 1>(src, dst, nelem);
+ }
+
+ __device__ __forceinline__ ~ncclLLPrimitives() {
+ for (int i=0; i<NSEND && i<nsend; i++) llSendCleaning(i);
+ for (int i=0; i<NRECV && i<nrecv; i++) llRecvCleaning(i);
+ // Save steps for the next operation
+ for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i);
+ for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i);
+ }
+};
+#endif
diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu
index bd1d23c..1ef66d4 100644
--- a/src/collectives/device/reduce.cu
+++ b/src/collectives/device/reduce.cu
@@ -4,18 +4,8 @@
* See LICENSE.txt for license information
************************************************************************/
-#include "common.h"
#include "reduce.h"
+#include "common.h"
#include "collectives.h"
-#define UNROLL 4
-
-#if NCCL_OP == 0
-IMPL_COLL2(ncclReduce, sum, FuncSum, ncclCollReduce, ncclSum);
-#elif NCCL_OP == 1
-IMPL_COLL2(ncclReduce, prod, FuncProd, ncclCollReduce, ncclProd);
-#elif NCCL_OP == 2
-IMPL_COLL2(ncclReduce, min, FuncMin, ncclCollReduce, ncclMin);
-#elif NCCL_OP == 3
-IMPL_COLL2(ncclReduce, max, FuncMax, ncclCollReduce, ncclMax);
-#endif
+IMPL_COLL_R(ncclReduce, ncclCollReduce);
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index f5694b1..302d053 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -8,143 +8,71 @@
#include "primitives.h"
#include "collectives.h"
-// Increase Step and boffset for buffer sync
-#define NEXT_STEP \
- step++; \
- boffset += sliceSize; \
- if (boffset == buffSize) boffset = 0;
-
template<int UNROLL, class FUNC, typename T>
-__device__ void ncclReduceKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
struct ncclComm* comm = args->comm;
- struct ncclRing* ring = comm->rings+blockIdx.x;
-
- WaitFlag waitDoneFromNext(ring->send.conn.head, (REDUCE_BUFCHUNKS-1)*REDUCE_SUBSTEPS);
- WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0);
- PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0);
- PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS);
-
- typedef Primitives<UNROLL, REDUCE_SUBSTEPS, T, FUNC> Prims;
-
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
const int nranks = comm->nRanks;
- const int buffSize = ring->buffSize / sizeof(T);
- const int sliceSize = buffSize / REDUCE_BUFCHUNKS;
- const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
+ const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+ const int chunkSize = stepSize * REDUCE_CHUNKSTEPS;
+ const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
const int rank = ring->devUserRanks[0];
const int prevRank = ring->devUserRanks[nranks-1];
const int root = args->root;
- if (tid == 0) {
- // Update in case we skipped some collectives
- *ring->recv.conn.opCount = args->opCount;
-
- if (rank != root) {
- // Wait for next to be ready
- WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
- waitOpCountNext.wait(args->opCount);
- }
- }
- __syncthreads();
-
- uint64_t step = 0ULL;
- int boffset = 0;
-
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
- T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
- T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+ ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, FUNC>
+ prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
- ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
- ssize_t offset = gridOffset + bid*chunkSize;
- int maxOffset = min(chunkSize, size-offset);
+ int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+ ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+ ssize_t offset = gridOffset + bid*realChunkSize;
+ int nelem = min(realChunkSize, size-offset);
if (prevRank == root) {
- Prims::Copy(tid, nthreads,
- thisInput + offset,
- nextOutput + boffset,
- sliceSize, maxOffset,
- step,
- waitDoneFromNext,
- postReadyToNext);
+ prims.send(thisInput+offset, nelem);
} else if (rank == root) {
- Prims::Reduce(tid, nthreads,
- prevInput + boffset,
- thisInput + offset,
- thisOutput + offset,
- sliceSize, maxOffset,
- step,
- waitReadyFromPrev,
- postDoneToPrev);
+ prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else {
- Prims::Reduce(tid, nthreads,
- prevInput + boffset,
- thisInput + offset,
- nextOutput + boffset,
- sliceSize, maxOffset,
- step,
- waitDoneFromNext, waitReadyFromPrev,
- postReadyToNext, postDoneToPrev);
- }
- NEXT_STEP; // Increases step, boffset
- }
-
- if (tid == 0) {
- if (rank != root) {
- // Wait for next to have consumed data before resetting the flag
- waitDoneFromNext.wait(REDUCE_SUBSTEPS*(step + REDUCE_BUFCHUNKS - 1));
- *ring->send.conn.head = 0ULL;
+ prims.recvReduceSend(thisInput+offset, nelem);
}
- *ring->recv.conn.tail = 0ULL;
- __threadfence_system();
- *ring->recv.conn.opCount = args->opCount+1;
}
}
-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
- boffset += NCCL_LL_SLICE_LINES; \
- if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \
- flag++; \
- step++;
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
- const int llNthreads = args->nThreads;
+ const int nthreads = args->nThreads;
struct ncclComm* comm = args->comm;
- struct ncclRing* ring = comm->rings+blockIdx.x;
- volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
- volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
- volatile int * sizesFifo = ring->send.conn.llFifo;
- uint64_t sendHead = sendHeadPtr[0];
- const int nranks = comm->nRanks;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
+
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
+
+ const ssize_t size = args->N;
const int rank = comm->rank;
+ const int nranks = comm->nRanks;
const int prevRank = ring->devUserRanks[nranks-1];
const int root = args->root;
- typedef LLPrimitives<T, FUNC> LL;
-
- const ssize_t size = args->N;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
- const ssize_t loopSize = args->nRings*chunkSize;
-
- uint64_t step = ring->send.conn.llStep;
- uint32_t flag = step + 1;
- int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+ const ssize_t loopSize = args->nChannels*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
- union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
- union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
@@ -152,39 +80,16 @@ __device__ void ncclReduceLLKernel(struct CollectiveArgs* args) {
}
ssize_t offset = gridOffset + bid*chunkSize;
- int maxOffset = min(chunkSize, size-offset);
+ int nelem = min(chunkSize, size-offset);
if (prevRank == root) {
- WAIT_NEXT;
- LL::ReduceCopy(
- thisInput + offset,
- nextOutput + boffset,
- maxOffset, flag, llNthreads);
- POST_SIZE;
- NEXT_STEP_LL;
+ LLprims.send(thisInput+offset, nelem);
} else if (rank == root) {
- LL::ReduceCopy(
- thisInput + offset,
- prevInput + boffset,
- thisOutput + offset,
- maxOffset, flag, llNthreads);
- NEXT_STEP_LL;
- ACK_PREV;
+ LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else {
- WAIT_NEXT;
- LL::ReduceCopy(
- thisInput + offset,
- prevInput + boffset,
- nextOutput + boffset,
- maxOffset, flag, flag, llNthreads);
- POST_SIZE;
- NEXT_STEP_LL;
- ACK_PREV;
+ LLprims.recvReduceSend(thisInput+offset, nelem);
}
}
-
- // We need everyone to acknowledge data even if they didn't receive anything
- // so that the next collective can start right away.
- ACK_PREV;
-
- FIFO_CLEANING_AND_SAVE_STEP(flag);
}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/reduce_kernel.h b/src/collectives/device/reduce_kernel.h
index 0cb8f13..0e90793 100644
--- a/src/collectives/device/reduce_kernel.h
+++ b/src/collectives/device/reduce_kernel.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -46,30 +46,28 @@ struct FuncMin {
}
};
+#define MASK0 0x00ff00ff
+#define MASK1 0xff00ff00
+static __device__ uint32_t addChar4(const uint32_t x, const uint32_t y) {
+ /* This can be used both for signed and unsigned 8-bit addition */
+ const uint32_t x0 = x & MASK0;
+ const uint32_t x1 = x & MASK1;
+ const uint32_t y0 = y & MASK0;
+ const uint32_t y1 = y & MASK1;
+ const uint32_t r0 = (x0+y0);
+ const uint32_t r1 = (x1+y1);
+ return (r0 & MASK0) | (r1 & MASK1);
+}
+
template<>
struct FuncSum<int8_t> {
- union converter { uint32_t storage; char4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
- int32_t rv;
- asm("vadd.s32.s32.s32 %0, %1.b0, %2.b0; \n\t"
- "vadd.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
- "vadd.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
- "vadd.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
- return rv;
#else
- converter cx, cy, cr;
- cx.storage = x;
- cy.storage = y;
- cr.a.x = cx.a.x + cy.a.x;
- cr.a.y = cx.a.y + cy.a.y;
- cr.a.z = cx.a.z + cy.a.z;
- cr.a.w = cx.a.w + cy.a.w;
- return cr.storage;
+ return addChar4(x, y);
#endif
}
__device__ int8_t operator()(const int8_t x, const int8_t y) const {
@@ -78,28 +76,13 @@ struct FuncSum<int8_t> {
};
template<>
struct FuncSum<uint8_t> {
- union converter { uint32_t storage; uchar4 a; };
__device__ uint32_t operator()(const uint32_t x, const uint32_t y) const {
#if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500)
int32_t rv, z=0;
asm("vadd4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
- int32_t rv;
- asm("vadd.u32.u32.u32 %0, %1.b0, %2.b0; \n\t"
- "vadd.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
- "vadd.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
- "vadd.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
- return rv;
#else
- converter cx, cy, cr;
- cx.storage = x;
- cy.storage = y;
- cr.a.x = cx.a.x + cy.a.x;
- cr.a.y = cx.a.y + cy.a.y;
- cr.a.z = cx.a.z + cy.a.z;
- cr.a.w = cx.a.w + cy.a.w;
- return cr.storage;
+ return addChar4(x, y);
#endif
}
__device__ uint8_t operator()(const uint8_t x, const uint8_t y) const {
@@ -109,22 +92,6 @@ struct FuncSum<uint8_t> {
static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) {
/* This can be used both for signed and unsigned 8-bit multiplication */
-#if (__CUDA_ARCH__ >= 300)
- uint32_t rv;
- asm("{ .reg .u32 t0, t1, t2, t3;\n\t"
- " vmad.u32.u32.u32 t3, %1.b3, %2.b3, 0;\n\t"
- " vmad.u32.u32.u32 t2, %1.b2, %2.b2, 0;\n\t"
- " shl.b32 t3, t3, 16;\n\t"
- " shl.b32 t2, t2, 16;\n\t"
- " vmad.u32.u32.u32 t1, %1.b1, %2.b1, t3;\n\t"
- " shl.b32 t1, t1, 8;\n\t"
- " vmad.u32.u32.u32 t0, %1.b0, %2.b0, t2;\n\t"
- " and.b32 t1, t1, 0xff00ff00;\n\t"
- " and.b32 t0, t0, 0x00ff00ff;\n\t"
- " or.b32 %0, t0, t1;\n\t"
- "}" : "=r"(rv) : "r"(x), "r"(y));
- return rv;
-#else
union converter { uint32_t storage; char4 a; };
converter cx, cy, cr;
cx.storage = x;
@@ -134,7 +101,6 @@ static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) {
cr.a.z = cx.a.z * cy.a.z;
cr.a.w = cx.a.w * cy.a.w;
return cr.storage;
-#endif
}
template<>
@@ -164,13 +130,6 @@ struct FuncMax<int8_t> {
int32_t rv, z=0;
asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
- int32_t rv;
- asm("vmax.s32.s32.s32 %0, %1.b0, %2.b0; \n\t"
- "vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
- "vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
- "vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
- return rv;
#else
converter cx, cy, cr;
cx.storage = x;
@@ -194,13 +153,6 @@ struct FuncMax<uint8_t> {
int32_t rv, z=0;
asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
- int32_t rv;
- asm("vmax.u32.u32.u32 %0, %1.b0, %2.b0; \n\t"
- "vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
- "vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
- "vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
- return rv;
#else
converter cx, cy, cr;
cx.storage = x;
@@ -225,13 +177,6 @@ struct FuncMin<int8_t> {
int32_t rv, z=0;
asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
- int32_t rv;
- asm("vmin.s32.s32.s32 %0, %1.b0, %2.b0; \n\t"
- "vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t"
- "vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t"
- "vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
- return rv;
#else
converter cx, cy, cr;
cx.storage = x;
@@ -255,13 +200,6 @@ struct FuncMin<uint8_t> {
int32_t rv, z=0;
asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z));
return rv;
-#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700)
- int32_t rv;
- asm("vmin.u32.u32.u32 %0, %1.b0, %2.b0; \n\t"
- "vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t"
- "vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t"
- "vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y));
- return rv;
#else
converter cx, cy, cr;
cx.storage = x;
diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu
index b16053c..10857ed 100644
--- a/src/collectives/device/reduce_scatter.cu
+++ b/src/collectives/device/reduce_scatter.cu
@@ -4,18 +4,8 @@
* See LICENSE.txt for license information
************************************************************************/
-#include "common.h"
#include "reduce_scatter.h"
+#include "common.h"
#include "collectives.h"
-#define UNROLL 4
-
-#if NCCL_OP == 0
-IMPL_COLL2(ncclReduceScatter, sum, FuncSum, ncclCollReduceScatter, ncclSum);
-#elif NCCL_OP == 1
-IMPL_COLL2(ncclReduceScatter, prod, FuncProd, ncclCollReduceScatter, ncclProd);
-#elif NCCL_OP == 2
-IMPL_COLL2(ncclReduceScatter, min, FuncMin, ncclCollReduceScatter, ncclMin);
-#elif NCCL_OP == 3
-IMPL_COLL2(ncclReduceScatter, max, FuncMax, ncclCollReduceScatter, ncclMax);
-#endif
+IMPL_COLL_R(ncclReduceScatter, ncclCollReduceScatter);
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index cad011b..c70c845 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -8,156 +8,82 @@
#include "primitives.h"
#include "collectives.h"
-// Increase Step and poffset/noffset for buffer sync
-#define NEXT_STEP \
- step++; \
- poffset = noffset; \
- noffset += sliceSize; \
- if (noffset == buffSize) noffset = 0;
-
template<int UNROLL, class FUNC, typename T>
-__device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = blockDim.x - 1;
const int bid = args->bid;
struct ncclComm* comm = args->comm;
- struct ncclRing* ring = comm->rings+blockIdx.x;
-
- WaitFlag waitDoneFromNext(ring->send.conn.head, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
- WaitFlag waitReadyFromPrev(ring->recv.conn.tail, REDUCESCATTER_SUBSTEPS);
- PostFlag postDoneToPrev(ring->recv.conn.head, REDUCESCATTER_SUBSTEPS, NULL, 0);
- PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS);
-
- typedef Primitives<UNROLL, REDUCESCATTER_SUBSTEPS, T, FUNC> Prims;
-
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
const ssize_t size = args->N;
const int nranks = comm->nRanks;
- const int buffSize = ring->buffSize / sizeof(T);
- const int sliceSize = buffSize / REDUCESCATTER_BUFCHUNKS;
- const ssize_t loopSize = args->nRings*(ssize_t)sliceSize;
-
- if (tid == 0) {
- // Update in case we skipped some collectives
- *ring->recv.conn.opCount = args->opCount;
- // Wait for next to be ready
- WaitFlag waitOpCountNext(ring->send.conn.opCount, 0);
- waitOpCountNext.wait(args->opCount);
- }
- __syncthreads();
-
- uint64_t step = 0ULL;
- int poffset, noffset = 0;
+ const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+ const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS;
+ const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
- T * __restrict__ prevInput = (T*)ring->recv.conn.buff;
- T * __restrict__ nextOutput = (T*)ring->send.conn.buff;
+
+ ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, FUNC>
+ prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
- int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings));
- ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
- ssize_t chunkOffset = gridOffset + bid*chunkSize;
+ int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels));
+ ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+ ssize_t chunkOffset = gridOffset + bid*realChunkSize;
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
- int maxOffset = min(chunkSize, size-chunkOffset);
+ int nelem = min(realChunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[nranks-1];
offset = chunkOffset + rankDest * size;
- Prims::Copy(tid, nthreads,
- thisInput + offset,
- nextOutput + noffset,
- sliceSize, maxOffset,
- step,
- waitDoneFromNext,
- postReadyToNext);
-
- NEXT_STEP; // Increases step, poffset, noffset
+ prims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
- Prims::Reduce(tid, nthreads,
- prevInput + poffset,
- thisInput + offset,
- nextOutput + noffset,
- sliceSize, maxOffset,
- step,
- waitDoneFromNext, waitReadyFromPrev,
- postReadyToNext, postDoneToPrev);
-
- NEXT_STEP;
+ prims.recvReduceSend(thisInput+offset, nelem);
}
- // step k-1: reduce this buffer and data, which will produce the final
- // result that we store in this data and push to the next GPU
+ // step k-1: reduce this buffer and data, which will produce the final result
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
- Prims::Reduce(tid, nthreads,
- prevInput + poffset,
- thisInput + offset,
- thisOutput + chunkOffset,
- sliceSize, maxOffset,
- step,
- waitReadyFromPrev,
- postDoneToPrev);
- }
-
- if (tid == 0) {
- waitDoneFromNext.wait(REDUCESCATTER_SUBSTEPS*(step + REDUCESCATTER_BUFCHUNKS));
- *ring->send.conn.head = 0ULL;
- *ring->recv.conn.tail = 0ULL;
- __threadfence_system();
- *ring->recv.conn.opCount = args->opCount+1;
+ prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
}
}
-#include "ll_kernel.h"
-
-#define NEXT_STEP_LL \
- poffset = noffset; \
- pflag = nflag; \
- noffset += NCCL_LL_SLICE_LINES; \
- if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \
- nflag++; \
- step++;
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { }
template<int UNUSED, class FUNC, typename T>
-__device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
+__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int bid = args->bid;
- const int llNthreads = args->nThreads;
+ const int nthreads = args->nThreads;
struct ncclComm* comm = args->comm;
- struct ncclRing* ring = comm->rings+blockIdx.x;
- volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead;
- volatile uint64_t * sendHeadPtr = ring->send.conn.llHead;
- volatile int * sizesFifo = ring->send.conn.llFifo;
- uint64_t sendHead = sendHeadPtr[0];
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ struct ncclRing* ring = &channel->ring;
- typedef LLPrimitives<T, FUNC> LL;
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount);
const ssize_t size = args->N;
//const int rank = comm->rank;
const int nranks = comm->nRanks;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
- const ssize_t loopSize = args->nRings*chunkSize;
-
- uint64_t step = ring->send.conn.llStep;
- uint32_t pflag, nflag = step + 1;
- int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step);
+ const ssize_t loopSize = args->nChannels*chunkSize;
// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;
- union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff;
- union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff;
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
if (size-gridOffset < loopSize) {
@@ -167,37 +93,21 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
- int maxOffset = min(chunkSize, size-chunkOffset);
+ int nelem = min(chunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ring->devUserRanks[nranks-1];
offset = chunkOffset + rankDest * size;
- WAIT_NEXT;
- LL::ReduceCopy(
- thisInput + offset,
- nextOutput + noffset,
- maxOffset, nflag, llNthreads);
- POST_SIZE;
-
- NEXT_STEP_LL;
+ LLprims.send(thisInput+offset, nelem);
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ring->devUserRanks[nranks-j];
offset = chunkOffset + rankDest * size;
- WAIT_NEXT;
- LL::ReduceCopy(
- thisInput + offset,
- prevInput + poffset,
- nextOutput + noffset,
- maxOffset, pflag, nflag, llNthreads);
- POST_SIZE;
- ACK_PREV;
-
- NEXT_STEP_LL;
+ LLprims.recvReduceSend(thisInput+offset, nelem);
}
// step k-1: reduce this buffer and data, which will produce the final
@@ -205,13 +115,9 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) {
rankDest = ring->devUserRanks[0];
offset = chunkOffset + rankDest * size;
- LL::ReduceCopy(
- thisInput + offset,
- prevInput + poffset,
- thisOutput + chunkOffset,
- maxOffset, pflag, llNthreads);
- ACK_PREV;
+ LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem);
}
-
- FIFO_CLEANING_AND_SAVE_STEP(nflag);
}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/reduce.cu b/src/collectives/reduce.cu
index d8fde80..302d4bc 100644
--- a/src/collectives/reduce.cu
+++ b/src/collectives/reduce.cu
@@ -4,30 +4,15 @@
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
-#include "common_coll.h"
#include "enqueue.h"
#include "collectives.h"
-ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count,
- ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
- size_t nbytes = count*ncclTypeSize(datatype);
- INFO(NCCL_COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
- if (comm->nRanks == 1) {
- if (sendbuff != recvbuff)
- CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
- } else {
- NCCLCHECK(transportSaveProxies(REDUCE_SUBSTEPS, REDUCE_BUFCHUNKS, 1, 1, nbytes, proxyPatternTo(root), comm));
- NCCLCHECK(saveKernel(ncclCollReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, 1));
- }
-
- return ncclSuccess;
-}
-
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
- return ncclEnqueueCheck(ncclReduceFunc, "Reduce", sendbuff, recvbuff, count, datatype,
- op, root, comm, stream);
+ struct ncclInfo info = { ncclCollReduce, "Reduce",
+ sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
+ REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
+ return ncclEnqueueCheck(&info);
}
diff --git a/src/collectives/reduce_scatter.cu b/src/collectives/reduce_scatter.cu
index 1447d4a..4ee77ef 100644
--- a/src/collectives/reduce_scatter.cu
+++ b/src/collectives/reduce_scatter.cu
@@ -4,29 +4,15 @@
* See LICENSE.txt for license information
************************************************************************/
-#include "core.h"
-#include "common_coll.h"
#include "enqueue.h"
#include "collectives.h"
-ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count,
- ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
- size_t nbytes = count*ncclTypeSize(datatype);
- INFO(NCCL_COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream);
- if (comm->nRanks == 1) {
- if (sendbuff != recvbuff)
- CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream));
- } else {
- NCCLCHECK(transportSaveProxies(REDUCESCATTER_SUBSTEPS, REDUCESCATTER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm));
- NCCLCHECK(saveKernel(ncclCollReduceScatter, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes*comm->nRanks, 1));
- }
- return ncclSuccess;
-}
-
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
- return ncclEnqueueCheck(ncclReduceScatterFunc, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype,
- op, 0, comm, stream);
+ struct ncclInfo info = { ncclCollReduceScatter, "ReduceScatter",
+ sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
+ REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
+ return ncclEnqueueCheck(&info);
}
diff --git a/src/enqueue.cu b/src/enqueue.cu
new file mode 100644
index 0000000..d283223
--- /dev/null
+++ b/src/enqueue.cu
@@ -0,0 +1,442 @@
+/*************************************************************************
+ * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "checks.h"
+#include "param.h"
+
+#include "collectives/collectives.h"
+
+// Only generate inline kernels for LL
+#define NCCL_FUNC5(coll, op, dtype) \
+ (void*)NCCL_KERN_NAME(coll##LL, op, dtype), \
+ (void*)NCCL_KERN_NAME(coll##LL, op, dtype)
+
+#define NCCL_FUNC4(coll, op, dtype) \
+ (void*)NCCL_FUNC5(coll##Ring, op, dtype), \
+ (void*)NCCL_FUNC5(coll##Tree, op, dtype)
+
+// Must be consistent with ncclDataType_t
+#define NCCL_FUNCS3A(coll, op) \
+ (void*)NCCL_FUNC4(coll, op, i8), \
+ (void*)NCCL_FUNC4(coll, op, u8), \
+ (void*)NCCL_FUNC4(coll, op, i32), \
+ (void*)NCCL_FUNC4(coll, op, u32), \
+ (void*)NCCL_FUNC4(coll, op, i64), \
+ (void*)NCCL_FUNC4(coll, op, u64), \
+ (void*)NCCL_FUNC4(coll, op, f16), \
+ (void*)NCCL_FUNC4(coll, op, f32), \
+ (void*)NCCL_FUNC4(coll, op, f64)
+#define NCCL_FUNCS3B(coll, op) \
+ (void*)NCCL_FUNC4(coll, op, i8), \
+ (void*)NCCL_FUNC4(coll, op, i8), \
+ (void*)NCCL_FUNC4(coll, op, i8), \
+ (void*)NCCL_FUNC4(coll, op, i8), \
+ (void*)NCCL_FUNC4(coll, op, i8), \
+ (void*)NCCL_FUNC4(coll, op, i8), \
+ (void*)NCCL_FUNC4(coll, op, i8), \
+ (void*)NCCL_FUNC4(coll, op, i8), \
+ (void*)NCCL_FUNC4(coll, op, i8)
+
+// Must be consistent with ncclRedOp_t -- but we only generate kernel for sums.
+#define NCCL_FUNCS2A(coll) \
+ NCCL_FUNCS3A(coll, sum), \
+ NCCL_FUNCS3A(coll, sum), \
+ NCCL_FUNCS3A(coll, sum), \
+ NCCL_FUNCS3A(coll, sum)
+#define NCCL_FUNCS2B(coll) \
+ NCCL_FUNCS3B(coll, copy), \
+ NCCL_FUNCS3B(coll, copy), \
+ NCCL_FUNCS3B(coll, copy), \
+ NCCL_FUNCS3B(coll, copy)
+
+// Must be consistent with the ncclFuncSet enum
+static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = {
+ NCCL_FUNCS2B(ncclBroadcast),
+ NCCL_FUNCS2A(ncclReduce),
+ NCCL_FUNCS2B(ncclAllGather),
+ NCCL_FUNCS2A(ncclReduceScatter),
+ NCCL_FUNCS2A(ncclAllReduce)
+};
+
+/*****************************************************************************/
+/* Launch system : synchronization and CUDA kernel launch */
+/*****************************************************************************/
+
+ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
+#if CUDART_VERSION >= 9000
+ if (cgMode & 0x01) {
+ CUDACHECK(cudaLaunchCooperativeKernelMultiDevice(paramsList, numDevices,
+ // These flags are to reduce the latency of using this API
+ cudaCooperativeLaunchMultiDeviceNoPreSync|cudaCooperativeLaunchMultiDeviceNoPostSync));
+ return ncclSuccess;
+ }
+#endif
+ int savedDev;
+ CUDACHECK(cudaGetDevice(&savedDev));
+ for (int i = 0; i < numDevices; i++) {
+ struct cudaLaunchParams* params = paramsList+i;
+ CUDACHECK(cudaSetDevice(cudaDevs[i]));
+ CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
+ }
+ CUDACHECK(cudaSetDevice(savedDev));
+ return ncclSuccess;
+}
+
+ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
+ params->gridDim.x = std::min((int) params->gridDim.x, comm->nChannels);
+
+ // Set active = 2 for the last operation
+ for (int r=0; r<params->gridDim.x; r++) {
+ struct ncclChannel* channel = comm->channels+r;
+ channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active = 2;
+ }
+
+ // Find the first operation, choose the kernel accordingly and pass it
+ // as the first argument.
+ struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart;
+ memcpy(&comm->args, coll, sizeof(struct ncclColl));
+ // As we pass that coll directly, we can free it immediately.
+ coll->active = 0;
+
+ params->func = ncclKerns[coll->funcIndex];
+ return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
+ volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+ int val = *ptr;
+ bool done = false;
+ while (done == false) {
+ if (val >= comm->intraRanks) {
+ WARN("Trying to launch too many collectives");
+ return ncclInvalidUsage;
+ }
+ if (val+1 == comm->intraRanks) {
+ // Reset the barrier.
+ comm->intraBarrier[comm->intraPhase^1] = 0;
+ *isLast = 1;
+ return ncclSuccess;
+ }
+ done = __sync_bool_compare_and_swap(ptr, val, val+1);
+ val++;
+ }
+ *isLast = 0;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
+ volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+ int val = *ptr;
+ if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
+ WARN("Trying to launch too many collectives");
+ return ncclInternalError;
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
+ volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
+ while (*ptr < comm->intraRanks) pthread_yield();
+ comm->intraPhase ^= 1;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
+ if (comm->nRanks == 1) return ncclSuccess;
+ struct cudaLaunchParams* params = comm->myParams;
+
+ NCCLCHECK(setupLaunch(comm, params));
+
+ // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
+ if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
+ // Enqueue event in user stream
+ CUDACHECK(cudaEventRecord(comm->doneEvent, comm->userStream));
+ // Create dependency between user stream and internal NCCL stream
+ CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
+ params->stream = comm->groupStream;
+ } else {
+ if (comm->userStream != params->stream) {
+ // Stream changed from last call, create dependency against last NCCL kernel launch
+ CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
+ }
+ params->stream = comm->userStream;
+ }
+
+ int isLast = 0;
+ NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
+
+ if (isLast) {
+ if (comm->launchMode == ncclComm::GROUP) {
+ // I'm the last. Launch all operations.
+ NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode));
+ }
+ NCCLCHECK(ncclCpuBarrierLast(comm));
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
+ if (comm->nRanks == 1) return ncclSuccess;
+ // We can't print the CG mode before the first barrier happened.
+ if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
+ *comm->intraCGMode ^= 0x10;
+ INFO(NCCL_INIT,"Launch mode %s%s%s",
+ comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
+ *comm->intraCGMode ? "/CGMD" : "",
+ (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
+ }
+
+ NCCLCHECK(ncclCpuBarrierOut(comm));
+
+ struct cudaLaunchParams *params = comm->myParams;
+ if (comm->launchMode == ncclComm::PARALLEL) {
+ CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
+ }
+ // Start the network proxies as soon as the kernel has been launched. We can't
+ // perform any CUDA call between the two or having a cudaFree between the CUDA
+ // launch and the transportStartProxy call could cause a deadlock.
+ // Also, starting the proxies after the CUDA launch seems to be better for
+ // performance (latency).
+ for (int r=0; r<params->gridDim.x; r++) {
+ struct ncclChannel* channel = comm->channels+r;
+ channel->collStart = channel->collFifoTail;
+ channel->collCount = 0;
+ }
+ params->gridDim.x = params->blockDim.x = 0;
+ NCCLCHECK(transportStartProxy(comm));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
+ struct cudaLaunchParams *params = comm->myParams;
+ // Enqueue event after NCCL kernel
+ CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream));
+ // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
+ if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
+ // Create dependency between NCCL internal stream and user stream
+ CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
+ }
+ comm->userStreamSet = false;
+ return ncclSuccess;
+}
+
+/*****************************************************************************/
+/* Enqueueing system : computation of kernel and proxy operations parameters */
+/*****************************************************************************/
+
+static ncclResult_t getPatternInfo(struct ncclInfo* info) {
+ if (info->coll == ncclCollBroadcast) info->pattern = ncclPatternPipelineFrom;
+ else if (info->coll == ncclCollReduce) info->pattern = ncclPatternPipelineTo;
+ else if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->pattern = ncclPatternRing;
+ else if (info->coll == ncclCollAllReduce) {
+ if (info->nBytes <= info->comm->treeThreshold)
+ info->pattern = ncclPatternTreeUpDown;
+ else
+ info->pattern = ncclPatternRingTwice;
+ }
+ else {
+ WARN("Unknown collective %d", info->coll);
+ return ncclInternalError;
+ }
+ return ncclSuccess;
+}
+
+static ncclResult_t getLoopInfo(struct ncclInfo* info) {
+ switch (info->pattern) {
+ case ncclPatternTreeUp:
+ case ncclPatternTreeDown:
+ case ncclPatternTreeUpDown:
+ case ncclPatternPipelineFrom:
+ case ncclPatternPipelineTo:
+ info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
+ case ncclPatternRing:
+ info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
+ case ncclPatternRingTwice:
+ info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break;
+ default:
+ WARN("Unknown pattern %d\n", info->pattern);
+ return ncclInternalError;
+ }
+ return ncclSuccess;
+}
+
+static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) {
+ // Compute thresholds and limits that users can override
+ int perThreadLLThreshold = std::min(info->comm->threadThreshold, (ssize_t)NCCL_LL_CHANNEL_THRESHOLD);
+ int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads);
+
+ // First compute nThreads
+ int nt = NCCL_LL_MIN_NTHREADS;
+ while (DIVUP(info->nBytes, nt*info->nchunksPerLoop) > perThreadLLThreshold && nt*2 <= maxLLNthreads) nt *= 2;
+
+ // Then compute nChannels
+ int nc = DIVUP(info->nBytes, nt*info->nchunksPerLoop*perThreadLLThreshold);
+ if (nc == 0) nc = 1;
+ if (nc > info->comm->nChannels) nc = info->comm->nChannels;
+
+ // Check if we have a fixed LL threshold, otherwise compute it.
+ int perThreadThreshold = info->comm->threadThreshold;
+ if (info->pattern >= ncclPatternTreeUp) perThreadThreshold *= 4;
+ ssize_t llThreshold = info->comm->llThreshold >= 0 ?
+ info->comm->llThreshold :
+ nc*nt*info->nchunksPerLoop*perThreadThreshold;
+
+ if (info->nBytes <= llThreshold) {
+ *llMode = 1;
+ *nChannels = nc;
+ *nThreads = nt;
+ } else {
+ *llMode = 0;
+ *nChannels = info->comm->nChannels;
+ *nThreads = info->comm->nThreads+1;
+ }
+}
+
+static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) {
+ // Set nstepsPerLoop and nchunksPerLoop
+ NCCLCHECK(getPatternInfo(info));
+ NCCLCHECK(getLoopInfo(info));
+
+ coll->args.root = info->root;
+ coll->args.N = info->count;
+ coll->args.ThisInput = info->sendbuff;
+ coll->args.ThisOutput = info->recvbuff;
+ coll->args.comm = info->comm->devComm;
+ coll->args.opCount = info->comm->opCount;
+
+ // Compute llMode, nChannels, nThreads
+ int llMode;
+ getKernelInfo(info, &coll->args.nChannels, &coll->args.nThreads, &llMode);
+
+ int treeMode = info->pattern >= ncclPatternTreeUp ? 1 : 0;
+ coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, llMode, treeMode);
+
+ int stepSize = ( llMode ? NCCL_LL_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS;
+ int chunkSteps = (llMode|treeMode) ? 1 : info->chunkSteps;
+ int sliceSteps = (llMode|treeMode) ? 1 : info->sliceSteps;
+ int chunkSize = stepSize*chunkSteps;
+
+ // Compute lastChunkSize
+ if (treeMode == 1 && llMode == 0) {
+ if (info->pattern == ncclPatternTreeUpDown) {
+ // Optimize chunkSize / nSteps
+ while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2;
+ while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2;
+ while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
+ }
+ // Use lastChunkSize as chunkSize
+ coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+ } else if (llMode == 1) {
+ int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t);
+ const ssize_t loopSize = coll->args.nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
+ coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), coll->args.nChannels*info->nchunksPerLoop);
+ ALIGN_SIZE(coll->args.lastChunkSize, coll->args.nThreads*sizeof(uint64_t));
+ coll->args.lastChunkSize /= ncclTypeSize(info->datatype);
+ }
+
+ // Compute nSteps for proxies
+ size_t nBytes = llMode ? info->nBytes*2 : info->nBytes;
+
+ int nLoops = (int)(DIVUP(nBytes, (((size_t)(coll->args.nChannels))*info->nchunksPerLoop*chunkSize)));
+ proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
+ proxyArgs->sliceSteps = sliceSteps;
+ proxyArgs->chunkSteps = chunkSteps;
+ proxyArgs->llMode = llMode;
+ proxyArgs->opCount = info->comm->opCount;
+ TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> llmode %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
+ coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, nBytes, llMode, coll->args.nChannels, coll->args.nThreads,
+ nLoops, proxyArgs->nsteps, info->comm);
+ return ncclSuccess;
+}
+
+static ncclResult_t saveKernel(struct ncclInfo* info) {
+ if (info->comm->nRanks == 1) {
+ if (info->sendbuff != info->recvbuff)
+ CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, cudaMemcpyDeviceToDevice, info->stream));
+ return ncclSuccess;
+ }
+
+ struct ncclColl coll;
+ struct ncclProxyArgs proxyArgs;
+ memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs));
+ NCCLCHECK(computeColl(info, &coll, &proxyArgs));
+
+ info->comm->myParams->blockDim.x = max(info->comm->myParams->blockDim.x, coll.args.nThreads);
+ if (info->comm->userStreamSet == false) {
+ info->comm->userStream = info->stream;
+ info->comm->userStreamSet = true;
+ } else if (info->stream != info->comm->userStream) {
+ WARN("Error : mixing different streams within a group call is not supported.");
+ return ncclInvalidUsage;
+ }
+ for (int bid=0; bid<coll.args.nChannels; bid++) {
+ struct ncclChannel* channel = info->comm->channels+(info->comm->myParams->gridDim.x % info->comm->nChannels);
+
+ if (channel->collCount == NCCL_MAX_OPS) {
+ WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
+ return ncclInvalidUsage;
+ }
+
+ // Proxy
+ proxyArgs.channel = channel;
+ NCCLCHECK(transportSaveProxies(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
+
+ info->comm->myParams->gridDim.x++;
+
+ int opIndex = channel->collFifoTail;
+ struct ncclColl* c = channel->collectives+opIndex;
+ volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
+ while (activePtr[0] != 0) sched_yield();
+
+ memcpy(c, &coll, sizeof(struct ncclColl));
+
+ c->args.bid = bid;
+ c->active = 1;
+ opIndex = (opIndex+1)%NCCL_MAX_OPS;
+ c->nextIndex = opIndex;
+ channel->collFifoTail = opIndex;
+ channel->collCount++;
+ }
+ /*if (llMode == 0)*/ info->comm->opCount++;
+ return ncclSuccess;
+}
+
+
+ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
+ if (info->comm == NULL) return ncclInvalidArgument;
+
+ INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
+ info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
+ info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
+
+ // Launch asynchronously if needed
+ if (ncclAsyncMode()) {
+ ncclResult_t ret = ncclSuccess;
+ int savedDev = -1;
+ if (info->comm->checkPointers) {
+ CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end);
+ CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, end);
+ }
+ // Check arguments
+ NCCLCHECKGOTO(ArgsCheck(info), ret, end);
+ // Always register comm even in case of error to make sure ncclGroupEnd
+ // cleans it up.
+ NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
+ NCCLCHECKGOTO(saveKernel(info), ret, end);
+end:
+ if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev));
+ ncclAsyncErrCheck(ret);
+ return ret;
+ } else {
+ NCCLCHECK(ArgsCheck(info));
+ NCCLCHECK(saveKernel(info));
+ NCCLCHECK(ncclBarrierEnqueue(info->comm));
+ NCCLCHECK(ncclBarrierEnqueueWait(info->comm));
+ NCCLCHECK(ncclEnqueueEvents(info->comm));
+ return ncclSuccess;
+ }
+}
diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h
index 278593c..a1aaf50 100644
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@@ -13,5 +13,7 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState);
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
+ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
+ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
ncclResult_t bootstrapClose(void* commState);
#endif
diff --git a/src/include/channel.h b/src/include/channel.h
new file mode 100644
index 0000000..76c5e8a
--- /dev/null
+++ b/src/include/channel.h
@@ -0,0 +1,14 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CHANNEL_H_
+#define NCCL_CHANNEL_H_
+#include "core.h"
+
+ncclResult_t initChannel(struct ncclComm* comm, int channelid);
+ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks);
+
+#endif
diff --git a/src/include/checks.h b/src/include/checks.h
new file mode 100644
index 0000000..bf7750e
--- /dev/null
+++ b/src/include/checks.h
@@ -0,0 +1,10 @@
+/*************************************************************************
+ * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+
+ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
+ncclResult_t ArgsCheck(struct ncclInfo* info);
diff --git a/src/include/common_coll.h b/src/include/common_coll.h
deleted file mode 100644
index 3ec7354..0000000
--- a/src/include/common_coll.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef COMMON_COLL_H_
-#define COMMON_COLL_H_
-
-#include "core.h"
-#include "enqueue.h"
-#include "collectives/collectives.h"
-
-static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
- cudaPointerAttributes attr;
- cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
- if (err != cudaSuccess || attr.devicePointer == NULL) {
- WARN("%s : %s is not a valid pointer", opname, ptrname);
- return ncclInvalidArgument;
- }
-#if CUDART_VERSION >= 10000
- if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
-#else
- if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
-#endif
- WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
- return ncclInvalidArgument;
- }
- return ncclSuccess;
-}
-
-static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
- if (ptr == NULL) {
- WARN("%s : %s argument is NULL", opname, ptrname);
- return ncclInvalidArgument;
- }
- return ncclSuccess;
-}
-
-static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) {
- NCCLCHECK(PtrCheck(comm, opname, "comm"));
- // First, the easy ones
- if (root < 0 || root >= comm->nRanks) {
- WARN("%s : invalid root %d (root should be in the 0..%d range)", opname, root, comm->nRanks);
- return ncclInvalidArgument;
- }
- if (type < 0 || type >= ncclNumTypes) {
- WARN("%s : invalid type %d", opname, type);
- return ncclInvalidArgument;
- }
- if (op < 0 || op >= ncclNumOps) {
- WARN("%s : invalid reduction operation %d", opname, op);
- return ncclInvalidArgument;
- }
-
- if (comm->checkPointers) {
- // Check CUDA device pointers
- if (strcmp(opname, "Broadcast") != 0 || comm->rank == root) {
- NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname));
- }
- if (strcmp(opname, "Reduce") != 0 || comm->rank == root) {
- NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname));
- }
- }
- return ncclSuccess;
-}
-
-static __inline__ int ncclTypeSize(ncclDataType_t type) {
- switch (type) {
- case ncclInt8:
- case ncclUint8:
- return 1;
- case ncclFloat16:
- return 2;
- case ncclInt32:
- case ncclUint32:
- case ncclFloat32:
- return 4;
- case ncclInt64:
- case ncclUint64:
- case ncclFloat64:
- return 8;
- default:
- return -1;
- }
-}
-
-// In : comm, nbytes ; Out : nrings, nthreads, ll
-// - We start with the minimum number of threads possible (64) and see if the size fits in LL;
-// If not, we increase the number of threads by 2x, until we reach the max number of LL threads (256, or set by user via NCCL_NTHREADS, or platform non-LL default)
-// - We use "maxRings" to limit the max number of rings we can use before reaching the max number of LL threads
-// This ensures we don't use a large number of rings with a small number of threads
-// - We use the NCCL_LL_RING_THRESHOLD as the per-thread threshold before we reach the max number of threads
-// we use NCCL_THREAD_THRESHOLD when we reach the max
-// - If by the max number of LL threads, the size still cannot fit in LL, then we use non-LL setting
-// - We honor the NCCL_LL_THRESHOLD (total threshold) set by user too
-static inline void ncclGetCollResource(ncclComm_t comm, size_t nbytes, int* nrings, int* nthreads, int* ll) {
- *ll = 0;
- int llEnforced = 0; /* see if the size falls in the NCCL_LL_THRESHOLD range set by user */
- if (comm->llThreshold >= 0) { /* user sets total LL threshold */
- if (nbytes > comm->llThreshold) { /* non-LL */
- *nthreads = comm->nThreads+1;
- *nrings = comm->nRings;
- return;
- } else {
- llEnforced = 1; /* user wants to use LL */
- }
- }
- int nt = NCCL_LL_MIN_NTHREADS; /* start with min number of LL threads */
- size_t nr;
- int ll_max_nthreads = std::min(NCCL_LL_MAX_NTHREADS, comm->nThreads); /* respect user's setting or platform's default setting */
- int maxRings = (comm->nRanks <= 4) ? 1 : ll_max_nthreads / NCCL_LL_MIN_NTHREADS;
- ssize_t threshold = std::min(comm->threadThreshold, (ssize_t)NCCL_LL_RING_THRESHOLD);
- while (nt < ll_max_nthreads && *ll == 0) {
- nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*nt*comm->nRanks));
- if (nr <= maxRings) { /* avoid using few threads but many rings */
- nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
- *ll = nbytes > comm->nRanks*nr*nt*threshold ? 0 : 1;
- }
- if (*ll == 0) {
- nt = nt << 1;
- }
- }
- if (*ll == 1) {
- *nthreads = nt;
- *nrings = (int)nr;
- return; /* we can use smaller number of threads to make LL work, stop here */
- }
- nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*ll_max_nthreads*comm->nRanks)); /* else we try the max number of LL threads */
- nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr;
- *ll = nbytes > comm->nRanks*nr*ll_max_nthreads*comm->threadThreshold ? llEnforced : 1;
- *nthreads = *ll ? ll_max_nthreads : comm->nThreads+1;
- *nrings = *ll ? (int)nr : comm->nRings;
-}
-
-static ncclResult_t saveKernel(int coll, const void* sendbuff, void* recvbuff, size_t count,
- ncclDataType_t dtype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, size_t nbytes, int loopFactor) {
- int llMode, nBlocks, nThreads;
- ncclGetCollResource(comm, nbytes, &nBlocks, &nThreads, &llMode);
- comm->myParams->blockDim.x = std::max((int)comm->myParams->blockDim.x, nThreads);
- if (comm->userStreamSet == false) {
- comm->userStream = stream;
- comm->userStreamSet = true;
- } else if (stream != comm->userStream) {
- WARN("Error : mixing different streams within a group call is not supported.");
- return ncclInvalidUsage;
- }
- int lastChunkSize = 0;
- if (llMode == 1) {
- int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / ncclTypeSize(dtype);
- const ssize_t loopSize = nBlocks*loopFactor*(ssize_t)sliceSize;
- lastChunkSize = DIVUP((count-count/loopSize*loopSize), nBlocks*loopFactor);
- ALIGN_SIZE(lastChunkSize, nThreads*sizeof(uint64_t)/ncclTypeSize(dtype));
- }
- for (int bid=0; bid<nBlocks; bid++) {
- struct ncclRing* ring = comm->rings+(comm->myParams->gridDim.x % comm->nRings);
- if (ring->collCount == NCCL_MAX_OPS) {
- WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
- return ncclInvalidUsage;
- }
-
- comm->myParams->gridDim.x++;
-
- int opIndex = ring->collFifoTail;
- struct ncclColl* c = ring->collectives+opIndex;
- volatile uint8_t* activePtr = (volatile uint8_t*)&c->active;
- while (activePtr[0] != 0) sched_yield();
-
- struct CollectiveArgs* args = &c->args;
- args->root = root;
- args->N = count;
- args->ThisInput = sendbuff;
- args->ThisOutput = recvbuff;
- args->comm = comm->devComm;
- args->opCount = comm->opCount;
- args->bid = bid;
- args->nRings = nBlocks;
- args->nThreads = nThreads;
- args->lastChunkSize = lastChunkSize;
-
- c->nThreads = nThreads;
- c->funcIndex = FUNC_INDEX(coll, op, dtype, llMode);
- c->active = 1;
- opIndex = (opIndex+1)%NCCL_MAX_OPS;
- c->nextIndex = opIndex;
- ring->collFifoTail = opIndex;
- ring->collCount++;
- }
- /*if (llMode == 0)*/ comm->opCount++;
- return ncclSuccess;
-}
-
-extern __global__ void ncclMultiOpKernel (struct ncclColl firstColl);
-
-#endif
diff --git a/src/include/core.h b/src/include/core.h
index 8285df5..d57d271 100644
--- a/src/include/core.h
+++ b/src/include/core.h
@@ -8,6 +8,7 @@
#define NCCL_CORE_H_
#define NCCL_MAX_OPS 2048
+#define NCCL_STEPS 8
#include "nccl.h"
#include "transport.h"
@@ -29,15 +30,15 @@ struct cudaLaunchParams {
};
#endif
-#define MAXRINGS 16
+#define MAXCHANNELS 16
#define MAXTHREADS 256
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
-// Rings / LL tuning
-#define NCCL_LL_RING_THRESHOLD 8 // Per thread size before we start increasing nrings
-#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL for Volta and above
+// Channels / LL tuning
+#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings
+#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL
#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs
-#define NCCL_LL_MAX_NTHREADS 256
+#define NCCL_LL_MAX_NTHREADS MAXTHREADS
#define NCCL_LL_MIN_NTHREADS 64
#define DIVUP(x, y) \
@@ -63,43 +64,84 @@ union ncclLLFifoLine {
int4 i4;
};
+typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t;
+
+typedef enum {
+ ncclPatternRing,
+ ncclPatternRingTwice,
+ ncclPatternPipelineFrom,
+ ncclPatternPipelineTo,
+ ncclPatternTreeUp,
+ ncclPatternTreeDown,
+ ncclPatternTreeUpDown
+} ncclPattern_t;
+
+typedef enum {
+ ncclDevSuccess,
+ ncclDevAssertedMismatch,
+ ncclDevSuspectedMismatch
+} ncclDevError_t;
+
+// Used to pass NCCL call information between functions
+struct ncclInfo {
+ ncclColl_t coll;
+ const char* opName;
+ // NCCL Coll Args
+ const void* sendbuff;
+ void* recvbuff;
+ size_t count;
+ ncclDataType_t datatype;
+ ncclRedOp_t op;
+ int root;
+ ncclComm_t comm;
+ cudaStream_t stream;
+ // Algorithm details
+ int chunkSteps;
+ int sliceSteps;
+ // Computed later
+ ncclPattern_t pattern;
+ size_t nBytes;
+ int nstepsPerLoop;
+ int nchunksPerLoop;
+};
+
struct ncclConnInfo {
// Regular comm mechanism
char *buff; // Local for recv, remote for send
uint64_t *tail; // Local for recv, remote for send
uint64_t *head; // Local for send, remote for recv
- uint64_t *opCount; // Local for recv, remote for send
+ uint64_t *opCountLoc; // opCount of local rank
+ uint64_t *opCountRem; // opCount of remote rank
int direct; // Direct communication
void **ptrExchange; // Pointer exchange for direct communication
int *fifo; // Size fifo for proxy
+ uint64_t step; // Keep where we are
+
// Low latency mechanism
- char *llBuff; // Local for recv, remote for send
- uint64_t *llHead; // Local for send, remote for recv
- int *llFifo; // LL Size fifo for proxy
- uint64_t llStep; // Keep where we are
+ union ncclLLFifoLine *llBuff; // Local for recv, remote for send
uint64_t llLastCleaning;
};
struct ncclConnector {
- struct transportProxyInfo* proxyInfo;
- struct ncclTransport* transport;
+ int connected;
+ struct ncclProxyArgs *proxyAppend;
+ struct ncclTransportComm* transportComm;
void* transportResources; // Host-side resources
struct ncclConnInfo conn;
+ struct ncclComm *comm;
};
#define CACHE_LINE_SIZE 128
#define MEM_ALIGN 4096
-#define SIZES_FIFO_SIZE 32
#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */
-#define NCCL_LL_CHUNKS 8
#define NUM_LINES_PER_THREAD 8
-#define NCCL_LL_BUFF_SIZE (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_LL_CHUNKS*sizeof(union ncclLLFifoLine)) // 256K
-#define NCCL_LL_BUFF_LINES (NCCL_LL_BUFF_SIZE / (2*sizeof(uint64_t)))
-#define NCCL_LL_SLICE_LINES (NCCL_LL_BUFF_LINES / NCCL_LL_CHUNKS)
+#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS)
+#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS)
+#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine))
#define NCCL_LL_CLEAN_FREQ 0x10000000
struct ncclSendMem {
@@ -109,7 +151,7 @@ struct ncclSendMem {
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
void* ptrExchange;
char pad2[CACHE_LINE_SIZE-sizeof(void*)];
- uint64_t llHead;
+ uint64_t opCount;
};
char pad3[MEM_ALIGN];
};
@@ -119,37 +161,54 @@ struct ncclRecvMem {
union {
struct {
uint64_t tail;
- char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
+ char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
uint64_t opCount;
- char pad4[CACHE_LINE_SIZE-sizeof(uint64_t)];
- int sizesFifo[SIZES_FIFO_SIZE];
- int llSizesFifo[SIZES_FIFO_SIZE];
+ char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)];
+ int sizesFifo[NCCL_STEPS];
};
- char pad5[MEM_ALIGN];
+ char pad4[MEM_ALIGN];
};
- char llBuff[NCCL_LL_BUFF_SIZE];
+ ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES];
char buff[1]; // Actually larger than that
};
struct ncclRing {
+ // Shortcuts for userRanks[1] and userRanks[n-1]
+ int prev;
+ int next;
+
+ // Maps an internal nccl index to user-specified rank order. This is necessary
+ // since we need to know how the user expects data to be ordered across
+ // devices. Ordered from current device.
+ int* userRanks;
+ int* devUserRanks;
+};
+
+#define NCCL_MAX_TREE_ARITY 3
+struct ncclTree {
+ int depth;
+ int up;
+ int down[NCCL_MAX_TREE_ARITY];
+};
+
+struct ncclPeer {
+ struct ncclConnector send;
+ struct ncclConnector recv;
+};
+
+struct ncclChannel {
union {
struct {
+ struct ncclRing ring;
+ struct ncclTree tree;
+
int id;
int nthreads;
- // Per ring resources
- struct ncclSendMem* devMemSend; // CUDA-size resources
- struct ncclRecvMem* devMemRecv; // CUDA-size resources
int buffSize;
- int devMemSendSize; // Keep the size for IPCs
- int devMemRecvSize; // Keep the size for IPCs
- struct ncclConnector send;
- struct ncclConnector recv;
- // Maps an internal nccl index to user-specified rank order. This is necessary
- // since we need to know how the user expects data to be ordered across
- // devices. Ordered from current device.
- int* userRanks;
- int* devUserRanks;
+ // Communication structures
+ struct ncclPeer* peers;
+ struct ncclPeer* devPeers;
// Operation list for aggregation
struct ncclColl* collectives;
@@ -162,7 +221,7 @@ struct ncclRing {
int data[0x80];
};
};
-static_assert(sizeof(struct ncclRing) == 0x80*sizeof(int), "ncclRing must have a pow2 size");
+static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size");
/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
@@ -179,7 +238,7 @@ struct CollectiveArgs {
size_t N;
uint32_t root;
uint8_t bid;
- uint8_t nRings;
+ uint8_t nChannels;
uint16_t nThreads;
int lastChunkSize;
@@ -188,7 +247,6 @@ struct ncclColl {
union {
struct {
struct CollectiveArgs args;
- uint16_t nThreads;
uint16_t funcIndex;
uint16_t nextIndex;
uint8_t active;
@@ -199,11 +257,16 @@ struct ncclColl {
static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size");
struct ncclComm {
- struct ncclRing rings[MAXRINGS];
+ struct ncclChannel channels[MAXCHANNELS];
+
+ struct ncclPeerInfo* peerInfo;
+
+ void* bootstrap;
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
int cudaDev; // my cuda device index
+ int nvmlDev; // my NVML device number
enum { GROUP, PARALLEL } launchMode;
cudaStream_t userStream;
@@ -215,18 +278,31 @@ struct ncclComm {
// where syncs are not symmetric).
uint64_t opCount;
- // Rings for collectives
- int nRings;
+ // Channels for collectives
+ int nChannels;
int nThreads;
// Low-latency algorithm threshold
ssize_t llThreshold;
ssize_t threadThreshold;
+ // Tree algorithm threshold
+ ssize_t treeThreshold;
+
// An internal CUDA stream for NCCL kernel CGMD launches
int groupCudaStream;
cudaStream_t groupStream;
+ // Whether there has been a fatal error in this communicator.
+ ncclResult_t fatalError;
+
+ // Error reported by GPU
+ volatile ncclDevError_t* fatalDevError;
+
+ // On host: this pointer has been obtained from cudaHostAlloc(cudaHostAllocMapped)
+ // On device: this pointer has been obtained from cudaHostGetDevicePointer()
+ volatile uint32_t *abortFlag;
+
// Device copy of the communicator
struct ncclComm *devComm;
@@ -244,6 +320,10 @@ struct ncclComm {
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
struct ncclColl args;
void* argsptr;
+
+ // Global proxy thread
+ pthread_t proxyThread;
+ struct ncclProxyState proxyState;
};
// Check CUDA calls
@@ -324,6 +404,28 @@ struct ncclComm {
#endif // end PROFAPI
int ncclCudaCompCap();
+ncclResult_t ncclNvlinkGpu(int* nvlink);
+int64_t ncclTreeThreshold();
+
+static __inline__ int ncclTypeSize(ncclDataType_t type) {
+ switch (type) {
+ case ncclInt8:
+ case ncclUint8:
+ return 1;
+ case ncclFloat16:
+ return 2;
+ case ncclInt32:
+ case ncclUint32:
+ case ncclFloat32:
+ return 4;
+ case ncclInt64:
+ case ncclUint64:
+ case ncclFloat64:
+ return 8;
+ default:
+ return -1;
+ }
+}
#include <sys/mman.h>
static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
diff --git a/src/include/cpuset.h b/src/include/cpuset.h
new file mode 100644
index 0000000..f70d1d8
--- /dev/null
+++ b/src/include/cpuset.h
@@ -0,0 +1,61 @@
+/*************************************************************************
+ * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CPUSET_H_
+#define NCCL_CPUSET_H_
+
+// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
+
+static int hexToInt(char c) {
+ int v = c - '0';
+ if (v < 0) return -1;
+ if (v > 9) v = 10 + c - 'a';
+ if ((v < 0) || (v > 15)) return -1;
+ return v;
+}
+
+#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
+
+ncclResult_t ncclStrToCpuset(char* str, cpu_set_t* mask) {
+ uint32_t cpumasks[CPU_SET_N_U32];
+ int m = CPU_SET_N_U32-1;
+ cpumasks[m] = 0;
+ for (int o=0; o<strlen(str); o++) {
+ char c = str[o];
+ if (c == ',') {
+ m--;
+ cpumasks[m] = 0;
+ } else {
+ int v = hexToInt(c);
+ if (v == -1) break;
+ cpumasks[m] <<= 4;
+ cpumasks[m] += v;
+ }
+ }
+ // Copy cpumasks to mask
+ for (int a=0; m<CPU_SET_N_U32; a++,m++) {
+ memcpy(((uint32_t*)mask)+a, cpumasks+m, sizeof(uint32_t));
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
+ int c = 0;
+ uint8_t* m8 = (uint8_t*)mask;
+ for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
+ if (c == 0 && m8[o] == 0) continue;
+ sprintf(str+c, "%02x", m8[o]);
+ c+=2;
+ if (o && o%4 == 0) {
+ sprintf(str+c, ",");
+ c++;
+ }
+ }
+ str[c] = '\0';
+ return ncclSuccess;
+}
+
+#endif
diff --git a/src/include/debug.h b/src/include/debug.h
index 55dee18..3acdf8c 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -25,6 +25,7 @@ extern uint64_t ncclDebugMask;
extern pthread_mutex_t ncclDebugOutputLock;
extern FILE *ncclDebugFile;
extern ncclResult_t getHostName(char* hostname, int maxlen);
+extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev);
extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
index 69d0463..4db7094 100644
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@@ -10,12 +10,7 @@
#include "core.h"
#include "group.h"
-typedef ncclResult_t(*ncclFunc_t)(const void* sendbuff, void* recvbuff, size_t count,
- ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
-
-ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff,
- void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root,
- ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast);
ncclResult_t ncclCpuBarrierLast(ncclComm_t comm);
ncclResult_t ncclCpuBarrierOut(ncclComm_t comm);
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
index ce3f6ca..89edbf5 100644
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@@ -58,8 +58,50 @@ typedef struct {
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v1_t;
-typedef ncclNet_v1_t ncclNet_t;
+typedef struct {
+ // Name of the network (mainly for logs)
+ const char* name;
+ // Initialize the network.
+ ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+ // Return the number of adapters.
+ ncclResult_t (*devices)(int* ndev);
+ // Return the device path in /sys. NCCL will call free on this path.
+ ncclResult_t (*pciPath)(int dev, char** path);
+ // Return whether this device supports host pointers and/or CUDA pointers
+ // as data from the current GPU. Supported types should be composed with
+ // NCCL_PTR_HOST and NCCL_PTR_CUDA.
+ ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
+ // Create a receiving object and provide a handle to connect to it. The
+ // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+ // between ranks to create a connection.
+ ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+ // Connect to a handle and return a sending comm object for that peer.
+ ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+ // Finalize connection establishment after remote peer has called connectHandle
+ ncclResult_t (*accept)(void* listenComm, void** recvComm);
+ // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+ ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+ ncclResult_t (*deregMr)(void* comm, void* mhandle);
+ // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+ // May return request == NULL if the call cannot be performed (or would block)
+ ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
+ // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+ // May return request == NULL if the call cannot be performed (or would block)
+ ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
+ // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+ // visible to the GPU
+ ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
+ // Test whether a request is complete. If size is not NULL, it returns the
+ // number of bytes sent/received.
+ ncclResult_t (*test)(void* request, int* done, int* size);
+ // Close and free send/recv comm objects
+ ncclResult_t (*closeSend)(void* sendComm);
+ ncclResult_t (*closeRecv)(void* recvComm);
+ ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v2_t;
+
+typedef ncclNet_v2_t ncclNet_t;
-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v1
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v2
#endif // end include guard
diff --git a/src/include/net.h b/src/include/net.h
index ebc9677..e75e6bb 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -26,9 +26,11 @@ static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(
static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
-static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, type, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, type, request)); return ncclSuccess; }
-static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size) { NCCLCHECK(ncclNet->flush(recvComm, data, size)); return ncclSuccess; }
+static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
+static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
+static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; }
+static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclNet->flush(recvComm, data, size, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
diff --git a/src/include/nvlink.h b/src/include/nvlink.h
index 7eb74c9..1baf9e5 100644
--- a/src/include/nvlink.h
+++ b/src/include/nvlink.h
@@ -67,18 +67,15 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
if (res != ncclSuccess) return 0;
for(int l=0; l<maxNvLinks; ++l) {
- // nvmlDeviceGetNvLinkCapability(NVML_NVLINK_CAP_P2P_SUPPORTED) would seem to
- // report whether the NVLink connects to a peer GPU (versus a POWER CPU?). I
- // don't know whether nvmlDeviceGetNvLinkRemotePciInfo() would succeed in
- // the POWER CPU case, so it seems best to check this as well.
+ // Check whether we can use this NVLink for P2P
unsigned canP2P;
if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
- // nvmlDeviceGetNvLinkRemotePciInfo() will return NVML_ERROR_NOT_SUPPORTED
- // if the links don't exist, or are disabled. So checking for that return
- // here would probably make the nvmlDeviceGetNvLinkCapability check above
- // redundant. Presumably, we still need to check the P2P capability above,
- // since even non-GPUs would possess PCI info.
+ // Make sure the Nvlink is up. The previous call should have trained the link.
+ nvmlEnableState_t isActive;
+ if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
+
+ // Try to figure out what's on the other side of the NVLink
nvmlPciInfo_t remoteProc;
if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
@@ -89,7 +86,7 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
p[c] = toupper(p[c]);
}
- if (strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
+ if (busId2 != NULL && strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) {
links++;
} else {
// Make a lower case copy of the bus ID for calling ncclDeviceType
@@ -101,11 +98,21 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
lowerId[c] = tolower(p[c]);
}
- // Determine if the remote side is NVswitch
+ // Determine if the remote side is NVswitch or a GPU
enum ncclNvLinkDeviceType type;
- if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
- //TODO: we are making an assumption that all GPUs are connected to this switch
- //This assumption may change for future architectures
+ ncclResult_t ret = ncclDeviceType(lowerId, &type);
+ if (ret == ncclSuccess) {
+ if (type == ncclNvLinkDeviceSwitch) {
+ //TODO: we are making an assumption that all GPUs are connected to this switch
+ //This assumption may change for future architectures
+ nvswitch_links++;
+ } else if (type == ncclNvLinkDeviceGpu && busId2 == NULL) {
+ links++;
+ }
+ } else {
+ // The NVLink is up but we couldn't find the PCI device on the other
+ // side. Assume it's an NVswitch outside a VM.
+ if (l==0) INFO(NCCL_INIT, "Assuming NVLink is connected to NVswitch");
nvswitch_links++;
}
}
@@ -113,43 +120,4 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) {
return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*links;
}
-static int getNumNvlinks(const char* busId) {
- nvmlDevice_t nvmlDev;
- ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev);
- if (res != ncclSuccess) return 0;
-
- int nvlinks = 0, nvswitch_links = 0;
- int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4;
- for(int l=0; l<maxNvLinks; ++l) {
- unsigned canP2P;
- nvmlEnableState_t isActive;
- if (wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) == ncclSuccess && canP2P &&
- wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) == ncclSuccess && isActive == NVML_FEATURE_ENABLED) {
- nvlinks++;
- } else {
- continue;
- }
-
- nvmlPciInfo_t remoteProc;
- if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
-
- // Make a lower case copy of the bus ID for calling ncclDeviceType
- // PCI system path is in lower case
- char* p = remoteProc.busId;
- char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
- for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
- if (p[c] == 0) break;
- lowerId[c] = tolower(p[c]);
- }
-
- // Determine if the remote side is NVswitch
- enum ncclNvLinkDeviceType type;
- if (ncclDeviceType(lowerId, &type) == ncclSuccess && type == ncclNvLinkDeviceSwitch) {
- //TODO: we are making an assumption that all GPUs are connected to this switch
- //This assumption may change for future architectures
- nvswitch_links++;
- }
- }
- return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*nvlinks;
-}
#endif
diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h
index ddfd233..0b6198a 100644
--- a/src/include/nvmlwrap.h
+++ b/src/include/nvmlwrap.h
@@ -7,7 +7,7 @@
#ifndef NCCL_NVMLWRAP_H_
#define NCCL_NVMLWRAP_H_
-#include "core.h"
+#include "nccl.h"
//#define NVML_DIRECT 1
#ifdef NVML_DIRECT
@@ -32,14 +32,6 @@ static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index)
NVMLCHECK(nvmlDeviceGetIndex(device, index));
return ncclSuccess;
}
-static ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
- NVMLCHECK(nvmlDeviceSetCpuAffinity(device));
- return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
- NVMLCHECK(nvmlDeviceClearCpuAffinity(device));
- return ncclSuccess;
-}
static ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
NVMLCHECK(nvmlDeviceGetHandleByIndex(index,device));
return ncclSuccess;
@@ -61,6 +53,10 @@ static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsig
NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
return ncclSuccess;
}
+static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
+ NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber));
+ return ncclSuccess;
+}
#else
// Dynamically handle dependencies on NVML
@@ -136,14 +132,14 @@ ncclResult_t wrapNvmlInit(void);
ncclResult_t wrapNvmlShutdown(void);
ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
-ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device);
-ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device);
ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci);
ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
nvmlNvLinkCapability_t capability, unsigned int *capResult);
+ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber);
+
#endif // NVML_DIRECT
#endif // End include guard
diff --git a/src/include/ring.h b/src/include/ring.h
deleted file mode 100644
index fa5e099..0000000
--- a/src/include/ring.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_RING_H_
-#define NCCL_RING_H_
-#include "core.h"
-
-ncclResult_t initRing(struct ncclComm* comm, int ringid);
-ncclResult_t freeRing(struct ncclRing* ring);
-
-#endif
diff --git a/src/include/rings.h b/src/include/rings.h
index 751846c..43fc595 100644
--- a/src/include/rings.h
+++ b/src/include/rings.h
@@ -12,6 +12,6 @@ static int getDefaultThreads() {
return ncclCudaCompCap() == 3 ? 128 : 256;
}
-ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next);
+ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut);
#endif
diff --git a/src/include/socket.h b/src/include/socket.h
index 624af40..fb5cfc0 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -60,7 +60,9 @@ static inline int envSocketFamily(void) {
}
static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
+#ifdef ENABLE_TRACE
char line[1024];
+#endif
struct netIf userIfs[MAX_IFS];
bool searchNot = prefixList && prefixList[0] == '^';
int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
@@ -106,7 +108,6 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre
// Store the IP address
int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
memcpy(addrs+found, interface->ifa_addr, salen);
- INFO(NCCL_INIT|NCCL_NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line));
found++;
}
}
@@ -336,8 +337,10 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr)
TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line));
#endif
- /* Put the socket in listen mode */
- SYSCHECK(listen(sockfd, 128), "listen");
+ /* Put the socket in listen mode
+ * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
+ */
+ SYSCHECK(listen(sockfd, 16384), "listen");
*fd = sockfd;
return ncclSuccess;
}
diff --git a/src/include/transport.h b/src/include/transport.h
index 59f83c9..6231a71 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,6 +9,7 @@
#include "nccl.h"
#include <stdint.h>
+#include "nvmlwrap.h"
#define NTRANSPORTS 3
@@ -19,11 +20,13 @@ struct ncclRing;
struct ncclConnector;
struct ncclComm;
-#define RANK_INFO_SIZE 64
-typedef char ncclTinfo_t[RANK_INFO_SIZE];
-
-struct ncclInfo {
- ncclTinfo_t tinfo[NTRANSPORTS];
+struct ncclPeerInfo {
+ int rank;
+ int cudaDev;
+ int nvmlDev;
+ uint64_t hostHash;
+ uint64_t pidHash;
+ char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
};
// Used to hold the transport connection values
@@ -34,18 +37,47 @@ struct ncclConnect {
char data[CONNECT_SIZE];
};
+enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress, ncclProxyOpDone };
+
+struct ncclProxyArgs;
+typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
+
struct ncclProxyArgs {
- struct ncclRing* ring;
- int substeps;
+ proxyProgressFunc_t progress;
+ struct ncclChannel* channel;
+ struct ncclConnector* connector;
+ int sliceSteps;
+ int chunkSteps;
int nsteps;
uint64_t opCount;
int llMode;
- bool needProxy;
- int active; // add component before this line -- it is left out during initialization
+ int state; // add component before this line -- it is left out during initialization
+
+ // Internal state
+ uint64_t head;
+ uint64_t tail;
+ uint64_t end;
+ void* requests[NCCL_STEPS];
+ int idle;
+
+ // Element linking
+ pthread_mutex_t mutex;
+ struct ncclProxyArgs* next;
+ struct ncclProxyArgs* nextPeer;
+};
+
+struct ncclProxyPool;
+struct ncclProxyState {
+ pthread_cond_t cond;
+ pthread_mutex_t mutex;
+ bool stop;
+ struct ncclProxyArgs* ops;
+ struct ncclProxyArgs* pool;
+ struct ncclProxyPool* pools;
};
struct ncclTransportComm {
- ncclResult_t (*setup)(ncclTinfo_t*, ncclTinfo_t*, struct ncclConnect*, struct ncclRing*);
+ ncclResult_t (*setup)(struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*);
ncclResult_t (*free)(void*);
ncclResult_t (*proxy)(struct ncclProxyArgs*);
@@ -53,8 +85,7 @@ struct ncclTransportComm {
struct ncclTransport {
const char name[4];
- ncclResult_t (*fillInfo)(ncclTinfo_t*, int);
- ncclResult_t (*canConnect)(ncclTvalue_t*, ncclTinfo_t*, ncclTinfo_t*);
+ ncclResult_t (*canConnect)(ncclTvalue_t*, struct ncclPeerInfo*, struct ncclPeerInfo*);
ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*);
struct ncclTransportComm send;
struct ncclTransportComm recv;
@@ -64,37 +95,17 @@ struct ncclTransport {
typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
-#define TRANSPORT_PROXY_FIFO_SIZE NCCL_MAX_OPS
-
-struct transportProxyInfo {
- struct ncclComm* comm;
- pthread_t thread;
- threadFunc_t func;
- volatile int proxyReady;
- struct ncclProxyArgs argsFifo[TRANSPORT_PROXY_FIFO_SIZE];
- volatile uint64_t argsFifoHead;
- volatile uint64_t argsFifoTail;
- pthread_cond_t cond;
- pthread_mutex_t mutex;
-};
-
-ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm);
-ncclResult_t transportDestroyProxy(struct ncclConnector* connector);
-
enum proxyMode {
proxyRing = 0,
proxyFrom = 1,
proxyTo = 2
};
-static int proxyPatternRing = proxyRing;
-static inline int proxyPatternFrom(int root) { return 1+root; }
-static inline int proxyPatternTo(int root) { return -1-root; }
-static inline enum proxyMode proxyPatternMode(int pattern) { return (pattern == 0) ? proxyRing : ((pattern > 0) ? proxyFrom : proxyTo); }
-static inline int proxyPatternRoot(int pattern) { return (pattern > 0) ? pattern-1 : -pattern-1; }
-
-ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t size, int pattern, struct ncclComm* comm);
-ncclResult_t transportStartProxies(struct ncclComm* comm);
+ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr);
+ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks);
+ncclResult_t transportStartProxy(struct ncclComm* comm);
+ncclResult_t transportCreateProxy(struct ncclComm* comm);
+ncclResult_t transportDestroyProxy(struct ncclComm* comm);
#include <unistd.h>
diff --git a/src/include/trees.h b/src/include/trees.h
new file mode 100644
index 0000000..1a151d1
--- /dev/null
+++ b/src/include/trees.h
@@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TREES_H_
+#define NCCL_TREES_H_
+
+ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0);
+ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* u1, int* d1_0, int* d1_1);
+
+#endif
diff --git a/src/init.cu b/src/init.cu
index 9d0188e..75822e6 100644
--- a/src/init.cu
+++ b/src/init.cu
@@ -1,21 +1,26 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "nccl.h"
#include "core.h"
-#include "ring.h"
+#include "channel.h"
#include "param.h"
#include "nvmlwrap.h"
#include "rings.h"
+#include "trees.h"
#include "bootstrap.h"
#include "transport.h"
-#include "common_coll.h"
#include "group.h"
#include "utils.h"
#include "net.h"
+#include "checks.h"
+#include "enqueue.h"
+#include "topo.h"
+#include "nvlink.h"
+#include "cpuset.h"
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
@@ -55,6 +60,16 @@ NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
ncclNet_t* ncclNet = NULL;
// We define this as weak to let tests redefine their own
+#pragma weak ncclNvlinkGpu
+ncclResult_t ncclNvlinkGpu(int* nvlink) {
+ int cudaDev;
+ CUDACHECK(cudaGetDevice(&cudaDev));
+ char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+ CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
+ *nvlink = getNvlinkGpu(busId, NULL);
+ return ncclSuccess;
+}
+// We define this as weak to let tests redefine their own
#pragma weak ncclCudaCompCap
int ncclCudaCompCap() {
int cudaDev;
@@ -77,10 +92,7 @@ ncclResult_t initNet(ncclNet_t* net) {
int ndev;
if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
if (net->devices(&ndev) != ncclSuccess) return ncclInternalError;
- if (ndev <= 0) {
- INFO(NCCL_INIT|NCCL_NET, "Net/%s: call to devices() returned 0 devices.", net->name);
- return ncclSystemError;
- }
+ if (ndev <= 0) return ncclSystemError;
return ncclSuccess;
}
@@ -91,15 +103,15 @@ ncclResult_t initNetPlugin(ncclNet_t** net) {
// string, so checking errno doesn't hurt to try to provide a better
// error message
if (errno == ENOENT) {
- INFO(NCCL_INIT|NCCL_NET, "No network plugin found.");
+ INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (libnccl-net.so).");
} else {
- INFO(NCCL_INIT|NCCL_NET, "Unable to load libnccl-net.so : %s", dlerror());
+ INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
}
return ncclSuccess;
}
ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
if (extNet == NULL) {
- INFO(NCCL_INIT|NCCL_NET, "NetPlugin: could not find " STR(NCCL_PLUGIN_SYMBOL) " symbol");
+ INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol.");
goto cleanup;
}
if (initNet(extNet) == ncclSuccess) {
@@ -116,21 +128,18 @@ ncclResult_t initNet() {
NCCLCHECK(initNet(&ncclNetSocket));
NCCLCHECK(initNetPlugin(&ncclNet));
- if (ncclNet != NULL) {
- INFO(NCCL_INIT|NCCL_NET, "Using network plugin %s", ncclNetName());
- return ncclSuccess;
- }
+ if (ncclNet != NULL) return ncclSuccess;
if (initNet(&ncclNetIb) == ncclSuccess) {
ncclNet = &ncclNetIb;
} else {
ncclNet = &ncclNetSocket;
}
- INFO(NCCL_INIT|NCCL_NET,"Using network %s", ncclNetName());
return ncclSuccess;
}
NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2);
NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2);
+NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", -2);
int ncclThreadThreshold(int minCompCap, int multiNode) {
int threshold = ncclParamThreadThreshold();
@@ -177,10 +186,15 @@ static ncclResult_t commFree(ncclComm_t comm) {
if (comm == NULL)
return ncclSuccess;
+ free(comm->peerInfo);
+
+ if (comm->bootstrap)
+ NCCLCHECK(bootstrapClose(comm->bootstrap));
+
CUDACHECK(cudaFree(comm->devComm));
- for (int ring=0; ring<comm->nRings; ring++)
- NCCLCHECK(freeRing(comm->rings+ring));
+ for (int channel=0; channel<comm->nChannels; channel++)
+ NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks));
if (comm->doneEvent != NULL)
CUDACHECK(cudaEventDestroy(comm->doneEvent));
@@ -199,6 +213,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
free(comm->intraCGMode);
free(comm->intraCC);
}
+ CUDACHECK(cudaFreeHost((void *)comm->abortFlag));
+ CUDACHECK(cudaFreeHost((void *)comm->fatalDevError));
free(comm);
return ncclSuccess;
@@ -222,12 +238,15 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
struct ncclComm* comm;
NCCLCHECK(ncclCalloc(&comm, 1));
- INFO(NCCL_INIT,"comm %p rank %d nranks %d", comm, rank, ndev);
comm->rank = rank;
comm->nRanks = ndev;
cudaGetDevice(&comm->cudaDev);
+ getNvmlDevice(comm->cudaDev, &comm->nvmlDev);
+ INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev);
+
comm->doneEvent = doneEvent;
comm->llThreshold = ncclParamLlThreshold();
+ comm->treeThreshold = ncclParamTreeThreshold();
comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
#if CUDART_VERSION >= 9200
comm->groupCudaStream = ncclParamGroupCudaStream();
@@ -235,6 +254,13 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
// Don't allow the user to overload the default setting in older CUDA builds
comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM;
#endif
+ comm->fatalError = ncclSuccess;
+
+ CUDACHECK(cudaHostAlloc((void**) &comm->fatalDevError, sizeof(ncclDevError_t), cudaHostAllocMapped));
+ *comm->fatalDevError = ncclDevSuccess;
+
+ CUDACHECK(cudaHostAlloc((void**) &comm->abortFlag, sizeof(uint32_t), cudaHostAllocMapped));
+ *comm->abortFlag = 0;
comm->argsptr = &comm->args;
@@ -248,9 +274,18 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
// Copy the comm on the device
NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1));
// Copy userRanks
- for (int r=0; r<comm->nRings; r++) {
- NCCLCHECK(ncclCudaMemcpy(comm->rings[r].devUserRanks, comm->rings[r].userRanks, comm->nRanks));
+ for (int r=0; r<comm->nChannels; r++) {
+ NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
+ NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks));
}
+ // Copy the device-accessible pointer to comm->abortFlag
+ void *devAbortFlag;
+ CUDACHECK(cudaHostGetDevicePointer(&devAbortFlag, (uint32_t *)comm->abortFlag, 0));
+ CUDACHECK(cudaMemcpy(&comm->devComm->abortFlag, &devAbortFlag, sizeof(int *), cudaMemcpyHostToDevice));
+ // Copy the device-accessible pointer to comm->fatalDevError
+ void *devFatalError;
+ CUDACHECK(cudaHostGetDevicePointer(&devFatalError, (ncclDevError_t *)comm->fatalDevError, 0));
+ CUDACHECK(cudaMemcpy(&comm->devComm->fatalDevError, &devFatalError, sizeof(ncclDevError_t *), cudaMemcpyHostToDevice));
return ncclSuccess;
}
@@ -267,35 +302,81 @@ static void showVersion() {
}
}
-static ncclResult_t fillInfo(struct ncclInfo* info, int rank) {
- for (int t=0; t<NTRANSPORTS; t++) {
- NCCLCHECK(ncclTransports[t].fillInfo(info->tinfo+t, rank));
- }
+static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank) {
+ info->rank = rank;
+ CUDACHECK(cudaGetDevice(&info->cudaDev));
+ NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev))
+ info->hostHash=getHostHash();
+ info->pidHash=getPidHash();
+
+ // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
+ // cudaDev is a CUDA runtime dev number which could be different from the
+ // NVML device number. Then we get the busID from NVML to be sure it is
+ // consistent with NVML remote PCI bus Ids.
+ CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
+ nvmlDevice_t nvmlDevice;
+ NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice));
+ nvmlPciInfo_t pciInfo;
+ NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo));
+ strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE);
return ncclSuccess;
}
template <int type>
-static ncclResult_t selectTransport(struct ncclInfo* myInfo, struct ncclInfo* peerInfo, struct ncclConnect* connect, struct ncclTransport** transportRet, struct ncclRing* ring) {
+static ncclResult_t selectTransport(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) {
for (int t=0; t<NTRANSPORTS; t++) {
struct ncclTransport *transport = ncclTransports+t;
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
ncclTvalue_t ret = 0;
- NCCLCHECK(transport->canConnect(&ret, myInfo->tinfo+t, peerInfo->tinfo+t));
+ NCCLCHECK(transport->canConnect(&ret, myInfo, peerInfo));
if (ret > 0) {
- NCCLCHECK(transportComm->setup(myInfo->tinfo+t, peerInfo->tinfo+t, connect, ring));
- *transportRet = transport;
+ connector->transportComm = transportComm;
+ NCCLCHECK(transportComm->setup(myInfo, peerInfo, connect, connector, buffSize, channelId));
return ncclSuccess;
}
}
WARN("No transport found !");
- *transportRet = NULL;
return ncclInternalError;
}
-static ncclResult_t setupRing(struct ncclComm* comm, int ringid, int rank, int nranks, int* ringRanks, struct ncclInfo* allInfo, struct ncclConnect* connect) {
- NCCLCHECK(initRing(comm, ringid));
+static int log2(int n) {
+ int l = 0;
+ while (n>>=1) l++;
+ return l;
+}
+
+static ncclResult_t ncclTreeThreshold(int nnodes, int nranks, int nChannels, ssize_t *treeThreshold) {
+ int nvlink;
+ NCCLCHECK(ncclNvlinkGpu(&nvlink));
+ float ringbw = nvlink ? 5000*nChannels : 5000; // approx, in MB/s or B/us
+ float ringlatinter = 6;
+ float treelatintra = 4;
+ float treelatinter = 15;
+ float treebw;
+ if (!nvlink) {
+ treebw = ringbw * 2 / 3;
+ } else {
+ treebw = ringbw * 3 / 4;
+ if (nnodes == 2) treebw *= 2;
+ }
+ float ringlat = ringlatinter*(nranks-1);
+ float treelat = treelatinter*log2(nnodes)+treelatintra*(nranks/nnodes-1);
+ if (nnodes < 2 || ringlat <= treelat)
+ *treeThreshold = 0;
+ else if (treebw > ringbw)
+ *treeThreshold = 0x7fffffffffffffff;
+ else
+ *treeThreshold = (ssize_t)(((ringbw*treebw/(ringbw-treebw)))*(ringlat-treelat));
+ return ncclSuccess;
+}
+
+static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks, int* treeMasters) {
+ TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
+ NCCLCHECK(initChannel(comm, channelId));
+
+ struct ncclChannel* channel = comm->channels+channelId;
+ struct ncclRing* ring = &channel->ring;
- struct ncclRing* ring = comm->rings+ringid;
// Reorganize ranks to start with rank.
int shift;
for (shift = 0; shift<nranks; shift++) {
@@ -306,21 +387,85 @@ static ncclResult_t setupRing(struct ncclComm* comm, int ringid, int rank, int n
for (int i=0; i<nranks; i++) {
ring->userRanks[i] = ringRanks[(i+shift)%nranks];
}
- int prev = ring->userRanks[nranks-1];
- int next = ring->userRanks[1];
+ int prev = ring->prev = ring->userRanks[nranks-1];
+ int next = ring->next = ring->userRanks[1];
+
+ struct ncclTree* tree = &channel->tree;
+ tree->up = -1;
+ tree->down[0] = tree->down[1] = tree->down[2] = -1;
+
+ //
+ // Find per-node masters and connect them via a binary tree
+ //
+
+ int nMasters = 0;
+ for (int r=0; r<nranks; r++) nMasters += treeMasters[r];
+ if (nMasters == 0) {
+ nMasters = 1;
+ treeMasters[0] = 1;
+ }
+
+ if (comm->treeThreshold == -2)
+ NCCLCHECK(ncclTreeThreshold(nMasters, comm->nRanks, comm->nChannels, &comm->treeThreshold));
+
+ if (comm->treeThreshold > 0) {
+ // Compute tree depth. Not an exact value but a good approximation in most
+ // cases and consistent across nodes
+ tree->depth = nranks/nMasters + log2(nMasters);
+
+ // Find my master : go backwards in the ring to find my root
+ int master = 0;
+ for (int i = 0; i<nranks; i++) {
+ int r = ring->userRanks[(nranks-i)%nranks];
+ if (treeMasters[r]) {
+ master = r;
+ break;
+ }
+ }
- NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+0, &ring->recv.transport, ring));
- NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+1, &ring->send.transport, ring));
- NCCLCHECK(transportCreateProxy(0, ring, comm));
- NCCLCHECK(transportCreateProxy(1, ring, comm));
+ int ranks[nMasters];
+ int i = 0, masterIndex = -1;
+ // Build binary tree
+ for (int r=0; r<nranks; r++) {
+ // Create index table
+ if (r == master) masterIndex = i;
+ if (treeMasters[r]) ranks[i++] = r;
+ }
+ int btreeUp, btreeDown0, btreeDown1;
+ int u0, d0_0, d0_1, u1, d1_0, d1_1;
+ NCCLCHECK(ncclGetDtree(nMasters, masterIndex, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1));
+ if (channelId < DIVUP(comm->nChannels, 2)) {
+ btreeUp = u0; btreeDown0 = d0_0; btreeDown1 = d0_1;
+ } else {
+ btreeUp = u1; btreeDown0 = d1_0; btreeDown1 = d1_1;
+ }
+
+ //
+ // Now build the full tree, combining the intra-node ring and the
+ // inter-node binary tree.
+ //
+
+ if (rank == master) {
+ int nDown = 0;
+ if (btreeUp != -1) tree->up = ranks[btreeUp];
+ if (treeMasters[next] == 0) tree->down[nDown++] = next;
+ if (btreeDown0 != -1) tree->down[nDown++] = ranks[btreeDown0];
+ if (btreeDown1 != -1) tree->down[nDown++] = ranks[btreeDown1];
+ } else {
+ tree->up = prev;
+ if (treeMasters[next] == 0) tree->down[0] = next;
+ }
+ }
+
+ TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
return ncclSuccess;
}
-static ncclResult_t fillConnect(struct ncclInfo* allInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) {
+static ncclResult_t fillConnect(struct ncclPeerInfo* peerInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) {
for (int r=0; r<nranks; r++) {
connectTransport[r] = -1;
for (int t=0; t<NTRANSPORTS; t++) {
- NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, allInfo[rank].tinfo+t, allInfo[r].tinfo+t));
+ NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, peerInfo+rank, peerInfo+r));
if (connectValue[r] > 0) {
connectTransport[r] = t;
break;
@@ -330,11 +475,6 @@ static ncclResult_t fillConnect(struct ncclInfo* allInfo, int nranks, int rank,
return ncclSuccess;
}
-static void swap(void* mem1, void* mem2, int size) {
- char tmp[size];
- memcpy(tmp, mem1, size); memcpy(mem1, mem2, size); memcpy(mem2, tmp, size);
-}
-
#define MAXWIDTH 20
#define PREFIXLEN 15
#define STRLENGTH (PREFIXLEN+5*MAXWIDTH)
@@ -380,9 +520,9 @@ void dumpLine(int* values, int nranks, const char* prefix) {
static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
for (int r=0; r<nrings; r++) {
char prefix[30];
- /*sprintf(prefix, "[%d] Ring %d Prev : ", rank, r);
+ /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
dumpLine(prev+r*nranks, nranks, prefix);
- sprintf(prefix, "[%d] Ring %d Next : ", rank, r);
+ sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
dumpLine(next+r*nranks, nranks, prefix);*/
int current = rank;
@@ -390,7 +530,7 @@ static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int
rings[r*nranks+i] = current;
current = next[r*nranks+current];
}
- sprintf(prefix, "Ring %02d : ", r);
+ sprintf(prefix, "Channel %02d : ", r);
if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
if (current != rank) {
WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
@@ -488,140 +628,274 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct
return ncclSuccess;
}
+static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) {
+ TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv);
+ uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */
+ struct ncclConnect connect;
+ struct ncclConnector* conn;
+ for (int i=0; i<nrecv; i++) {
+ int peer = peerRecv[i];
+ if (peer == -1) continue;
+ conn = &channel->peers[peer].recv;
+ if (conn->connected) { ++nSkippedRecv; continue; }
+ NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
+ NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+ }
+ for (int i=0; i<nsend; i++) {
+ int peer = peerSend[i];
+ if (peer == -1) continue;
+ conn = &channel->peers[peer].send;
+ if (conn->connected) { ++nSkippedSend; continue; }
+ NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id));
+ NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+ }
+ for (int i=0; i<nsend; i++) {
+ int peer = peerSend[i];
+ if (peer == -1) continue;
+ conn = &channel->peers[peer].send;
+ if (conn->connected) {++nSkippedSend; continue; }
+ NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+ NCCLCHECK(conn->transportComm->connect(&connect, conn));
+ conn->connected = 1;
+ }
+ for (int i=0; i<nrecv; i++) {
+ int peer = peerRecv[i];
+ if (peer == -1) continue;
+ conn = &channel->peers[peer].recv;
+ if (conn->connected) {++nSkippedRecv; continue; }
+ NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
+ NCCLCHECK(conn->transportComm->connect(&connect, conn));
+ conn->connected = 1;
+ }
+ TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
+ return ncclSuccess;
+}
+
static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
+ // We use 3 AllGathers
+ // 1. { peerInfo, comm }
+ // 2. ConnectTransport[nranks], ConnectValue[nranks]
+ // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
+
int rank = comm->rank;
int nranks = comm->nRanks;
- void* commState;
- NCCLCHECK(bootstrapInit(commId, rank, nranks, &commState));
+ TRACE(NCCL_INIT, "rank %d nranks %d - BEGIN", rank, nranks);
+ NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
- struct ncclInfo* allInfo;
- NCCLCHECK(ncclCalloc(&allInfo, nranks));
- NCCLCHECK(fillInfo(allInfo+rank, rank));
- NCCLCHECK(bootstrapAllGather(commState, allInfo, sizeof(struct ncclInfo)));
+ // AllGather1 - begin
+ struct {
+ struct ncclPeerInfo peerInfo;
+ struct ncclComm* comm;
+ } *allGather1Data;
+
+ NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
+ allGather1Data[rank].comm = comm;
+ NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank));
+ NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
+
+ NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
+ for (int i = 0; i < nranks; i++) {
+ memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
+ }
+ // AllGather1 data is used again below
+ // AllGather1 - end
+
+ // AllGather2 - begin
+ size_t allGather2DataRowSize = sizeof(int)*nranks + sizeof(ncclTvalue_t)*nranks;
+ void *allGather2Data;
+ NCCLCHECK(ncclCalloc((char **)&allGather2Data, allGather2DataRowSize*nranks));
+ int *myTransportRow = (int *)((char *)allGather2Data + allGather2DataRowSize*rank);
+ ncclTvalue_t *myValueRow = (ncclTvalue_t *)(myTransportRow + nranks);
+
+ NCCLCHECK(fillConnect(comm->peerInfo, nranks, rank, myTransportRow, myValueRow));
+ NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather2Data, allGather2DataRowSize));
int* connectTransport;
ncclTvalue_t* connectValue;
NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks));
NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks));
+ for (int i = 0; i < nranks; i++) {
+ memcpy(connectTransport + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize, sizeof(int)*nranks);
+ memcpy(connectValue + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize + nranks*sizeof(int), sizeof(ncclTvalue_t)*nranks);
+ }
+ free(allGather2Data);
+ // AllGather2 - end
- NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
- NCCLCHECK(bootstrapAllGather(commState, connectTransport, nranks*(sizeof(int))));
- NCCLCHECK(bootstrapAllGather(commState, connectValue, nranks*(sizeof(ncclTvalue_t))));
//if (rank == 0) dumpMatrix(connectTransport, nranks);
//if (rank == 0) dumpMatrixTvalue(connectValue, nranks);
// Get my rings
int nrings;
- int* prev, *next;
- NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
- NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
+ int* prev, *next, *treeIn, *treeOut;
+ NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS));
+ NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS));
+ NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS));
+ NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS));
comm->nThreads = getDefaultThreads();
- NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next));
+ NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut));
+ TRACE(NCCL_INIT, "rank %d nranks %d - BUILD %d RINGS", rank, nranks, nrings);
+ assert(nrings <= MAXCHANNELS);
free(connectTransport);
free(connectValue);
+ // AllGather3 - begin
+ struct {
+ int nThreads;
+ int nrings;
+ int cudaCompCap;
+ int prev[MAXCHANNELS];
+ int next[MAXCHANNELS];
+ } *allGather3Data;
+
+ NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
+ allGather3Data[rank].nThreads = comm->nThreads;
+ allGather3Data[rank].nrings = nrings;
+ allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
+ for (int r=0; r<nrings; r++) {
+ allGather3Data[rank].prev[r] = *(prev+r*nranks+rank);
+ allGather3Data[rank].next[r] = *(next+r*nranks+rank);
+ }
+ NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
+
// Find max nThreads
- int allData[nranks];
- allData[rank] = comm->nThreads;
- NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
for (int i=0; i<nranks; i++)
- comm->nThreads = std::max(allData[i], comm->nThreads);
- if (rank == 0) INFO(NCCL_INIT,"Using %d threads", comm->nThreads);
+ comm->nThreads = std::max(allGather3Data[i].nThreads, comm->nThreads);
// Determine the minimum CUDA Compute capability of all GPUs
- int myCompCap = ncclCudaCompCap();
+ int myCompCap = allGather3Data[rank].cudaCompCap;
int minCompCap = myCompCap;
- allData[rank] = myCompCap;
- NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
- for (int i=0; i<nranks; i++)
- minCompCap = std::min(allData[i], minCompCap);
- if (rank == 0) INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
+ for (int i = 0; i < nranks; i++)
+ minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap);
+
+ // Determine thread threshold across all GPUs
+ int nnodes = 0;
+ for (int r=0; r<nranks; r++) nnodes += treeIn[r];
+ comm->threadThreshold = ncclThreadThreshold(minCompCap, nnodes);
// Find min nrings across ranks
- allData[rank] = nrings;
- NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int)));
for (int i=0; i<nranks; i++)
- nrings = std::min(allData[i], nrings);
-
- // Exchange data with others to build complete rings
- comm->nRings = nrings;
- for (int r=0; r<nrings; r++) {
- NCCLCHECK(bootstrapAllGather(commState, prev+r*nranks, sizeof(int)));
- NCCLCHECK(bootstrapAllGather(commState, next+r*nranks, sizeof(int)));
+ nrings = std::min(allGather3Data[i].nrings, nrings);
+ comm->nChannels = nrings;
+
+ // Unpack the per ring prev/next arrays
+ for (int i = 0; i < nranks; i++) {
+ for (int r = 0; r < nrings; r++) {
+ prev[r*nranks+i] = allGather3Data[i].prev[r];
+ next[r*nranks+i] = allGather3Data[i].next[r];
+ }
}
+ free(allGather3Data);
+ // AllGather3 - end
+
int *rings;
- NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
+ NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
NCCLCHECK(buildRings(nrings, rings, rank, nranks, prev, next));
free(prev);
free(next);
+ TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d RINGS", rank, nranks, nrings);
// Connect with prev/next for each ring
- struct ncclConnect *connectData;
- NCCLCHECK(ncclCalloc(&connectData, 2*nranks));
+ struct ncclConnect *connect;
+ NCCLCHECK(ncclCalloc(&connect, 2));
for (int r=0; r<nrings; r++) {
- int* ringRanks = rings+r*nranks;
- struct ncclRing *ring = comm->rings+r;
- NCCLCHECK(setupRing(comm, r, rank, nranks, ringRanks, allInfo, connectData+2*rank));
- int prev_offset = ring->userRanks[nranks-1]*2+1;
- int next_offset = ring->userRanks[1]*2;
- NCCLCHECK(bootstrapAllGather(commState, connectData, sizeof(struct ncclConnect)*2));
- NCCLCHECK(ring->send.transport->send.connect(connectData+next_offset, &ring->send));
- NCCLCHECK(ring->recv.transport->recv.connect(connectData+prev_offset, &ring->recv));
- }
- free(connectData);
- free(rings);
- free(allInfo);
+ struct ncclChannel* channel = comm->channels+r;
+ NCCLCHECK(setupChannel(comm, r, rank, nranks, rings+r*nranks, treeIn+r*nranks));
+ NCCLCHECK(p2pSetup(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next));
+ NCCLCHECK(p2pSetup(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up));
+ NCCLCHECK(p2pSetup(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down));
+ }
+ if (comm->treeThreshold > 0) {
+ char line[1024];
+ line[0]='\0';
+ for (int c=0; c<nrings; c++) {
+ struct ncclTree* tree = &comm->channels[c].tree;
+ snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d->%d->%d/%d/%d",
+ c, tree->up, rank, tree->down[0], tree->down[1], tree->down[2]);
+ }
+ line[1023] = '\0';
+ INFO(NCCL_INIT, "Trees%s", line);
+ }
+ if (rank == 0) {
+ char treeline[64];
+ snprintf(treeline, 64, "enabled up to size %ld", comm->treeThreshold);
+ INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees %s", comm->nThreads, minCompCap,
+ comm->treeThreshold == 0 ? "disabled" :
+ comm->treeThreshold == 0x7fffffffffffffff ? "enabled for all sizes" :
+ treeline);
+ }
- // Intra-process barrier setup
- struct rankInfo {
- uint64_t hostHash;
- uint64_t pidHash;
- struct ncclComm* comm;
- } rankInfos[nranks];
- rankInfos[rank].hostHash = getHostHash();
- rankInfos[rank].pidHash = getPidHash();
- rankInfos[rank].comm = comm;
- NCCLCHECK(bootstrapAllGather(commState, rankInfos, sizeof(struct rankInfo)));
+ TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, nrings);
+ free(connect);
+ free(rings);
+ free(treeIn);
+ free(treeOut);
- // Compute intra ranks
+ // Compute intra ranks (using AllGather1 data)
int intraRank0 = -1, intraRank = -1, intraRanks = 0;
- int multiNode = 0;
- for (int r=0; r<nranks; r++) {
- if ((rankInfos[r].hostHash == rankInfos[rank].hostHash) &&
- (rankInfos[r].pidHash == rankInfos[rank].pidHash)) {
- if (intraRanks == 0) intraRank0 = r;
- if (r == rank) intraRank = intraRanks;
+ for (int i = 0; i < nranks; i++) {
+ if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) &&
+ (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) {
+ if (intraRanks == 0) intraRank0 = i;
+ if (i == rank) intraRank = intraRanks;
intraRanks++;
- } else if (rankInfos[r].hostHash != rankInfos[rank].hostHash) {
- multiNode = 1;
}
}
TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
- rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
- if (intraRank == -1 || intraRank0 == -1 || rankInfos[intraRank0].comm == NULL) {
+ rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
+ if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
- rank, rankInfos[rank].hostHash, intraRank, intraRanks, intraRank0);
+ rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
return ncclInternalError;
}
- NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, rankInfos[intraRank0].comm));
+ NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm));
- // Determine thread threshold across all GPUs
- comm->threadThreshold = ncclThreadThreshold(minCompCap, multiNode);
+ // Done with AllGather1 data
+ free(allGather1Data);
- // Barrier
- bootstrapClose(commState);
+ if (nnodes) NCCLCHECK(transportCreateProxy(comm));
+
+ TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
return ncclSuccess;
}
-bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice) {
- char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
- if (cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev) != cudaSuccess) return false;
- if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevice) != ncclSuccess) return false;
- if (wrapNvmlDeviceSetCpuAffinity(*nvmlDevice) != ncclSuccess) {
- WARN("Failed to set CPU affinity");
- return false;
+static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) {
+ CPU_ZERO_S(sizeof(cpu_set_t), mask);
+ char* cudaPath;
+ NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
+ char path[PATH_MAX];
+ strncpy(path, cudaPath, PATH_MAX-1);
+ snprintf(path+strlen(path), PATH_MAX-1-strlen(path), "/local_cpus");
+ path[PATH_MAX-1] = '\0';
+ int fd;
+ SYSCHECKVAL(open(path, O_RDONLY), "open", fd);
+ char affinityStr[sizeof(cpu_set_t)*2];
+ int r = read(fd, affinityStr, sizeof(cpu_set_t)*2);
+ if (r > 0)
+ NCCLCHECK(ncclStrToCpuset(affinityStr, mask));
+ close(fd);
+ free(cudaPath);
+ return ncclSuccess;
+}
+
+static ncclResult_t setCpuAffinity(int cudaDev) {
+ // Work within the enveloppe we were provided
+ cpu_set_t mask;
+ SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
+
+ // Find the subpart that is local to our GPU
+ cpu_set_t gpuMask;
+ NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask));
+ cpu_set_t finalMask;
+ CPU_AND(&finalMask, &mask, &gpuMask);
+
+ // If those are not disjoint, try to stay local
+ if (CPU_COUNT(&finalMask)) {
+ char affinityStr[sizeof(cpu_set_t)*2];
+ NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
+ INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", cudaDev, affinityStr);
+ SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity");
}
- return true;
+ return ncclSuccess;
}
ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
@@ -633,9 +907,8 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
// Make sure all host memory allocation are close to the GPU
int cudaDev;
- nvmlDevice_t nvmlDevice;
CUDACHECK(cudaGetDevice(&cudaDev));
- SetCpuAffinity(cudaDev, &nvmlDevice);
+ NCCLCHECK(setCpuAffinity(cudaDev));
ncclResult_t res;
NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
@@ -645,7 +918,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
- INFO(NCCL_INIT,"comm %p rank %d nranks %d - COMPLETE", *newcomm, myrank, nranks);
+ INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->nvmlDev);
return ncclSuccess;
cleanup:
@@ -664,8 +937,6 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
NCCLCHECK(ncclInit());
if (myrank == 0) showVersion();
- INFO(NCCL_INIT,"rank %d nranks %d", myrank, nranks);
-
// Make sure the CUDA runtime is initialized.
CUDACHECK(cudaFree(NULL));
@@ -685,7 +956,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
}
static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) {
- struct ncclInfo* allInfo;
+ struct ncclPeerInfo* allInfo;
NCCLCHECK(ncclCalloc(&allInfo, nranks));
for (int rank=0; rank<nranks; rank++) {
CUDACHECK(cudaSetDevice(devs[rank]));
@@ -699,12 +970,14 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
for (int rank=0; rank<nranks; rank++)
NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank));
- int* prev, *prevFinal, *next, *nextFinal;
- NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS));
- NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXRINGS));
- NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS));
- NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXRINGS));
- int nrings = MAXRINGS;
+ int* prev, *prevFinal, *next, *nextFinal, *treeIn, *treeOut;
+ NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS));
+ NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXCHANNELS));
+ NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS));
+ NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXCHANNELS));
+ NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS));
+ NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS));
+ int nrings = MAXCHANNELS;
int nthreads=0;
int myCompCap = ncclCudaCompCap();
int minCompCap = myCompCap;
@@ -713,7 +986,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
int nringsRank;
int nthreadsRank = getDefaultThreads();
myCompCap = ncclCudaCompCap();
- NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next));
+ NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut));
nrings = std::min(nrings, nringsRank);
nthreads = std::max(nthreads, nthreadsRank);
minCompCap = std::min(minCompCap, myCompCap);
@@ -728,11 +1001,10 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
free(prev);
free(next);
- INFO(NCCL_INIT,"Using %d threads", nthreads);
- INFO(NCCL_INIT,"Min Comp Cap %d", minCompCap);
+ INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees disabled", nthreads, minCompCap);
int* rings;
- NCCLCHECK(ncclCalloc(&rings, nranks*MAXRINGS));
+ NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
NCCLCHECK(buildRings(nrings, rings, 0, nranks, prevFinal, nextFinal));
free(prevFinal);
free(nextFinal);
@@ -741,7 +1013,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
int threadThreshold = ncclThreadThreshold(minCompCap, 0);
for (int rank=0; rank<nranks; rank++) {
- comms[rank]->nRings = nrings;
+ comms[rank]->nChannels = nrings;
comms[rank]->nThreads = nthreads;
comms[rank]->threadThreshold = threadThreshold;
}
@@ -751,26 +1023,32 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
int* ringRanks = rings+r*nranks;
for (int rank=0; rank<nranks; rank++) {
CUDACHECK(cudaSetDevice(devs[rank]));
- NCCLCHECK(setupRing(comms[rank], r, rank, nranks, ringRanks, allInfo, connect+2*rank));
- }
- // RingExchange connect information
- for (int rank=0; rank<nranks; rank++) {
- // Swap rank->prev and prevRank->next
- struct ncclRing *ring = comms[rank]->rings+r;
- int prevRank = ring->userRanks[nranks-1];
- struct ncclConnect* prevRankNextConnect = connect+2*prevRank+1;
- struct ncclConnect* rankPrevConnect = connect+2*rank;
- swap(prevRankNextConnect, rankPrevConnect, sizeof(struct ncclConnect));
+ struct ncclChannel* channel = comms[rank]->channels+r;
+ struct ncclRing *ring = &channel->ring;
+ NCCLCHECK(setupChannel(comms[rank], r, rank, nranks, ringRanks, treeIn));
+ // Make sure we don't use trees, we cannot use them with initAll
+ comms[rank]->treeThreshold = 0;
+ int prev = channel->ring.prev = ring->userRanks[nranks-1];
+ int next = channel->ring.next = ring->userRanks[1];
+ struct ncclConnector* recv = &channel->peers[prev].recv;
+ struct ncclConnector* send = &channel->peers[next].send;
+ NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+rank*2+0, recv, channel->buffSize, channel->id));
+ NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+rank*2+1, send, channel->buffSize, channel->id));
}
for (int rank=0; rank<nranks; rank++) {
CUDACHECK(cudaSetDevice(devs[rank]));
- struct ncclRing *ring = comms[rank]->rings+r;
- NCCLCHECK(ring->send.transport->send.connect(connect+2*rank+1, &ring->send));
- NCCLCHECK(ring->recv.transport->recv.connect(connect+2*rank+0, &ring->recv));
+ struct ncclChannel* channel = comms[rank]->channels+r;
+ struct ncclRing *ring = &channel->ring;
+ struct ncclConnector* recv = &channel->peers[ring->prev].recv;
+ struct ncclConnector* send = &channel->peers[ring->next].send;
+ NCCLCHECK(recv->transportComm->connect(connect+ring->prev*2+1, recv));
+ NCCLCHECK(send->transportComm->connect(connect+ring->next*2+0, send));
}
}
- free(rings);
free(allInfo);
+ free(rings);
+ free(treeIn);
+ free(treeOut);
return ncclSuccess;
}
@@ -794,7 +1072,6 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
int savedDevice;
int rank, cudaDev;
ncclComm_t comm = NULL;
- nvmlDevice_t nvmlDevice;
int ncclDevList[ndev];
for (int i=0; i<ndev; i++) {
ncclDevList[i] = devlist ? devlist[i] : i;
@@ -812,7 +1089,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
cudaDev = ncclDevList[rank];
CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup);
- SetCpuAffinity(cudaDev, &nvmlDevice);
+ NCCLCHECK(setCpuAffinity(cudaDev));
NCCLCHECKGOTO(commAlloc(&comm, ndev, rank), res, cleanup);
comms[rank] = comm;
@@ -848,27 +1125,50 @@ final:
return res;
}
-NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
-ncclResult_t ncclCommDestroy(ncclComm_t comm) {
- if (comm == NULL)
- return ncclSuccess;
+static ncclResult_t commDestroy(ncclComm_t comm) {
int savedDevice;
CUDACHECK(cudaGetDevice(&savedDevice));
int commDevice = comm->cudaDev;
+ int rank = comm->rank;
if (savedDevice != commDevice) {
CUDACHECK(cudaSetDevice(commDevice));
}
+ TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, *comm->abortFlag, comm->fatalError);
+
+ CUDACHECK(cudaStreamSynchronize(comm->groupStream));
+ NCCLCHECK(transportDestroyProxy(comm));
NCCLCHECK(commFree(comm));
if (savedDevice != commDevice)
CUDACHECK(cudaSetDevice(savedDevice));
+ INFO(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
+
return ncclSuccess;
}
+NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
+ncclResult_t ncclCommDestroy(ncclComm_t comm) {
+ if (comm == NULL)
+ return ncclSuccess;
+
+ return commDestroy(comm);
+}
+
+NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
+ncclResult_t ncclCommAbort(ncclComm_t comm) {
+ if (comm == NULL)
+ return ncclSuccess;
+
+ // Ask anything that might still be running on the device to quit
+ *comm->abortFlag = 1;
+
+ return commDestroy(comm);
+}
+
NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
const char* ncclGetErrorString(ncclResult_t code) {
switch (code) {
@@ -882,6 +1182,39 @@ const char* ncclGetErrorString(ncclResult_t code) {
}
}
+NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError);
+ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
+ NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm"));
+ NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError"));
+
+ // Check device reported error
+ static ncclDevError_t printedDevErr = ncclDevSuccess;
+ switch(*comm->fatalDevError) {
+ case ncclDevSuccess :
+ break;
+ case ncclDevAssertedMismatch :
+ if (printedDevErr != ncclDevAssertedMismatch) {
+ WARN("Mismatched collective detected, please check your collective calls at and around rank %d. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank);
+ printedDevErr = ncclDevAssertedMismatch;
+ }
+ if (comm->fatalError == ncclSuccess) {
+ comm->fatalError = ncclInvalidUsage;
+ }
+ break;
+ case ncclDevSuspectedMismatch :
+ if (printedDevErr != ncclDevSuspectedMismatch) {
+ WARN("Your program may be hanging, this may be caused by a collective mismatch around rank %d. Please check your collective calls at and around this rank. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank);
+ printedDevErr = ncclDevSuspectedMismatch;
+ }
+ break;
+ default:
+ WARN("Unknown device error %d", *comm->fatalDevError);
+ return ncclInternalError;
+ }
+ *asyncError = comm->fatalError;
+ return ncclSuccess;
+}
+
NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
NCCLCHECK(PtrCheck(comm, "CommCount", "comm"));
diff --git a/src/misc/checks.cu b/src/misc/checks.cu
new file mode 100644
index 0000000..a07e577
--- /dev/null
+++ b/src/misc/checks.cu
@@ -0,0 +1,69 @@
+/*************************************************************************
+ * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "checks.h"
+
+static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) {
+ cudaPointerAttributes attr;
+ cudaError_t err = cudaPointerGetAttributes(&attr, pointer);
+ if (err != cudaSuccess || attr.devicePointer == NULL) {
+ WARN("%s : %s is not a valid pointer", opname, ptrname);
+ return ncclInvalidArgument;
+ }
+#if CUDART_VERSION >= 10000
+ if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
+#else
+ if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) {
+#endif
+ WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev);
+ return ncclInvalidArgument;
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) {
+ if (ptr == NULL) {
+ WARN("%s : %s argument is NULL", opname, ptrname);
+ return ncclInvalidArgument;
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ArgsCheck(struct ncclInfo* info) {
+ NCCLCHECK(PtrCheck(info->comm, info->opName, "comm"));
+ // First, the easy ones
+ if (info->root < 0 || info->root >= info->comm->nRanks) {
+ WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks);
+ return ncclInvalidArgument;
+ }
+ if (info->datatype < 0 || info->datatype >= ncclNumTypes) {
+ WARN("%s : invalid type %d", info->opName, info->datatype);
+ return ncclInvalidArgument;
+ }
+ // Type is OK, compute nbytes. Convert Allgather/Broadcast calls to chars.
+ info->nBytes = info->count * ncclTypeSize(info->datatype);
+ if (info->coll == ncclCollAllGather || info->coll == ncclCollBroadcast) {
+ info->count = info->nBytes;
+ info->datatype = ncclInt8;
+ }
+ if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
+
+ if (info->op < 0 || info->op >= ncclNumOps) {
+ WARN("%s : invalid reduction operation %d", info->opName, info->op);
+ return ncclInvalidArgument;
+ }
+
+ if (info->comm->checkPointers) {
+ // Check CUDA device pointers
+ if (info->coll != ncclCollBroadcast || info->comm->rank == info->root) {
+ NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
+ }
+ if (info->coll != ncclCollReduce || info->comm->rank == info->root) {
+ NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
+ }
+ }
+ return ncclSuccess;
+}
diff --git a/src/misc/enqueue.cu b/src/misc/enqueue.cu
deleted file mode 100644
index 80846dd..0000000
--- a/src/misc/enqueue.cu
+++ /dev/null
@@ -1,248 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "enqueue.h"
-#include "common_coll.h"
-#include "param.h"
-
-#include "collectives/collectives.h"
-
-#define NCCL_FUNC4(coll, op, dtype) \
- (void*)NCCL_KERN_NAME(coll, op, dtype), \
- (void*)NCCL_KERN_NAME(coll##LL, op, dtype)
-
-// Must be consistent with ncclDataType_t
-#define NCCL_FUNCS3A(coll, op) \
- (void*)NCCL_FUNC4(coll, op, i8), \
- (void*)NCCL_FUNC4(coll, op, u8), \
- (void*)NCCL_FUNC4(coll, op, i32), \
- (void*)NCCL_FUNC4(coll, op, u32), \
- (void*)NCCL_FUNC4(coll, op, i64), \
- (void*)NCCL_FUNC4(coll, op, u64), \
- (void*)NCCL_FUNC4(coll, op, f16), \
- (void*)NCCL_FUNC4(coll, op, f32), \
- (void*)NCCL_FUNC4(coll, op, f64)
-#define NCCL_FUNCS3B(coll, op) \
- (void*)NCCL_FUNC4(coll, op, i8), \
- (void*)NCCL_FUNC4(coll, op, i8), \
- (void*)NCCL_FUNC4(coll, op, i8), \
- (void*)NCCL_FUNC4(coll, op, i8), \
- (void*)NCCL_FUNC4(coll, op, i8), \
- (void*)NCCL_FUNC4(coll, op, i8), \
- (void*)NCCL_FUNC4(coll, op, i8), \
- (void*)NCCL_FUNC4(coll, op, i8), \
- (void*)NCCL_FUNC4(coll, op, i8)
-
-// Must be consistent with ncclRedOp_t
-#define NCCL_FUNCS2A(coll) \
- NCCL_FUNCS3A(coll, sum ), \
- NCCL_FUNCS3A(coll, prod), \
- NCCL_FUNCS3A(coll, max ), \
- NCCL_FUNCS3A(coll, min )
-#define NCCL_FUNCS2B(coll) \
- NCCL_FUNCS3B(coll, copy), \
- NCCL_FUNCS3B(coll, copy), \
- NCCL_FUNCS3B(coll, copy), \
- NCCL_FUNCS3B(coll, copy)
-
-// Must be consistent with the ncclFuncSet enum
-static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2] = {
- NCCL_FUNCS2B(ncclBroadcast),
- NCCL_FUNCS2A(ncclReduce),
- NCCL_FUNCS2B(ncclAllGather),
- NCCL_FUNCS2A(ncclReduceScatter),
- NCCL_FUNCS2A(ncclAllReduce)
-};
-
-ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) {
-#if CUDART_VERSION >= 9000
- if (cgMode & 0x01) {
- CUDACHECK(cudaLaunchCooperativeKernelMultiDevice(paramsList, numDevices,
- // These flags are to reduce the latency of using this API
- cudaCooperativeLaunchMultiDeviceNoPreSync|cudaCooperativeLaunchMultiDeviceNoPostSync));
- return ncclSuccess;
- }
-#endif
- int savedDev;
- CUDACHECK(cudaGetDevice(&savedDev));
- for (int i = 0; i < numDevices; i++) {
- struct cudaLaunchParams* params = paramsList+i;
- CUDACHECK(cudaSetDevice(cudaDevs[i]));
- CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
- }
- CUDACHECK(cudaSetDevice(savedDev));
- return ncclSuccess;
-}
-
-ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) {
- params->gridDim.x = std::min((int) params->gridDim.x, comm->nRings);
-
- // Set active = 2 for the last operation
- for (int r=0; r<params->gridDim.x; r++) {
- struct ncclRing* ring = comm->rings+r;
- ring->collectives[(ring->collStart+ring->collCount-1)%NCCL_MAX_OPS].active = 2;
- }
-
- // Find the first operation, choose the kernel accordingly and pass it
- // as the first argument.
- struct ncclColl* coll = comm->rings[0].collectives+comm->rings[0].collStart;
- memcpy(&comm->args, coll, sizeof(struct ncclColl));
- // As we pass that coll directly, we can free it immediately.
- coll->active = 0;
-
- params->func = ncclKerns[coll->funcIndex];
- return ncclSuccess;
-}
-
-ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) {
- volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
- int val = *ptr;
- bool done = false;
- while (done == false) {
- if (val >= comm->intraRanks) {
- WARN("Trying to launch too many collectives");
- return ncclInvalidUsage;
- }
- if (val+1 == comm->intraRanks) {
- // Reset the barrier.
- comm->intraBarrier[comm->intraPhase^1] = 0;
- *isLast = 1;
- return ncclSuccess;
- }
- done = __sync_bool_compare_and_swap(ptr, val, val+1);
- val++;
- }
- *isLast = 0;
- return ncclSuccess;
-}
-
-ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) {
- volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
- int val = *ptr;
- if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) {
- WARN("Trying to launch too many collectives");
- return ncclInternalError;
- }
- return ncclSuccess;
-}
-
-ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
- volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase);
- while (*ptr < comm->intraRanks) pthread_yield();
- comm->intraPhase ^= 1;
- return ncclSuccess;
-}
-
-ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) {
- if (comm->nRanks == 1) return ncclSuccess;
- struct cudaLaunchParams* params = comm->myParams;
-
- NCCLCHECK(setupLaunch(comm, params));
-
- // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
- if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
- // Enqueue event in user stream
- CUDACHECK(cudaEventRecord(comm->doneEvent, comm->userStream));
- // Create dependency between user stream and internal NCCL stream
- CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->doneEvent, 0));
- params->stream = comm->groupStream;
- } else {
- if (comm->userStream != params->stream) {
- // Stream changed from last call, create dependency against last NCCL kernel launch
- CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
- }
- params->stream = comm->userStream;
- }
-
- int isLast = 0;
- NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
-
- if (isLast) {
- if (comm->launchMode == ncclComm::GROUP) {
- // I'm the last. Launch all operations.
- NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode));
- }
- NCCLCHECK(ncclCpuBarrierLast(comm));
- }
- return ncclSuccess;
-}
-
-ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
- if (comm->nRanks == 1) return ncclSuccess;
- // We can't print the CG mode before the first barrier happened.
- if (comm->rank == 0 && *comm->intraCGMode & 0x10) {
- *comm->intraCGMode ^= 0x10;
- INFO(NCCL_INIT,"Launch mode %s%s%s",
- comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel",
- *comm->intraCGMode ? "/CGMD" : "",
- (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
- }
-
- NCCLCHECK(ncclCpuBarrierOut(comm));
-
- struct cudaLaunchParams *params = comm->myParams;
- if (comm->launchMode == ncclComm::PARALLEL) {
- CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream));
- }
- // Start the network proxies as soon as the kernel has been launched. We can't
- // perform any CUDA call between the two or having a cudaFree between the CUDA
- // launch and the transportStartProxies call could cause a deadlock.
- // Also, starting the proxies after the CUDA launch seems to be better for
- // performance (latency).
- for (int r=0; r<params->gridDim.x; r++) {
- struct ncclRing* ring = comm->rings+r;
- ring->collStart = ring->collFifoTail;
- ring->collCount = 0;
- }
- params->gridDim.x = params->blockDim.x = 0;
- NCCLCHECK(transportStartProxies(comm));
- return ncclSuccess;
-}
-
-ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
- struct cudaLaunchParams *params = comm->myParams;
- // Enqueue event after NCCL kernel
- CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream));
- // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL
- if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) {
- // Create dependency between NCCL internal stream and user stream
- CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0));
- }
- comm->userStreamSet = false;
- return ncclSuccess;
-}
-
-ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff,
- void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root,
- ncclComm_t comm, cudaStream_t stream) {
- if (comm == NULL) return ncclInvalidArgument;
- // Launch asynchronously if needed
- if (ncclAsyncMode()) {
- ncclResult_t ret = ncclSuccess;
- int savedDev = -1;
- if (comm->checkPointers) {
- CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end);
- CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, end);
- }
- // Check arguments
- NCCLCHECKGOTO(ArgsCheck(sendbuff, recvbuff, count, type, op, root, comm, primName), ret, end);
- // Always register comm even in case of error to make sure ncclGroupEnd
- // cleans it up.
- NCCLCHECK(ncclAsyncColl(comm));
- NCCLCHECKGOTO(func(sendbuff, recvbuff, count, type, op, root, comm, stream), ret, end);
-end:
- if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev));
- ncclAsyncErrCheck(ret);
- return ret;
- } else {
- NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, type, op, root, comm, primName));
- NCCLCHECK(func(sendbuff, recvbuff, count, type, op, root, comm, stream));
- NCCLCHECK(ncclBarrierEnqueue(comm));
- NCCLCHECK(ncclBarrierEnqueueWait(comm));
- NCCLCHECK(ncclEnqueueEvents(comm));
- return ncclSuccess;
- }
-}
diff --git a/src/misc/group.cu b/src/misc/group.cu
index 1716a75..c428a22 100644
--- a/src/misc/group.cu
+++ b/src/misc/group.cu
@@ -179,13 +179,13 @@ group_cleanup:
// an atomic operation, we need to cancel all operations.
for (int i=0; i<ncclGroupIndex; i++) {
struct ncclComm* comm = ncclGroupArgs[i].coll.comm;
- for (int r=0; r<comm->nRings; r++) {
- struct ncclRing* ring = comm->rings+r;
- for (int i=0; i<ring->collCount; i++) {
- ring->collectives[(ring->collStart + i)%NCCL_MAX_OPS].active = 0;
+ for (int c=0; c<comm->nChannels; c++) {
+ struct ncclChannel* channel = comm->channels+c;
+ for (int i=0; i<channel->collCount; i++) {
+ channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
}
- ring->collFifoTail = ring->collStart;
- ring->collCount = 0;
+ channel->collFifoTail = channel->collStart;
+ channel->collCount = 0;
}
comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0;
comm->userStreamSet = false;
diff --git a/src/misc/nvmlwrap.cu b/src/misc/nvmlwrap.cu
index d9407f4..635f332 100644
--- a/src/misc/nvmlwrap.cu
+++ b/src/misc/nvmlwrap.cu
@@ -16,14 +16,14 @@ static nvmlReturn_t (*nvmlInternalInit)(void);
static nvmlReturn_t (*nvmlInternalShutdown)(void);
static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
-static nvmlReturn_t (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device);
-static nvmlReturn_t (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device);
static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci);
static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link,
nvmlNvLinkCapability_t capability, unsigned int *capResult);
+static nvmlReturn_t (*nvmlInternalDeviceGetMinorNumber)(nvmlDevice_t device, unsigned int* minorNumber);
+
ncclResult_t wrapNvmlSymbols(void) {
if (nvmlState == nvmlInitialized)
@@ -70,10 +70,9 @@ ncclResult_t wrapNvmlSymbols(void) {
LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
- LOAD_SYM(nvmlhandle, "nvmlDeviceSetCpuAffinity", nvmlInternalDeviceSetCpuAffinity);
- LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity);
LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo);
+ LOAD_SYM(nvmlhandle, "nvmlDeviceGetMinorNumber", nvmlInternalDeviceGetMinorNumber);
LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState);
LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo);
LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability);
@@ -86,9 +85,8 @@ teardown:
nvmlInternalShutdown = NULL;
nvmlInternalDeviceGetHandleByPciBusId = NULL;
nvmlInternalDeviceGetIndex = NULL;
- nvmlInternalDeviceSetCpuAffinity = NULL;
- nvmlInternalDeviceClearCpuAffinity = NULL;
nvmlInternalDeviceGetPciInfo = NULL;
+ nvmlInternalDeviceGetMinorNumber = NULL;
nvmlInternalDeviceGetNvLinkState = NULL;
nvmlInternalDeviceGetNvLinkRemotePciInfo = NULL;
nvmlInternalDeviceGetNvLinkCapability = NULL;
@@ -155,46 +153,28 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
return ncclSuccess;
}
-ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) {
- if (nvmlInternalDeviceSetCpuAffinity == NULL) {
- WARN("lib wrapper not initialized.");
- return ncclInternalError;
- }
- // Workaround : it seems SetCpuAffinity is not thread safe.
- static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
- pthread_mutex_lock(&lock);
- nvmlReturn_t ret = nvmlInternalDeviceSetCpuAffinity(device);
- pthread_mutex_unlock(&lock);
- if (ret != NVML_SUCCESS) {
- WARN("nvmlDeviceSetCpuAffinity() failed: %s ",
- nvmlInternalErrorString(ret));
- return ncclSystemError;
- }
- return ncclSuccess;
-}
-
-ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) {
- if (nvmlInternalInit == NULL) {
+ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
+ if (nvmlInternalDeviceGetPciInfo == NULL) {
WARN("lib wrapper not initialized.");
return ncclInternalError;
}
- nvmlReturn_t ret = nvmlInternalDeviceClearCpuAffinity(device);
+ nvmlReturn_t ret = nvmlInternalDeviceGetPciInfo(device, pci);
if (ret != NVML_SUCCESS) {
- WARN("nvmlDeviceClearCpuAffinity() failed: %s ",
+ WARN("nvmlDeviceGetPciInfo() failed: %s ",
nvmlInternalErrorString(ret));
return ncclSystemError;
}
return ncclSuccess;
}
-ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) {
- if (nvmlInternalDeviceGetPciInfo == NULL) {
+ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) {
+ if (nvmlInternalDeviceGetMinorNumber == NULL) {
WARN("lib wrapper not initialized.");
return ncclInternalError;
}
- nvmlReturn_t ret = nvmlInternalDeviceGetPciInfo(device, pci);
+ nvmlReturn_t ret = nvmlInternalDeviceGetMinorNumber(device, minorNumber);
if (ret != NVML_SUCCESS) {
- WARN("nvmlDeviceGetPciInfo() failed: %s ",
+ WARN("nvmlDeviceGetMinorNumber() failed: %s ",
nvmlInternalErrorString(ret));
return ncclSystemError;
}
@@ -208,8 +188,9 @@ ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link
}
nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive);
if (ret != NVML_SUCCESS) {
- INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
- nvmlInternalErrorString(ret));
+ if (ret != NVML_ERROR_NOT_SUPPORTED)
+ INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
+ nvmlInternalErrorString(ret));
return ncclSystemError;
}
return ncclSuccess;
diff --git a/src/misc/rings.cu b/src/misc/rings.cu
index a5d4616..a7b122c 100644
--- a/src/misc/rings.cu
+++ b/src/misc/rings.cu
@@ -160,7 +160,10 @@ static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankTo
while ((rank = findConnected(curRank, matrix, nranks, transport, coords)) == -1) {
current[transport] = 0;
transport++;
- if (transport == NTRANSPORTS) { free(p2pConnected); return ncclInternalError; }
+ if (transport == NTRANSPORTS) {
+ WARN("Error : Could not find transport to connect next group\n");
+ free(p2pConnected);
+ return ncclInternalError; }
}
curRank = rank;
current[transport]++;
@@ -179,8 +182,20 @@ ncclResult_t getEnvThreads(int* nthreads) {
return ncclSuccess;
}
+static inline int copyRings(int nrings, int newNrings, int nranks, int* a, int* b, int* c, int* d) {
+ if (newNrings > MAXCHANNELS) newNrings = MAXCHANNELS;
+ for (int r=nrings; r<newNrings; r++) {
+ for (int i=0; i<nranks; i++) {
+ a[r*nranks+i] = a[(r-nrings)*nranks+i];
+ b[r*nranks+i] = b[(r-nrings)*nranks+i];
+ c[r*nranks+i] = c[(r-nrings)*nranks+i];
+ d[r*nranks+i] = d[(r-nrings)*nranks+i];
+ }
+ }
+ return newNrings;
+}
/* Main ring creation function */
-ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next) {
+ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut) {
*nrings = 0;
if (nranks == 1) return ncclSuccess;
@@ -191,6 +206,12 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
if (ret == ncclSuccess && *nrings > 0) {
if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings);
NCCLCHECK(getEnvThreads(nthreads));
+ for (int r = 0; r<*nrings; r++) {
+ for (int i = 0; i<nranks; i++) {
+ if (transports[i*nranks+prev[i]] == 2) treeIn[i] = 1;
+ if (transports[i*nranks+next[i]] == 2) treeOut[i] = 1;
+ }
+ }
return ncclSuccess;
}
if (rank == 0) INFO(NCCL_INIT,"No valid ring found in environment, ignoring");
@@ -210,8 +231,8 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
int minScore = NCCL_MAX_SCORE;
int nringsTmp;
int *prevTmp, *nextTmp, *idxToRank, *rankToIdx, *groups, *subgroups;
- NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXRINGS));
- NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXRINGS));
+ NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXCHANNELS));
+ NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&idxToRank, nranks));
NCCLCHECK(ncclCalloc(&rankToIdx, nranks));
NCCLCHECK(ncclCalloc(&groups, nranks));
@@ -220,8 +241,8 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
int nThreads;
do {
nThreads = *nthreads;
- for (int i=0; i<nranks*MAXRINGS; i++) prevTmp[i] = nextTmp[i] = -1;
- nringsTmp = MAXRINGS;
+ for (int i=0; i<nranks*MAXCHANNELS; i++) prevTmp[i] = nextTmp[i] = -1;
+ nringsTmp = MAXCHANNELS;
// Loop over transports to connect groups
for (int t=NTRANSPORTS-1; t>=0; t--) {
for (int i=0; i<nranks; i++) idxToRank[i] = rankToIdx[i] = -1;
@@ -282,6 +303,11 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
for (int i=0; i<nidx; i++) {
if ((prevTmp[r*nranks+idxToRank[i]] == -1) && (subprev[r*nidx+i] != -1)) prevTmp[r*nranks+idxToRank[i]] = idxToRank[subprev[r*nidx+i]];
if ((nextTmp[r*nranks+idxToRank[i]] == -1) && (subnext[r*nidx+i] != -1)) nextTmp[r*nranks+idxToRank[i]] = idxToRank[subnext[r*nidx+i]];
+ if (t == NTRANSPORTS-1) {
+ // Save node-level masters for trees
+ treeIn[r*nranks+idxToRank[i]] = prevTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
+ treeOut[r*nranks+idxToRank[i]] = nextTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
+ }
}
}
//for (int r=0; r<nringsTmp; r++) {
@@ -316,6 +342,15 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
*nthreads = nThreads;
+ /* Duplicate the rings in case of multinode+NVLink */
+ int nnodes = 0;
+ for (int r=0; r<nranks; r++) nnodes += treeIn[r];
+ int nvlink;
+ NCCLCHECK(ncclNvlinkGpu(&nvlink));
+ if (nnodes > 1 && nvlink) {
+ *nrings = copyRings(*nrings, *nrings*2, nranks, prev, next, treeIn, treeOut);
+ }
+
if (*nrings == 0) {
WARN("Could not create rings, falling back on simple ring");
*nrings = 1;
@@ -329,9 +364,9 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than NCCL_MAX_NRINGS, ignoring NCCL_MIN_NRINGS");
minNrings = 0;
}
- if (minNrings > MAXRINGS) {
- if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXRINGS, MAXRINGS);
- minNrings = MAXRINGS;
+ if (minNrings > MAXCHANNELS) {
+ if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXCHANNELS, MAXCHANNELS);
+ minNrings = MAXCHANNELS;
}
if (maxNrings > 0 && maxNrings <= *nrings) {
if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
@@ -341,13 +376,7 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int*
if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
if (minNrings > 0 && minNrings > *nrings) {
if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings);
- for (int r=*nrings; r<MAXRINGS && r <minNrings; r++) {
- for (int i=0; i<nranks; i++) {
- prev[r*nranks+i] = prev[(r-*nrings)*nranks+i];
- next[r*nranks+i] = next[(r-*nrings)*nranks+i];
- }
- }
- *nrings = minNrings;
+ *nrings = copyRings(*nrings, minNrings, nranks, prev, next, treeIn, treeOut);
}
}
diff --git a/src/misc/trees.cu b/src/misc/trees.cu
new file mode 100644
index 0000000..e53ea0b
--- /dev/null
+++ b/src/misc/trees.cu
@@ -0,0 +1,108 @@
+/*************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "net.h"
+#include "param.h"
+
+#define RANK_TO_INDEX(r) (rank > root ? rank-1 : rank)
+
+/* Btree which alternates leaves and nodes.
+ * Assumes root is 0, which conveniently builds a tree on powers of two,
+ * (because we have pow2-1 ranks) which lets us manipulate bits.
+ * Find first non-zero bit, then :
+ * Find the parent :
+ * xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below)
+ * xx11[0] -> xx10[0] (3,7,11 below)
+ * Find the children :
+ * xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13)
+ * xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13)
+ *
+ * Illustration :
+ * 0---------------8
+ * ______/ \______
+ * 4 12
+ * / \ / \
+ * 2 6 10 \
+ * / \ / \ / \ \
+ * 1 3 5 7 9 11 13
+ */
+ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) {
+ int up, down0, down1;
+ int bit;
+ for (bit=1; bit<nranks; bit<<=1) {
+ if (bit & rank) break;
+ }
+
+ if (rank == 0) {
+ *u = -1;
+ *d0 = nranks > 1 ? bit >> 1 : -1;
+ *d1 = -1;
+ return ncclSuccess;
+ }
+
+ up = (rank ^ bit) | (bit << 1);
+ if (up >= nranks) up = (rank ^ bit);
+ *u = up;
+
+ int lowbit = bit >> 1;
+ // down0 is always within bounds
+ down0 = lowbit == 0 ? -1 : rank-lowbit;
+
+ down1 = lowbit == 0 ? -1 : rank+lowbit;
+ // Make sure down1 is within bounds
+ while (down1 >= nranks) {
+ down1 = lowbit == 0 ? -1 : rank+lowbit;
+ lowbit >>= 1;
+ }
+ *d0 = down0; *d1 = down1;
+
+ return ncclSuccess;
+}
+
+/* Build a double binary tree. Take the previous tree for the first tree.
+ * For the second tree, we use a mirror tree (if nranks is odd)
+ *
+ * 8---------0---------5
+ * ______/ \______ _____/ \______
+ * 4 12 1 9
+ * / \ / \ / \
+ * 2 6 10 3 7 10
+ * / \ / \ / \ / \ / \ / \
+ * 1 3 5 7 9 11 2 4 6 8 11 12
+ *
+ * or shift it by one rank (if nranks is even)
+ *
+ * 8---------0--------------9
+ * ______/ \ ______/ \
+ * 4 \ 5 \
+ * / \ \ / \ \
+ * 2 6 10 3 7 11
+ * / \ / \ / \ / \ / \ / \
+ * 1 3 5 7 9 11 2 4 6 8 10 1
+ */
+ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* s1, int* d1_0, int* d1_1) {
+ // First tree ... use a btree
+ ncclGetBtree(nranks, rank, s0, d0_0, d0_1);
+ // Second tree ... mirror or shift
+ if (nranks % 2 == 0) {
+ // shift
+ int shiftrank = (rank-1+nranks) % nranks;
+ int u, d0, d1;
+ ncclGetBtree(nranks, shiftrank, &u, &d0, &d1);
+ *s1 = u == -1 ? -1 : (u+1) % nranks;
+ *d1_0 = d0 == -1 ? -1 : (d0+1) % nranks;
+ *d1_1 = d1 == -1 ? -1 : (d1+1) % nranks;
+ } else {
+ // mirror
+ int u, d0, d1;
+ ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1);
+ *s1 = u == -1 ? -1 : nranks-1-u;
+ *d1_0 = d0 == -1 ? -1 : nranks-1-d0;
+ *d1_1 = d1 == -1 ? -1 : nranks-1-d1;
+ }
+ return ncclSuccess;
+}
diff --git a/src/misc/utils.cu b/src/misc/utils.cu
index d8e3aec..c618e71 100644
--- a/src/misc/utils.cu
+++ b/src/misc/utils.cu
@@ -11,6 +11,24 @@
#include <string.h>
#include <stdarg.h>
+#include "nvmlwrap.h"
+#include "core.h"
+
+// Convert a logical cudaDev index to the NVML device minor number
+ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) {
+ char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+ nvmlDevice_t nvmlDevice;
+ unsigned int dev;
+ *nvmlDev = -1;
+ CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
+ NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDevice));
+ NCCLCHECK(wrapNvmlDeviceGetMinorNumber(nvmlDevice, &dev));
+
+ *nvmlDev = dev;
+
+ return ncclSuccess;
+}
+
ncclResult_t getHostName(char* hostname, int maxlen) {
if (gethostname(hostname, maxlen) != 0) {
strncpy(hostname, "unknown", maxlen);
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 7227625..985274e 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -68,14 +68,24 @@ ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId
ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
-/* Frees resources associated with communicator object. */
+/* Frees resources associated with communicator object, but waits for any operations
+ * that might still be running on the device. */
ncclResult_t ncclCommDestroy(ncclComm_t comm);
ncclResult_t pncclCommDestroy(ncclComm_t comm);
+/* Frees resources associated with communicator object and aborts any operations
+ * that might still be running on the device. */
+ncclResult_t ncclCommAbort(ncclComm_t comm);
+ncclResult_t pncclCommAbort(ncclComm_t comm);
+
/* Returns a human-readable error message. */
const char* ncclGetErrorString(ncclResult_t result);
const char* pncclGetErrorString(ncclResult_t result);
+/* Checks whether the comm has encountered any asynchronous errors */
+ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+
/* Gets the number of ranks in the communicator clique. */
ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
diff --git a/src/ring.cu b/src/ring.cu
deleted file mode 100644
index fede793..0000000
--- a/src/ring.cu
+++ /dev/null
@@ -1,70 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "ring.h"
-#include "param.h"
-
-NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
-
-ncclResult_t initRing(struct ncclComm* comm, int ringid) {
- struct ncclRing* ring = comm->rings+ringid;
- ring->id = ringid;
-
- // Setup intermediate buffering
- ring->buffSize = ncclParamBuffsize();
-
- const int sendSize = ring->devMemSendSize = sizeof(struct ncclSendMem);
- struct ncclSendMem* sendMem;
- NCCLCHECK(ncclCudaCalloc((char**)&sendMem, sendSize));
- ring->devMemSend = sendMem;
-
- const int recvSize = ring->devMemRecvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
- struct ncclRecvMem* recvMem;
- NCCLCHECK(ncclCudaCalloc((char**)&recvMem, recvSize));
- ring->devMemRecv = recvMem;
-
- TRACE(NCCL_INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize);
-
- // Pre-configure send/recv pointers. Those are the default, they may change later.
- ring->recv.conn.buff = recvMem->buff;
- ring->recv.conn.llBuff = recvMem->llBuff;
- ring->recv.conn.tail = &recvMem->tail;
- ring->recv.conn.opCount = &recvMem->opCount;
- ring->recv.conn.direct = 0;
- ring->send.conn.head = &sendMem->head;
- ring->send.conn.llHead = &sendMem->llHead;
- ring->send.conn.direct = 0;
- ring->send.conn.llStep = 0;
- ring->send.conn.llLastCleaning = 0;
-
- // Ring index to user rank table.
- NCCLCHECK(ncclCudaCalloc(&ring->devUserRanks, comm->nRanks));
- NCCLCHECK(ncclCalloc(&ring->userRanks, comm->nRanks));
-
- // Per-ring operation list.
- NCCLCHECK(ncclCudaHostAlloc((void**)&ring->collectives, (void**)&ring->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS));
- return ncclSuccess;
-}
-
-ncclResult_t freeRing(struct ncclRing* ring) {
- // Intermediate buffering
- CUDACHECK(cudaFree(ring->devMemSend));
- CUDACHECK(cudaFree(ring->devMemRecv));
-
- // Index to rank table
- free(ring->userRanks);
- CUDACHECK(cudaFree(ring->devUserRanks));
-
- // Operation list
- NCCLCHECK(ncclCudaHostFree(ring->collectives));
-
- // Free transport proxy resources
- if (ring->send.transportResources) NCCLCHECK(ring->send.transport->send.free(ring->send.transportResources));
- NCCLCHECK(transportDestroyProxy(&ring->send));
- if (ring->recv.transportResources) NCCLCHECK(ring->recv.transport->recv.free(ring->recv.transportResources));
- NCCLCHECK(transportDestroyProxy(&ring->recv));
- return ncclSuccess;
-}
diff --git a/src/transport.cu b/src/transport.cu
index 7c13d5c..1436a5b 100644
--- a/src/transport.cu
+++ b/src/transport.cu
@@ -1,11 +1,10 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
-#include "common_coll.h"
extern struct ncclTransport p2pTransport;
extern struct ncclTransport shmTransport;
@@ -17,74 +16,16 @@ struct ncclTransport ncclTransports[NTRANSPORTS] = {
netTransport,
};
-static void FifoPullArgs(struct transportProxyInfo* info, struct ncclProxyArgs *args) {
- struct ncclProxyArgs *fifoArgs = info->argsFifo + (info->argsFifoHead % TRANSPORT_PROXY_FIFO_SIZE);
- pthread_mutex_lock(&info->mutex);
- while (fifoArgs->active == 0)
- pthread_cond_wait(&info->cond, &info->mutex);
- __sync_synchronize();
- memcpy(args, fifoArgs, sizeof(struct ncclProxyArgs));
- __sync_synchronize();
- fifoArgs->active = 0;
- pthread_cond_signal(&info->cond);
- pthread_mutex_unlock(&info->mutex);
- info->argsFifoHead++;
-}
-
-static struct ncclProxyArgs* FifoGetNextArgs(struct transportProxyInfo* info) {
- if (info == NULL) return NULL;
- struct ncclProxyArgs* fifoArgs = info->argsFifo + (info->argsFifoTail % TRANSPORT_PROXY_FIFO_SIZE);
- pthread_mutex_lock(&info->mutex);
- while (fifoArgs->active == 1)
- pthread_cond_wait(&info->cond, &info->mutex);
- pthread_mutex_unlock(&info->mutex);
- info->argsFifoTail++;
- return fifoArgs;
-}
-
-static void FifoPushArgs(struct transportProxyInfo* info) {
- if (info == NULL) return;
-
- struct ncclProxyArgs* fifoArgs = info->argsFifo + ((info->argsFifoTail-1) % TRANSPORT_PROXY_FIFO_SIZE);
- if (fifoArgs->active == 0) return;
-
- pthread_mutex_lock(&info->mutex);
- pthread_cond_signal(&info->cond);
- pthread_mutex_unlock(&info->mutex);
-}
-
-static void WaitProxyReady(struct transportProxyInfo* info) {
- pthread_mutex_lock(&info->mutex);
- while (info->proxyReady == 0)
- pthread_cond_wait(&info->cond, &info->mutex);
- pthread_mutex_unlock(&info->mutex);
-}
-
-static void SetProxyReady(struct transportProxyInfo* info) {
- pthread_mutex_lock(&info->mutex);
- info->proxyReady = 1;
- pthread_cond_signal(&info->cond);
- pthread_mutex_unlock(&info->mutex);
-}
-
-static void StopProxy(struct transportProxyInfo* info) {
- struct ncclProxyArgs* fifoArgs = FifoGetNextArgs(info);
- fifoArgs->active = -1;
- FifoPushArgs(info);
-}
-
#define RECV 0
#define SEND 1
-static bool NeedProxy(int type, int pattern, struct ncclRing* ring, int nranks) {
- enum proxyMode mode = proxyPatternMode(pattern);
- if (mode == proxyRing) return true;
+static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) {
+ if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
/* In chains, one rank does not need a proxy. Let's figure out which one it is */
- int root = proxyPatternRoot(pattern);
// Which index in the reorganized rings should we compare root against */
const int myrank = 0, nextrank = 1, prevrank = nranks-1;
- int index = mode == proxyFrom ?
+ int index = pattern == ncclPatternPipelineFrom ?
/* no recv / no send if root = */
/* bcast */ (type == RECV ? myrank : nextrank ):
/* reduce */ (type == RECV ? prevrank : myrank );
@@ -92,96 +33,216 @@ static bool NeedProxy(int type, int pattern, struct ncclRing* ring, int nranks)
return (root != rank);
}
-static void SaveProxy(struct ncclConnector* connector, struct ncclProxyArgs* args, int needProxy) {
- struct transportProxyInfo* info = connector->proxyInfo;
- if (info == NULL) return;
- struct ncclProxyArgs* fifoArgs = FifoGetNextArgs(info);
- args->needProxy = needProxy;
- __sync_synchronize();
- memcpy(fifoArgs, args, sizeof(struct ncclProxyArgs));
- __sync_synchronize();
- fifoArgs->active = 1;
+enum { proxyRecv=0, proxySend=1 };
+
+#define PROXYARGS_ALLOCATE_SIZE 32
+struct ncclProxyPool {
+ struct ncclProxyPool *next;
+ struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
+};
+
+ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) {
+ struct ncclProxyState* state = &comm->proxyState;
+ struct ncclProxyArgs* elem;
+ pthread_mutex_lock(&state->mutex);
+ if (state->pool == NULL) {
+ // Allocate a new pool of elements
+ struct ncclProxyPool* newPool;
+ NCCLCHECK(ncclCalloc(&newPool, 1));
+ struct ncclProxyArgs* newElems = newPool->elems;
+ // Chain newly allocated elements
+ for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) {
+ if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1;
+ }
+ // Add them all to the pool list
+ state->pool = newElems;
+ // Save the pool memory block for later resource release
+ newPool->next = state->pools;
+ state->pools = newPool;
+ }
+ elem = state->pool;
+ state->pool = state->pool->next;
+ pthread_mutex_unlock(&state->mutex);
+ elem->next = elem->nextPeer = NULL;
+ *argsptr = elem;
+ return ncclSuccess;
}
-ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t nbytes, int pattern, struct ncclComm* comm) {
- int llMode, nrings, nthreads;
- ncclGetCollResource(comm, nbytes, &nrings, &nthreads, &llMode);
- nbytes = llMode ? nbytes * 2 : nbytes;
- substeps = llMode ? 1 : substeps;
- subchunks = llMode ? NCCL_LL_CHUNKS : subchunks;
- int buffSize = llMode ? NCCL_LL_BUFF_SIZE : comm->rings[0].buffSize;
-
- int nrounds = (int)(DIVUP(nbytes, ((size_t)nrings * nblocksPerRound * (buffSize/subchunks)))); // Fixed 32-bit overflow
- int nsteps = nstepsPerRound * nrounds * substeps;
- TRACE(NCCL_NET,"opCount %lx substeps %d subchunks %d nrounds %d nsteps %d comm %p", comm->opCount, subchunks, subchunks, nrounds, nsteps, comm);
- TRACE(NCCL_NET,"opCount %lx nbytes %zi nrings %d buffSize %d pattern %d comm %p", comm->opCount, nbytes, nrings, buffSize, pattern, comm);
- for (int r=0; r<nrings; r++) {
- struct ncclRing* ring = comm->rings+((comm->myParams->gridDim.x+r)%comm->nRings);
- struct ncclProxyArgs args = { ring, substeps*subchunks, nsteps, comm->opCount, llMode, 0 };
- SaveProxy(&ring->recv, &args, NeedProxy(RECV, pattern, ring, comm->nRanks));
- SaveProxy(&ring->send, &args, NeedProxy(SEND, pattern, ring, comm->nRanks));
+static void ProxyAppend(struct ncclConnector* connector, struct ncclProxyArgs* args) {
+ struct ncclComm* comm = connector->comm;
+ struct ncclProxyState* state = &comm->proxyState;
+ pthread_mutex_lock(&state->mutex);
+ if (connector->proxyAppend == NULL) {
+ // Nothing running for that peer. Add to the circular list
+ if (state->ops == NULL) {
+ // Create the list
+ args->next = args;
+ state->ops = args;
+ } else {
+ // Insert element in the list
+ args->next = state->ops->next;
+ state->ops->next = args;
+ }
+ connector->proxyAppend = args;
+ } else {
+ // There is an active operation already for that peer.
+ // Add it to the per-peer list
+ connector->proxyAppend->nextPeer = args;
+ connector->proxyAppend = args;
}
+ pthread_mutex_unlock(&state->mutex);
+}
+
+template <int type>
+static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) {
+ if (peer < 0) return ncclSuccess;
+
+ struct ncclPeer* peerComm = args->channel->peers+peer;
+ struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send;
+ if (connector->transportComm->proxy == NULL) return ncclSuccess;
+
+ struct ncclProxyArgs* op;
+ NCCLCHECK(transportAllocateProxyArgs(connector->comm, &op));
+ memcpy(op, args, sizeof(struct ncclProxyArgs));
+ op->connector = connector;
+ op->progress = connector->transportComm->proxy;
+ op->state = ncclProxyOpReady;
+ ProxyAppend(connector, op);
return ncclSuccess;
}
-ncclResult_t transportStartProxies(ncclComm* comm) {
- for (int r=0; r<comm->nRings; r++) {
- FifoPushArgs(comm->rings[r].send.proxyInfo);
- FifoPushArgs(comm->rings[r].recv.proxyInfo);
+ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks) {
+ if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
+ struct ncclRing* ring = &args->channel->ring;
+ if (NeedProxy(RECV, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxyRecv>(ring->prev, args));
+ if (NeedProxy(SEND, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy<proxySend>(ring->next, args));
+ }
+ if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
+ // Tree up
+ struct ncclTree* tree = &args->channel->tree;
+ for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args));
+ NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
+ }
+ if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
+ // Tree down
+ struct ncclTree* tree = &args->channel->tree;
+ for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
+ NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
}
- pthread_yield(); // Let other threads run
return ncclSuccess;
}
-void* persistentThread(void *opaqueInfo) {
- struct transportProxyInfo* info = (struct transportProxyInfo*)opaqueInfo;
- // We need to initialize the context before launching any NCCL cuda kernel,
- // otherwise we would create it during the first cudaMemcpyAsync inside the
- // proxy function and that would cause a deadlock
- cudaSetDevice(info->comm->cudaDev);
- // Signal the main thread the context is created and it can proceed.
- SetProxyReady(info);
+void* persistentThread(void *comm_) {
+ struct ncclComm* comm = (struct ncclComm*)comm_;
+ struct ncclProxyState* state = &comm->proxyState;
+ struct ncclProxyArgs* op = NULL;
+ ncclResult_t ret = ncclSuccess;
+ int idle = 1;
+ int idleSpin = 0;
while (1) {
- struct ncclProxyArgs args;
- FifoPullArgs(info, &args);
- if (args.active == -1) {
- // Main thread asked to stop
+ do {
+ if (*comm->abortFlag) return NULL;
+ if (op == NULL) {
+ pthread_mutex_lock(&state->mutex);
+ op = state->ops;
+ if (op == NULL) {
+ if (state->stop) {
+ // No more commands to process and proxy has been requested to stop
+ pthread_mutex_unlock(&state->mutex);
+ return NULL;
+ }
+ pthread_cond_wait(&state->cond, &state->mutex);
+ }
+ pthread_mutex_unlock(&state->mutex);
+ }
+ } while (op == NULL);
+ op->idle = 0;
+ if (op->state != ncclProxyOpNone) ret = op->progress(op);
+ if (ret != ncclSuccess) {
+ comm->fatalError = ret;
+ INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
return NULL;
}
- ncclResult_t res = info->func(&args);
- if (res != ncclSuccess) {
- WARN("%s:%d -> %d [Proxy thread error]", __FILE__, __LINE__, res);
+ idle &= op->idle;
+ pthread_mutex_lock(&state->mutex);
+ if (!idle) idleSpin = 0;
+ struct ncclProxyArgs *next = op->next;
+ if (next->state == ncclProxyOpNone) {
+ struct ncclProxyArgs *freeOp = next;
+ if (next->nextPeer) {
+ // Replace next by its next per-peer element.
+ next = next->nextPeer;
+ if (op != freeOp) {
+ next->next = freeOp->next;
+ op->next = next;
+ } else {
+ next->next = next;
+ }
+ } else {
+ // Remove next from circular list
+ next->connector->proxyAppend = NULL;
+ if (op != freeOp) {
+ next = next->next;
+ op->next = next;
+ } else {
+ next = NULL;
+ }
+ }
+ if (freeOp == state->ops) state->ops = next;
+ freeOp->next = state->pool;
+ state->pool = freeOp;
}
+ op = next;
+ if (op == state->ops) {
+ if (idle == 1) {
+ if (++idleSpin == 10) {
+ sched_yield();
+ idleSpin = 0;
+ }
+ }
+ idle = 1;
+ }
+ pthread_mutex_unlock(&state->mutex);
}
}
-ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm) {
- struct ncclConnector* connector = (type == RECV) ? &ring->recv : &ring->send;
- threadFunc_t proxyfunc = (threadFunc_t) ((type == RECV) ? connector->transport->recv.proxy : connector->transport->send.proxy);
- if (proxyfunc) {
- TRACE(NCCL_NET,"type %d ring %p proxyfunc %p comm %p", type, ring, proxyfunc, comm);
- struct transportProxyInfo* info;
- NCCLCHECK(ncclCalloc(&info, 1));
- connector->proxyInfo = info;
- info->comm = comm;
- info->cond = PTHREAD_COND_INITIALIZER;
- info->mutex = PTHREAD_MUTEX_INITIALIZER;
- info->func = proxyfunc;
- info->argsFifoHead = info->argsFifoTail = 0;
- info->proxyReady = 0;
- pthread_create(&connector->proxyInfo->thread, NULL, persistentThread, info);
- // Wait for thread to initialize its CUDA context.
- WaitProxyReady(info);
+ncclResult_t transportStartProxy(struct ncclComm* comm) {
+ pthread_mutex_lock(&comm->proxyState.mutex);
+ if (comm->proxyState.ops != NULL)
+ pthread_cond_signal(&comm->proxyState.cond);
+ pthread_mutex_unlock(&comm->proxyState.mutex);
+ return ncclSuccess;
+}
+
+ncclResult_t transportCreateProxy(struct ncclComm* comm) {
+ if (!comm->proxyThread) {
+ comm->proxyState.cond = PTHREAD_COND_INITIALIZER;
+ comm->proxyState.mutex = PTHREAD_MUTEX_INITIALIZER;
+ comm->proxyState.ops = NULL;
+ pthread_create(&comm->proxyThread, NULL, persistentThread, comm);
}
return ncclSuccess;
}
-ncclResult_t transportDestroyProxy(struct ncclConnector* connector) {
- if (connector->proxyInfo) {
- StopProxy(connector->proxyInfo);
- pthread_join(connector->proxyInfo->thread, NULL);
- free(connector->proxyInfo);
- connector->proxyInfo = NULL;
+ncclResult_t transportDestroyProxy(struct ncclComm* comm) {
+ struct ncclProxyState* state = &comm->proxyState;
+
+ // Request the proxy to stop and then wake it
+ pthread_mutex_lock(&state->mutex);
+ state->stop = true;
+ pthread_cond_signal(&state->cond);
+ pthread_mutex_unlock(&state->mutex);
+ if (comm->proxyThread) pthread_join(comm->proxyThread, NULL);
+
+ // Free off any memory allocated for the proxy arg pools
+ pthread_mutex_lock(&state->mutex);
+ struct ncclProxyState* proxyState = &comm->proxyState;
+ while (proxyState->pools != NULL) {
+ struct ncclProxyPool *next = proxyState->pools->next;
+ free(proxyState->pools);
+ proxyState->pools = next;
}
+ pthread_mutex_unlock(&state->mutex);
+
return ncclSuccess;
}
diff --git a/src/transport/net.cu b/src/transport/net.cu
index 9c366b3..06a6e23 100644
--- a/src/transport/net.cu
+++ b/src/transport/net.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,11 +9,17 @@
#include "nvmlwrap.h"
#include "net.h"
#include "param.h"
-#include "nvlink.h"
+#include "topo.h"
#include <cuda_runtime.h>
#include <assert.h>
#define NET_MAX_IFS 16
+#define NET_MAX_GPUS 32
+
+// Cache GPU-NIC distances to avoid re-computing them
+#define NET_TVALUE_UNKNOWN 0ULL
+static ncclTvalue_t ncclNetTvalues[NET_MAX_GPUS] = { NET_TVALUE_UNKNOWN };
+static int ncclNetNDev;
// We encode 3 bits of distance per interface into a ncclTvalue_t (64-bit)
#define NET_BITS_PER_IF 3
@@ -28,13 +34,9 @@ static ncclTvalue_t getTvalue(short* distances, int ndev) {
}
return tvalue;
}
-
-struct netInfo {
- int rank;
- int ndev;
- ncclTvalue_t tValue;
- short distances[NET_MAX_IFS];
-};
+static int getScore(ncclTvalue_t tvalue, int dev) {
+ return (tvalue >> (dev*NET_BITS_PER_IF)) & NET_BITS_PER_IF_MASK;
+}
struct netConnectInfo {
ncclNetHandle_t netHandle;
@@ -46,11 +48,13 @@ struct netSendResources {
struct ncclRecvMem* hostRecvMem;
struct ncclSendMem* devHostSendMem;
struct ncclRecvMem* devHostRecvMem;
- struct ncclSendMem* hostDevMem;
int netDev;
int useGdr;
- struct ncclRecvMem* devNetMem;
- uint64_t llStep;
+ int buffSize;
+ void* mhandle;
+ void* llMhandle;
+ struct ncclRecvMem* devRecvMem;
+ uint64_t step;
uint64_t llLastCleaning;
};
@@ -61,50 +65,70 @@ struct netRecvResources {
struct ncclRecvMem* hostRecvMem;
struct ncclSendMem* devHostSendMem;
struct ncclRecvMem* devHostRecvMem;
- struct ncclRecvMem* hostDevMem;
int netDev;
int useGdr;
- uint64_t llStep;
+ int buffSize;
+ void* mhandle;
+ void* llMhandle;
+ struct ncclRecvMem* devRecvMem;
+ uint64_t step;
uint64_t llLastCleaning;
};
-/* Fill information necessary to exchange between ranks to choose whether or not
- * to use this transport */
-ncclResult_t netFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
- struct netInfo* info = (struct netInfo*)opaqueInfo;
- static_assert(sizeof(struct netInfo) <= sizeof(ncclTinfo_t), "NET Info too large");
- info->rank = rank;
- NCCLCHECK(ncclNetDevices(&info->ndev));
- if (info->ndev == 0) {
+static ncclResult_t netDistance(int cudaDev, int dev, short* distance) {
+ char* cudaPath = NULL;
+ char* nicPath = NULL;
+ ncclResult_t err;
+ NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
+ err = ncclNetPciPath(dev, &nicPath);
+ *distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath);
+ if (nicPath) free(nicPath);
+ if (cudaPath) free(cudaPath);
+ return ncclSuccess;
+}
+
+static ncclResult_t netDevices(int* ndev, short** distances) {
+ NCCLCHECK(ncclNetDevices(ndev));
+ if (*ndev == 0) {
WARN("Error : Network returned 0 device");
return ncclSystemError;
}
- if (info->ndev > NET_MAX_IFS) info->ndev = NET_MAX_IFS;
+ if (*ndev > NET_MAX_IFS) *ndev = NET_MAX_IFS;
- // Find distance with current GPU
- int cudaDev;
- cudaGetDevice(&cudaDev);
- char* cudaPath;
- NCCLCHECK(getCudaPath(cudaDev, &cudaPath));
+ *distances = (short*)malloc(*ndev*sizeof(short));
+ if (*distances == NULL) return ncclSystemError;
+ // Find distance with current GPU
+ int cudaDev, nvmlDev;
+ CUDACHECK(cudaGetDevice(&cudaDev));
+ NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
char line[1024];
- sprintf(line, "CUDA Dev %d, %s NIC distance : ", cudaDev, ncclNetName());
- for (int d=0; d<info->ndev; d++) {
- char* nicPath;
- ncclResult_t err = ncclNetPciPath(d, &nicPath);
- info->distances[d] = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath);
- sprintf(line+strlen(line), " %s", pathDists[info->distances[d]]);
- if (err == ncclSuccess) free(nicPath);
+ sprintf(line, "CUDA Dev %d[%d], %s NIC distance : ", cudaDev, nvmlDev, ncclNetName());
+ for (int d=0; d<*ndev; d++) {
+ NCCLCHECK(netDistance(cudaDev, d, *distances+d));
+ sprintf(line+strlen(line), " %s", pathDists[(*distances)[d]]);
}
INFO(NCCL_INIT|NCCL_NET, "%s", line);
- free(cudaPath);
return ncclSuccess;
}
/* Determine if we can communicate with the peer */
-ncclResult_t netCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
- struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
- ret[0] = getTvalue(myInfo->distances, myInfo->ndev);
+ncclResult_t netCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
+ int cudaDev;
+ CUDACHECK(cudaGetDevice(&cudaDev));
+ ret[0] = ncclNetTvalues[cudaDev];
+ if (ret[0] == NET_TVALUE_UNKNOWN) {
+ if (cudaDev >= NET_MAX_GPUS) {
+ WARN("CUDA device %d >= MAX %d\n", cudaDev, NET_MAX_GPUS);
+ return ncclInternalError;
+ }
+ int nDev;
+ short* distances;
+ NCCLCHECK(netDevices(&nDev, &distances));
+ ncclNetTvalues[cudaDev] = ret[0] = getTvalue(distances, nDev);
+ ncclNetNDev = nDev;
+ free(distances);
+ }
return ncclSuccess;
}
@@ -196,45 +220,51 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
return ncclSuccess;
}
-int getDev(int ringId, int nDev, short* distances) {
- int minDistance = PATH_SOC;
- for (int d=0; d<nDev; d++) if (distances[d] < minDistance) minDistance = distances[d];
+int getDev(int cudaDev, int ringId) {
+ ncclTvalue_t tvalues = ncclNetTvalues[cudaDev];
+
+ int dev = 0;
+ int maxScore = 0;
+ for (int d=0; d<ncclNetNDev; d++) if (getScore(tvalues,d) > maxScore) maxScore = getScore(tvalues,d);
int skip = ringId+1;
while (skip) {
- for (int d=0; d<nDev; d++) {
- if (distances[d] == minDistance) {
+ for (int d=0; d<ncclNetNDev; d++) {
+ if (getScore(tvalues, d) == maxScore) {
skip--;
- if (skip == 0) return d;
+ if (skip == 0) { dev = d; goto end; }
}
}
}
- return 0;
+end:
+ return dev;
}
NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
-static ncclResult_t netGetGdrSupport(int dev, int distance, int read, int* useGdr) {
+static ncclResult_t netGetGdrSupport(int dev, int read, int* useGdr) {
*useGdr = 0;
- int cudaDev;
+ int cudaDev, nvmlDev;
CUDACHECK(cudaGetDevice(&cudaDev));
+ NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
if (read) { // For reads (sends) only enable under certain conditions
int gdrReadParam = ncclParamNetGdrRead();
if (gdrReadParam == 0) return ncclSuccess;
- else if (gdrReadParam < 0) { // default : enable only on DGX2
- char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
- CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev));
- int nvlinks = getNumNvlinks(busId);
- if (nvlinks < CONNECT_NVSWITCH || ncclCudaCompCap() < 7) return ncclSuccess;
+ if (gdrReadParam < 0) {
+ int nvlink;
+ NCCLCHECK(ncclNvlinkGpu(&nvlink));
+ if (!nvlink) return ncclSuccess;
}
}
// Check if we are close enough that it makes sense to enable GDR
int netGdrLevel = ncclParamNetGdrLevel();
+ short distance;
+ NCCLCHECK(netDistance(cudaDev, dev, &distance));
if (distance >= netGdrLevel) {
- INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, dev, distance, netGdrLevel);
+ INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel);
return ncclSuccess;
}
@@ -243,51 +273,59 @@ static ncclResult_t netGetGdrSupport(int dev, int distance, int read, int* useGd
NCCLCHECK(ncclNetPtrSupport(dev, &flags));
if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess;
*useGdr = 1;
- INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d / HCA %d (distance %d >= %d), read %d", ncclNetName(), cudaDev, dev, distance, netGdrLevel, read);
+ INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d[%d] / HCA %d (distance %d < %d), read %d", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel, read);
return ncclSuccess;
}
/* Determine if we will use this transport for this peer and return connect
* information for this peer */
-ncclResult_t netSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
+ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
struct netSendResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
- ring->send.transportResources = resources;
+ send->transportResources = resources;
+
+ int cudaDev;
+ CUDACHECK(cudaGetDevice(&cudaDev));
+ resources->netDev = getDev(cudaDev, channelId);
+ NCCLCHECK(netGetGdrSupport(resources->netDev, 1, &resources->useGdr));
- struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
- resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances);
- NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 1, &resources->useGdr));
+ int sendSize = sizeof(struct ncclSendMem);
+ NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
- int size = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
+ int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
if (resources->useGdr) {
- NCCLCHECK(ncclCudaCalloc((char**)(&resources->devNetMem), size));
+ NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
}
+ NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
+ resources->buffSize = buffSize;
- NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, size));
- NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, size));
-
+ INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [send] via NET/%s/%d%s", channelId, myInfo->rank, peerInfo->rank, ncclNetName(), resources->netDev,
+ resources->useGdr ? "/GDRDMA" : "");
return ncclSuccess;
}
-ncclResult_t netRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
+ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
struct netRecvResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
- ring->recv.transportResources = resources;
+ recv->transportResources = resources;
- struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo;
- resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances);
- NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 0, &resources->useGdr));
+ int cudaDev;
+ CUDACHECK(cudaGetDevice(&cudaDev));
+ resources->netDev = getDev(cudaDev, channelId);
+ NCCLCHECK(netGetGdrSupport(resources->netDev, 0, &resources->useGdr));
int sendSize = sizeof(struct ncclSendMem);
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
- int recvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
+ int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+ if (resources->useGdr) {
+ NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
+ }
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize));
+ resources->buffSize = buffSize;
- struct netInfo* peerInfo = (struct netInfo*)peerOpaqueInfo;
- INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d via NET/%s/%d%s%s", ring->id, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
- resources->useGdr ? "/GDRDMA" : "",
- (resources->hostDevMem != NULL) ? "/GDCopy" : "");
+ INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [receive] via NET/%s/%d%s", channelId, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev,
+ resources->useGdr ? "/GDRDMA" : "");
struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
return ncclSuccess;
@@ -297,27 +335,28 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
// Setup device pointers
struct netSendResources* resources = (struct netSendResources*)send->transportResources;
- if (resources->useGdr) {
- send->conn.buff = resources->devNetMem->buff;
- // We don't use devMem for llMode because the CPU has to read the data
- send->conn.llBuff = resources->devHostRecvMem->llBuff;
- } else {
- send->conn.buff = resources->devHostRecvMem->buff;
- send->conn.llBuff = resources->devHostRecvMem->llBuff;
- }
+ // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host
+ struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
+ send->conn.buff = recvMem->buff;
+ send->conn.llBuff = resources->devHostRecvMem->llBuff;
+
+ // Head/Tail/Opcount/Fifos are always on host
send->conn.tail = &resources->devHostRecvMem->tail;
- send->conn.opCount = &resources->devHostRecvMem->opCount;
+ send->conn.opCountRem = &resources->devHostRecvMem->opCount;
send->conn.fifo = resources->devHostRecvMem->sizesFifo;
- send->conn.llFifo = resources->devHostRecvMem->llSizesFifo;
-
- if (resources->hostDevMem == NULL) {
- send->conn.head = &resources->devHostSendMem->head;
- send->conn.llHead = &resources->devHostSendMem->llHead;
- }
+ send->conn.head = &resources->devHostSendMem->head;
+ send->conn.opCountLoc = &resources->devHostSendMem->opCount;
+ for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
// Connect to remote peer
struct netConnectInfo* info = (struct netConnectInfo*)connectInfo;
NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm));
+
+ NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->buff, resources->buffSize,
+ resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
+ NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->devHostRecvMem->llBuff,
+ NCCL_LL_BUFF_SIZE, NCCL_PTR_HOST, &resources->llMhandle));
+
return ncclSuccess;
}
@@ -326,32 +365,37 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
// Setup device pointers
struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources;
- recv->conn.head = &resources->devHostSendMem->head;
- recv->conn.llHead = &resources->devHostSendMem->llHead;
-
- if (resources->useGdr == 0) {
- recv->conn.buff = resources->devHostRecvMem->buff;
- recv->conn.llBuff = resources->devHostRecvMem->llBuff;
- }
+ // Intermediate buffering on GPU for GPU Direct RDMA
+ struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem;
+ recv->conn.buff = recvMem->buff;
+ recv->conn.llBuff = recvMem->llBuff;
- if (resources->hostDevMem == NULL) {
- recv->conn.tail = &resources->devHostRecvMem->tail;
- recv->conn.opCount = &resources->devHostRecvMem->opCount;
- }
+ // Head/Tail/Opcount are always on host
+ recv->conn.tail = &resources->devHostRecvMem->tail;
+ recv->conn.opCountLoc = &resources->devHostRecvMem->opCount;
+ recv->conn.head = &resources->devHostSendMem->head;
+ recv->conn.opCountRem = &resources->devHostSendMem->opCount;
- // Finish connection establishment
+ // Finish connection establishment from remote peer
NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
+ NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->buff, resources->buffSize,
+ resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle));
+ NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->llBuff, NCCL_LL_BUFF_SIZE,
+ resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->llMhandle));
+
return ncclSuccess;
}
ncclResult_t netSendFree(void* transportResources) {
struct netSendResources* resources = (struct netSendResources*)transportResources;
NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
+ NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandle));
+ NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->llMhandle));
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
if (resources->useGdr)
- CUDACHECK(cudaFree(resources->devNetMem));
+ CUDACHECK(cudaFree(resources->devRecvMem));
NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
free(resources);
return ncclSuccess;
@@ -360,196 +404,166 @@ ncclResult_t netSendFree(void* transportResources) {
ncclResult_t netRecvFree(void* transportResources) {
struct netRecvResources* resources = (struct netRecvResources*)transportResources;
NCCLCHECK(ncclCudaHostFree(resources->hostSendMem));
+ NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandle));
+ NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->llMhandle));
NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem));
+ if (resources->useGdr)
+ CUDACHECK(cudaFree(resources->devRecvMem));
NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
free(resources);
return ncclSuccess;
}
ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
- struct ncclRing* ring = args->ring;
- struct netSendResources* resources = (struct netSendResources*) (ring->send.transportResources);
- const int llMode = args->llMode;
-
- volatile uint64_t* prevTail = &resources->hostRecvMem->tail;
- struct ncclSendMem* prevMem = resources->hostDevMem ? resources->hostDevMem : resources->hostSendMem;
- uint64_t* prevHead = llMode ? &prevMem->llHead : &prevMem->head;
- struct ncclRecvMem* localMem = resources->useGdr ? resources->devNetMem : resources->hostRecvMem;
- char* localBuff = llMode ? resources->hostRecvMem->llBuff : localMem->buff;
- int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
- volatile int* sizesFifo = llMode ? resources->hostRecvMem->llSizesFifo : resources->hostRecvMem->sizesFifo;
- int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
- int sliceSize = buffSize / args->substeps;
-
- assert(args->substeps <= SIZES_FIFO_SIZE);
-
- uint64_t head = llMode ? resources->llStep : 0ULL;
- uint64_t tail = llMode ? resources->llStep : 0ULL;
- uint64_t end = head + args->nsteps;
-
- int idle = 0;
- void* requests[args->substeps];
-
- if (!args->needProxy) goto nextColl;
-
- TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
- TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
-
- // Update in case we skipped some collectives
- if (llMode == 0) resources->hostRecvMem->opCount = args->opCount;
-
- while (head < end) {
- idle++;
- if (llMode) {
- if (tail < end && tail < head + args->substeps) {
- int slot = tail%args->substeps;
- int size = sizesFifo[slot];
- if (size != 0) {
- if (size == -1) size = 0;
- uint32_t flag = tail + 1;
- int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
- size = nFifoLines * sizeof(union ncclLLFifoLine);
- union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+slot*sliceSize);
- for (int i=0; i<nFifoLines; i++) {
- volatile uint32_t *f1 = &lines[i].flag1;
- volatile uint32_t *f2 = &lines[i].flag2;
- while (f1[0] != flag || f2[0] != flag);
+ struct netSendResources* resources = (struct netSendResources*) (args->connector->transportResources);
+ if (args->state == ncclProxyOpReady) {
+ // Update opCount
+ resources->hostRecvMem->opCount = args->opCount;
+
+ // Round to next multiple of sliceSteps
+ resources->step = ROUNDUP(resources->step, args->chunkSteps);
+ args->head = resources->step;
+ args->tail = resources->step;
+ args->end = args->head + args->nsteps;
+ args->state = ncclProxyOpProgress;
+ }
+ if (args->state == ncclProxyOpProgress) {
+ args->idle = 1;
+ if (args->head < args->end) {
+ if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) {
+ volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
+ if (args->llMode) {
+ int buffSlot = args->tail%NCCL_STEPS;
+ int size = sizesFifo[buffSlot];
+ if (size != -1) {
+ uint32_t flag = args->tail + 1;
+ int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
+ size = nFifoLines * sizeof(union ncclLLFifoLine);
+ union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
+ int ready = 1;
+ for (int i=0; i<nFifoLines; i++) {
+ volatile uint32_t *f1 = &lines[i].flag1;
+ volatile uint32_t *f2 = &lines[i].flag2;
+ if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
+ }
+ if (ready) {
+ NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, resources->llMhandle, args->requests+buffSlot));
+ if (args->requests[buffSlot] != NULL) {
+ sizesFifo[buffSlot] = -1;
+ // Make sure size is reset to zero before we update the head.
+ __sync_synchronize();
+ args->tail += args->sliceSteps;
+ args->idle = 0;
+ }
+ }
}
- NCCLCHECK(ncclNetIsend(resources->netSendComm, lines, size, ptrType, requests+slot));
- if (requests[slot] != NULL) {
- sizesFifo[slot] = size;
- tail++;
- idle = 0;
+ } else if (args->tail < resources->hostRecvMem->tail) {
+ struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
+ int stepSize = args->channel->buffSize/NCCL_STEPS;
+ // Send through network
+ int buffSlot = args->tail%NCCL_STEPS;
+ NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot));
+ if (args->requests[buffSlot] != NULL) {
+ sizesFifo[buffSlot] = -1;
+ // Make sure size is reset to zero before we update the head.
+ __sync_synchronize();
+ args->tail += args->sliceSteps;
+ args->idle = 0;
}
}
}
- } else while (tail < *prevTail) {
- // Send through network
- int slot = tail%args->substeps;
- NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+slot*sliceSize, sizesFifo[slot], ptrType, requests+slot));
- if (requests[slot] != NULL) {
- tail++;
- idle = 0;
- }
- }
- if (head < tail) {
- int done;
- int slot = head%args->substeps;
- NCCLCHECK(ncclNetTest(requests[slot], &done, NULL));
- if (done) {
- if (llMode) {
- sizesFifo[slot] = 0;
- // Make sure size is reset to zero before we update the head.
- __sync_synchronize();
+ if (args->head < args->tail) {
+ int done;
+ int buffSlot = args->head%NCCL_STEPS;
+ NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
+ if (done) {
+ args->head += args->sliceSteps;
+ resources->hostSendMem->head = args->head;
+ args->idle = 0;
}
- head++;
- *prevHead = head;
- idle = 0;
}
}
- if (idle) transportProxyIdle(idle);
+ if (args->head == args->end) {
+ resources->step = args->end;
+ args->idle = 0;
+ args->state = ncclProxyOpDone;
+ }
}
-
- // Reset
- if (llMode == 0) *prevTail = 0;
-
-nextColl:
- if (llMode) {
- resources->llStep += args->nsteps;
- // Don't forget to ack otherwise the GPU won't be able to push data.
- *prevHead = resources->llStep;
- if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
- memset(localBuff, 0, NCCL_LL_BUFF_SIZE);
- resources->llStep += NCCL_LL_CHUNKS;
- *prevHead = resources->llStep;
- resources->llLastCleaning = resources->llStep;
+ if (args->state == ncclProxyOpDone) {
+ union ncclLLFifoLine* llBuff = resources->hostRecvMem->llBuff;
+ if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
+ for (int i=0; i< NCCL_LL_BUFF_LINES; i++) llBuff[i].flag1 = llBuff[i].flag2 = resources->step;
+ resources->step += NCCL_STEPS;
+ resources->hostSendMem->head = resources->step;
+ resources->llLastCleaning = resources->step;
}
+ args->state = ncclProxyOpNone;
}
return ncclSuccess;
}
ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
- struct ncclRing* ring = args->ring;
- struct netRecvResources* resources = (struct netRecvResources*) (ring->recv.transportResources);
- int llMode = args->llMode;
-
- volatile uint64_t* nextHead = llMode ? &resources->hostSendMem->llHead : &resources->hostSendMem->head;
- struct ncclRecvMem* localMem = resources->useGdr ? ring->devMemRecv : resources->hostRecvMem;
- char* localBuff = llMode ? localMem->llBuff : localMem->buff;
- char* nextBuff = (resources->useGdr == 0 && resources->hostDevMem) ? resources->hostDevMem->buff : NULL;
- int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
- uint64_t* nextTail = resources->hostDevMem ? &resources->hostDevMem->tail : &resources->hostRecvMem->tail;
-
- int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize;
- int sliceSize = buffSize / args->substeps;
-
- uint64_t head = llMode ? resources->llStep : 0ULL;
- uint64_t tail = llMode ? resources->llStep : 0ULL;
- uint64_t end = head + args->nsteps;
-
- int idle = 0;
- void* requests[args->substeps];
-
- if (!args->needProxy) goto nextColl;
-
- TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode);
- TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType);
-
- if (llMode == 0) {
- // Waiting for next opCount is only needed before writing nextTail.
- uint64_t* nextOpCount = resources->hostDevMem ? &resources->hostDevMem->opCount : &resources->hostRecvMem->opCount;
- transportProxyWait([=] { return *nextOpCount >= args->opCount; });
+ struct netRecvResources* resources = (struct netRecvResources*) (args->connector->transportResources);
+ if (args->state == ncclProxyOpReady) {
+ // Update opCount
+ resources->hostSendMem->opCount = args->opCount;
+
+ // Round to next multiple of sliceSteps
+ resources->step = ROUNDUP(resources->step, args->chunkSteps);
+ args->head = resources->step;
+ args->tail = resources->step;
+ args->end = args->head + args->nsteps;
+ args->state = ncclProxyOpProgress;
}
-
- while (head < end) {
- idle++;
- if ((tail < head + args->substeps) && (tail < *nextHead + args->substeps) && (tail < end)) {
- int slot = tail%args->substeps;
- NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+slot*sliceSize, sliceSize, ptrType, requests+slot));
- if (requests[slot] != NULL) {
- tail++;
- idle = 0;
+ if (args->state == ncclProxyOpProgress) {
+ args->idle = 1;
+ int stepSize = ( args->llMode ? NCCL_LL_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS;
+ if (args->head < args->end) {
+ struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
+ char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff;
+ void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle;
+ if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
+ int buffSlot = args->tail%NCCL_STEPS;
+ int sliceSize = stepSize * args->sliceSteps;
+ NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot));
+ if (args->requests[buffSlot] != NULL) {
+ args->tail += args->sliceSteps;
+ args->idle = 0;
+ }
}
- }
- if (tail > head) {
- int done;
- int slot = head%args->substeps;
- int size;
- NCCLCHECK(ncclNetTest(requests[slot], &done, &size));
- if (done) {
- if (nextBuff) memcpy(nextBuff+slot*sliceSize, localBuff+slot*sliceSize, size);
- head++;
- if (llMode == 0) {
- if (ptrType == NCCL_PTR_CUDA) ncclNetFlush(resources->netRecvComm, localBuff+slot*sliceSize, size);
- *nextTail = head;
+ if (args->tail > args->head) {
+ int buffSlot = args->head%NCCL_STEPS;
+ int done, size;
+ NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size));
+ if (done) {
+ args->head += args->sliceSteps;
+ if (args->llMode == 0) {
+ if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle);
+ resources->hostRecvMem->tail = args->head;
+ }
+ args->idle = 0;
}
- idle = 0;
}
}
- if (idle) transportProxyIdle(idle);
- }
-
- // Wait for last ack and reset
- if (llMode == 0) {
- transportProxyWait([=] { return *nextHead == head; });
- *nextHead = 0;
+ if (args->head == args->end) {
+ resources->step = args->end;
+ args->idle = 0;
+ args->state = ncclProxyOpDone;
+ }
}
-
-nextColl:
- if (llMode) {
- resources->llStep += args->nsteps;
- if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
- resources->llStep += NCCL_LL_CHUNKS;
- while (*nextHead < resources->llStep);
- resources->llLastCleaning = resources->llStep;
+ if (args->state == ncclProxyOpDone) {
+ if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) {
+ resources->step += NCCL_STEPS;
+ while (resources->hostSendMem->head < resources->step);
+ resources->llLastCleaning = resources->step;
}
+ args->state = ncclProxyOpNone;
}
return ncclSuccess;
}
struct ncclTransport netTransport = {
"NET",
- netFillInfo,
netCanConnect,
netGetRings,
{ netSendSetup, netSendConnect, netSendFree, netSendProxy },
diff --git a/src/transport/net_ib.cu b/src/transport/net_ib.cu
index 18e158d..f7c574b 100644
--- a/src/transport/net_ib.cu
+++ b/src/transport/net_ib.cu
@@ -32,6 +32,7 @@ static int ncclNIbDevs = -1;
struct ncclIbDev {
int device;
uint8_t port;
+ uint8_t link;
ibv_context* context;
char devName[MAXNAMESIZE];
};
@@ -97,7 +98,6 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
WARN("NET/IB : No IP interface found.");
return ncclInternalError;
}
- INFO(NCCL_INIT|NCCL_NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName);
// Detect IB cards
int nIbDevs;
@@ -113,47 +113,59 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
for (int d=0; d<nIbDevs; d++) {
struct ibv_context * context;
- if (ncclSuccess != wrap_ibv_open_device(&context, devices[d])) {
+ if (ncclSuccess != wrap_ibv_open_device(&context, devices[d]) || context == NULL) {
WARN("NET/IB : Unable to open device %s", devices[d]->name);
continue;
}
int found = 0;
- if (context) {
- struct ibv_device_attr devAttr;
- if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
- WARN("NET/IB : Unable to query device %s", devices[d]->name);
+ struct ibv_device_attr devAttr;
+ if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
+ WARN("NET/IB : Unable to query device %s", devices[d]->name);
+ if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
+ continue;
+ }
+ for (int port = 1; port <= devAttr.phys_port_cnt; port++) {
+ struct ibv_port_attr portAttr;
+ if (ncclSuccess != wrap_ibv_query_port(context, port, &portAttr)) {
+ WARN("NET/IB : Unable to query port %d", port);
continue;
}
- for (int port = 1; port <= devAttr.phys_port_cnt; port++) {
- struct ibv_port_attr portAttr;
- if (ncclSuccess != wrap_ibv_query_port(context, port, &portAttr)) {
- WARN("NET/IB : Unable to query port %d", port);
- continue;
- }
- if (portAttr.state != IBV_PORT_ACTIVE) continue;
- if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND
- && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
-
- // check against user specified HCAs/ports
- if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) {
- continue;
- }
- INFO(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
- portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
- ncclIbDevs[ncclNIbDevs].device = d;
- ncclIbDevs[ncclNIbDevs].port = port;
- ncclIbDevs[ncclNIbDevs].context = context;
- strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
- ncclNIbDevs++;
- found++;
- pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
- }
+ if (portAttr.state != IBV_PORT_ACTIVE) continue;
+ if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND
+ && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
- if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } }
+ // check against user specified HCAs/ports
+ if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) {
+ continue;
+ }
+ TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
+ portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
+ ncclIbDevs[ncclNIbDevs].device = d;
+ ncclIbDevs[ncclNIbDevs].port = port;
+ ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
+ ncclIbDevs[ncclNIbDevs].context = context;
+ strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
+ ncclNIbDevs++;
+ found++;
+ pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
}
+ if (found == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
}
if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; };
}
+ if (ncclNIbDevs == 0) {
+ INFO(NCCL_INIT|NCCL_NET, "NET/IB : No device found.");
+ } else {
+ char line[1024];
+ line[0] = '\0';
+ for (int d=0; d<ncclNIbDevs; d++) {
+ snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%d/%s", d, ncclIbDevs[d].devName,
+ ncclIbDevs[d].port, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
+ }
+ line[1023] = '\0';
+ char addrline[1024];
+ INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s ; OOB %s:%s", line, ncclIbIfName, socketToString(&ncclIbIfAddr.sa, addrline));
+ }
pthread_mutex_unlock(&ncclIbLock);
}
return ncclSuccess;
@@ -205,11 +217,12 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
*supportedTypes = NCCL_PTR_HOST;
- int cudaDev;
+ int cudaDev, nvmlDev;
CUDACHECK(cudaGetDevice(&cudaDev));
+ NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))
if (ncclIbGdrSupport(dev) != ncclSuccess) {
- INFO(NCCL_INIT|NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (no module or not supported by GPU)", cudaDev, ncclIbDevs[dev].devName);
+ INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d '%s' (no module or not supported by GPU)", cudaDev, nvmlDev, dev, ncclIbDevs[dev].devName);
return ncclSuccess;
}
*supportedTypes |= NCCL_PTR_CUDA;
@@ -242,23 +255,15 @@ struct ncclIbHandle {
union socketAddress connectAddr;
};
-struct ncclIbMr {
- struct ibv_mr* mr;
- int refcnt;
-};
-
struct ncclIbVerbs {
struct ibv_pd* pd;
struct ibv_cq* cq;
- struct ncclIbMr mrPool[MAX_REQUESTS];
- int mrRotation;
};
struct ncclIbRequest {
int used;
int type;
struct ncclIbVerbs* verbs;
- struct ncclIbMr * ibMr;
int done;
int size;
int free;
@@ -278,12 +283,12 @@ struct ncclIbSendFifo {
};
struct ncclIbSendComm {
+ struct ncclIbVerbs verbs;
struct ncclIbSendFifo fifo[MAX_REQUESTS];
struct ncclIbRequest reqs[MAX_REQUESTS];
uint32_t fifoHead;
int fd;
int ready;
- struct ncclIbVerbs verbs;
struct ibv_qp* qp;
struct ibv_mr* fifoMr;
};
@@ -307,11 +312,11 @@ struct ncclIbRemFifo {
};
struct ncclIbRecvComm {
+ struct ncclIbVerbs verbs;
struct ncclIbRemFifo remFifo;
struct ncclIbRequest reqs[MAX_REQUESTS];
int fd;
int ready;
- struct ncclIbVerbs verbs;
struct ibv_qp* qp;
struct ncclIbGpuFlush gpuFlush;
};
@@ -434,13 +439,13 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
// RoCE support
qpInfo.lid = portAttr.lid;
if (qpInfo.lid) { // IB
- INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid);
+ INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid);
} else { // RoCE
union ibv_gid gid;
NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
qpInfo.spn = gid.global.subnet_prefix;
qpInfo.iid = gid.global.interface_id;
- INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
+ INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
}
NCCLCHECK(socketSend(comm->fd, &qpInfo, sizeof(qpInfo)));
@@ -537,7 +542,6 @@ ncclResult_t ncclIbGetRequest(struct ncclIbRequest* reqs, struct ncclIbRequest**
r->used = 1;
r->type = 0;
r->verbs = NULL;
- r->ibMr = NULL;
r->done = 0;
r->size = -1;
r->free = 0;
@@ -583,57 +587,34 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size);
#define REG_ALIGN (4096)
-// Cache previous MRs to avoid registering/unregistering for each Isend/Irecv
-ncclResult_t ncclIbGetMr(struct ncclIbVerbs* verbs, void* data, int size, struct ncclIbMr** mrRet) {
+ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+ struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
uint64_t addr = (uint64_t)data;
- int elem = -1;
assert(size > 0);
- // Look for an already existing MR
- for (int i=0; i<MAX_REQUESTS; i++) {
- if (verbs->mrPool[i].mr == NULL) continue;
- uint64_t regAddr = (uint64_t)verbs->mrPool[i].mr->addr;
- uint64_t regSize = (uint64_t)verbs->mrPool[i].mr->length;
- if (regAddr <= addr && addr+size <= regAddr+regSize) {
- *mrRet = verbs->mrPool+i;
- verbs->mrPool[i].refcnt++;
- return ncclSuccess;
- }
- }
-
- // Find an unused element
- if (elem == -1) {
- elem = (verbs->mrRotation++);
- for (int i=0; i<MAX_REQUESTS; i++) {
- elem %= MAX_REQUESTS;
- if (verbs->mrPool[elem].refcnt > 0) elem++; else break;
- }
- if (verbs->mrPool[elem].refcnt > 0) {
- WARN("NET/IB : memory register : no MR available");
- return ncclInternalError;
- }
- }
-
- assert(elem < MAX_REQUESTS);
- assert(verbs->mrPool[elem].refcnt == 0);
-
// Deregister / register
uint64_t regAddr = addr & (~(REG_ALIGN-1));
uint64_t regSize = addr+size - regAddr;
regSize = ((regSize + REG_ALIGN-1) / REG_ALIGN ) * REG_ALIGN;
- if (verbs->mrPool[elem].mr) NCCLCHECK(wrap_ibv_dereg_mr(verbs->mrPool[elem].mr));
- NCCLCHECK(wrap_ibv_reg_mr(&verbs->mrPool[elem].mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
- *mrRet = verbs->mrPool+elem;
- verbs->mrPool[elem].refcnt++;
- TRACE(NCCL_INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey);
+ struct ibv_mr* mr;
+ NCCLCHECK(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
+ *mhandle = (void*)mr;
+ TRACE(NCCL_INIT,"regAddr %lx size %ld rkey %x", regAddr, regSize, mr->rkey);
return ncclSuccess;
}
-ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void** request) {
+ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
+ NCCLCHECK(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, void** request) {
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
if (comm->ready == 0) NCCLCHECK(ncclSendCheck(comm));
if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
+ struct ibv_mr* mr = (struct ibv_mr*)mhandle;
+
// Wait for the receiver to have posted the corresponding receive
volatile struct ncclIbSendFifo* slot = comm->fifo + (comm->fifoHead%MAX_REQUESTS);
volatile uint32_t * readyPtr = &slot->ready;
@@ -641,7 +622,6 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void**
struct ncclIbRequest* req;
NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
- req->type = type;
req->verbs = &comm->verbs;
req->size = size;
@@ -654,8 +634,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void**
wr.sg_list = NULL;
wr.num_sge = 0;
} else {
- NCCLCHECK(ncclIbGetMr(&comm->verbs, data, size, &req->ibMr));
- sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=req->ibMr->mr->lkey;
+ sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=mr->lkey;
wr.sg_list = &sge;
wr.num_sge = 1;
}
@@ -720,14 +699,15 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t
return ncclSuccess;
}
-ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void** request) {
+ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) {
struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
if (comm->ready == 0) NCCLCHECK(ncclRecvCheck(comm));
if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
+ struct ibv_mr* mr = (struct ibv_mr*)mhandle;
+
struct ncclIbRequest* req;
NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
- req->type = type;
req->verbs = &comm->verbs;
req->size = size;
@@ -739,10 +719,8 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void**
if (size == 0) {
wr.sg_list = NULL;
wr.num_sge = 0;
- req->ibMr = NULL;
} else {
- NCCLCHECK(ncclIbGetMr(&comm->verbs, data, size, &req->ibMr));
- sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=req->ibMr->mr->lkey;
+ sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=mr->lkey;
wr.sg_list = &sge;
wr.num_sge = 1;
}
@@ -752,25 +730,25 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void**
*request = req;
// Post to FIFO to notify sender
- NCCLCHECK(ncclIbPostFifo(comm, req->ibMr->mr->rkey, (uint64_t)data, size));
+ NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size));
return ncclSuccess;
}
-ncclResult_t ncclIbFlush(void* recvComm, void* data, int size) {
+ncclResult_t ncclIbFlush(void* recvComm, void* data, int size, void* mhandle) {
struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
if (comm->gpuFlush.enabled == 0 || size == 0) return ncclSuccess;
struct ncclIbRequest* req;
NCCLCHECK(ncclIbGetRequest(comm->reqs, &req));
req->verbs = &comm->verbs;
- NCCLCHECK(ncclIbGetMr(&comm->verbs, data, 1, &req->ibMr));
+ struct ibv_mr* mr = (struct ibv_mr*)mhandle;
struct ibv_send_wr wr;
memset(&wr, 0, sizeof(wr));
wr.wr_id = (uint64_t)req;
wr.wr.rdma.remote_addr = (uint64_t)data;
- wr.wr.rdma.rkey = req->ibMr->mr->rkey;
+ wr.wr.rdma.rkey = mr->rkey;
wr.sg_list = &comm->gpuFlush.sge;
wr.num_sge = 1;
wr.opcode = IBV_WR_RDMA_READ;
@@ -800,32 +778,31 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) {
}
int wrDone = 0;
- struct ibv_wc wc;
- NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 1, &wc, &wrDone));
+ struct ibv_wc wcs[4];
+ NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 4, wcs, &wrDone));
if (wrDone == 0) return ncclSuccess;
- if (wc.status != IBV_WC_SUCCESS) {
- WARN("NET/IB : Got completion with error %d, opcode %d, len %d, vendor err %d", wc.status, wc.opcode, wc.byte_len, wc.vendor_err);
- return ncclSystemError;
- }
+ for (int w=0; w<wrDone; w++) {
+ struct ibv_wc *wc = wcs+w;
+ if (wc->status != IBV_WC_SUCCESS) {
+ WARN("NET/IB : Got completion with error %d, opcode %d, len %d, vendor err %d", wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
+ return ncclSystemError;
+ }
- struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc.wr_id;
- if (doneReq) {
- if (wc.opcode == IBV_WC_RECV) {
- doneReq->size = wc.byte_len;
+ struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc->wr_id;
+ if (doneReq) {
+ if (wc->opcode == IBV_WC_RECV) {
+ doneReq->size = wc->byte_len;
#if USE_RDMA_WRITE
- } else if (wc.opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
- doneReq->size = wc.imm_data;
+ } else if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
+ doneReq->size = wc->imm_data;
#endif
- }
- if (doneReq->ibMr != NULL) {
- doneReq->ibMr->refcnt--;
- if (doneReq->ibMr->refcnt < 0) WARN("NET/IB : doneReq %p MR %p refcount now %d", doneReq, doneReq->ibMr, doneReq->ibMr->refcnt);
- }
- doneReq->done = 1;
- if (doneReq->free == 1) {
- // This is an internal (FIFO post) req. Free it immediately.
- doneReq->used = 0;
+ }
+ doneReq->done = 1;
+ if (doneReq->free == 1) {
+ // This is an internal (FIFO post) req. Free it immediately.
+ doneReq->used = 0;
+ }
}
}
}
@@ -837,12 +814,6 @@ ncclResult_t ncclIbCloseSend(void* sendComm) {
close(comm->fd);
if (comm->qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qp));
if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr));
- for (int i=0; i<MAX_REQUESTS; i++) {
- if (comm->verbs.mrPool[i].mr != NULL) {
- if (comm->verbs.mrPool[i].refcnt != 0) WARN("NET/IB : TX MR #%d has non-zero (%d) refcnt", i, comm->verbs.mrPool[i].refcnt);
- NCCLCHECK(wrap_ibv_dereg_mr(comm->verbs.mrPool[i].mr));
- }
- }
NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs));
free(comm);
}
@@ -859,12 +830,6 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) {
if (comm->gpuFlush.hostMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->gpuFlush.hostMr));
}
if (comm->remFifo.mr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remFifo.mr));
- for (int i=0; i<MAX_REQUESTS; i++) {
- if (comm->verbs.mrPool[i].mr != NULL) {
- if (comm->verbs.mrPool[i].refcnt != 0) WARN("NET/IB : RX MR #%d has non-zero (%d) refcnt", i, comm->verbs.mrPool[i].refcnt);
- NCCLCHECK(wrap_ibv_dereg_mr(comm->verbs.mrPool[i].mr));
- }
- }
NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs));
free(comm);
}
@@ -889,6 +854,8 @@ ncclNet_t ncclNetIb = {
ncclIbListen,
ncclIbConnect,
ncclIbAccept,
+ ncclIbRegMr,
+ ncclIbDeregMr,
ncclIbIsend,
ncclIbIrecv,
ncclIbFlush,
diff --git a/src/transport/net_socket.cu b/src/transport/net_socket.cu
index 1efee15..0464b43 100644
--- a/src/transport/net_socket.cu
+++ b/src/transport/net_socket.cu
@@ -27,10 +27,19 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
pthread_mutex_lock(&ncclSocketLock);
if (ncclNetIfs == -1) {
ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
- INFO(NCCL_INIT|NCCL_NET,"NET/Socket : %d interfaces found", ncclNetIfs);
if (ncclNetIfs <= 0) {
WARN("NET/Socket : no interface found");
return ncclInternalError;
+ } else {
+ char line[1024];
+ char addrline[1024];
+ line[0] = '\0';
+ for (int i=0; i<ncclNetIfs; i++) {
+ snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, ncclNetIfNames+i*MAX_IF_NAME_SIZE,
+ socketToString(&ncclNetIfAddrs[i].sa, addrline));
+ }
+ line[1023] = '\0';
+ INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
}
}
pthread_mutex_unlock(&ncclSocketLock);
@@ -113,7 +122,7 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
union socketAddress localAddr;
char ifName[MAX_IF_NAME_SIZE];
if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
- WARN("No usable listening interface found");
+ WARN("NET/Socket : No usable listening interface found");
return ncclSystemError;
}
// pass the local address back
@@ -205,21 +214,24 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
return ncclSuccess;
}
-ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, int type, void** request) {
- if (type != NCCL_PTR_HOST) return ncclInternalError;
+ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) {
+ return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess;
+}
+ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
+
+ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, void* mhandle, void** request) {
struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm;
NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_SEND, data, size, comm->fd, (struct ncclSocketRequest**)request));
return ncclSuccess;
}
-ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, int type, void** request) {
- if (type != NCCL_PTR_HOST) return ncclInternalError;
+ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) {
struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm;
NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_RECV, data, size, comm->fd, (struct ncclSocketRequest**)request));
return ncclSuccess;
}
-ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size) {
+ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size, void* mhandle) {
// We don't support CUDA pointers, so we don't need a flush operation
return ncclInternalError;
}
@@ -243,6 +255,8 @@ ncclNet_t ncclNetSocket = {
ncclSocketListen,
ncclSocketConnect,
ncclSocketAccept,
+ ncclSocketRegMr,
+ ncclSocketDeregMr,
ncclSocketIsend,
ncclSocketIrecv,
ncclSocketFlush,
diff --git a/src/transport/p2p.cu b/src/transport/p2p.cu
index 6c4626a..9f3e0b6 100644
--- a/src/transport/p2p.cu
+++ b/src/transport/p2p.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,18 +11,9 @@
#include "param.h"
#include <unistd.h>
#include <cuda_runtime.h>
-#include "nvmlwrap.h"
#include <ctype.h>
#include "nvlink.h"
-struct p2pInfo {
- int rank;
- int cudaDev;
- uint64_t hostHash;
- uint64_t pidHash;
- char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-};
-
struct p2pConnectInfo {
int direct;
union {
@@ -31,36 +22,40 @@ struct p2pConnectInfo {
};
};
-#include <sys/types.h>
+struct p2pSendResources {
+ struct ncclSendMem* devMem;
+ void* ipcPtr;
+};
-/* Fill information necessary to exchange between ranks to choose whether or not
- * to use this transport */
-ncclResult_t p2pFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
- struct p2pInfo* info = (struct p2pInfo*)opaqueInfo;
- static_assert(sizeof(struct p2pInfo) <= sizeof(ncclTinfo_t), "p2p Info too large");
- info->rank = rank;
- CUDACHECK(cudaGetDevice(&info->cudaDev));
- info->hostHash=getHostHash();
- info->pidHash=getPidHash();
-
- // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
- // cudaDev is a CUDA runtime dev number which could be different from the
- // NVML device number. Then we get the busID from NVML to be sure it is
- // consistent with NVML remote PCI bus Ids.
- CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev));
- nvmlDevice_t nvmlDevice;
- NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice));
- nvmlPciInfo_t pciInfo;
- NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo));
- strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE);
- return ncclSuccess;
-}
+struct p2pRecvResources {
+ struct ncclRecvMem* devMem;
+ void* ipcPtr;
+};
+
+#include <sys/types.h>
NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2);
NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2);
+/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
+static int busIdToCudaDev(const char* busId) {
+ int ndev;
+ if (cudaGetDeviceCount(&ndev) != cudaSuccess)
+ return -1;
+ for (int i = 0; i < ndev; i++) {
+ char devBusId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+ if (cudaDeviceGetPCIBusId(devBusId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess)
+ return -1;
+ if (strcmp(busId, devBusId) == 0) {
+ return i;
+ }
+ }
+ // BusId was not found in our locally visible CUDA devices
+ return -1;
+}
+
/* Determine if we can communicate with the peer through p2p */
-ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
+ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
// Do not use P2P across root complexes by default (provided CUDA permits it)
int p2pLevel = PATH_SOC;
if (ncclParamP2pDisable() == 1) p2pLevel = 0;
@@ -70,23 +65,26 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
if (p2pLevel == 0) return ncclSuccess;
- struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
- struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
-
// Rule out different nodes
if (myInfo->hostHash != peerInfo->hostHash) return ncclSuccess;
+ // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
+ int peerCudaDev = busIdToCudaDev(peerInfo->busId);
+ if (peerCudaDev == -1) return ncclSuccess; // Peer's CUDA device is not visible in this process
+
+ TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
+
// Do not detect topology if we're on the same GPU. Note this is not really supported.
- if (myInfo->cudaDev == peerInfo->cudaDev) {
+ if (myInfo->cudaDev == peerCudaDev) {
*ret = 1 + PATH_SOC;
return ncclSuccess;
}
// See if CUDA can do P2P
int p2p;
- if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerInfo->cudaDev) != cudaSuccess) {
- INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d and dev %d",
- myInfo->cudaDev, peerInfo->cudaDev);
+ if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerCudaDev) != cudaSuccess) {
+ INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%d) and dev %d(=%d)",
+ myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev);
return ncclSuccess;
}
if (p2p == 0) return ncclSuccess;
@@ -102,7 +100,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin
char* myPath;
char* peerPath;
ncclResult_t err1 = getCudaPath(myInfo->cudaDev, &myPath);
- ncclResult_t err2 = getCudaPath(peerInfo->cudaDev, &peerPath);
+ ncclResult_t err2 = getCudaPath(peerCudaDev, &peerPath);
if (err1 == ncclSuccess && err2 == ncclSuccess) {
int distance = pciDistance(myPath, peerPath);
if (distance < p2pLevel) {
@@ -174,8 +172,8 @@ static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentR
static inline int copyRings(int nranks, int* rings, int nrings, int newNrings) {
if (nrings == 0) return 0;
// Copy rings by dup times
- if (newNrings > MAXRINGS) {
- newNrings = MAXRINGS;
+ if (newNrings > MAXCHANNELS) {
+ newNrings = MAXCHANNELS;
}
for (int r=nrings; r<newNrings; r++) {
for (int i=0; i<nranks; i++) rings[r*nranks+i] = rings[(r%nrings)*nranks+i];
@@ -191,7 +189,6 @@ int p2pComputeRingsNvLink(ncclTvalue_t* matrix, int nranks, int *rings, int nrin
if (connect) {
inTheRing[rings[0]] = 1;
nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, rings[1], nranks-2, connect);
- nrings = copyRings(nranks, rings, nrings, nringsMax);
} else {
rings[0] = 0;
nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, 0, nranks-1, connect);
@@ -209,9 +206,9 @@ static inline int findConnect(int nranks, int* ranks) {
int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int oversubscribe, int* nthreads) {
if (nrings == 0) return 0;
- if (nrings > MAXRINGS) {
- WARN("Max rings reached, limiting to %d", MAXRINGS);
- nrings = MAXRINGS;
+ if (nrings > MAXCHANNELS) {
+ WARN("Max rings reached, limiting to %d", MAXCHANNELS);
+ nrings = MAXCHANNELS;
}
// Find existing constraints / connections
int connect = 0;
@@ -239,9 +236,9 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin
if (compNrings && compNrings < nrings && nranks <= 4) {
// Try to oversubscribe to get a better result
- int *rings2 = (int *)malloc(sizeof(int)*MAXRINGS*nranks);
- if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXRINGS*nranks); return 0; }
- for (int i=0; i<MAXRINGS*nranks; i++) rings2[i] = -1;
+ int *rings2 = (int *)malloc(sizeof(int)*MAXCHANNELS*nranks);
+ if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXCHANNELS*nranks); return 0; }
+ for (int i=0; i<MAXCHANNELS*nranks; i++) rings2[i] = -1;
int nThreads = *nthreads;
int compNrings2 = p2pComputeRingsNvLink(values, nranks, rings2, nrings, prev, next, 1, &nThreads);
if (compNrings2 > compNrings*2) {
@@ -255,7 +252,6 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin
// Duplicate the rings for direct NVLink
compNrings = copyRings(nranks, rings, compNrings, compNrings*2);
- if (ncclCudaCompCap() == 6) *nthreads /= 2;
return compNrings;
}
@@ -367,8 +363,8 @@ int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings,
ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
if (*nringsRet == 0) return ncclSuccess;
int *rings;
- NCCLCHECK(ncclCalloc(&rings, MAXRINGS*nranks));
- for (int i=0; i<MAXRINGS*nranks; i++) rings[i] = -1;
+ NCCLCHECK(ncclCalloc(&rings, MAXCHANNELS*nranks));
+ for (int i=0; i<MAXCHANNELS*nranks; i++) rings[i] = -1;
int nrings = *nringsRet;
// NVswitch
@@ -446,39 +442,47 @@ end:
} while (0)
/* Send: Create and return connect structures for this peer to connect to me */
-ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
- struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
- struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
+ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+ struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+
+ struct p2pSendResources* resources;
+ NCCLCHECK(ncclCalloc(&resources, 1));
+ send->transportResources = resources;
+ const int sendSize = sizeof(struct ncclSendMem);
+ NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize));
+
struct p2pConnectInfo info;
if (myInfo->pidHash == peerInfo->pidHash) {
info.direct = 1;
- info.directPtr = ring->devMemSend;
+ info.directPtr = resources->devMem;
if (myInfo->cudaDev == peerInfo->cudaDev) {
- INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank);
+ INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", channelId, myInfo->rank, peerInfo->rank);
} else {
// Enable P2P access
cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0);
if (err == cudaErrorPeerAccessAlreadyEnabled) {
cudaGetLastError();
} else if (err != cudaSuccess) {
- WARN("failed to peer with device %d: %d %s",
- peerInfo->cudaDev, err, cudaGetErrorString(err));
+ WARN("failed to peer with device %d(=%d): %d %s",
+ peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
return ncclInternalError;
}
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer",
- ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+ channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
}
} else {
+ // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
+ int peerCudaDev = busIdToCudaDev(peerInfo->busId);
info.direct = 0;
// Map IPC and enable P2P access
- cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)ring->devMemSend);
+ cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
if (err != cudaSuccess) {
- WARN("rank %d failed to get CUDA IPC handle to device %d : %d %s",
- myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
+ WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
+ myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
return ncclInternalError;
}
INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC",
- ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+ channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
//TRACE_DUMP_IPC(&info.devIpc);
}
static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
@@ -487,13 +491,19 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
}
/* Create and return connect structures for this peer to connect to me */
-ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
- struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo;
- struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo;
+ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo,
+ struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) {
+
+ struct p2pRecvResources* resources;
+ NCCLCHECK(ncclCalloc(&resources, 1));
+ recv->transportResources = resources;
+ const int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+ NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize));
+
struct p2pConnectInfo info;
if (myInfo->pidHash == peerInfo->pidHash) {
info.direct = 1;
- info.directPtr = ring->devMemRecv;
+ info.directPtr = resources->devMem;
if (myInfo->cudaDev == peerInfo->cudaDev) {
TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank);
} else {
@@ -502,22 +512,24 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
if (err == cudaErrorPeerAccessAlreadyEnabled) {
cudaGetLastError();
} else if (err != cudaSuccess) {
- WARN("failed to peer with device %d: %d %s",
- peerInfo->cudaDev, err, cudaGetErrorString(err));
+ WARN("failed to peer with device %d(=%d): %d %s",
+ peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
return ncclInternalError;
}
- TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+ TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
}
} else {
+ // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
+ int peerCudaDev = busIdToCudaDev(peerInfo->busId);
info.direct = 0;
// Map IPC and enable P2P access
- cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)ring->devMemRecv);
+ cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem);
if (err != cudaSuccess) {
- WARN("rank %d failed to get CUDA IPC handle to device %d : %d %s",
- myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err));
+ WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s",
+ myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err));
return ncclInternalError;
}
- TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+ TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev);
//TRACE_DUMP_IPC(&info.devIpc);
}
static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
@@ -527,22 +539,16 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo
/* Connect/Send to this peer */
static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
- void** resources = &send->transportResources;
+ struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
struct ncclRecvMem* remDevMem;
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
if (info->direct) {
remDevMem = (struct ncclRecvMem*)(info->directPtr);
send->conn.direct = 1;
- *resources = NULL;
} else {
- void* remPtr = NULL;
//TRACE_DUMP_IPC(&info->devIpc);
- cudaError_t err = cudaIpcOpenMemHandle(&remPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
- void** ipcPtrSave;
- NCCLCHECK(ncclCalloc(&ipcPtrSave, 1));
- *resources = ipcPtrSave;
- *ipcPtrSave = remPtr;
- remDevMem = (struct ncclRecvMem*)remPtr;
+ cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
+ remDevMem = (struct ncclRecvMem*)resources->ipcPtr;
if (err != cudaSuccess) {
WARN("failed to open CUDA IPC handle : %d %s",
err, cudaGetErrorString(err));
@@ -553,30 +559,26 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC
send->conn.buff = remDevMem->buff;
send->conn.llBuff = remDevMem->llBuff;
send->conn.tail = &remDevMem->tail;
- send->conn.opCount = &remDevMem->opCount;
- // send->conn->head should have been set to devMemSend already
+ send->conn.opCountRem = &remDevMem->opCount;
+ send->conn.head = &resources->devMem->head;
+ send->conn.ptrExchange = &resources->devMem->ptrExchange;
+ send->conn.opCountLoc = &resources->devMem->opCount;
return ncclSuccess;
}
/* Connect/Recv from this peer */
ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
- void** resources = &recv->transportResources;
+ struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
struct ncclSendMem* remDevMem;
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
if (info->direct) {
remDevMem = (struct ncclSendMem*)(info->directPtr);
recv->conn.direct = 1;
recv->conn.ptrExchange = &remDevMem->ptrExchange;
- *resources = NULL;
} else {
- void* remPtr = NULL;
//TRACE_DUMP_IPC(&info->devIpc);
- cudaError_t err = cudaIpcOpenMemHandle(&remPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
- void** ipcPtrSave;
- NCCLCHECK(ncclCalloc(&ipcPtrSave, 1));
- *resources = ipcPtrSave;
- *ipcPtrSave = remPtr;
- remDevMem = (struct ncclSendMem*)remPtr;
+ cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
+ remDevMem = (struct ncclSendMem*)resources->ipcPtr;
if (err != cudaSuccess) {
WARN("failed to open CUDA IPC handle : %d %s",
err, cudaGetErrorString(err));
@@ -584,28 +586,35 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
}
}
- // recv->conn->buff should have been set to devMemRecv already
- // recv->conn->tail should have been set to devMemRecv already
- // recv->conn->opCount should have been set to devMemRecv already
+ recv->conn.buff = resources->devMem->buff;
+ recv->conn.llBuff = resources->devMem->llBuff;
+ recv->conn.tail = &resources->devMem->tail;
+ recv->conn.opCountLoc = &resources->devMem->opCount;
recv->conn.head = &remDevMem->head;
- recv->conn.llHead = &remDevMem->llHead;
+ recv->conn.opCountRem = &remDevMem->opCount;
return ncclSuccess;
}
-ncclResult_t p2pFree(void* resources) {
- if (resources != NULL) {
- void** ipcPtrSave = (void**) resources;
- CUDACHECK(cudaIpcCloseMemHandle(*ipcPtrSave));
- free(resources);
- }
+ncclResult_t p2pSendFree(void* resources) {
+ struct p2pSendResources* sendRes = (struct p2pSendResources*)resources;
+ if (sendRes->ipcPtr)
+ CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
+ CUDACHECK(cudaFree(sendRes->devMem));
+ return ncclSuccess;
+}
+
+ncclResult_t p2pRecvFree(void* resources) {
+ struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources;
+ if (recvRes->ipcPtr)
+ CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
+ CUDACHECK(cudaFree(recvRes->devMem));
return ncclSuccess;
}
struct ncclTransport p2pTransport = {
"P2P",
- p2pFillInfo,
p2pCanConnect,
p2pGetRings,
- { p2pSendSetup, p2pSendConnect, p2pFree, NULL },
- { p2pRecvSetup, p2pRecvConnect, p2pFree, NULL }
+ { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL },
+ { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL }
};
diff --git a/src/transport/shm.cu b/src/transport/shm.cu
index 317f652..56e0242 100644
--- a/src/transport/shm.cu
+++ b/src/transport/shm.cu
@@ -12,13 +12,6 @@
#include <unistd.h>
#include <cuda_runtime.h>
-struct shmInfo {
- int rank;
- int cudaDev;
- uint64_t hostHash;
- uint64_t pidHash;
-};
-
struct shmSendConnectInfo {
uint64_t pidHash;
int id;
@@ -51,24 +44,10 @@ struct shmRecvResources {
struct ncclRecvMem* devHostMem;
};
-/* Fill information necessary to exchange between ranks to choose whether or not
- * to use this transport */
-ncclResult_t shmFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
- struct shmInfo* info = (struct shmInfo*)opaqueInfo;
- static_assert(sizeof(struct shmInfo) <= sizeof(ncclTinfo_t), "shm Info too large");
- info->rank = rank;
- CUDACHECK(cudaGetDevice(&info->cudaDev));
- info->hostHash=getHostHash();
- info->pidHash=getPidHash();
- return ncclSuccess;
-}
-
NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);
/* Determine if we can communicate with the peer */
-ncclResult_t shmCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) {
- struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo;
- struct shmInfo* peerInfo = (struct shmInfo*)peerOpaqueInfo;
+ncclResult_t shmCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) {
*ret = ((ncclParamShmDisable() == 1) || (myInfo->hostHash != peerInfo->hostHash)) ? 0 : 1;
return ncclSuccess;
}
@@ -88,7 +67,7 @@ static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid)
}
ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) {
- if (*nringsRet == MAXRINGS) *nringsRet = 1;
+ if (*nringsRet == MAXCHANNELS) *nringsRet = 1;
int nGroups = groups[nranks-1] + 1;
int starts[nGroups];
int ends[nGroups];
@@ -156,43 +135,40 @@ ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t*
#define MAX_SHM_NAME_LEN 1024
/* Create and return connect structures for this peer to connect to me */
-ncclResult_t shmSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
- struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo;
- struct shmInfo* peerInfo = (struct shmInfo*)peerOpaqueInfo;
+ncclResult_t shmSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
struct shmSendResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
- ring->send.transportResources = resources;
+ send->transportResources = resources;
struct shmRecvConnectInfo info;
char shmName[MAX_SHM_NAME_LEN];
- sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
+ sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank);
info.shmSize = resources->shmSize = sizeof(struct ncclSendMem);
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
- INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
- info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
+ INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev);
+ info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo));
return ncclSuccess;
}
-ncclResult_t shmRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) {
- struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo;
+ncclResult_t shmRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
struct shmRecvResources* resources;
NCCLCHECK(ncclCalloc(&resources, 1));
- ring->recv.transportResources = resources;
+ recv->transportResources = resources;
struct shmSendConnectInfo info;
char shmName[MAX_SHM_NAME_LEN];
- sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank);
- info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize;
+ sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank);
+ info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+buffSize;
TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
- info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
+ info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash;
static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big");
memcpy(connectInfo, &info, sizeof(struct shmSendConnectInfo));
return ncclSuccess;
@@ -216,10 +192,10 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
send->conn.buff = resources->devRemHostMem->buff;
send->conn.llBuff = resources->devRemHostMem->llBuff;
send->conn.tail = &resources->devRemHostMem->tail;
- send->conn.opCount = &resources->devRemHostMem->opCount;
+ send->conn.opCountRem = &resources->devRemHostMem->opCount;
send->conn.head = &resources->devHostMem->head;
- send->conn.llHead = &resources->devHostMem->llHead;
+ send->conn.opCountLoc = &resources->devHostMem->opCount;
return ncclSuccess;
}
@@ -235,12 +211,12 @@ ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
NCCLCHECK(shmUnlink(shmName));
recv->conn.head = &resources->devRemHostMem->head;
- recv->conn.llHead = &resources->devRemHostMem->llHead;
+ recv->conn.opCountRem = &resources->devRemHostMem->opCount;
recv->conn.buff = resources->devHostMem->buff;
recv->conn.llBuff = resources->devHostMem->llBuff;
recv->conn.tail = &resources->devHostMem->tail;
- recv->conn.opCount = &resources->devHostMem->opCount;
+ recv->conn.opCountLoc = &resources->devHostMem->opCount;
return ncclSuccess;
}
@@ -262,7 +238,6 @@ ncclResult_t shmRecvFree(void* transportResources) {
struct ncclTransport shmTransport = {
"SHM",
- shmFillInfo,
shmCanConnect,
shmGetRings,
{ shmSendSetup, shmSendConnect, shmSendFree, NULL },