diff options
author | Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com> | 2020-01-07 02:48:51 +0300 |
---|---|---|
committer | Marcin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com> | 2020-01-07 02:49:13 +0300 |
commit | 7bed5ac058c3ca3c4617d133f3240b6981f95428 (patch) | |
tree | 49e7c6b2e08790b5d7b9eda124ce6cd130c87f1f | |
parent | 8e3a3f7c5b520babff49cec54a866fa3eda3a3b6 (diff) | |
parent | 3899f6e0f219b0cbf58537f791b0dd104b377750 (diff) |
merge with newest master
102 files changed, 7275 insertions, 4727 deletions
diff --git a/LICENSE.txt b/LICENSE.txt index 3593a7a..e318c66 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,5 +1,5 @@ - Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -55,7 +55,7 @@ To install NCCL on the system, create a package then install it as root. Debian/Ubuntu : ```shell $ # Install tools to create debian packages -$ sudo apt install build-essential devscripts debhelper +$ sudo apt install build-essential devscripts debhelper fakeroot $ # Build NCCL deb package $ make pkg.debian.build $ ls build/pkg/deb/ @@ -89,4 +89,4 @@ $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g <ngpus> ## Copyright -All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +All source code and accompanying documentation is copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. diff --git a/ext-net/dummy/Makefile b/ext-net/dummy/Makefile index d1eb4c5..efa841c 100644 --- a/ext-net/dummy/Makefile +++ b/ext-net/dummy/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/ext-net/dummy/plugin.c b/ext-net/dummy/plugin.c index f11b365..67d7d88 100644 --- a/ext-net/dummy/plugin.c +++ b/ext-net/dummy/plugin.c @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/makefiles/common.mk b/makefiles/common.mk index a81a1b5..37e81be 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -15,6 +15,7 @@ PROFAPI ?= 0 NVCC = $(CUDA_HOME)/bin/nvcc CUDA_LIB ?= $(CUDA_HOME)/lib64 +CUDA_INC ?= $(CUDA_HOME)/include CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) #CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev) CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) @@ -24,8 +25,7 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2) # Better define NVCC_GENCODE in your environment to the minimal set # of archs to reduce compile time. -CUDA8_GENCODE = -gencode=arch=compute_30,code=sm_30 \ - -gencode=arch=compute_35,code=sm_35 \ +CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \ -gencode=arch=compute_50,code=sm_50 \ -gencode=arch=compute_60,code=sm_60 \ -gencode=arch=compute_61,code=sm_61 @@ -42,9 +42,13 @@ else endif #$(info NVCC_GENCODE is ${NVCC_GENCODE}) -CXXFLAGS := -I$(CUDA_INC) -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden -CXXFLAGS += -Wall -Wno-sign-compare -Wno-unused-function -NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all +CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden +CXXFLAGS += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla +CXXFLAGS += -I $(CUDA_INC) +# Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors) +# 512 : 120, 640 : 96, 768 : 80, 1024 : 60 +# We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions. +NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all # Use addprefix so that we can specify more than one path NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt @@ -67,7 +71,7 @@ CXXFLAGS += -O0 -g -ggdb3 endif ifneq ($(VERBOSE), 0) -NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra +NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter CXXFLAGS += -Wall -Wextra else .SILENT: diff --git a/makefiles/formatting.mk b/makefiles/formatting.mk index 4a4ab88..a543131 100644 --- a/makefiles/formatting.mk +++ b/makefiles/formatting.mk @@ -1,5 +1,5 @@ # -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/makefiles/version.mk b/makefiles/version.mk index a8c6e3a..80f6221 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 4 -NCCL_PATCH := 2 +NCCL_MINOR := 5 +NCCL_PATCH := 6 NCCL_SUFFIX := -PKG_REVISION := 1 +PKG_REVISION := 2 diff --git a/pkg/Makefile b/pkg/Makefile index 04b23da..ab6487b 100644 --- a/pkg/Makefile +++ b/pkg/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/pkg/debian/Makefile b/pkg/debian/Makefile index 439635f..0494f3e 100644 --- a/pkg/debian/Makefile +++ b/pkg/debian/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -16,13 +16,8 @@ DEBFILES := compat copyright libnccl-dev.install rules $(DEBGEN) DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES)) PKG_TIMESTAMP := $(shell date -R) -ARCH := $(shell uname -m) -PKG_ARCH ?= $(shell uname -m | sed -e "s/x86_64/amd64/g" | sed -e "s/ppc64le/ppc64el/g") -PKG_MULTIARCH ?= $(shell $(CXX) -print-multiarch) -ifeq ($(PKG_MULTIARCH),) -# Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it -PKG_MULTIARCH := $(ARCH)-linux-gnu -endif +PKG_ARCH ?= $(shell dpkg-architecture -qDEB_HOST_ARCH) +PKG_MULTIARCH ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH) prep : $(DEBTARGETS) $(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR) diff --git a/pkg/redhat/Makefile b/pkg/redhat/Makefile index ffcc973..0808478 100644 --- a/pkg/redhat/Makefile +++ b/pkg/redhat/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/pkg/redhat/nccl.spec.in b/pkg/redhat/nccl.spec.in index f9d83a3..5fad346 100644 --- a/pkg/redhat/nccl.spec.in +++ b/pkg/redhat/nccl.spec.in @@ -7,6 +7,7 @@ Group: Development/Libraries License: BSD URL: http://developer.nvidia.com/nccl Source0: nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz +Prereq: /sbin/ldconfig %description NCCL (pronounced "Nickel") is a stand-alone library of standard collective @@ -50,6 +51,12 @@ ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so # static install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir} +%post -p /sbin/ldconfig +%postun -p /sbin/ldconfig + +%post devel -p /sbin/ldconfig +%postun devel -p /sbin/ldconfig + %clean rm -rf $RPM_BUILD_ROOT diff --git a/pkg/srctxz/Makefile b/pkg/srctxz/Makefile index ed677fe..01cab95 100644 --- a/pkg/srctxz/Makefile +++ b/pkg/srctxz/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/pkg/srctxz/create_srctxz.sh.in b/pkg/srctxz/create_srctxz.sh.in index ae7d01f..11bdd52 100644 --- a/pkg/srctxz/create_srctxz.sh.in +++ b/pkg/srctxz/create_srctxz.sh.in @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/pkg/txz/Makefile b/pkg/txz/Makefile index fa587ef..b7d9aa5 100644 --- a/pkg/txz/Makefile +++ b/pkg/txz/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/pkg/txz/create_txz.sh.in b/pkg/txz/create_txz.sh.in index 73922e0..deae854 100644 --- a/pkg/txz/create_txz.sh.in +++ b/pkg/txz/create_txz.sh.in @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/src/Makefile b/src/Makefile index fe60b11..b11de5e 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -9,34 +9,40 @@ include ../makefiles/version.mk ##### src files INCEXPORTS := nccl.h nccl_net.h -LIBSRCFILES := init.cu channel.cu bootstrap.cu transport.cu enqueue.cu \ - misc/group.cu misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/checks.cu misc/trees.cu \ - transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \ - collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu +LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc \ + misc/nvmlwrap.cc misc/ibvwrap.cc misc/utils.cc misc/argcheck.cc \ + transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \ + collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \ + graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc ##### lib files LIBNAME := libnccl.so STATICLIBNAME := libnccl_static.a +##### pkgconfig files +PKGCONFIGFILE := nccl.pc ##### dirs BUILDDIR ?= $(abspath ../build) INCDIR := $(BUILDDIR)/include LIBDIR := $(BUILDDIR)/lib OBJDIR := $(BUILDDIR)/obj +PKGDIR := $(BUILDDIR)/lib/pkgconfig ##### target files +CUDARTLIB ?= cudart_static INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%) LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR)) LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH)) STATICLIBTARGET := $(STATICLIBNAME) -LIBOBJ := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o) +PKGTARGET := $(PKGCONFIGFILE) +LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o) DEPFILES := $(LIBOBJ:%.o=%.d) -LDFLAGS += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl +LDFLAGS += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl DEVICELIB := $(BUILDDIR)/obj/collectives/device/colldevice.a ##### rules build : lib staticlib -lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) +lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET) staticlib : $(LIBDIR)/$(STATICLIBTARGET) @@ -53,7 +59,7 @@ $(INCDIR)/nccl.h : nccl.h.in # NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z)) @$(eval NCCL_VERSION := $(shell printf "%d%d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH))) mkdir -p $(INCDIR) - printf "Generating %-35s > %s\n" $< $@ + @printf "Generating %-35s > %s\n" $< $@ sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ @@ -77,37 +83,53 @@ $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB) ar cr $@ $(LIBOBJ) $(TMP)/*.o rm -Rf $(TMP) +$(PKGDIR)/nccl.pc : nccl.pc.in + mkdir -p $(PKGDIR) + @printf "Generating %-35s > %s\n" $< $@ + sed -e 's|$${nccl:Prefix}|\$(PREFIX)|g' \ + -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ + -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ + -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ + $< > $@ + $(INCDIR)/%.h : %.h @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(INCDIR) - cp -f $< $@ + install -m 644 $< $@ $(INCDIR)/nccl_%.h : include/nccl_%.h @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(INCDIR) - cp -f $< $@ + install -m 644 $< $@ + +$(PKGDIR)/%.pc : %.pc + @printf "Grabbing %-35s > %s\n" $< $@ + mkdir -p $(PKGDIR) + install -m 644 $< $@ -$(OBJDIR)/%.o : %.cu +$(OBJDIR)/%.o : %.cc @printf "Compiling %-35s > %s\n" $< $@ mkdir -p `dirname $@` - $(NVCC) -I. -I$(INCDIR) -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@ - @$(NVCC) -I. -I$(INCDIR) -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp) + $(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@ + @$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp) @sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d) @sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \ sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d) @rm -f $(@:%.o=%.d.tmp) clean : - rm -rf ${INCDIR} ${LIBDIR} ${OBJDIR} $(MAKE) -C collectives/device clean + rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR} install : lib mkdir -p $(PREFIX)/lib + mkdir -p $(PREFIX)/lib/pkgconfig mkdir -p $(PREFIX)/include - cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/ + cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/ + cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/ cp -v $(BUILDDIR)/include/* $(PREFIX)/include/ -FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h') +FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h') # Note that formatting.mk defines a new target so in order to not overwrite the default target, # it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well # as the BUILDDIR variable. diff --git a/src/bootstrap.cu b/src/bootstrap.cc index 6b1d573..11ffc35 100644 --- a/src/bootstrap.cu +++ b/src/bootstrap.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,48 +9,142 @@ #include "utils.h" #include "bootstrap.h" #include "net.h" +#include "socket.h" #include <unistd.h> #include <sys/types.h> -// Always use sockets for bootstrap -ncclNet_t* ncclBootstrapNet = &ncclNetSocket; +struct bootstrapNetComm { + int fd; +}; -static ncclResult_t bootstrapNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; } -static ncclResult_t bootstrapNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; } -static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; } -static ncclResult_t bootstrapNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; } -static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; } -static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; } -static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; } +/* Init functions */ +static char bootstrapNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS]; +static union socketAddress bootstrapNetIfAddrs[MAX_IFS]; +static int bootstrapNetIfs = -1; +pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER; + +ncclResult_t bootstrapNetInit() { + if (bootstrapNetIfs == -1) { + pthread_mutex_lock(&bootstrapNetLock); + if (bootstrapNetIfs == -1) { + bootstrapNetIfs = findInterfaces(bootstrapNetIfNames, bootstrapNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS); + if (bootstrapNetIfs <= 0) { + WARN("Bootstrap : no socket interface found"); + return ncclInternalError; + } else { + char line[1024]; + char addrline[1024]; + line[0] = '\0'; + for (int i=0; i<bootstrapNetIfs; i++) { + snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, bootstrapNetIfNames+i*MAX_IF_NAME_SIZE, + socketToString(&bootstrapNetIfAddrs[i].sa, addrline)); + } + line[1023] = '\0'; + INFO(NCCL_INIT, "Bootstrap : Using%s", line); + } + } + pthread_mutex_unlock(&bootstrapNetLock); + } + return ncclSuccess; +} + +static ncclResult_t bootstrapNetNewComm(struct bootstrapNetComm** comm) { + NCCLCHECK(ncclCalloc(comm, 1)); + (*comm)->fd = -1; + return ncclSuccess; +} + +static ncclResult_t bootstrapNetGetSocketAddr(int dev, union socketAddress* addr) { + if (dev >= bootstrapNetIfs) return ncclInternalError; + memcpy(addr, bootstrapNetIfAddrs+dev, sizeof(*addr)); + return ncclSuccess; +} + +/* Socket Interface Selection type */ +enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 }; + +static ncclResult_t bootstrapNetListen(int dev, ncclNetHandle_t* netHandle, void** listenComm) { + union socketAddress* connectAddr = (union socketAddress*) netHandle; + static_assert(sizeof(union socketAddress) < NCCL_NET_HANDLE_MAXSIZE, "union socketAddress size is too large"); + // if dev >= 0, listen based on dev + if (dev >= 0) { + NCCLCHECK(bootstrapNetGetSocketAddr(dev, connectAddr)); + } else if (dev == findSubnetIf) { + // handle stores a remote address + // need to find a local addr that is in the same network as the remote addr + union socketAddress localAddr; + char ifName[MAX_IF_NAME_SIZE]; + if (findInterfaceMatchSubnet(ifName, &localAddr, connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) { + WARN("NET/Socket : No usable listening interface found"); + return ncclSystemError; + } + // pass the local address back + memcpy(connectAddr, &localAddr, sizeof(localAddr)); + } // Otherwise, handle stores a local address + struct bootstrapNetComm* comm; + NCCLCHECK(bootstrapNetNewComm(&comm)); + NCCLCHECK(createListenSocket(&comm->fd, connectAddr)); + *listenComm = comm; + return ncclSuccess; +} + +static ncclResult_t bootstrapNetConnect(int dev, ncclNetHandle_t* netHandle, void** sendComm) { + union socketAddress* connectAddr = (union socketAddress*) netHandle; + struct bootstrapNetComm* comm; + NCCLCHECK(bootstrapNetNewComm(&comm)); + NCCLCHECK(connectAddress(&comm->fd, connectAddr)); + *sendComm = comm; + return ncclSuccess; +} -// Additional sync functions based on async + test for bootstrap, using host ptrs. +static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) { + struct bootstrapNetComm* lComm = (struct bootstrapNetComm*)listenComm; + struct bootstrapNetComm* rComm; + NCCLCHECK(bootstrapNetNewComm(&rComm)); + struct sockaddr_in sockaddr; + socklen_t socklen = sizeof(struct sockaddr_in); + SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd); + *recvComm = rComm; + return ncclSuccess; +} + +static ncclResult_t bootstrapNetClose(void* opaqueComm) { + struct bootstrapNetComm* comm = (struct bootstrapNetComm*)opaqueComm; + if (comm) { + close(comm->fd); + free(comm); + } + return ncclSuccess; +} + +static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(bootstrapNetClose(sendComm)); return ncclSuccess; } +static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(bootstrapNetClose(recvComm)); return ncclSuccess; } +static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(bootstrapNetClose(listenComm)); return ncclSuccess; } + +// Additional sync functions static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) { - void* request, *mhandle; - NCCLCHECK(ncclBootstrapNet->regMr(sendComm, data, size, NCCL_PTR_HOST, &mhandle)); - NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, mhandle, &request)); - NCCLCHECK(ncclBootstrapNet->deregMr(sendComm, mhandle)); - int done = 0; - while (!done) NCCLCHECK(bootstrapNetTest(request, &done, NULL)); + struct bootstrapNetComm* comm = (struct bootstrapNetComm*)sendComm; + NCCLCHECK(socketSend(comm->fd, &size, sizeof(int))); + NCCLCHECK(socketSend(comm->fd, data, size)); return ncclSuccess; } static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) { - void* request, *mhandle; - NCCLCHECK(ncclBootstrapNet->regMr(recvComm, data, size, NCCL_PTR_HOST, &mhandle)); - NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, mhandle, &request)); - NCCLCHECK(ncclBootstrapNet->deregMr(recvComm, mhandle)); - int done = 0; - while (!done) NCCLCHECK(bootstrapNetTest(request, &done, NULL)); + struct bootstrapNetComm* comm = (struct bootstrapNetComm*)recvComm; + int recvSize; + NCCLCHECK(socketReceive(comm->fd, &recvSize, sizeof(int))); + if (recvSize > size) { + WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size); + return ncclInternalError; + } + NCCLCHECK(socketReceive(comm->fd, data, std::min(recvSize, size))); return ncclSuccess; } -struct extId { - ncclNetHandle_t extHandleRoot; - void* extListenComm; - uint64_t hostHash; - pid_t pid; - int fd; - pthread_t boostrapThread; -}; +ncclResult_t bootstrapNetCreateHandle(ncclNetHandle_t* netHandle, const char* str) { + union socketAddress* connectAddr = (union socketAddress*) netHandle; + NCCLCHECK(GetSocketAddrFromString(connectAddr, str)); + return ncclSuccess; +} struct extInfo { int rank; @@ -69,9 +163,8 @@ static ncclResult_t setFilesLimit() { return ncclSuccess; } -static void *bootstrapRoot(void* commId) { +static void *bootstrapRoot(void* listenComm) { struct extInfo info; - struct extId* id = (struct extId*)commId; ncclNetHandle_t *rankHandles = NULL; ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange ncclNetHandle_t zero = { 0 }; // for sanity checking @@ -83,7 +176,7 @@ static void *bootstrapRoot(void* commId) { /* Receive addresses from all ranks */ int nranks = 0, c = 0; do { - NCCLCHECKGOTO(bootstrapNetAccept(id->extListenComm, &tmpComm), res, out); + NCCLCHECKGOTO(bootstrapNetAccept(listenComm, &tmpComm), res, out); NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out); NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out); @@ -108,22 +201,22 @@ static void *bootstrapRoot(void* commId) { memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t)); ++c; + TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks); } while (c < nranks); - TRACE(NCCL_INIT, "COLLECTED HANDLES"); + TRACE(NCCL_INIT, "COLLECTED ALL %d HANDLES", nranks); // Send the connect handle for the next rank in the AllGather ring for (int r=0; r<nranks; ++r) { int next = (r+1) % nranks; void *tmpSendComm; - NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot[r], &tmpSendComm), res, out); + NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot+r, &tmpSendComm), res, out); NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out); NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out); } - TRACE(NCCL_INIT, "SENT OUT HANDLES"); + TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks); out: - bootstrapNetCloseListen(id->extListenComm); - free(commId); + bootstrapNetCloseListen(listenComm); if (rankHandles) free(rankHandles); if (rankHandlesRoot) free(rankHandlesRoot); @@ -131,31 +224,28 @@ out: return NULL; } -ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) { - struct extId* id = (struct extId*)commId; - id->hostHash = getHostHash(); - NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm)); - ncclUniqueId* threadIdCopy; - NCCLCHECK(ncclCalloc(&threadIdCopy, 1)); - memcpy(threadIdCopy, id, sizeof(ncclUniqueId)); - pthread_create(&id->boostrapThread, NULL, bootstrapRoot, (void *)threadIdCopy); +ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) { + ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id; + void* listenComm; + NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, netHandle, &listenComm)); + pthread_t thread; + pthread_create(&thread, NULL, bootstrapRoot, listenComm); return ncclSuccess; } -ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) { - static_assert(sizeof(extId) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId"); - extId* id = (extId*)out; +ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) { + static_assert(sizeof(ncclNetHandle_t) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId"); + memset(id, 0, sizeof(ncclUniqueId)); + ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id; char* env = getenv("NCCL_COMM_ID"); if (env) { - if (ncclSocketCreateHandle(&id->extHandleRoot, env) != 0) { + if (bootstrapNetCreateHandle(netHandle, env) != 0) { WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>"); return ncclInvalidArgument; } - id->pid = -1; } else { - id->pid = getpid(); - NCCLCHECK(bootstrapCreateRoot(out, false)); + NCCLCHECK(bootstrapCreateRoot(id, false)); } return ncclSuccess; @@ -178,9 +268,9 @@ struct extState { int dev; }; -ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** commState) { - struct extId* id = (struct extId*)commId; - bool idFromEnv = id->pid < 0; +ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState) { + ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id; + bool idFromEnv = getenv("NCCL_COMM_ID") != NULL; struct extState* state; NCCLCHECK(ncclCalloc(&state, 1)); state->rank = rank; @@ -195,8 +285,8 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co void *tmpSendComm, *tmpRecvComm; // Pass the remote address to listen via info if (idFromEnv) { - memcpy(&info.extHandleListen, &id->extHandleRoot, sizeof(ncclNetHandle_t)); - memcpy(&info.extHandleListenRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t)); + memcpy(&info.extHandleListen, netHandle, sizeof(ncclNetHandle_t)); + memcpy(&info.extHandleListenRoot, netHandle, sizeof(ncclNetHandle_t)); } // listen will return the local address via info (specify interface type 'findSubnetIf') state->dev = idFromEnv ? findSubnetIf : 0; @@ -215,7 +305,7 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co } // send info on my listening socket to root - NCCLCHECK(bootstrapNetConnect(state->dev, id->extHandleRoot, &tmpSendComm)); + NCCLCHECK(bootstrapNetConnect(state->dev, netHandle, &tmpSendComm)); NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info))); NCCLCHECK(bootstrapNetCloseSend(tmpSendComm)); @@ -226,7 +316,7 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm)); NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot)); - NCCLCHECK(bootstrapNetConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm)); + NCCLCHECK(bootstrapNetConnect(state->dev, &extHandleNext, &state->extBstrapRingSendComm)); // Accept the connect request from the previous rank in the AllGather ring NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm)); @@ -269,7 +359,7 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) { ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) { struct extState* state = (struct extState*)commState; void* tmpSendComm; - NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles[peer], &tmpSendComm)); + NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles+peer, &tmpSendComm)); NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int))); NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size)); NCCLCHECK(bootstrapNetCloseSend(tmpSendComm)); @@ -357,3 +447,13 @@ ncclResult_t bootstrapClose(void* commState) { return ncclSuccess; } + +ncclResult_t bootstrapAbort(void* commState) { + struct extState* state = (struct extState*)commState; + bootstrapNetCloseListen(state->extBstrapListenComm); + bootstrapNetCloseSend(state->extBstrapRingSendComm); + bootstrapNetCloseRecv(state->extBstrapRingRecvComm); + free(state->peerBstrapHandles); + free(state); + return ncclSuccess; +} diff --git a/src/channel.cu b/src/channel.cc index 937e84e..b053e5b 100644 --- a/src/channel.cu +++ b/src/channel.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -47,5 +47,10 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) { if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources)); if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources)); } + + // Free the peer structures. + CUDACHECK(cudaFree(channel->devPeers)); + free(channel->peers); + return ncclSuccess; } diff --git a/src/collectives/all_gather.cu b/src/collectives/all_gather.cc index db21dee..348c176 100644 --- a/src/collectives/all_gather.cu +++ b/src/collectives/all_gather.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/all_reduce.cu b/src/collectives/all_reduce.cc index 1492c90..7796d5b 100644 --- a/src/collectives/all_reduce.cu +++ b/src/collectives/all_reduce.cc @@ -1,11 +1,10 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "enqueue.h" -#include "collectives.h" NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); diff --git a/src/collectives/broadcast.cu b/src/collectives/broadcast.cc index 6a3d0a8..042301b 100644 --- a/src/collectives/broadcast.cu +++ b/src/collectives/broadcast.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/Makefile b/src/collectives/device/Makefile index 8e92596..001059c 100644 --- a/src/collectives/device/Makefile +++ b/src/collectives/device/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -68,4 +68,4 @@ $(DEVOBJ) : $(LIBOBJ) $(NVCC) $(NVCUFLAGS) -dlink $^ -o $@ clean: - rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(STATICLIB) test + rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(RULESFILE) $(STATICLIB) diff --git a/src/collectives/device/all_gather.cu b/src/collectives/device/all_gather.cu index 530bf14..109c341 100644 --- a/src/collectives/device/all_gather.cu +++ b/src/collectives/device/all_gather.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h index 36809c9..0ad5ba9 100644 --- a/src/collectives/device/all_gather.h +++ b/src/collectives/device/all_gather.h @@ -1,33 +1,33 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" template<int UNROLL, class FUNC, typename T> __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = blockDim.x - 1; + const int nthreads = args->nThreads-WARP_SIZE; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; const int nranks = comm->nRanks; const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); - const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS; + const int chunkSize = stepSize * ALLGATHER_CHUNKSTEPS; const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC> - prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount); + ncclPrimitives<UNROLL, ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS, T, 1, 1, FUNC> + prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); @@ -74,7 +74,7 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; @@ -129,3 +129,67 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) { template<int UNUSED, class FUNC, typename T> __device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { } + +#include "prims_ll128.h" +template<int UNUSED, class FUNC, typename T> +__device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int bid = args->bid; + const int nthreads = args->nThreads; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; + + ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); + + const ssize_t size = args->N; + //const int rank = comm->rank; + const int nranks = comm->nRanks; + ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. + const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2; + + const ssize_t loopSize = args->nChannels*chunkSize; + + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize); + + ssize_t chunkOffset = gridOffset + bid*chunkSize; + + /////////////// begin AllGather steps /////////////// + ssize_t offset; + int nelem = min(chunkSize, size-chunkOffset); + int rankDest; + + // step 0: push data to next GPU + rankDest = ring->devUserRanks[0]; + offset = chunkOffset + rankDest * size; + + if (thisInput + chunkOffset == thisOutput + offset) { // In place + LLprims.send(thisInput+chunkOffset, nelem); + } else { + LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem); + } + + // k-2 steps: copy to next GPU + for (int j=1; j<nranks-1; ++j) { + rankDest = ring->devUserRanks[nranks-j]; + offset = chunkOffset + rankDest * size; + + LLprims.recvCopySend(thisOutput+offset, nelem); + } + + // step k-1: final store + rankDest = ring->devUserRanks[1]; + offset = chunkOffset + rankDest * size; + + LLprims.recv(thisOutput+offset, nelem); + } +} + +template<int UNUSED, class FUNC, typename T> +__device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { } diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu index aaa96b4..85d007e 100644 --- a/src/collectives/device/all_reduce.cu +++ b/src/collectives/device/all_reduce.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h index ea89a71..2449c2b 100644 --- a/src/collectives/device/all_reduce.h +++ b/src/collectives/device/all_reduce.h @@ -1,19 +1,19 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" template<int UNROLL, class FUNC, typename T> __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = blockDim.x - 1; + const int nthreads = args->nThreads-WARP_SIZE; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; @@ -27,7 +27,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) { T * __restrict__ thisOutput = (T*)args->ThisOutput; ncclPrimitives<UNROLL, ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS, T, 1, 1, FUNC> - prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount); + prims(tid, args->nThreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) { int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels)); @@ -85,23 +85,28 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) { template<int UNROLL, class FUNC, typename T> __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = blockDim.x - 1; + const int nthreads = args->nThreads-WARP_SIZE; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; - struct ncclTree* tree = &channel->tree; const ssize_t size = args->N; const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); - const int chunkSize = args->lastChunkSize; + int chunkSize = args->lastChunkSize; + const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T); const ssize_t loopSize = args->nChannels*chunkSize; + if (loopSize > size) { + chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize; + } + // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; do { + struct ncclTree* tree = &channel->treeUp; // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) - ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount); + ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { // Up ssize_t offset = gridOffset + bid*chunkSize; @@ -117,8 +122,9 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) { } while(0); do { + struct ncclTree* tree = &channel->treeDn; // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) - ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount); + ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { // Down ssize_t offset = gridOffset + bid*chunkSize; @@ -139,7 +145,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; @@ -149,6 +155,8 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { //const int rank = comm->rank; const int nranks = comm->nRanks; ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); + const ssize_t minChunkSize = nthreads * (sizeof(uint64_t)) / sizeof(T); + const ssize_t loopSize = args->nChannels*nranks*chunkSize; // Compute pointers @@ -156,10 +164,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { T * __restrict__ thisOutput = (T*)args->ThisOutput; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - if (size-gridOffset < loopSize) { - chunkSize = args->lastChunkSize; - } - ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize; + chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize); /////////////// begin AllReduce steps /////////////// ssize_t offset; @@ -168,7 +173,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { // step 0: push data to next GPU slice = ring->devUserRanks[nranks-1]; - offset = chunkOffset + slice * chunkSize; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.send(thisInput+offset, nelem); @@ -176,7 +181,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { // k-2 steps: reduce and copy to next GPU for (int j=2; j<nranks; ++j) { slice = ring->devUserRanks[nranks-j]; - offset = chunkOffset + slice * chunkSize; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.recvReduceSend(thisInput+offset, nelem); @@ -185,7 +190,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { // step k-1: reduce this buffer and data, which will produce the final // result that we store in this data and push to the next GPU slice = ring->devUserRanks[0]; - offset = chunkOffset + slice * chunkSize; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem); @@ -193,7 +198,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { // k-2 steps: copy to next GPU for (int j=1; j<nranks-1; ++j) { slice = ring->devUserRanks[nranks-j]; - offset = chunkOffset + slice * chunkSize; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); LLprims.recvCopySend(thisOutput+offset, nelem); @@ -201,7 +206,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { // Make final copy from buffer to dest. slice = ring->devUserRanks[1]; - offset = chunkOffset + slice * chunkSize; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; nelem = min(chunkSize, size-offset); // Here we need to copy from buffer to this output. @@ -214,18 +219,23 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = args->nThreads; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; - struct ncclTree* tree = &channel->tree; const ssize_t size = args->N; ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); + const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T); const ssize_t loopSize = args->nChannels*chunkSize; + if (loopSize > size) { + chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize; + } + // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; do { + struct ncclTree* tree = &channel->treeUp; // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) ncclLLPrimitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { @@ -243,6 +253,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) { } while(0); do { + struct ncclTree* tree = &channel->treeDn; // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) ncclLLPrimitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { @@ -259,3 +270,141 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) { } } while(0); } + +#include "prims_ll128.h" +template<int UNUSED, class FUNC, typename T> +__device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int bid = args->bid; + const int nthreads = args->nThreads; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; + + ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); + + const ssize_t size = args->N; + //const int rank = comm->rank; + const int nranks = comm->nRanks; + ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. + const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2; + + const ssize_t loopSize = args->nChannels*nranks*chunkSize; + + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + chunkSize = min(DIVUP(size-gridOffset, args->nChannels*nranks*minChunkSize)*minChunkSize, chunkSize); + + /////////////// begin AllReduce steps /////////////// + ssize_t offset; + int nelem; + int slice; + + // step 0: push data to next GPU + slice = ring->devUserRanks[nranks-1]; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + nelem = min(chunkSize, size-offset); + + LLprims.send(thisInput+offset, nelem); + + // k-2 steps: reduce and copy to next GPU + for (int j=2; j<nranks; ++j) { + slice = ring->devUserRanks[nranks-j]; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + nelem = min(chunkSize, size-offset); + + LLprims.recvReduceSend(thisInput+offset, nelem); + } + + // step k-1: reduce this buffer and data, which will produce the final + // result that we store in this data and push to the next GPU + slice = ring->devUserRanks[0]; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + nelem = min(chunkSize, size-offset); + + LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem); + + // k-2 steps: copy to next GPU + for (int j=1; j<nranks-1; ++j) { + slice = ring->devUserRanks[nranks-j]; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + nelem = min(chunkSize, size-offset); + + LLprims.recvCopySend(thisOutput+offset, nelem); + } + + // Make final copy from buffer to dest. + slice = ring->devUserRanks[1]; + offset = gridOffset + (slice*args->nChannels+bid) * chunkSize; + nelem = min(chunkSize, size-offset); + + // Here we need to copy from buffer to this output. + LLprims.recv(thisOutput+offset, nelem); + } +} + +template<int UNUSED, class FUNC, typename T> +__device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int nthreads = args->nThreads; + const int bid = args->bid; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclTree* treeUp = &channel->treeUp; + struct ncclTree* treeDn = &channel->treeDn; + const ssize_t size = args->N; + ssize_t chunkSize = args->lastChunkSize; + const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/8; + const ssize_t loopSize = args->nChannels*chunkSize; + int nthreadsSplit = NCCL_LL128_SPLIT(nthreads); + + if (loopSize > size) { + chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize; + } + + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + if (treeUp->up == -1) { + // ReduceAndBroadcast : max number of recv is 3, max number of send is 3 + ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, NCCL_MAX_TREE_ARITY> LLprims(tid, nthreads, treeUp->down, treeDn->down, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem); + } + } else { + if (tid < nthreadsSplit) { + // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) + ncclLL128Primitives<T, FUNC, NCCL_MAX_TREE_ARITY, 1> LLprims(tid, nthreadsSplit, treeUp->down, &treeUp->up, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Up + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (treeUp->down[0] == -1) { + LLprims.send(thisInput+offset, nelem); + } else { + LLprims.recvReduceSend(thisInput+offset, nelem); + } + } + } else { + // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) + ncclLL128Primitives<T, FUNC, 1, NCCL_MAX_TREE_ARITY> LLprims(tid-nthreadsSplit, nthreads-nthreadsSplit, &treeDn->up, treeDn->down, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Down + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (treeDn->down[0] == -1) { + LLprims.recv(thisOutput+offset, nelem); + } else { + LLprims.recvCopySend(thisOutput+offset, nelem); + } + } + } + } +} diff --git a/src/collectives/device/broadcast.cu b/src/collectives/device/broadcast.cu index b83ee70..8c8dbb6 100644 --- a/src/collectives/device/broadcast.cu +++ b/src/collectives/device/broadcast.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h index fb18312..de8b989 100644 --- a/src/collectives/device/broadcast.h +++ b/src/collectives/device/broadcast.h @@ -1,19 +1,19 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" template<int UNROLL, class FUNC, typename T> __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = blockDim.x - 1; + const int nthreads = args->nThreads-WARP_SIZE; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; @@ -29,7 +29,7 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) { T * __restrict__ thisOutput = (T*)args->ThisOutput; ncclPrimitives<UNROLL, BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS, T, 1, 1, FUNC> - prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); + prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); @@ -59,7 +59,7 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; @@ -100,3 +100,51 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) { template<int UNUSED, class FUNC, typename T> __device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { } + +#include "prims_ll128.h" +template<int UNUSED, class FUNC, typename T> +__device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int bid = args->bid; + const int nthreads = args->nThreads; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; + + ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); + + const ssize_t size = args->N; + const int rank = ring->devUserRanks[0]; + const int nextRank = ring->devUserRanks[1]; + const int root = args->root; + + ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + + const ssize_t loopSize = args->nChannels*chunkSize; + + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize); + ssize_t offset = gridOffset + bid*chunkSize; + + int nelem = min(chunkSize, size-offset); + if (rank == root) { + if (thisInput == thisOutput) { + LLprims.send(thisInput+offset, nelem); + } else { + LLprims.copySend(thisInput + offset, thisOutput + offset, nelem); + } + } else if (nextRank == root) { + LLprims.recv(thisOutput + offset, nelem); + } else { + LLprims.recvCopySend(thisOutput + offset, nelem); + } + } +} + +template<int UNUSED, class FUNC, typename T> +__device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { } diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index e4aecbd..46eb9f5 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,9 +7,8 @@ #ifndef NCCL_DEVICE_COMMON_H_ #define NCCL_DEVICE_COMMON_H_ -#include "../collectives.h" -#include "core.h" -#include "nccl.h" +#include "collectives.h" +#include "devcomm.h" // Exit If Abort Barrier across CTA: make sure all threads exit consistently // Each thread sets a predicate to true if abort == 1 @@ -31,17 +30,19 @@ extern __device__ ncclKern_t ncclFuncs[]; static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) { int* d = (int*)dst; int* s = (int*)src; - // When aggregation is effective, if some threads have aborted inside the LL kernel, - // make sure the rest of the threads abort as well - exitIfAbortBarrier(0); for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o]; - __syncthreads(); } -static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid) { +static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* hostColl, int tid, struct ncclDevComm* comm) { + // Check whether the last operation was aborted and make sure all threads exit + int abort = tid == 0 ? *(comm->abortFlag) : 0; + exitIfAbortBarrier(abort); load_parallel(localColl, hostColl, sizeof(struct ncclColl), tid); + __syncthreads(); if (tid == 0) hostColl->active = 0; } +extern __device__ volatile uint64_t* ncclShmem; + /* Functions for aggregation case */ #define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \ __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \ @@ -51,13 +52,14 @@ __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \ #if NCCL_OP == 0 /* Kernels with the first operation inlined */ #define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \ -__launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ int tid = threadIdx.x; \ int bid = blockIdx.x; \ + __shared__ volatile uint64_t shmem[NCCL_LL128_SHMEM_SIZE]; \ + ncclShmem = shmem; \ __shared__ struct ncclColl localColl; \ \ - struct ncclComm* comm = firstColl.args.comm; \ + struct ncclDevComm* comm = firstColl.args.comm; \ struct ncclChannel* channel = comm->channels+bid; \ struct ncclColl* c; \ if (bid == 0) { \ @@ -65,7 +67,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ c = &firstColl; \ } else { \ c = &localColl; \ - load_coll(c, channel->devCollectives+channel->collFifoHead, tid); \ + load_coll(c, channel->devCollectives+channel->collFifoHead, tid, comm); \ } \ while (1) { \ if (tid < c->args.nThreads) { \ @@ -84,7 +86,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ \ /* Load next collective operation*/ \ c = &localColl; /* for bid 0 */ \ - load_coll(c, channel->devCollectives+nextIndex, tid); \ + load_coll(c, channel->devCollectives+nextIndex, tid, comm); \ } \ } #else @@ -93,13 +95,14 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ // Only generate inline kernels for LL #define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \ - IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \ IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \ - IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1, al)) \ + IMPL_COLL_FUNC(coll##LL128, op, ncclFunc, dtype, ctype) \ + IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \ + IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, al, NCCL_PROTO_LL)) \ #define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \ - IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0) \ - IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 1) + IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \ + IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING) #if NCCL_TYPE == 0 #define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h index e1fb096..aa1e936 100644 --- a/src/collectives/device/common_kernel.h +++ b/src/collectives/device/common_kernel.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,7 +7,7 @@ #ifndef NCCL_COMMON_KERNEL_H_ #define NCCL_COMMON_KERNEL_H_ -#include "core.h" +#include "devcomm.h" #include <cstdio> #include <cstdint> @@ -263,8 +263,6 @@ __device__ __forceinline__ void ReduceCopyMulti(const int tid, const int nthread } } -#define WARP_SIZE 32 - template<class FUNC, typename T, int UNROLL, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS> __device__ __forceinline__ void ReduceCopy128bMulti( const int w, const int nw, const int t, int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS], diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu index ea06b68..034fe96 100644 --- a/src/collectives/device/functions.cu +++ b/src/collectives/device/functions.cu @@ -1,20 +1,23 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "collectives.h" #include "common.h" +__device__ volatile uint64_t* ncclShmem; + #define NCCL_FUNC5(coll, op, dtype) \ - NCCL_COLL_NAME(coll, op, dtype), \ - NCCL_COLL_NAME(coll##LL, op, dtype) + NCCL_COLL_NAME(coll##LL, op, dtype), \ + NCCL_COLL_NAME(coll##LL128, op, dtype), \ + NCCL_COLL_NAME(coll, op, dtype) #define NCCL_FUNC4(coll, op, dtype) \ - NCCL_FUNC5(coll##Ring, op, dtype), \ - NCCL_FUNC5(coll##Tree, op, dtype) + NCCL_FUNC5(coll##Tree, op, dtype), \ + NCCL_FUNC5(coll##Ring, op, dtype) // Must be consistent with ncclDataType_t #define NCCL_FUNCS3A(coll, op) \ @@ -50,7 +53,7 @@ NCCL_FUNCS3B(coll, copy), \ NCCL_FUNCS3B(coll, copy) -// Must be consistent with ncclColl_t +// Must be consistent with ncclFunc_t #define NCCL_FUNCS() { \ NCCL_FUNCS2B(ncclBroadcast), \ NCCL_FUNCS2A(ncclReduce), \ @@ -59,7 +62,7 @@ NCCL_FUNCS2A(ncclAllReduce) } // Must be consistent with the ncclFuncSet enum -__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = { +__device__ ncclKern_t ncclFuncs[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = { // Don't try to initialize the host shadow copy of this device-side global // variable. There is no host pointer to a device-side function, which // confuses clang. This will be fixed in the next clang release. diff --git a/src/collectives/device/gen_rules.sh b/src/collectives/device/gen_rules.sh index 3942c8c..4413213 100755 --- a/src/collectives/device/gen_rules.sh +++ b/src/collectives/device/gen_rules.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/src/collectives/device/op128.h b/src/collectives/device/op128.h new file mode 100644 index 0000000..9405dc2 --- /dev/null +++ b/src/collectives/device/op128.h @@ -0,0 +1,36 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef OP128_H_ +#define OP128_H_ + +inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) { + asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" + : "=l"(v0), "=l"(v1) : "l"(ptr)); +} + +inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) { + asm volatile("st.volatile.global.v2.u64 [%2], {%0,%1};" + :: "l"(v0), "l"(v1), "l"(ptr)); +} + +inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) { + uint64_t* shmemAsmPtr; + asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr)); + return shmemAsmPtr; +} + +inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) { + asm volatile("ld.volatile.shared.v2.u64 {%0,%1}, [%2];" + : "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr)); +} + +inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) { + asm volatile("st.volatile.shared.v2.u64 [%2], {%0,%1};" + :: "l"(v0), "l"(v1), "l"(shmemAsmPtr)); +} + +#endif diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h index c5aaf54..aa3d20d 100644 --- a/src/collectives/device/primitives.h +++ b/src/collectives/device/primitives.h @@ -37,20 +37,32 @@ class ncclPrimitives { private: const int tid; const int nthreads; + const int wid; + const int stepSize; int nrecv = 0; int nsend = 0; - const int stepSize; - struct ncclConnInfo* recvConn[NRECV]; - struct ncclConnInfo* sendConn[NSEND]; - volatile uint64_t* waitPtr; + struct ncclConnInfo* recvConn = NULL; + volatile uint64_t* recvConnHeadPtr = NULL; + uint64_t recvConnHead; + volatile uint64_t* recvConnTailPtr = NULL; + uint64_t recvConnTail; + uint64_t recvConnTailCache; // Cache last seen value + + struct ncclConnInfo* sendConn = NULL; + volatile int* sendConnFifoPtr = NULL; + volatile uint64_t* sendConnTailPtr = NULL; + uint64_t sendConnTail; + volatile uint64_t* sendConnHeadPtr = NULL; + uint64_t sendConnHead; + uint64_t sendConnHeadCache; // Cache last seen value + uint64_t recvStep[NRECV]; uint64_t sendStep[NSEND]; - uint64_t sendConnHead[NSEND]; const T* recvDirectBuff[NRECV]; T* sendDirectBuff[NSEND]; const T* recvBuff[NRECV]; T* sendBuff[NSEND]; - struct ncclComm* comm; + struct ncclDevComm* comm; inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; } inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; } @@ -60,15 +72,18 @@ class ncclPrimitives { inline __device__ void barrier() { asm volatile ("bar.sync 1, %0;" :: "r"(nthreads)); } + inline __device__ void subBarrier() { + asm volatile ("bar.sync 2, %0;" :: "r"(nthreads-WARP_SIZE)); + } uint32_t mismatch = 0; const uint64_t opCount; - inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) { + inline __device__ void checkMismatch(struct ncclConnInfo* conn) { if (mismatch) { // In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch *(comm->fatalDevError) = ncclDevAssertedMismatch; - } else if (remoteOpCount && *remoteOpCount > opCount) { + } else if (conn && *conn->opCountRem > opCount) { mismatch += 1; } } @@ -76,49 +91,55 @@ class ncclPrimitives { uint32_t spins = 0; uint32_t abort = 0; - inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) { + inline __device__ int checkAbort(int i, int send) { spins++; - if (spins == SPINS_BEFORE_CHECK_ABORT) { + if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) { abort = *(comm->abortFlag); - checkMismatch(remoteOpCount); + if (wid == i) checkMismatch(send ? sendConn : recvConn); spins = 0; } return abort; } - inline __device__ void waitRecv(int i) { + inline __device__ void waitSend(int nbytes) { spins = 0; mismatch = 0; - recvStep[i] += SLICESTEPS; - if (tid == i) { - while (*(waitPtr) < recvStep[i]) { - if (checkAbort(recvConn[i]->opCountRem)) break; + if (sendConnHeadPtr) { + while (sendConnHeadCache + NCCL_STEPS < sendConnHead + SLICESTEPS) { + sendConnHeadCache = *sendConnHeadPtr; + if (checkAbort(wid, 1)) break; + } + if (sendConnFifoPtr) { + sendConnFifoPtr[sendConnHead%NCCL_STEPS] = nbytes; } + sendConnHead += SLICESTEPS; } } - inline __device__ void waitSend(int i) { + inline __device__ void waitRecv() { spins = 0; mismatch = 0; - sendStep[i] += SLICESTEPS; - if (tid == WARP_SIZE+i) { - while (sendConnHead[i] + NCCL_STEPS < sendStep[i]) { - sendConnHead[i] = *waitPtr; - if (checkAbort(sendConn[i]->opCountRem)) break; + if (recvConnTailPtr) { + while (recvConnTailCache < recvConnTail + SLICESTEPS) { + recvConnTailCache = *recvConnTailPtr; + if (checkAbort(wid, 0)) break; } + recvConnTail += SLICESTEPS; } } - inline __device__ void postRecv(int i) { - *(recvConn[i]->head) = recvStep[i] += SLICESTEPS; + inline __device__ void incRecv(int i) { + recvStep[i] += SLICESTEPS; } - - inline __device__ void postSend(int i) { - *(sendConn[i]->tail) = sendStep[i] += SLICESTEPS; + inline __device__ void postRecv() { + if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += SLICESTEPS; } - inline __device__ void postSendSize(int i, int size) { - if (sendConn[i]->fifo) sendConn[i]->fifo[sendStep[i]%NCCL_STEPS] = size; + inline __device__ void incSend(int i) { + sendStep[i] += SLICESTEPS; + } + inline __device__ void postSend() { + if (sendConnTailPtr) *sendConnTailPtr = sendConnTail += SLICESTEPS; } template <int DIRECTRECV> @@ -131,11 +152,22 @@ class ncclPrimitives { return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i); } + template <int DIRECTRECV> + inline __device__ int directRecvInc(int i, int directInc, int sliceInc) { + return DIRECTRECV && recvDirectBuff[i] ? directInc : sliceInc; + } + + template <int DIRECTSEND> + inline __device__ int directSendInc(int i, int directInc, int sliceInc) { + return DIRECTSEND && sendDirectBuff[i] ? directInc : sliceInc; + } + template <int DIRECTRECV, int DIRECTSEND, int RECV, int SEND, int SRC, int DST> inline __device__ void GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) { int offset = 0; - int sliceSize = stepSize * SLICESTEPS; + int sliceSize = stepSize*SLICESTEPS; + int dataSize = max(DIVUP(nelem, 16*SLICESPERCHUNK)*16, sliceSize/32); const T* srcs[RECV*NRECV+SRC]; srcs[0] = SRC ? srcPtr : directRecvPtr<DIRECTRECV>(0, directOffset); @@ -151,101 +183,126 @@ class ncclPrimitives { for (int i=1; i<NSEND && i<nsend; i++) dsts[DST+i] = directSendPtr<DIRECTSEND>(i, directOffset); } - #pragma unroll 1 + bool syncThread = tid >= nthreads-WARP_SIZE; + + #pragma unroll for (int slice=0; slice<SLICESPERCHUNK; ++slice) { - int realSize = max(0, min(sliceSize, nelem-offset)); - if (tid < nthreads) { - FOR_SEND(waitSend); - FOR_RECV(waitRecv); + int realSize = max(0, min(dataSize, nelem-offset)); + if (!syncThread) { + if (SEND) waitSend(realSize*sizeof(T)); + if (RECV) waitRecv(); if (realSize > 0) { - barrier(); + subBarrier(); if (DIRECTRECV && recvDirectBuff[0]) { // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy if (SEND) { - ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads, 1, srcs, nsend, dsts+1, realSize); + ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, NSEND>(tid, nthreads-WARP_SIZE, 1, srcs, nsend, dsts+1, realSize); } } else { - ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize); + ReduceOrCopyMulti<UNROLL, FUNC, T, RECV+SRC, RECV*NRECV+SRC, SEND+DST, SEND*NSEND+DST>(tid, nthreads-WARP_SIZE, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize); } } - exitIfAbortBarrier(abort); - } else { - exitIfAbortBarrier(abort); - FOR_SEND(postSendSize, realSize*sizeof(T)); - if (SEND) __threadfence_system(); - FOR_SEND(postSend); - FOR_RECV(postRecv); } - for (int i=0; i<RECV*NRECV+SRC; i++) srcs[i] += sliceSize; - for (int i=0; i<SEND*NSEND+DST; i++) dsts[i] += sliceSize; - offset += sliceSize; + barrier(); + FOR_SEND(incSend); + FOR_RECV(incRecv); + if (syncThread) { + if (SEND) { + if (realSize > 0 && wid == 0) __threadfence_system(); + __syncwarp(); + postSend(); + } + if (RECV) postRecv(); + } + srcs[0] += SRC ? realSize : directRecvInc<DIRECTRECV>(0, realSize, sliceSize); + for (int i=1-SRC; i<RECV*NRECV; i++) srcs[SRC+i] += sliceSize; + dsts[0] += DST ? realSize : directSendInc<DIRECTSEND>(0, realSize, sliceSize); + for (int i=1-DST; i<SEND*NSEND; i++) dsts[DST+i] += directSendInc<DIRECTSEND>(i, realSize, sliceSize); + offset += realSize; } } __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i, T* directBuff) { - recvConn[i] = conn; - recvBuff[i] = (const T*)recvConn[i]->buff; - recvStep[i] = recvConn[i]->step; + recvBuff[i] = (const T*)conn->buff; + recvStep[i] = conn->step; recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS); - // Return credits in case we rounded up. - if (tid == nthreads) *recvConn[i]->head = recvStep[i]; - if (tid == i) { - waitPtr = recvConn[i]->tail; - *(recvConn[i]->opCountLoc) = opCount; - } recvDirectBuff[i] = NULL; - if (directBuff && recvConn[i]->direct) { + if (directBuff && conn->direct) { recvDirectBuff[i] = directBuff; - if (tid == 0) *recvConn[i]->ptrExchange = directBuff; + if (tid == 0) *conn->ptrExchange = directBuff; } + if (wid == i) recvConn = conn; + if (wid == i) recvConnTail = recvConnHead = recvStep[i]; // Make sure we set this after rounding up nrecv++; } + __device__ __forceinline__ void loadRecvSync() { + if (tid >= WARP_SIZE && tid < 2*WARP_SIZE && wid<nrecv) { + recvConnTailPtr = recvConn->tail; + recvConnTailCache = *recvConnTailPtr; + } + if (tid >= nthreads-WARP_SIZE && wid < nrecv) { + recvConnHeadPtr = recvConn->head; + // Return credits in case we rounded up. + *recvConnHeadPtr = recvConnHead; + // Update opCount in case we skipped some operations + *(recvConn->opCountLoc) = opCount; + } + } __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) { - sendConn[i] = conn; - sendBuff[i] = (T*)sendConn[i]->buff; - sendStep[i] = sendConn[i]->step; + sendBuff[i] = (T*)conn->buff; + sendStep[i] = conn->step; sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS); - if (tid == WARP_SIZE+i) { - waitPtr = sendConn[i]->head; - sendConnHead[i] = *waitPtr; - *(sendConn[i]->opCountLoc) = opCount; - } sendDirectBuff[i] = NULL; - if (directBuff && sendConn[i]->direct) { - void* volatile* ptr = sendConn[i]->ptrExchange; + if (directBuff && conn->direct) { + void* volatile* ptr = conn->ptrExchange; while ((sendDirectBuff[i] = (T*)(*ptr)) == NULL); - __syncthreads(); + barrier(); if (tid == 0) *ptr = NULL; } + if (wid == i) sendConn = conn; + if (wid == i) sendConnTail = sendConnHead = sendStep[i]; // Make sure we set this after rounding up nsend++; } + __device__ __forceinline__ void loadSendSync() { + if (tid < nsend) { + sendConnHeadPtr = sendConn->head; + sendConnHeadCache = *sendConnHeadPtr; + sendConnFifoPtr = sendConn->fifo; + *(sendConn->opCountLoc) = opCount; + } + if (tid >= nthreads-WARP_SIZE && wid<nsend) { + sendConnTailPtr = sendConn->tail; + } + } - __device__ __forceinline__ void saveRecvConn(int i) { - if (tid == i) { - recvConn[i]->step = recvStep[i]; + __device__ __forceinline__ void saveRecvSync() { + if (tid >= nthreads-WARP_SIZE && wid < nrecv) { + recvConn->step = recvConnHead; + *(recvConn->opCountLoc) = opCount+1; __threadfence_system(); - *(recvConn[i]->opCountLoc) += 1; } } - __device__ __forceinline__ void saveSendConn(int i) { - if (tid == WARP_SIZE+i) { - sendConn[i]->step = sendStep[i]; + __device__ __forceinline__ void saveSendSync() { + if (tid < nsend) { + sendConn->step = sendConnHead; + *(sendConn->opCountLoc) = opCount+1; __threadfence_system(); - *(sendConn[i]->opCountLoc) += 1; } } public: __device__ __forceinline__ - ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount) - : comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) { - // Make sure step is updated before we read it - __syncthreads(); + ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount) + : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), stepSize(stepSize), opCount(opCount) { + // Make sure step is updated before we read it. + barrier(); for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, directBuff); for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, directBuff); + loadRecvSync(); + loadSendSync(); } __device__ __forceinline__ void @@ -305,283 +362,13 @@ class ncclPrimitives { } __device__ __forceinline__ ~ncclPrimitives() { - // Save steps for next collective. Have thread 0 do it to be compatible - // with the way LL works. - for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i); - for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i); + // Save steps for the next operation + saveRecvSync(); + saveSendSync(); } }; -template <typename T, class FUNC, int NRECV, int NSEND> -class ncclLLPrimitives { - private: - const int tid; - const int nthreads; - int nrecv = 0; - int nsend = 0; - struct ncclConnInfo* recvConn[NRECV]; - struct ncclConnInfo* sendConn[NSEND]; - volatile uint64_t* waitPtr; - volatile uint64_t* postPtr; - volatile int* fifoPtr; - uint64_t recvStep[NRECV]; - uint64_t sendStep[NSEND]; - uint64_t sendConnHead; - union ncclLLFifoLine* recvBuff[NRECV]; - union ncclLLFifoLine* sendBuff[NSEND]; - struct ncclComm* comm; - - inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } - inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } - inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } - inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); } - inline __device__ uint32_t recvFlag(int i) { return recvStep[i]+1; } - inline __device__ uint32_t sendFlag(int i) { return sendStep[i]+1; } - - // Exit If Abort Barrier : make sure all threads exit consistently - // Each thread sets a predicate to true if val == 1 - // all CTA's threads enter the barrier and do a popc on their predicates being True - // If any of the thread's predicate was True, all the threads call exit() - inline __device__ void exitIfAbortLocalBarrier() { - uint32_t popc; - asm ("{"); - asm volatile (" .reg .pred barr_pred;"); - asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort)); - asm volatile (" bar.red.popc.u32 %0, 14, %1, barr_pred;" : "=r"(popc) : "r"(nthreads)); - asm ("}"); - if (popc) { - // Make sure threads not participating in the operation get the abort and all threads exit - exitIfAbortBarrier(1); - } - } - - inline __device__ void barrier() { - asm volatile ("bar.sync 1, %0;" :: "r"(nthreads)); - } - - uint32_t mismatch = 0; - const uint64_t opCount; - - inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) { - if (mismatch > 20) { - // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch - // Note that we are not using _threadfence_system in LL so the error cannot be asserted - *(comm->fatalDevError) = ncclDevSuspectedMismatch; - } else if (remoteOpCount && *remoteOpCount > opCount) { - mismatch += 1; - } - } - - uint32_t spins = 0; - uint32_t abort = 0; - - inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) { - spins++; - if (spins == SPINS_BEFORE_CHECK_ABORT) { - abort = *(comm->abortFlag); - checkMismatch(remoteOpCount); - spins = 0; - } - return abort; - } - - inline __device__ void waitSend(int i, int nbytes) { - spins = 0; - mismatch = 0; - if (tid == WARP_SIZE+i) { - while (sendConnHead + NCCL_STEPS < sendStep[i] + 1) { - sendConnHead = *waitPtr; - if (checkAbort(sendConn[i]->opCountRem)) break; - } - if (fifoPtr) fifoPtr[sendStep[i]%NCCL_STEPS] = nbytes; - } - } +#include "prims_ll.h" +//#include "prims_ll128.h" - inline __device__ void postRecv(int i) { - recvStep[i]++; - if (tid == i) *postPtr = recvStep[i]; - } - - inline __device__ void postSend(int i) { - sendStep[i]++; - } - - __device__ uint64_t readLL(int i, int offset) { - union ncclLLFifoLine* src = recvPtr(i) + offset; - uint32_t flag = recvFlag(i); - uint32_t data1, flag1, data2, flag2; - spins = 0; - mismatch = 0; - do { - asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4)); - if (checkAbort(recvConn[i]->opCountRem)) break; - } while ((flag1 != flag) || (flag2 != flag)); - uint64_t val64 = data1 + (((uint64_t)data2) << 32); - return val64; - } - - __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) { - asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag)); - } - - // Using memcpy handles misaligned pointers. - __device__ uint64_t readAL(uint64_t* src) { - uint64_t val; - memcpy((char*)&val, (char*)src, sizeof(uint64_t)); - return val; - } - - __device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) { - memcpy((char*)dst, (char*)&val, nbytes); - } - - template <int RECV, int SEND, int SRC, int DST> - __device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) { - uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T); - FOR_SEND(waitSend, nbytes*2); - barrier(); - uint32_t npack = DIVUP(nbytes, sizeof(uint64_t)); - uint64_t* srcPack = (uint64_t*)srcPtr; - uint64_t* dstPack = (uint64_t*)dstPtr; - // Do multiples of 64 bits - #pragma unroll 2 - for (int offset=tid; offset<npack; offset+=nthreads) { - // Recv : local, then intra-node, then inter-node - uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset); - if (RECV) { - if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val); - for (int i=1; i<NRECV && i<nrecv; i++) { - val = MULTI<FUNC, T>()(readLL(i, offset), val); - } - } - - // Send : inter-node, then intra-node, then local - if (SEND) { - for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i)); - storeLL(sendPtr(0)+offset, val, sendFlag(0)); - } - if (DST) { - if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) { - // Last incomplete word - storeAL(dstPack+offset, val, nbytes & 0x7); - } else { - storeAL(dstPack+offset, val, sizeof(uint64_t)); - } - } - } - exitIfAbortLocalBarrier(); - FOR_RECV(postRecv); - FOR_SEND(postSend); - } - - __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) { - recvConn[i] = conn; - recvBuff[i] = recvConn[i]->llBuff; - recvStep[i] = recvConn[i]->step; - if (tid == i) { - postPtr = recvConn[i]->head; - *(recvConn[i]->opCountLoc) = opCount; - } - nrecv++; - } - - __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) { - sendConn[i] = conn; - sendBuff[i] = sendConn[i]->llBuff; - sendStep[i] = sendConn[i]->step; - if (tid == WARP_SIZE+i) { - waitPtr = sendConn[i]->head; - fifoPtr = sendConn[i]->fifo; - sendConnHead = *waitPtr; - *(sendConn[i]->opCountLoc) = opCount; - } - nsend++; - } - - __device__ __forceinline__ void saveRecvConn(int i) { - if (tid == i) { - recvConn[i]->step = recvStep[i]; - *(recvConn[i]->opCountLoc) += 1; - __threadfence_block(); - } - } - - __device__ __forceinline__ void saveSendConn(int i) { - if (tid == WARP_SIZE+i) { - sendConn[i]->step = sendStep[i]; - *(sendConn[i]->opCountLoc) += 1; - __threadfence_block(); - } - } - - __device__ __forceinline__ void llSendCleaning(int i) { - if (sendStep[i] > sendConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - /* Reset all flags */ - static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); - static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); - for (int s=0; s<NCCL_STEPS; s++) { - waitSend(i, 0); - for (int o=tid; o<NCCL_LL_SLICE_LINES; o+=nthreads) { - const union ncclLLFifoLine resetLine = { 0, sendFlag(i), 0, sendFlag(i) }; - sendPtr(i)[o].i4 = resetLine.i4; - } - } - if (tid == 0) sendConn[i]->llLastCleaning = sendStep[i]; - } - } - - __device__ __forceinline__ void llRecvCleaning(int i) { - if (recvStep[i] > recvConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - recvStep[i] += NCCL_STEPS; - if (tid == 0) recvConn[i]->llLastCleaning = recvStep[i]; - } - } - - public: - __device__ __forceinline__ - ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount) - : comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) { - // Make sure step is updated before we read it. - barrier(); - - for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i); - for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i); - } - - __device__ void send(const T* src, int nelem) { - return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem); - } - - __device__ void recv(T* dst, int nelem) { - return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem); - } - - __device__ void recvReduceSend(const T* src, int nelem) { - return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem); - } - - __device__ void recvReduceCopy(const T* src, T* dst, int nelem) { - return LLGenericOp<1, 0, 1, 1>(src, dst, nelem); - } - - __device__ void copySend(const T* src, T* dst, int nelem) { - return LLGenericOp<0, 1, 1, 1>(src, dst, nelem); - } - - __device__ void recvCopySend(T* dst, int nelem) { - return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem); - } - - __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) { - return LLGenericOp<1, 1, 1, 1>(src, dst, nelem); - } - - __device__ __forceinline__ ~ncclLLPrimitives() { - for (int i=0; i<NSEND && i<nsend; i++) llSendCleaning(i); - for (int i=0; i<NRECV && i<nrecv; i++) llRecvCleaning(i); - // Save steps for the next operation - for (int i=0; i<NRECV && i<nrecv; i++) saveRecvConn(i); - for (int i=0; i<NSEND && i<nsend; i++) saveSendConn(i); - } -}; #endif diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h new file mode 100644 index 0000000..f919493 --- /dev/null +++ b/src/collectives/device/prims_ll.h @@ -0,0 +1,259 @@ +template <typename T, class FUNC, int NRECV, int NSEND> +class ncclLLPrimitives { + private: + const int tid; + const int nthreads; + const int wid; + int nrecv = 0; + int nsend = 0; + struct ncclConnInfo* recvConn = NULL; + volatile uint64_t* recvConnHeadPtr = NULL; + uint64_t recvConnHead; + + struct ncclConnInfo* sendConn = NULL; + volatile int* sendConnFifoPtr = NULL; + volatile uint64_t* sendConnHeadPtr = NULL; + uint64_t sendConnHead; + uint64_t sendConnHeadCache; // Cache last seen value + + uint64_t recvStep[NRECV]; + uint64_t sendStep[NSEND]; + union ncclLLFifoLine* recvBuff[NRECV]; + union ncclLLFifoLine* sendBuff[NSEND]; + struct ncclDevComm* comm; + + inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } + inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } + inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } + inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); } + inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); } + inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); } + + inline __device__ void barrier() { + asm volatile ("bar.sync 1, %0;" :: "r"(nthreads)); + } + + uint32_t mismatch = 0; + const uint64_t opCount; + + inline __device__ void checkMismatch(struct ncclConnInfo* conn) { + if (mismatch > 20) { + // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch + // Note that we are not using _threadfence_system in LL so the error cannot be asserted + *(comm->fatalDevError) = ncclDevSuspectedMismatch; + } else if (conn && *conn->opCountRem > opCount) { + mismatch += 1; + } + } + + uint32_t spins = 0; + uint32_t abort = 0; + + inline __device__ int checkAbort(int i, int send) { + spins++; + if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) { + abort = *(comm->abortFlag); + if (wid == i) checkMismatch(send ? sendConn : recvConn); + spins = 0; + } + return abort; + } + + inline __device__ void waitSend(int nbytes) { + spins = 0; + mismatch = 0; + if (sendConnHeadPtr) { + while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) { + sendConnHeadCache = *sendConnHeadPtr; + if (checkAbort(wid, 1)) break; + } + if (sendConnFifoPtr) { + int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes; + sendConnFifoPtr[sendConnHead%NCCL_STEPS] = size; + } + sendConnHead += 1; + } + barrier(); + } + + inline __device__ void incRecv(int i) { + recvStep[i] += 1; + } + inline __device__ void postRecv() { + barrier(); + if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1; + } + + inline __device__ void incSend(int i, int offset) { + // LL Cleanup : write all flags in the slice to make sure we don't have + // data corruption when flag loops over. + if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) { + for (int o = offset; o<NCCL_LL_SLICE_LINES; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i)); + } + sendStep[i]++; + } + + __device__ uint64_t readLL(int i, int offset) { + union ncclLLFifoLine* src = recvPtr(i) + offset; + uint32_t flag = recvFlag(i); + uint32_t data1, flag1, data2, flag2; + spins = 0; + mismatch = 0; + do { + asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4)); + if (checkAbort(i, 0)) break; + } while ((flag1 != flag) || (flag2 != flag)); + uint64_t val64 = data1 + (((uint64_t)data2) << 32); + return val64; + } + + __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) { + asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag)); + } + + // Using memcpy handles misaligned pointers. + __device__ uint64_t readAL(uint64_t* src) { + uint64_t val; + memcpy((char*)&val, (char*)src, sizeof(uint64_t)); + return val; + } + + __device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) { + memcpy((char*)dst, (char*)&val, nbytes); + } + + template <int RECV, int SEND, int SRC, int DST> + __device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) { + uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T); + uint32_t npack = DIVUP(nbytes, sizeof(uint64_t)); + uint64_t* srcPack = (uint64_t*)srcPtr; + uint64_t* dstPack = (uint64_t*)dstPtr; + int offset = tid; + + // Always waitSend in case of cleanup + if (SEND) waitSend(npack*sizeof(union ncclLLFifoLine)); + + // Do multiples of 64 bits + #pragma unroll 2 + for (; offset<npack; offset+=nthreads) { + // Recv : local, then intra-node, then inter-node + uint64_t val = SRC ? readAL(srcPack+offset) : readLL(0, offset); + if (RECV) { + if (SRC) val = MULTI<FUNC, T>()(readLL(0, offset), val); + for (int i=1; i<NRECV && i<nrecv; i++) { + val = MULTI<FUNC, T>()(readLL(i, offset), val); + } + } + + // Send : inter-node, then intra-node, then local + if (SEND) { + for (int i=1; i<NSEND && i<nsend; i++) storeLL(sendPtr(i)+offset, val, sendFlag(i)); + storeLL(sendPtr(0)+offset, val, sendFlag(0)); + } + if (DST) { + if (((offset*sizeof(uint64_t)) ^ nbytes) < sizeof(uint64_t)) { + // Last incomplete word + storeAL(dstPack+offset, val, nbytes & 0x7); + } else { + storeAL(dstPack+offset, val, sizeof(uint64_t)); + } + } + } + FOR_RECV(incRecv); if (RECV) postRecv(); + FOR_SEND(incSend, offset); + } + + __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) { + recvBuff[i] = conn->llBuff; + recvStep[i] = conn->step; + if (wid == i) recvConn = conn; + nrecv++; + } + __device__ __forceinline__ void loadRecvSync() { + if (tid >= nthreads-WARP_SIZE && wid < nrecv) { + recvConnHeadPtr = recvConn->head; + recvConnHead = recvConn->step; + // Update opCount in case we skipped some operations + *(recvConn->opCountLoc) = opCount; + } + } + + __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) { + sendBuff[i] = conn->llBuff; + sendStep[i] = conn->step; + if (wid == i) sendConn = conn; + nsend++; + } + __device__ __forceinline__ void loadSendSync() { + if (tid < nsend) { + sendConnHeadPtr = sendConn->head; + sendConnHeadCache = *sendConnHeadPtr; + sendConnHead = sendConn->step; + sendConnFifoPtr = sendConn->fifo; + *(sendConn->opCountLoc) = opCount; + } + } + + __device__ __forceinline__ void saveRecvSync() { + if (tid >= nthreads-WARP_SIZE && wid < nrecv) { + recvConn->step = recvConnHead; + *(recvConn->opCountLoc) = opCount+1; + __threadfence_block(); + } + } + + __device__ __forceinline__ void saveSendSync() { + if (tid < nsend) { + sendConn->step = sendConnHead; + *(sendConn->opCountLoc) = opCount+1; + __threadfence_block(); + } + } + + public: + __device__ __forceinline__ + ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount) + : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), opCount(opCount) { + // Make sure step is updated before we read it. + barrier(); + + for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i); + for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i); + loadRecvSync(); + loadSendSync(); + } + + __device__ void send(const T* src, int nelem) { + return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem); + } + + __device__ void recv(T* dst, int nelem) { + return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem); + } + + __device__ void recvReduceSend(const T* src, int nelem) { + return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem); + } + + __device__ void recvReduceCopy(const T* src, T* dst, int nelem) { + return LLGenericOp<1, 0, 1, 1>(src, dst, nelem); + } + + __device__ void copySend(const T* src, T* dst, int nelem) { + return LLGenericOp<0, 1, 1, 1>(src, dst, nelem); + } + + __device__ void recvCopySend(T* dst, int nelem) { + return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem); + } + + __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) { + return LLGenericOp<1, 1, 1, 1>(src, dst, nelem); + } + + __device__ __forceinline__ ~ncclLLPrimitives() { + // Save steps for the next operation + saveRecvSync(); + saveSendSync(); + } +}; diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h new file mode 100644 index 0000000..40a8cff --- /dev/null +++ b/src/collectives/device/prims_ll128.h @@ -0,0 +1,410 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "op128.h" + +#define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1) + +template <typename T, class FUNC, int NRECV, int NSEND> +class ncclLL128Primitives { + private: + const int tid; + const int nthreads; + const int wid; + const int warp; + const bool flagThread; + int nrecv = 0; + int nsend = 0; + struct ncclConnInfo* recvConn = NULL; + volatile uint64_t* recvConnHeadPtr = NULL; + uint64_t recvConnHead; + + struct ncclConnInfo* sendConn = NULL; + volatile int* sendConnFifoPtr = NULL; + volatile uint64_t* sendConnTailPtr = NULL; + uint64_t sendConnTail; + volatile uint64_t* sendConnHeadPtr = NULL; + uint64_t sendConnHead; + uint64_t sendConnHeadCache; // Cache last seen value + + uint64_t recvStep[NRECV]; + uint64_t sendStep[NSEND]; + uint64_t* recvBuff[NRECV]; + uint64_t* sendBuff[NSEND]; + struct ncclDevComm* comm; + + volatile uint64_t* shmem; + + inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; } + inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL128_SLICE_ELEMS; } + inline __device__ uint64_t* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } + inline __device__ uint64_t* sendPtr(int i) { return sendBuff[i]+sendOffset(i); } + inline __device__ uint64_t recvFlag(int i) { return recvStep[i]+1; } + inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; } + + inline __device__ void barrier() { + if (NSEND>NRECV) { + asm volatile ("bar.sync 2, %0;" :: "r"(nthreads)); + } else { + asm volatile ("bar.sync 3, %0;" :: "r"(nthreads)); + } + } + + uint32_t mismatch = 0; + const uint64_t opCount; + + inline __device__ void checkMismatch(struct ncclConnInfo* conn) { + if (mismatch > 20) { + // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch + // Note that we are not using _threadfence_system in LL so the error cannot be asserted + *(comm->fatalDevError) = ncclDevSuspectedMismatch; + } else if (conn && *conn->opCountRem > opCount) { + mismatch += 1; + } + } + + uint32_t spins = 0; + uint32_t abort = 0; + + inline __device__ int checkAbort(int i, int send) { + spins++; + if (abort == 0 && spins == SPINS_BEFORE_CHECK_ABORT) { + abort = *(comm->abortFlag); + if (wid == i) checkMismatch(send ? sendConn : recvConn); + spins = 0; + } + return abort; + } + + inline __device__ void waitSend(int nbytes) { + spins = 0; + mismatch = 0; + if (sendConnHeadPtr) { + while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) { + sendConnHeadCache = *sendConnHeadPtr; + if (checkAbort(wid, 1)) break; + } + if (sendConnFifoPtr) { + sendConnFifoPtr[sendStep[wid]%NCCL_STEPS] = nbytes; + } + sendConnHead += 1; + } + } + + inline __device__ void incRecv(int i) { + recvStep[i] += 1; + } + inline __device__ void postRecv() { + if (recvConnHeadPtr) *recvConnHeadPtr = recvConnHead += 1; + } + + inline __device__ void incSend(int i) { + sendStep[i] += 1; + } + inline __device__ void postSend() { + if (sendConnTailPtr) { __threadfence(); *sendConnTailPtr = sendConnTail += 1; } + } + + template <int ELEMS_PER_THREAD> + inline __device__ void loadSrcToShmem128(int maxOffset, const uint64_t* src64Ptr) { +#if 0 + uint64_t v[ELEMS_PER_THREAD]; + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + if (u*WARP_SIZE < maxOffset) load128(src64Ptr+u*WARP_SIZE, v[u], v[u+1]); + } + uint64_t* shmemAsmPtr = shmemCvtPtr(shmem); + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + storeShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]); + } +#else + uint64_t* shmemAsmPtr = shmemCvtPtr(shmem); + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + if (u*WARP_SIZE < maxOffset) { + uint64_t v0, v1; + load128(src64Ptr+u*WARP_SIZE, v0, v1); + storeShmem128(shmemAsmPtr+u*WARP_SIZE, v0, v1); + } + } +#endif + } + + inline __device__ void loadSrcToShmem(int start, int end, const T* srcPtr) { + T* shmemPtr = (T*)(shmem-2*wid); + for (int offset = start+wid; offset < end; offset += WARP_SIZE) { + shmemPtr[offset] = srcPtr[offset]; + } + } + + template <int ELEMS_PER_THREAD> + inline __device__ void storeShmemToDst128(int maxOffset, uint64_t* dst64Ptr) { + uint64_t v[ELEMS_PER_THREAD]; + uint64_t* shmemAsmPtr = shmemCvtPtr(shmem); + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + loadShmem128(shmemAsmPtr+u*WARP_SIZE, v[u], v[u+1]); + } + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + if (u*WARP_SIZE < maxOffset) store128(dst64Ptr+u*WARP_SIZE, v[u], v[u+1]); + } + } + + inline __device__ void storeShmemToDst(int start, int end, T* dstPtr) { + T* shmemPtr = (T*)(shmem-2*wid); + for (int offset = start+wid; offset < end; offset += WARP_SIZE) { + dstPtr[offset] = shmemPtr[offset]; + } + } + + #define WARP_MASK 0xffffffff + + template <int ELEMS_PER_THREAD, int RECV, int SEND, int SRC, int DST> + __device__ __forceinline__ void recvReduceSendCopy(int ll128Offset) { + uint64_t v[ELEMS_PER_THREAD]; + + /************* Data Loading : SHMEM -> REG **************/ + if (SRC) { + volatile uint64_t* shmem64Ptr = shmem - (2*wid)/NCCL_LL128_LINEELEMS; + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + v[u] = shmem64Ptr[u*(WARP_SIZE-2)]; + if (!flagThread) v[u+1] = shmem64Ptr[u*(WARP_SIZE-2)+1]; + } + } + /*********** End Data Loading : SHMEM -> REG ************/ + + /************************ Recv **************************/ + if (RECV) { + uint64_t flag = recvFlag(0); + uint64_t* ptr = recvPtr(0)+ll128Offset; + bool needReload; + uint64_t v0, v1; + do { + needReload = false; + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + load128(ptr+u*WARP_SIZE, v0, v1); + needReload |= flagThread && (v1 != flag); + } + } while (__any_sync(WARP_MASK, needReload) && checkAbort(0, 0) == 0); + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + load128(ptr+u*WARP_SIZE, v0, v1); + v[u] = SRC ? MULTI<FUNC, T>()(v0, v[u]) : v0; + v[u+1] = SRC ? MULTI<FUNC, T>()(v1, v[u+1]) : v1; + } + + for (int i=1; i<NRECV && i<nrecv; i++) { + uint64_t flag = recvFlag(i); + uint64_t* ptr = recvPtr(i)+ll128Offset; + uint64_t v0, v1; + do { + needReload = false; + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + load128(ptr+u*WARP_SIZE, v0, v1); + needReload |= flagThread && (v1 != flag); + } + } while (__any_sync(WARP_MASK, needReload) && checkAbort(i, 0) == 0); + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + load128(ptr+u*WARP_SIZE, v0, v1); + v[u] = MULTI<FUNC, T>()(v0, v[u]); + v[u+1] = MULTI<FUNC, T>()(v1, v[u+1]); + } + } + } + /********************** End Recv ************************/ + + /************************ Send **************************/ + if (SEND) { + for (int i=1; i<NSEND && i<nsend; i++) { + int flag = sendFlag(i); + uint64_t* ptr = sendPtr(i)+ll128Offset; + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]); + } + } + int flag = sendFlag(0); + uint64_t* ptr = sendPtr(0)+ll128Offset; + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + store128(ptr+u*WARP_SIZE, v[u], flagThread ? flag : v[u+1]); + } + } + /********************** End Send ************************/ + + /************* Data Storing : REG -> SHMEM **************/ + if (DST) { + volatile uint64_t* shmem64Ptr = shmem - (2*wid)/NCCL_LL128_LINEELEMS; + #pragma unroll + for (int u=0; u<ELEMS_PER_THREAD; u+=2) { + shmem64Ptr[u*(WARP_SIZE-2)] = v[u]; + if (!flagThread) shmem64Ptr[u*(WARP_SIZE-2)+1] = v[u+1]; + } + } + /*********** End data Storing : REG -> SHMEM ************/ + } + + #define LL128INC (WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD) + #define ELEMINC (LL128INC-(LL128INC/NCCL_LL128_LINEELEMS)) + + template <int RECV, int SEND, int SRC, int DST> + __device__ void GenericOp(const T* srcPtr, T* dstPtr, int nelem) { + if (nelem <= 0) { + // Don't move any data but still increase steps and sync with prev/next + if (SEND) waitSend(0); + FOR_SEND(incSend); if (SEND) postSend(); + FOR_RECV(incRecv); if (RECV) postRecv(); + return; + } + const int nelem64 = ((nelem*sizeof(T))/(2*sizeof(uint64_t)))*2; + const uint64_t* src64Ptr = ((uint64_t*)srcPtr); + uint64_t* dst64Ptr = ((uint64_t*)dstPtr); + + int ll128Offset = LL128INC*warp+2*wid; + int elemOffset = ELEMINC*warp; + const int nwarps = nthreads/WARP_SIZE; + + if (SEND) waitSend(DIVUP(nelem*sizeof(T), ELEMINC*sizeof(uint64_t))*LL128INC*sizeof(uint64_t)); + barrier(); + + while (elemOffset*(sizeof(uint64_t)/sizeof(T)) < nelem) { + const int maxOffset128 = min(nelem64-elemOffset, (int)ELEMINC); + const int maxOffset = min(nelem-(elemOffset*((int)(sizeof(uint64_t)/sizeof(T)))), (int)(ELEMINC*(sizeof(uint64_t)/sizeof(T)))); + if (SRC) { + int done = 0; + if ((((uint64_t)srcPtr)&0xf) == 0) { + loadSrcToShmem128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, src64Ptr+elemOffset+2*wid); + done = maxOffset128*(sizeof(uint64_t)/sizeof(T)); + } + loadSrcToShmem(done, maxOffset, (T*)(src64Ptr+elemOffset)); + } + __syncwarp(); + recvReduceSendCopy<NCCL_LL128_SHMEM_ELEMS_PER_THREAD, RECV, SEND, SRC, DST>(ll128Offset); + __syncwarp(); + if (DST) { + int done = 0; + if ((((uint64_t)dstPtr)&0xf) == 0) { + storeShmemToDst128<NCCL_LL128_SHMEM_ELEMS_PER_THREAD>(maxOffset128-2*wid, dst64Ptr+elemOffset+2*wid); + done = maxOffset128*(sizeof(uint64_t)/sizeof(T)); + } + storeShmemToDst(done, maxOffset, (T*)(dst64Ptr+elemOffset)); + } + __syncwarp(); + ll128Offset += LL128INC*nwarps; + elemOffset += ELEMINC*nwarps; + } + + barrier(); + FOR_SEND(incSend); if (SEND) postSend(); + FOR_RECV(incRecv); if (RECV) postRecv(); + } + + __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) { + recvBuff[i] = conn->ll128Buff; + recvStep[i] = conn->step; + if (wid == i) recvConn = conn; + nrecv++; + } + __device__ __forceinline__ void loadRecvSync() { + if (tid >= nthreads-WARP_SIZE && wid < nrecv) { + recvConnHeadPtr = recvConn->head; + recvConnHead = recvConn->step; + // Update opCount in case we skipped some operations + *(recvConn->opCountLoc) = opCount; + } + } + + __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) { + sendBuff[i] = conn->ll128Buff; + sendStep[i] = conn->step; + if (wid == i) sendConn = conn; + nsend++; + } + __device__ __forceinline__ void loadSendSync() { + if (tid < nsend) { + sendConnHeadPtr = sendConn->head; + sendConnHeadCache = *sendConnHeadPtr; + sendConnHead = sendConn->step; + sendConnFifoPtr = sendConn->fifo; + *(sendConn->opCountLoc) = opCount; + } + if (tid >= nthreads-WARP_SIZE && wid<nsend) { + if (sendConn->fifo) { + sendConnTailPtr = sendConn->tail; + sendConnTail = sendConn->step; + } + } + } + + __device__ __forceinline__ void saveRecvSync() { + if (tid >= nthreads-WARP_SIZE && wid < nrecv) { + recvConn->step = recvConnHead; + *(recvConn->opCountLoc) = opCount+1; + __threadfence_block(); + } + } + + __device__ __forceinline__ void saveSendSync() { + if (tid < nsend) { + sendConn->step = sendConnHead; + *(sendConn->opCountLoc) = opCount+1; + __threadfence_block(); + } + } + + public: + __device__ __forceinline__ + ncclLL128Primitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount) + : comm(comm), tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE), flagThread((tid%8)==7), opCount(opCount), shmem(ncclShmem+(threadIdx.x/WARP_SIZE)*NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE+2*wid) { + // Make sure step is updated before we read it. + barrier(); + + for (int i=0; i<NRECV && recvPeers[i] >= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i); + for (int i=0; i<NSEND && sendPeers[i] >= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i); + loadRecvSync(); + loadSendSync(); + } + + __device__ void send(const T* src, int nelem) { + return GenericOp<0, 1, 1, 0>(src, NULL, nelem); + } + + __device__ void recv(T* dst, int nelem) { + return GenericOp<1, 0, 0, 1>(NULL, dst, nelem); + } + + __device__ void recvReduceSend(const T* src, int nelem) { + return GenericOp<1, 1, 1, 0>(src, NULL, nelem); + } + + __device__ void recvReduceCopy(const T* src, T* dst, int nelem) { + return GenericOp<1, 0, 1, 1>(src, dst, nelem); + } + + __device__ void copySend(const T* src, T* dst, int nelem) { + return GenericOp<0, 1, 1, 1>(src, dst, nelem); + } + + __device__ void recvCopySend(T* dst, int nelem) { + return GenericOp<1, 1, 0, 1>(NULL, dst, nelem); + } + + __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) { + return GenericOp<1, 1, 1, 1>(src, dst, nelem); + } + + __device__ __forceinline__ ~ncclLL128Primitives() { + // Save steps for the next operation + saveRecvSync(); + saveSendSync(); + } +}; diff --git a/src/collectives/device/reduce.cu b/src/collectives/device/reduce.cu index 1ef66d4..a2caac5 100644 --- a/src/collectives/device/reduce.cu +++ b/src/collectives/device/reduce.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h index 302d053..0680abe 100644 --- a/src/collectives/device/reduce.h +++ b/src/collectives/device/reduce.h @@ -1,19 +1,19 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" template<int UNROLL, class FUNC, typename T> __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = blockDim.x - 1; + const int nthreads = args->nThreads-WARP_SIZE; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; @@ -30,7 +30,7 @@ __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) { T * __restrict__ thisOutput = (T*)args->ThisOutput; ncclPrimitives<UNROLL, REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS, T, 1, 1, FUNC> - prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); + prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); @@ -55,7 +55,7 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; @@ -93,3 +93,48 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) { template<int UNUSED, class FUNC, typename T> __device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { } + +#include "prims_ll128.h" +template<int UNUSED, class FUNC, typename T> +__device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int bid = args->bid; + const int nthreads = args->nThreads; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; + + ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); + + const ssize_t size = args->N; + const int rank = comm->rank; + const int nranks = comm->nRanks; + const int prevRank = ring->devUserRanks[nranks-1]; + const int root = args->root; + + ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + + const ssize_t loopSize = args->nChannels*chunkSize; + + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize); + ssize_t offset = gridOffset + bid*chunkSize; + + int nelem = min(chunkSize, size-offset); + if (prevRank == root) { + LLprims.send(thisInput+offset, nelem); + } else if (rank == root) { + LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem); + } else { + LLprims.recvReduceSend(thisInput+offset, nelem); + } + } +} + +template<int UNUSED, class FUNC, typename T> +__device__ void ncclReduceTreeLL128Kernel(struct CollectiveArgs* args) { } diff --git a/src/collectives/device/reduce_scatter.cu b/src/collectives/device/reduce_scatter.cu index 10857ed..8b45299 100644 --- a/src/collectives/device/reduce_scatter.cu +++ b/src/collectives/device/reduce_scatter.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h index c70c845..1985148 100644 --- a/src/collectives/device/reduce_scatter.h +++ b/src/collectives/device/reduce_scatter.h @@ -1,25 +1,25 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" template<int UNROLL, class FUNC, typename T> __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; - const int nthreads = blockDim.x - 1; + const int nthreads = args->nThreads-WARP_SIZE; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; const int nranks = comm->nRanks; const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); - const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS; + const int chunkSize = stepSize * REDUCESCATTER_CHUNKSTEPS; const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; // Compute pointers @@ -27,7 +27,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) { T * __restrict__ thisOutput = (T*)args->ThisOutput; ncclPrimitives<UNROLL, REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS, T, 1, 1, FUNC> - prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); + prims(tid, args->nThreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); @@ -69,7 +69,7 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; @@ -121,3 +121,64 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) { template<int UNUSED, class FUNC, typename T> __device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { } + +#include "prims_ll128.h" +template<int UNUSED, class FUNC, typename T> +__device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int bid = args->bid; + const int nthreads = args->nThreads; + struct ncclDevComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; + + ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); + + const ssize_t size = args->N; + //const int rank = comm->rank; + const int nranks = comm->nRanks; + ssize_t chunkSize = (NCCL_LL128_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T)); + // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere. + const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2; + + const ssize_t loopSize = args->nChannels*chunkSize; + + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + chunkSize = min(DIVUP(size-gridOffset, args->nChannels*minChunkSize)*minChunkSize, chunkSize); + + ssize_t chunkOffset = gridOffset + bid*chunkSize; + + /////////////// begin ReduceScatter steps /////////////// + ssize_t offset; + int nelem = min(chunkSize, size-chunkOffset); + int rankDest; + + // step 0: push data to next GPU + rankDest = ring->devUserRanks[nranks-1]; + offset = chunkOffset + rankDest * size; + + LLprims.send(thisInput+offset, nelem); + + // k-2 steps: reduce and copy to next GPU + for (int j=2; j<nranks; ++j) { + rankDest = ring->devUserRanks[nranks-j]; + offset = chunkOffset + rankDest * size; + + LLprims.recvReduceSend(thisInput+offset, nelem); + } + + // step k-1: reduce this buffer and data, which will produce the final + // result that we store in this data + rankDest = ring->devUserRanks[0]; + offset = chunkOffset + rankDest * size; + + LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem); + } +} + +template<int UNUSED, class FUNC, typename T> +__device__ void ncclReduceScatterTreeLL128Kernel(struct CollectiveArgs* args) { } diff --git a/src/collectives/reduce.cu b/src/collectives/reduce.cc index 302d4bc..67f2fae 100644 --- a/src/collectives/reduce.cu +++ b/src/collectives/reduce.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/reduce_scatter.cu b/src/collectives/reduce_scatter.cc index 4ee77ef..5ad7f5f 100644 --- a/src/collectives/reduce_scatter.cu +++ b/src/collectives/reduce_scatter.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/debug.cc b/src/debug.cc new file mode 100644 index 0000000..03a77ae --- /dev/null +++ b/src/debug.cc @@ -0,0 +1,169 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" +#include "nccl_net.h" +#include <stdlib.h> +#include <stdarg.h> + +int ncclDebugLevel = -1; +thread_local int ncclDebugNoWarn = 0; +uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT +FILE *ncclDebugFile = stdout; +pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER; + +void ncclDebugInit() { + pthread_mutex_lock(&ncclDebugLock); + if (ncclDebugLevel != -1) return; + const char* nccl_debug = getenv("NCCL_DEBUG"); + if (nccl_debug == NULL) { + ncclDebugLevel = NCCL_LOG_NONE; + } else if (strcasecmp(nccl_debug, "VERSION") == 0) { + ncclDebugLevel = NCCL_LOG_VERSION; + } else if (strcasecmp(nccl_debug, "WARN") == 0) { + ncclDebugLevel = NCCL_LOG_WARN; + } else if (strcasecmp(nccl_debug, "INFO") == 0) { + ncclDebugLevel = NCCL_LOG_INFO; + } else if (strcasecmp(nccl_debug, "ABORT") == 0) { + ncclDebugLevel = NCCL_LOG_ABORT; + } else if (strcasecmp(nccl_debug, "TRACE") == 0) { + ncclDebugLevel = NCCL_LOG_TRACE; + } + + /* Parse the NCCL_DEBUG_SUBSYS env var + * This can be a comma separated list such as INIT,COLL + * or ^INIT,COLL etc + */ + char* ncclDebugSubsysEnv = getenv("NCCL_DEBUG_SUBSYS"); + if (ncclDebugSubsysEnv != NULL) { + int invert = 0; + if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; } + ncclDebugMask = invert ? ~0ULL : 0ULL; + char *ncclDebugSubsys = strdup(ncclDebugSubsysEnv); + char *subsys = strtok(ncclDebugSubsys, ","); + while (subsys != NULL) { + uint64_t mask = 0; + if (strcasecmp(subsys, "INIT") == 0) { + mask = NCCL_INIT; + } else if (strcasecmp(subsys, "COLL") == 0) { + mask = NCCL_COLL; + } else if (strcasecmp(subsys, "P2P") == 0) { + mask = NCCL_P2P; + } else if (strcasecmp(subsys, "SHM") == 0) { + mask = NCCL_SHM; + } else if (strcasecmp(subsys, "NET") == 0) { + mask = NCCL_NET; + } else if (strcasecmp(subsys, "GRAPH") == 0) { + mask = NCCL_GRAPH; + } else if (strcasecmp(subsys, "TUNING") == 0) { + mask = NCCL_TUNING; + } else if (strcasecmp(subsys, "ALL") == 0) { + mask = NCCL_ALL; + } + if (mask) { + if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask; + } + subsys = strtok(NULL, ","); + } + free(ncclDebugSubsys); + } + + /* Parse and expand the NCCL_DEBUG_FILE path and + * then create the debug file. But don't bother unless the + * NCCL_DEBUG level is > VERSION + */ + const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE"); + if (ncclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) { + int c = 0; + char debugFn[PATH_MAX+1] = ""; + char *dfn = debugFn; + while (ncclDebugFileEnv[c] != '\0' && c < PATH_MAX) { + if (ncclDebugFileEnv[c++] != '%') { + *dfn++ = ncclDebugFileEnv[c-1]; + continue; + } + switch (ncclDebugFileEnv[c++]) { + case '%': // Double % + *dfn++ = '%'; + break; + case 'h': // %h = hostname + char hostname[1024]; + getHostName(hostname, 1024, '.'); + dfn += snprintf(dfn, PATH_MAX, "%s", hostname); + break; + case 'p': // %p = pid + dfn += snprintf(dfn, PATH_MAX, "%d", getpid()); + break; + default: // Echo everything we don't understand + *dfn++ = '%'; + *dfn++ = ncclDebugFileEnv[c-1]; + break; + } + } + *dfn = '\0'; + if (debugFn[0] != '\0') { + FILE *file = fopen(debugFn, "w"); + if (file != NULL) { + INFO(NCCL_ALL,"DEBUG file is '%s'", debugFn); + ncclDebugFile = file; + } + } + } + +#ifdef ENABLE_TRACE + ncclEpoch = std::chrono::high_resolution_clock::now(); +#endif + pthread_mutex_unlock(&ncclDebugLock); +} + +/* Common logging function used by the INFO, WARN and TRACE macros + * Also exported to the dynamically loadable Net transport modules so + * they can share the debugging mechanisms and output files + */ +void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) { + if (ncclDebugLevel == -1) ncclDebugInit(); + if (ncclDebugNoWarn == 1 && level == NCCL_LOG_WARN) level = NCCL_LOG_INFO; + + char hostname[1024]; + getHostName(hostname, 1024, '.'); + int cudaDev; + cudaGetDevice(&cudaDev); + + char buffer[1024]; + size_t len = 0; + pthread_mutex_lock(&ncclDebugLock); + if (ncclDebugNoWarn && ncclDebugLevel == NCCL_LOG_WARN) printf("WARN -> INFO\n"); + if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN) + len = snprintf(buffer, sizeof(buffer), + "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line); + else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask)) + len = snprintf(buffer, sizeof(buffer), + "%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev); +#ifdef ENABLE_TRACE + else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) { + auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch; + double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000; + len = snprintf(buffer, sizeof(buffer), + "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line); + } +#endif + if (len) { + va_list vargs; + va_start(vargs, fmt); + (void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs); + va_end(vargs); + fprintf(ncclDebugFile,"%s\n", buffer); + fflush(ncclDebugFile); + } + pthread_mutex_unlock(&ncclDebugLock); + + // If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort() + if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) { + fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n", + hostname, getpid(), gettid(), cudaDev, filefunc, line); + abort(); + } +} diff --git a/src/enqueue.cu b/src/enqueue.cc index d283223..2239865 100644 --- a/src/enqueue.cu +++ b/src/enqueue.cc @@ -1,23 +1,21 @@ /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "enqueue.h" -#include "checks.h" -#include "param.h" - -#include "collectives/collectives.h" +#include "argcheck.h" // Only generate inline kernels for LL #define NCCL_FUNC5(coll, op, dtype) \ (void*)NCCL_KERN_NAME(coll##LL, op, dtype), \ + (void*)NCCL_KERN_NAME(coll##LL, op, dtype), \ (void*)NCCL_KERN_NAME(coll##LL, op, dtype) #define NCCL_FUNC4(coll, op, dtype) \ - (void*)NCCL_FUNC5(coll##Ring, op, dtype), \ - (void*)NCCL_FUNC5(coll##Tree, op, dtype) + (void*)NCCL_FUNC5(coll##Tree, op, dtype), \ + (void*)NCCL_FUNC5(coll##Ring, op, dtype) // Must be consistent with ncclDataType_t #define NCCL_FUNCS3A(coll, op) \ @@ -54,7 +52,7 @@ NCCL_FUNCS3B(coll, copy) // Must be consistent with the ncclFuncSet enum -static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = { +static void* const ncclKerns[NCCL_NUM_FUNCTIONS*ncclNumOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = { NCCL_FUNCS2B(ncclBroadcast), NCCL_FUNCS2A(ncclReduce), NCCL_FUNCS2B(ncclAllGather), @@ -87,7 +85,7 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *par } ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) { - params->gridDim.x = std::min((int) params->gridDim.x, comm->nChannels); + params->gridDim.x = std::min<unsigned>(params->gridDim.x, comm->nChannels); // Set active = 2 for the last operation for (int r=0; r<params->gridDim.x; r++) { @@ -207,6 +205,7 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) { channel->collCount = 0; } params->gridDim.x = params->blockDim.x = 0; + comm->lastOpCount = comm->opCount; NCCLCHECK(transportStartProxy(comm)); return ncclSuccess; } @@ -228,20 +227,70 @@ ncclResult_t ncclEnqueueEvents(ncclComm_t comm) { /* Enqueueing system : computation of kernel and proxy operations parameters */ /*****************************************************************************/ -static ncclResult_t getPatternInfo(struct ncclInfo* info) { - if (info->coll == ncclCollBroadcast) info->pattern = ncclPatternPipelineFrom; - else if (info->coll == ncclCollReduce) info->pattern = ncclPatternPipelineTo; - else if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->pattern = ncclPatternRing; - else if (info->coll == ncclCollAllReduce) { - if (info->nBytes <= info->comm->treeThreshold) - info->pattern = ncclPatternTreeUpDown; - else - info->pattern = ncclPatternRingTwice; +// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction +// factor is not ideal but works quite well. Powers of two, 64 B to 1 GB. +static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = { + { 1.0, 1.0, 1.0, 1.0, .9, .8, .7, .7, .7, .7, .6, .5, .5, .5, .6, .7, .8, .9, .9, 1.0, 1.0, 1.0 }, + { 1.0, 1.0, 1.0, 1.0, 1.0, .9, .8, .8, .8, .8, .7, .7, .7, .6, .6, .7, .7, .8, .8, .9, .9, 1.0 }, + { .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .5, .5, .6, .6, .7, .8, .9 } +}; + +static ncclResult_t getAlgoInfo(struct ncclInfo* info) { + struct ncclComm* comm = info->comm; + float minTime = 3600000.0; // Hopefully no operation will take an hour to complete. + // Find algorithm / protocol. + info->algorithm = -1; + info->protocol = -1; + for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) { + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + float bw = comm->bandwidths[info->coll][a][p]; + if (bw == 0) continue; + int logSize = log2i(info->nBytes>>6); + if (a == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[p][logSize]; + float time = comm->latencies[info->coll][a][p] + (info->nBytes) / (1000 * bw); + if (time < minTime) { + info->algorithm = a; + info->protocol = p; + minTime = time; + } + } } - else { - WARN("Unknown collective %d", info->coll); + if (info->algorithm == -1 || info->protocol == -1) { + WARN("Error : no algorithm/protocol available"); return ncclInternalError; } + //if (comm->rank == 0) INFO(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %d", info->nBytes, info->algorithm, info->protocol, minTime); + TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime); + + int nc = comm->nChannels; + int nt = comm->maxThreads[info->protocol]; + int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol]; + while (info->nBytes < nc*nt*threadThreshold) { + if (nc >= 2) nc--; + else if ((nt % 128) == 0) nt/=2; + else break; + } + if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync + info->nChannels = nc; + info->nThreads = nt; + return ncclSuccess; +} + +static ncclResult_t getPatternInfo(struct ncclInfo* info) { + switch (info->coll) { + case ncclCollBroadcast: + info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeDown : ncclPatternPipelineFrom; break; + case ncclCollReduce: + info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUp : ncclPatternPipelineTo; break; + case ncclCollReduceScatter: + case ncclCollAllGather: + info->pattern = ncclPatternRing; break; + case ncclCollAllReduce: + info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break; + default: + WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm); + return ncclInternalError; + } return ncclSuccess; } @@ -264,40 +313,9 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) { return ncclSuccess; } -static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) { - // Compute thresholds and limits that users can override - int perThreadLLThreshold = std::min(info->comm->threadThreshold, (ssize_t)NCCL_LL_CHANNEL_THRESHOLD); - int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads); - - // First compute nThreads - int nt = NCCL_LL_MIN_NTHREADS; - while (DIVUP(info->nBytes, nt*info->nchunksPerLoop) > perThreadLLThreshold && nt*2 <= maxLLNthreads) nt *= 2; - - // Then compute nChannels - int nc = DIVUP(info->nBytes, nt*info->nchunksPerLoop*perThreadLLThreshold); - if (nc == 0) nc = 1; - if (nc > info->comm->nChannels) nc = info->comm->nChannels; - - // Check if we have a fixed LL threshold, otherwise compute it. - int perThreadThreshold = info->comm->threadThreshold; - if (info->pattern >= ncclPatternTreeUp) perThreadThreshold *= 4; - ssize_t llThreshold = info->comm->llThreshold >= 0 ? - info->comm->llThreshold : - nc*nt*info->nchunksPerLoop*perThreadThreshold; - - if (info->nBytes <= llThreshold) { - *llMode = 1; - *nChannels = nc; - *nThreads = nt; - } else { - *llMode = 0; - *nChannels = info->comm->nChannels; - *nThreads = info->comm->nThreads+1; - } -} - static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) { // Set nstepsPerLoop and nchunksPerLoop + NCCLCHECK(getAlgoInfo(info)); NCCLCHECK(getPatternInfo(info)); NCCLCHECK(getLoopInfo(info)); @@ -307,48 +325,52 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo coll->args.ThisOutput = info->recvbuff; coll->args.comm = info->comm->devComm; coll->args.opCount = info->comm->opCount; + coll->args.nChannels = info->nChannels; + coll->args.nThreads = info->nThreads; - // Compute llMode, nChannels, nThreads - int llMode; - getKernelInfo(info, &coll->args.nChannels, &coll->args.nThreads, &llMode); - - int treeMode = info->pattern >= ncclPatternTreeUp ? 1 : 0; - coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, llMode, treeMode); + coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol); - int stepSize = ( llMode ? NCCL_LL_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS; - int chunkSteps = (llMode|treeMode) ? 1 : info->chunkSteps; - int sliceSteps = (llMode|treeMode) ? 1 : info->sliceSteps; + int stepSize = (info->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : info->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS; + int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1; + int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1; int chunkSize = stepSize*chunkSteps; // Compute lastChunkSize - if (treeMode == 1 && llMode == 0) { + if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_SIMPLE) { if (info->pattern == ncclPatternTreeUpDown) { // Optimize chunkSize / nSteps - while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2; - while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2; - while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2; + while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*8 && chunkSize > 131072) chunkSize /= 2; + while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth*4 && chunkSize > 65536) chunkSize /= 2; + while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].treeUp.depth && chunkSize > 32768) chunkSize /= 2; } // Use lastChunkSize as chunkSize coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype); - } else if (llMode == 1) { + } else if (info->protocol == NCCL_PROTO_LL) { int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t); - const ssize_t loopSize = coll->args.nChannels*info->nchunksPerLoop*(ssize_t)sliceSize; - coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), coll->args.nChannels*info->nchunksPerLoop); - ALIGN_SIZE(coll->args.lastChunkSize, coll->args.nThreads*sizeof(uint64_t)); + const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize; + coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop); + ALIGN_SIZE(coll->args.lastChunkSize, info->nThreads*sizeof(uint64_t)); coll->args.lastChunkSize /= ncclTypeSize(info->datatype); + } else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) { + int nstepsInter = 1+log2i(info->comm->nNodes); + while (info->nBytes / (info->nChannels*chunkSize) < nstepsInter*4 && chunkSize > 32768) chunkSize /= 2; + // Use lastChunkSize as chunkSize + coll->args.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype)); } // Compute nSteps for proxies - size_t nBytes = llMode ? info->nBytes*2 : info->nBytes; - - int nLoops = (int)(DIVUP(nBytes, (((size_t)(coll->args.nChannels))*info->nchunksPerLoop*chunkSize))); + int chunkEffectiveSize = chunkSize; + if (info->protocol == NCCL_PROTO_LL) chunkEffectiveSize /= 2; + if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS; + //if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol); + int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize))); proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps; proxyArgs->sliceSteps = sliceSteps; proxyArgs->chunkSteps = chunkSteps; - proxyArgs->llMode = llMode; + proxyArgs->protocol = info->protocol; proxyArgs->opCount = info->comm->opCount; - TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> llmode %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p", - coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, nBytes, llMode, coll->args.nChannels, coll->args.nThreads, + TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p", + coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads, nLoops, proxyArgs->nsteps, info->comm); return ncclSuccess; } @@ -365,7 +387,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) { memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs)); NCCLCHECK(computeColl(info, &coll, &proxyArgs)); - info->comm->myParams->blockDim.x = max(info->comm->myParams->blockDim.x, coll.args.nThreads); + info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, coll.args.nThreads); if (info->comm->userStreamSet == false) { info->comm->userStream = info->stream; info->comm->userStreamSet = true; @@ -401,7 +423,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) { channel->collFifoTail = opIndex; channel->collCount++; } - /*if (llMode == 0)*/ info->comm->opCount++; + info->comm->opCount++; return ncclSuccess; } diff --git a/src/graph/connect.cc b/src/graph/connect.cc new file mode 100644 index 0000000..af481d2 --- /dev/null +++ b/src/graph/connect.cc @@ -0,0 +1,268 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "comm.h" +#include "graph.h" +#include "trees.h" +#include "rings.h" + +/******************************************************************/ +/********************* Internode connection ***********************/ +/******************************************************************/ + +ncclResult_t ncclTopoPreset(struct ncclComm* comm, + struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, + struct ncclTopoRanks* topoRanks) { + int rank = comm->rank; + int localRanks = comm->localRanks; + int nChannels = comm->nChannels; + + for (int c=0; c<nChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + channel->ring.prev = channel->ring.next = -1; + channel->treeUp.up = -1; + for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeUp.down[i] = -1; + channel->treeDn.up = -1; + for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeDn.down[i] = -1; + + int* ringIntra = ringGraph->intra+c*localRanks; + int* treeIntra = treeGraph->intra+c*localRanks; + + for (int i=0; i<localRanks; i++) { + if (ringIntra[i] == rank) { + topoRanks->ringRecv[c] = ringIntra[0]; + topoRanks->ringSend[c] = ringIntra[localRanks-1]; + channel->ring.prev = (i == 0) ? -1 : ringIntra[i-1]; + channel->ring.next = (i == localRanks-1) ? -1 : ringIntra[i+1]; + } + if (treeIntra[i] == rank) { + int recvIndex = 0, sendIndex = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; + int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks; + + // Tree loop always flows in the same direction. Other trees are symmetric, i.e. + // up/down go in reverse directions + int sym = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP ? 0 : 1; + + // Down tree is common + topoRanks->treeDnRecv[c] = treeIntra[recvIndex]; + topoRanks->treeDnSend[c] = treeIntra[sendIndex]; + channel->treeDn.up = treeIntra[prev]; + channel->treeDn.down[0] = treeIntra[next]; + // Up tree depends on the pattern + topoRanks->treeUpRecv[c] = sym ? topoRanks->treeDnSend[c] : topoRanks->treeDnRecv[c]; + topoRanks->treeUpSend[c] = sym ? topoRanks->treeDnRecv[c] : topoRanks->treeDnSend[c]; + channel->treeUp.down[0] = sym ? channel->treeDn.down[0] : channel->treeDn.up ; + channel->treeUp.up = sym ? channel->treeDn.up : channel->treeDn.down[0]; + } + } + topoRanks->ringPrev[c] = channel->ring.prev; + topoRanks->ringNext[c] = channel->ring.next; + } + // Duplicate channels rings/trees + struct ncclChannel* channel0 = comm->channels; + struct ncclChannel* channel1 = channel0+nChannels; + memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel)); + return ncclSuccess; +} + +static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext, int* firstRanks) { + int nChannels = comm->nChannels; + int nNodes = comm->nNodes; + for (int c=0; c<nChannels; c++) { + int* recv = ringRecv+c*comm->nRanks; + int* send = ringSend+c*comm->nRanks; + int* prev = ringPrev+c*comm->nRanks; + int* next = ringNext+c*comm->nRanks; + struct ncclChannel* channel0 = comm->channels+c; + struct ncclChannel* channel1 = channel0+nChannels; + for (int n=0; n<nNodes; n++) { + int recvRank = recv[firstRanks[n]]; + int prevSendRank = send[firstRanks[(n-1+nNodes)%nNodes]]; + prev[recvRank] = prevSendRank; + if (comm->rank == recvRank) { + channel0->ring.prev = prevSendRank; + channel1->ring.prev = prevSendRank; + } + int sendRank = send[firstRanks[n]]; + int nextRecvRank = recv[firstRanks[(n+1)%nNodes]]; + next[sendRank] = nextRecvRank; + if (comm->rank == sendRank) { + channel0->ring.next = nextRecvRank; + channel1->ring.next = nextRecvRank; + } + } + TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, channel0->ring.prev, comm->rank, channel0->ring.next); + TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c+nChannels, channel1->ring.prev, comm->rank, channel1->ring.next); + } + return ncclSuccess; +} + +static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes, int* firstRanks) { + for (int n=0; n<nNodes; n++) indexes[n] = ranks[firstRanks[n]]; + return ncclSuccess; +} + +static ncclResult_t setTreeUp(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int u0, int u1) { + if (u0 != -1) tree0->up = indexes[u0]; + if (u1 != -1) tree1->up = indexes[u1]; + return ncclSuccess; +} + +static ncclResult_t addRanksDown(int* down, int* indexes, int r0, int r1) { + int x = 0; + if (down[x] >= 0) x++; + if (down[x] >= 0) { + WARN("Internal error : tree already has more than one child (%d %d %d)\n", down[0], down[1], down[2]); + return ncclInternalError; + } + if (r0 != -1) down[x++] = indexes[r0]; + if (r1 != -1) down[x++] = indexes[r1]; + return ncclSuccess; +} + +static ncclResult_t setTreeDown(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int d0_0, int d0_1, int d1_0, int d1_1) { + NCCLCHECK(addRanksDown(tree0->down, indexes, d0_0, d0_1)); + NCCLCHECK(addRanksDown(tree1->down, indexes, d1_0, d1_1)); + return ncclSuccess; +} + +static ncclResult_t openRing(struct ncclTree* tree, int rank, int upRank) { + if (tree->down[0] == upRank) tree->down[0] = -1; + if (rank == upRank) tree->up = -1; + return ncclSuccess; +} + +static ncclResult_t connectTrees(struct ncclComm* comm, int* treeUpRecv, int* treeUpSend, int* treeDnRecv, int* treeDnSend, int* firstRanks) { + const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node; + int* indexesSend, *indexesRecv; + NCCLCHECK(ncclCalloc(&indexesSend, nNodes)); + NCCLCHECK(ncclCalloc(&indexesRecv, nNodes)); + + // Compute tree depth. Not an exact value but a good approximation in most + // cases + int depth = comm->nRanks/nNodes - 1 + log2i(nNodes); + + int u0, d0_0, d0_1, u1, d1_0, d1_1; + NCCLCHECK(ncclGetDtree(nNodes, node, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1)); + for (int c=0; c<nChannels; c++) { + struct ncclChannel* channel0 = comm->channels+c; + struct ncclChannel* channel1 = channel0+nChannels; + NCCLCHECK(getIndexes(treeUpSend+c*comm->nRanks, indexesSend, nNodes, firstRanks)); + NCCLCHECK(getIndexes(treeUpRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks)); + NCCLCHECK(openRing(&channel0->treeUp, comm->rank, indexesSend[node])); + NCCLCHECK(openRing(&channel1->treeUp, comm->rank, indexesSend[node])); + int root = indexesSend[node]; + if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeUp, &channel1->treeUp, indexesRecv, u0, u1)); + if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeUp, &channel1->treeUp, indexesSend, d0_0, d0_1, d1_0, d1_1)); + NCCLCHECK(getIndexes(treeDnSend+c*comm->nRanks, indexesSend, nNodes, firstRanks)); + NCCLCHECK(getIndexes(treeDnRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks)); + NCCLCHECK(openRing(&channel0->treeDn, comm->rank, u0 == -1 ? root : indexesRecv[node])); + NCCLCHECK(openRing(&channel1->treeDn, comm->rank, u1 == -1 ? root : indexesRecv[node])); + if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeDn, &channel1->treeDn, indexesRecv, d0_0, d0_1, d1_0, d1_1)); + if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeDn, &channel1->treeDn, indexesSend, u0, u1)); + TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c, channel0->treeUp.up, channel0->treeUp.down[0], channel0->treeUp.down[1], channel0->treeUp.down[2]); + TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c+nChannels, channel1->treeUp.up, channel1->treeUp.down[0], channel1->treeUp.down[1], channel1->treeUp.down[2]); + TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c, channel0->treeDn.up, channel0->treeDn.down[0], channel0->treeDn.down[1], channel0->treeDn.down[2]); + TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c+nChannels, channel1->treeDn.up, channel1->treeDn.down[0], channel1->treeDn.down[1], channel1->treeDn.down[2]); + channel0->treeUp.depth = channel1->treeUp.depth = depth; + } + free(indexesSend); + free(indexesRecv); + return ncclSuccess; +} + +// Legacy naming +NCCL_PARAM(MinNrings, "MIN_NRINGS", -2); +NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2); +// New naming +NCCL_PARAM(MinNchannels, "MIN_NCHANNELS", -2); +NCCL_PARAM(MaxNchannels, "MAX_NCHANNELS", -2); + +int ncclMinNchannels() { + int minNchannels = 0; + if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings(); + if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels(); + if (minNchannels > MAXCHANNELS) { + WARN("User asked for a minimum of %d channels, limiting to %d\n", minNchannels, MAXCHANNELS); + minNchannels = MAXCHANNELS; + } + if (minNchannels < 0) minNchannels = 0; + return minNchannels; +} +int ncclMaxNchannels() { + int maxNchannels = MAXCHANNELS; + if (ncclParamMaxNrings() != -2) maxNchannels = ncclParamMaxNrings(); + if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels(); + if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS; + if (maxNchannels < 1) { + WARN("User asked for a maximum of %d channels, setting it to 1\n", maxNchannels); + maxNchannels = 1; + } + return maxNchannels; +} + +ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings) { + // Gather data from all ranks + int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend; + int nranks = comm->nRanks; + int nChannels = comm->nChannels; + NCCLCHECK(ncclCalloc(&ringRecv, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&ringSend, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeUpRecv, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeUpSend, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeDnRecv, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeDnSend, nranks*MAXCHANNELS)); + for (int i=0; i<nranks; i++) { + for (int c=0; c<nChannels;c++) { + ringRecv[c*nranks+i] = allTopoRanks[i]->ringRecv[c]; + ringSend[c*nranks+i] = allTopoRanks[i]->ringSend[c]; + ringPrev[c*nranks+i] = allTopoRanks[i]->ringPrev[c]; + ringNext[c*nranks+i] = allTopoRanks[i]->ringNext[c]; + treeUpRecv[c*nranks+i] = allTopoRanks[i]->treeUpRecv[c]; + treeUpSend[c*nranks+i] = allTopoRanks[i]->treeUpSend[c]; + treeDnRecv[c*nranks+i] = allTopoRanks[i]->treeDnRecv[c]; + treeDnSend[c*nranks+i] = allTopoRanks[i]->treeDnSend[c]; + } + } + + // Connect rings and trees. This should also duplicate the channels. + NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext, firstRanks)); + NCCLCHECK(connectTrees(comm, treeUpRecv, treeUpSend, treeDnRecv, treeDnSend, firstRanks)); + + // Duplicate ringPrev/ringNext for ncclBuildRing + memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int)); + memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int)); + + // Duplication should be complete now + nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2); + + // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS. + // We permit combining max, then min, to only use the first channels, then duplicate them. + nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels); + int c; + for (c=nChannels; c<ncclMinNchannels(); c++) { + memcpy(ringPrev+c*nranks, ringPrev+(c-nChannels)*nranks, nranks*sizeof(int)); + memcpy(ringNext+c*nranks, ringNext+(c-nChannels)*nranks, nranks*sizeof(int)); + memcpy(comm->channels+c, comm->channels+c-nChannels, sizeof(struct ncclChannel)); + } + nChannels = comm->nChannels = c; + + // Create rings array and check all is fine + NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext)); + + free(ringRecv); + free(ringSend); + free(ringPrev); + free(ringNext); + free(treeUpRecv); + free(treeUpSend); + free(treeDnRecv); + free(treeDnSend); + + return ncclSuccess; +} diff --git a/src/graph/paths.cc b/src/graph/paths.cc new file mode 100644 index 0000000..ce1772c --- /dev/null +++ b/src/graph/paths.cc @@ -0,0 +1,363 @@ +/************************************************************************* + * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" +#include "graph.h" +#include "topo.h" +#include "comm.h" +#include "net.h" + +// Pre-compute GPU->NIC, GPU->GPU and NIC->GPU paths + +struct ncclTopoNodeList { + struct ncclTopoNode* list[NCCL_TOPO_MAX_NODES]; + int count; +}; + +static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode* node, int t, int64_t id, struct ncclTopoLinkList** path) { + for (int i=0; i<system->nodes[t].count; i++) { + if (system->nodes[t].nodes[i].id == id) { + *path = node->paths[t]+i; + return ncclSuccess; + } + } + WARN("Could not find node of type %d id %lx\n", t, id); + return ncclInternalError; +} + +static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) { + if (baseNode->paths[baseNode->type] == NULL) { + NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count)); + } + + // breadth-first search to set all paths to that node in the system + struct ncclTopoNodeList nodeList; + struct ncclTopoNodeList nextNodeList; + nodeList.count = 1; nodeList.list[0] = baseNode; + nextNodeList.count = 0; + struct ncclTopoLinkList* basePath; + NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath)); + basePath->count = 0; + basePath->width = LOC_WIDTH; + basePath->type = LINK_LOC; + + while (nodeList.count) { + nextNodeList.count = 0; + for (int n=0; n<nodeList.count; n++) { + struct ncclTopoNode* node = nodeList.list[n]; + struct ncclTopoLinkList* path; + NCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path)); + for (int l=0; l<node->nlinks; l++) { + struct ncclTopoLink* link = node->links+l; + struct ncclTopoNode* remNode = link->remNode; + if (remNode->paths[baseNode->type] == NULL) { + NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count)); + } + struct ncclTopoLinkList* remPath; + NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath)); + int width = std::min(path->width, link->width); + if (remPath->width < width) { + // Find reverse link + for (int l=0; l<remNode->nlinks; l++) { + if (remNode->links[l].remNode == node) { + remPath->list[0] = remNode->links+l; + break; + } + } + if (remPath->list[0] == NULL) { + WARN("Failed to find reverse path from remNode id %d type %d nlinks %d to node id %d type %d", + remNode->id, remNode->type, remNode->nlinks, node->id, node->type); + return ncclInternalError; + } + // Copy the rest of the path + for (int i=0; i<path->count; i++) remPath->list[i+1] = path->list[i]; + remPath->count = path->count + 1; + remPath->width = width; + + // Consider the path is QPI when going through the CPU + // Also don't consider LINK_NET as we only care about the NIC->GPU path. + int type = remNode->type == CPU ? LINK_QPI : link->type == LINK_NET ? 0 : link->type; + remPath->type = std::max(path->type, type); + + // Add to the list for the next iteration if not already in the list + // Disallow GPUs as intermediate steps for now + if (remNode->type != GPU) { + int i; + for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break; + if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode; + } + } + } + } + memcpy(&nodeList, &nextNodeList, sizeof(nodeList)); + } + return ncclSuccess; +} + +static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* node) { + char line[1024]; +#ifdef ENABLE_TRACE + INFO(NCCL_GRAPH, "Paths from %s/%lX :", topoNodeTypeStr[node->type], node->id); +#else + sprintf(line, "%s/%lX :", topoNodeTypeStr[node->type], node->id); + int offset = strlen(line); +#endif + for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) { + if (node->paths[t] == NULL) continue; + for (int n = 0; n<system->nodes[t].count; n++) { +#ifdef ENABLE_TRACE + line[0] = 0; + int offset = 0; + for (int i=0; i<node->paths[t][n].count; i++) { + struct ncclTopoLink* link = node->paths[t][n].list[i]; + struct ncclTopoNode* remNode = link->remNode; + sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id); + offset = strlen(line); + } + INFO(NCCL_GRAPH, "%s (%d)", line, node->paths[t][n].width); +#else + sprintf(line+offset, "%s/%lX (%d/%d/%d) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, node->paths[t][n].type); + offset = strlen(line); +#endif + } + } +#ifndef ENABLE_TRACE + INFO(NCCL_GRAPH, "%s", line); +#endif +} + +ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system) { + for (int i=0; i<system->nodes[GPU].count; i++) { + printNodePaths(system, system->nodes[GPU].nodes+i); + } + for (int i=0; i<system->nodes[NET].count; i++) { + printNodePaths(system, system->nodes[NET].nodes+i); + } + return ncclSuccess; +} + +static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu) { + // Find the closest CPU to a GPU + int minHops = 0; + int localCpu = -1; + struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[gpu].paths[CPU]; + for (int c=0; c<system->nodes[CPU].count; c++) { + int hops = paths[c].count; + if (minHops == 0 || hops < minHops) { + localCpu = c; + minHops = hops; + } + } + if (localCpu == -1) { + WARN("Error : could not find CPU close to GPU %d", gpu); + return ncclInternalError; + } + *retCpu = localCpu; + return ncclSuccess; +} + +static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int i1, int t2, int i2) { + struct ncclTopoNode* cpuNode = system->nodes[CPU].nodes+c; + struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1; + + int l=0; + // Node 1 -> CPU + for (int i=0; i<srcNode->paths[CPU][c].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[CPU][c].list[i]; + // CPU -> Node 2 + for (int i=0; i<cpuNode->paths[t2][i2].count; i++) srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i]; + + // Update path characteristics + srcNode->paths[t2][i2].count = l; + srcNode->paths[t2][i2].type = LINK_QPI; + srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width); + return ncclSuccess; +} + +// Remove/free paths for a given type +static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) { + for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) { + for (int n=0; n<system->nodes[t].count; n++) { + struct ncclTopoNode* node = system->nodes[t].nodes+n; + free(node->paths[nodeType]); + node->paths[nodeType] = NULL; + } + } +} + +ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) { + // Precompute paths between GPUs/NICs. + + // Remove everything in case we're re-computing + for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t); + + // Set direct paths from/to CPUs. We need them in many cases. + for (int c=0; c<system->nodes[CPU].count; c++) { + NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system)); + } + + // Set direct paths from/to GPUs. + for (int g=0; g<system->nodes[GPU].count; g++) { + // Compute paths to GPU g + NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system)); + + if (peerInfos == NULL) continue; + // Update paths from GPUs p to GPU g when we can't or don't want to use P2P or even SHM + struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].rank; + for (int p=0; p<system->nodes[GPU].count; p++) { + if (p == g) continue; + struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].rank; + int p2p; + NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo)); + if (p2p == 0) { + int shm; + NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo)); + if (shm == 1) { + // We cannot use GPU Direct, so we need all traffic to go through a CPU + int cpu; + NCCLCHECK(getLocalCpu(system, g, &cpu)); + NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g)); + } else { + // We cannot communicate with that peer. + system->nodes[GPU].nodes[p].paths[GPU][g].count = 0; + } + } + } + } + + // Set direct paths from/to NICs. + for (int n=0; n<system->nodes[NET].count; n++) { + struct ncclTopoNode* netNode = system->nodes[NET].nodes+n; + NCCLCHECK(ncclTopoSetPaths(netNode, system)); + + if (peerInfos == NULL) continue; + for (int g=0; g<system->nodes[GPU].count; g++) { + if ((peerInfos[system->nodes[GPU].nodes[g].rank].gdrSupport & (1 << n)) == 0) { + // We cannot use GPU Direct RDMA, so we need all NIC<->GPU paths + // to go through a CPU + int localCpu; + NCCLCHECK(getLocalCpu(system, g, &localCpu)); + NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g)); + NCCLCHECK(addCpuStep(system, localCpu, GPU, g, NET, n)); + } + } + } + + return ncclSuccess; +} + +ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm) { + int *domains; + int64_t *ids; + NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count)); + NCCLCHECK(ncclCalloc(&ids, system->nodes[GPU].count)); + int myDomain = 0; + for (int g=0; g<system->nodes[GPU].count; g++) { + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + domains[g] = g; + ids[g] = gpu->id; + for (int p=0; p<g; p++) { + if (gpu->paths[GPU][p].count > 0) { + domains[g] = std::min(domains[g], domains[p]); + } + } + if (gpu->rank == comm->rank) myDomain = domains[g]; + } + + int ngpus = system->nodes[GPU].count; + for (int i=0; i<ngpus; i++) { + if (domains[i] == myDomain) continue; + struct ncclTopoNode* gpu = NULL; + int g; + for (g=0; g<system->nodes[GPU].count /* This one varies over the loops */; g++) { + gpu = system->nodes[GPU].nodes+g; + if (gpu->id == ids[i]) break; else gpu=NULL; + } + if (gpu == NULL) { + WARN("Could not find id %lx", ids[i]); + free(domains); + free(ids); + return ncclInternalError; + } + + // Remove GPUs I can't access (even indirectly) from my view of the node + for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) { + for (int n=0; n<system->nodes[t].count; n++) { + struct ncclTopoNode* node = system->nodes[t].nodes+n; + if (node == gpu) continue; + for (int l=0; l<node->nlinks; l++) { + while (l<node->nlinks && node->links[l].remNode == gpu) { + if (l<node->nlinks-1) + memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink)); + node->nlinks--; + } + if (l<node->nlinks && node->links[l].remNode->type == GPU && node->links[l].remNode >= gpu) { + node->links[l].remNode--; + } + } + } + } + if (g != system->nodes[GPU].count-1) + memmove(gpu, gpu+1, (system->nodes[GPU].count-g-1)*sizeof(struct ncclTopoNode)); + system->nodes[GPU].count--; + } + + comm->localRanks = system->nodes[GPU].count; + if (system->nodes[GPU].count == comm->nRanks) { + // Trim network + ncclTopoRemovePathType(system, NET); + system->nodes[NET].count = 0; + } + free(domains); + free(ids); + return ncclSuccess; +} + +static ncclResult_t getGpuSpeed(struct ncclTopoNode* node, int* speed) { + int nvlSpeed = 0; + int nvlPeers = 0; + int pciSpeed = 0; + for (int l=0; l<node->nlinks; l++) { + if (node->links[l].type == LINK_NVL) nvlSpeed += node->links[l].width; + if (node->links[l].remNode->type == GPU) nvlPeers++; else nvlPeers = 2; + if (node->links[l].type == LINK_PCI) pciSpeed = node->links[l].width; + } + *speed = std::min(*speed, std::max(nvlSpeed, pciSpeed)); + return ncclSuccess; +} + +ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system) { + // Compute max speed to try to accelerate the search. + system->maxSpeed = LOC_WIDTH; + + for (int g=0; g<system->nodes[GPU].count; g++) { + NCCLCHECK(getGpuSpeed(system->nodes[GPU].nodes+g, &system->maxSpeed)); + } + if (system->nodes[NET].count) { + // Try to assign one NIC per GPU + int netMaxSpeed = 0; + int netMaxSpeedCount = 0; + for (int n=0; n<system->nodes[NET].count; n++) { + int maxSpeed = 0; + struct ncclTopoNode* net = system->nodes[NET].nodes+n; + for (int g=0; g<system->nodes[GPU].count; g++) { + maxSpeed = std::max(maxSpeed, net->paths[GPU][g].width); + } + if (maxSpeed > netMaxSpeed) { + netMaxSpeed = maxSpeed; + netMaxSpeedCount = 1; + } else if (maxSpeed == netMaxSpeed) { + netMaxSpeedCount++; + } + } + system->maxSpeed = std::min(system->maxSpeed, netMaxSpeedCount*NET_WIDTH); + } + return ncclSuccess; +} + +void ncclTopoFree(struct ncclTopoSystem* system) { + for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t); + free(system); +} diff --git a/src/graph/rings.cc b/src/graph/rings.cc new file mode 100644 index 0000000..5aacbb5 --- /dev/null +++ b/src/graph/rings.cc @@ -0,0 +1,57 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" + +#define MAXWIDTH 20 +#define PREFIXLEN 15 +#define STRLENGTH (PREFIXLEN+5*MAXWIDTH) +void dumpLine(int* values, int nranks, const char* prefix) { + int prefixlen = strlen(prefix); + char line[STRLENGTH+1]; + line[STRLENGTH] = '\0'; + memset(line, ' ', STRLENGTH); + strncpy(line, prefix, PREFIXLEN); + for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]); + INFO(NCCL_INIT,"%s", line); +} + +ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) { + for (int r=0; r<nrings; r++) { + char prefix[30]; + /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r); + dumpLine(prev+r*nranks, nranks, prefix); + sprintf(prefix, "[%d] Channel %d Next : ", rank, r); + dumpLine(next+r*nranks, nranks, prefix);*/ + + int current = rank; + for (int i=0; i<nranks; i++) { + rings[r*nranks+i] = current; + current = next[r*nranks+current]; + } + sprintf(prefix, "Channel %02d/%02d : ", r, nrings); + if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix); + if (current != rank) { + WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank); + return ncclInternalError; + } + // Check that all ranks are there + for (int i=0; i<nranks; i++) { + int found = 0; + for (int j=0; j<nranks; j++) { + if (rings[r*nranks+j] == i) { + found = 1; + break; + } + } + if (found == 0) { + WARN("Error : ring %d does not contain rank %d", r, i); + return ncclInternalError; + } + } + } + return ncclSuccess; +} diff --git a/src/graph/rings.h b/src/graph/rings.h new file mode 100644 index 0000000..c52b1ca --- /dev/null +++ b/src/graph/rings.h @@ -0,0 +1,7 @@ +/************************************************************************* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next); diff --git a/src/graph/search.cc b/src/graph/search.cc new file mode 100644 index 0000000..3a8b4e7 --- /dev/null +++ b/src/graph/search.cc @@ -0,0 +1,594 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" +#include "graph.h" +#include "topo.h" + +static ncclResult_t ncclTopoFollowPath(struct ncclTopoGraph* graph, struct ncclTopoLinkList* path, struct ncclTopoNode** node, int width, int typeSave) { + if (path->count == 0) return ncclSuccess; + + *node = NULL; + if (width > 0) { + if (path->type > graph->type) return ncclSuccess; + graph->type = std::max(graph->type, path->type); + graph->nHops += path->count; + } else { + graph->type = typeSave; + graph->nHops -= path->count; + } + + for (int i=0; i<path->count; i++) { + if (path->list[i]->width < width) { + // Can't follow this path, rewind and exit + for (int j=0; j<i; j++) path->list[j]->width += width; + return ncclSuccess; + } + path->list[i]->width -= width; + } + *node = path->list[path->count-1]->remNode; + return ncclSuccess; +} + +static int gpuPciWidth(struct ncclTopoNode* gpu) { + for (int l=0; l<gpu->nlinks; l++) { + struct ncclTopoLink* gpuLink = gpu->links+l; + if (gpuLink->type != LINK_PCI) continue; + struct ncclTopoNode* pci = gpuLink->remNode; + for (int l=0; l<pci->nlinks; l++) { + struct ncclTopoLink* pciLink = pci->links+l; + if (pciLink->remNode != gpu) continue; + return std::min(gpuLink->width, pciLink->width); + } + } + return -1; +} + +/* Choose the order in which we try next GPUs. This is critical for the search + to quickly converge to the best solution even if it eventually times out. */ +struct ncclGpuScore { + int g; // Retain the index + int startIndex; // Least important + int intraNhops; + int intraWidth; + int interNhops; + int interPciWidth; + int interWidth; // Most important +}; + +static int cmpScore(const void * g1, const void * g2) { + struct ncclGpuScore *s1 = (struct ncclGpuScore*)g1; + struct ncclGpuScore *s2 = (struct ncclGpuScore*)g2; + int d; + if ((d = (s2->interWidth - s1->interWidth))) return d; + if ((d = (s2->interPciWidth - s1->interPciWidth))) return d; + if ((d = (s1->interNhops - s2->interNhops))) return d; + if ((d = (s2->intraWidth - s1->intraWidth))) return d; + if ((d = (s1->intraNhops - s2->intraNhops))) return d; + return s1->startIndex - s2->startIndex; +} + +static int cmpIntraScores(struct ncclGpuScore* scores, int count) { + int intraWidth = scores[0].intraWidth; + int intraNhops = scores[0].intraNhops; + for (int i=1; i<count; i++) { + if (scores[i].intraWidth != intraWidth || scores[i].intraNhops != intraNhops) return 1; + } + return 0; +} + +static ncclResult_t getNetPaths(struct ncclTopoSystem* system, const uint64_t flag, struct ncclTopoLinkList** netPaths) { + for (int n=0; n<system->nodes[NET].count; n++) { + if (system->nodes[NET].nodes[n].used & flag) { + *netPaths=system->nodes[NET].nodes[n].paths[GPU]; + return ncclSuccess; + } + } + return ncclInternalError; +} + +ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoNode* gpu, int* next, int* countPtr, int sortNet) { + const uint64_t flag = 1ULL<<(graph->nChannels); + int ngpus = system->nodes[GPU].count; + struct ncclTopoLinkList* paths = gpu->paths[GPU]; + struct ncclTopoLinkList* netPaths = NULL; + if (sortNet) NCCLCHECK(getNetPaths(system, flag, &netPaths)); + + struct ncclGpuScore scores[NCCL_TOPO_MAX_NODES]; + memset(scores, 0, ngpus*sizeof(struct ncclGpuScore)); + int start = gpu-system->nodes[GPU].nodes; + int count = 0; + for (int i=1; i<ngpus; i++) { + int g = (start+i)%ngpus; + if (paths[g].count == 0) continue; // There is no path to that GPU + if (system->nodes[GPU].nodes[g].used & flag) continue; + scores[count].g = g; + scores[count].startIndex = i; + scores[count].intraNhops = paths[g].count; + scores[count].intraWidth = paths[g].width; + if (netPaths) { + scores[count].interNhops = netPaths[g].count; + scores[count].interPciWidth = gpuPciWidth(system->nodes[GPU].nodes+g); + scores[count].interWidth = netPaths[g].width; + } + count++; + } + + // Sort GPUs + qsort(scores, count, sizeof(struct ncclGpuScore), cmpScore); + + // Check if all have the same intra-node score in which case we go reverse for sortNet = -1 + if (sortNet == -1 && cmpIntraScores(scores, count) == 0) { + for (int i=0; i<count; i++) next[i] = scores[count-1-i].g; + } else { + for (int i=0; i<count; i++) next[i] = scores[i].g; + } + *countPtr = count; + return ncclSuccess; +} + +ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time); + +#define NCCL_SEARCH_TIMEOUT (1ULL<<20) // This should get contain all search within a second or so. + +#define FORCED_ORDER_PCI 1 +#define FORCED_ORDER_REPLAY 2 + +ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int step, int* g) { + *g = -1; + if (graph->nChannels == 0) return ncclInternalError; + int ngpus = system->nodes[GPU].count; + int nextRank = graph->intra[(graph->nChannels-1)*ngpus+step+1]; + for (int i=0; i<ngpus; i++) if (system->nodes[GPU].nodes[i].rank == nextRank) { + *g = i; + return ncclSuccess; + } + if (*g == -1) return ncclInternalError; + return ncclSuccess; +} + +ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time); + +ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoLinkList* paths, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time, int g, int speed) { + int typeSave = graph->type; + const uint64_t flag = 1ULL<<(graph->nChannels); + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, speed, typeSave)); + if (gpu) { + gpu->used ^= flag; + NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, maxSpeed, time)); + gpu->used ^= flag; + if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, -speed, typeSave)); + } + return ncclSuccess; +} + +ncclResult_t ncclTopoCompareGraphs(struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) { + // 0. When we are trying to increase speedIntra, do not copy if the solution has less channels + // since it would likely impact the rings algorithms too. + if (graph->speedIntra > graph->speedInter && graph->nChannels < refGraph->nChannels) return ncclSuccess; + + // 1. Try to get better bandwidth + if (graph->nChannels*graph->speedIntra < refGraph->nChannels*refGraph->speedIntra) return ncclSuccess; + if (graph->nChannels*graph->speedIntra > refGraph->nChannels*refGraph->speedIntra) { + *copy = 1; + return ncclSuccess; + } + // 2. Give an advantage when all channels are the same + if (graph->nChannels > 1 && graph->sameChannels && refGraph->sameChannels == 0) { + *copy = 1; + return ncclSuccess; + } + // 3. Less hops + if (graph->nHops < refGraph->nHops) *copy = 1; + return ncclSuccess; +} + +ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time) { + if ((*time) <= 0) return ncclSuccess; + (*time)--; + + int ngpus = system->nodes[GPU].count; + if (step == ngpus) { + // Determine whether we found a better solution or not + int copy = 0; + int sameChannels = graph->sameChannels; + if (graph->nChannels > 0) { + int* intra = graph->intra+graph->nChannels*ngpus; + for (int g=0; g<ngpus; g++) if (intra[g] != intra[g-ngpus]) graph->sameChannels = 0; + } + graph->nChannels++; + NCCLCHECK(ncclTopoCompareGraphs(graph, saveGraph, ©)); + if (copy) { + memcpy(saveGraph, graph, sizeof(struct ncclTopoGraph)); + if (graph->nChannels*graph->speedIntra == maxSpeed) *time = -1; + } + if (graph->nChannels < MAXCHANNELS/2) { + NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, maxSpeed, time)); + } + graph->nChannels--; + graph->sameChannels = sameChannels; + return ncclSuccess; + } + graph->intra[graph->nChannels*ngpus+step] = gpu->rank; + if (step == backToNet) { + // first get back to NIC + if (system->nodes[NET].count) { + int maxWidth = 0; + struct ncclTopoLinkList* paths = gpu->paths[NET]; + for (int n=0; n<system->nodes[NET].count; n++) { + if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue; + maxWidth = std::max(paths[n].width, maxWidth); + } + for (int n=0; n<system->nodes[NET].count; n++) { + if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue; + if (paths[n].width == maxWidth) { + struct ncclTopoNode* net = system->nodes[NET].nodes+n; + int typeSave = graph->type; + NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, graph->speedInter, typeSave)); + if (net) { + graph->inter[graph->nChannels*2+1] = net->id; + NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, maxSpeed, time)); + NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, -graph->speedInter, typeSave)); + } + } + } + } + } else if (step < system->nodes[GPU].count-1) { + // Go to next GPU + struct ncclTopoLinkList* paths = gpu->paths[GPU]; + int next[NCCL_TOPO_MAX_NODES]; + int count; + if (forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order + next[0] = step+1; + count = 1; + } else if (forcedOrder == FORCED_ORDER_REPLAY) { // Try last channel order + NCCLCHECK(ncclTopoReplayGetGpu(system, graph, step, next)); + count = 1; + } else { // Normal search + NCCLCHECK(ncclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step+1 ? 1 : -1 )); + } + for (int i=0; i<count; i++) { + int g = next[i]; + int nvlink = graph->nvlink; + graph->nvlink &= paths[g].type <= LINK_NVL ? 1 : 0; + int speed = graph->speedIntra; + if (paths[g].type == LINK_QPI) speed = INTEL_P2P_OVERHEAD(speed); + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, step+1, backToNet, backToFirstRank, forcedOrder, maxSpeed, time, g, speed)); + graph->nvlink = nvlink; + } + } else if (step == backToFirstRank) { + // Find first GPU and loop back to it + int g; + int rank = graph->intra[graph->nChannels*ngpus]; + for (g=0; g<ngpus; g++) { + if (system->nodes[GPU].nodes[g].rank == rank) break; + } + if (g == ngpus) { + WARN("Could not find GPU with rank %d\n", rank); + return ncclInternalError; + } + struct ncclTopoLinkList* paths = gpu->paths[GPU]; + struct ncclTopoNode* firstGpu = system->nodes[GPU].nodes+g; + int typeSave = graph->type; + NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, graph->speedIntra, typeSave)); + if (firstGpu) { + NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step+1, backToNet, -1, forcedOrder, maxSpeed, time)); + NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, -graph->speedIntra, typeSave)); + } + } else { + // Next path + NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, maxSpeed, time)); + } + return ncclSuccess; +} + +ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int maxSpeed, int* time) { + const uint64_t flag = 1ULL<<(graph->nChannels); + const int speed = graph->speedInter; + for (int n=0; n<system->nodes[NET].count; n++) { + struct ncclTopoNode* net = system->nodes[NET].nodes+n; + struct ncclTopoNode* gpu; + if (net->used == 0) { + graph->inter[graph->nChannels*2] = net->id; + for (int i=0; i<system->nodes[NET].count; i++) { + if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag; + } + struct ncclTopoLinkList* paths = net->paths[GPU]; + + // First try the PCI order to set a reference + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, speed)); + // Then try to replay the last channel + if (graph->nChannels > 0) { + int g; + NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g)); + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, g, speed)); + } + + // Then try the most local GPUs + int maxWidth = 0, minHops = 0xfffffff; + for (int g=0; g<system->nodes[GPU].count; g++) { + if (paths[g].width > maxWidth) { + maxWidth = paths[g].width; + minHops = paths[g].count; + } else if (paths[g].width == maxWidth && paths[g].count < minHops) { + minHops = paths[g].count; + } + } + if (maxWidth >= speed) { + // In the first loop, avoid using GPUs in both directions between channels (one channel + // sending from that GPU and one channel receiving to that GPU), since that usually leads + // to lower BW. + for (int tryGpuBidir=0; tryGpuBidir<2; tryGpuBidir++) { + for (int g=0; g<system->nodes[GPU].count; g++) { + if (paths[g].width == maxWidth && paths[g].count == minHops) { + gpu = system->nodes[GPU].nodes+g; + int gpuUsed = gpuPciWidth(gpu) > 0 ? 0 : 1; + if (tryGpuBidir == gpuUsed) { + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, 0, maxSpeed, time, g, speed)); + } + } + } + } + } + for (int i=0; i<system->nodes[NET].count; i++) { + if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag; + } + } + } + return ncclSuccess; +} + +/* Search Patterns + * + * Intra-node + * Ring : GPU a -> GPU b -> .. -> GPU x -> GPU a + * (=Split Tree Loop) + * Tree : GPU a -> GPU b -> .. -> GPU x + * (=Split Tree) + * + * Inter-node + * Ring : NET n -> GPU a -> GPU b -> .. -> GPU x -> NET n (or m if crossNic) + * Tree : NET n -> GPU a -> GPU b -> .. -> GPU x + * `--> NET n (or m if crossNic) + * Split Tree : NET n -> GPU a -> GPU b -> .. -> GPU x + * `--> NET n (or m if crossNic) + * Split Tree Loop : NET n -> GPU a -> GPU b -> .. -> GPU x -> GPU a + * `--> NET n (or m if crossNic) + */ +ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) { + if (system->nodes[NET].count) { + if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1; + else if (pattern == NCCL_TOPO_PATTERN_TREE) *backToNet = 0; + else *backToNet = 1; + if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1; + else *backToFirstRank = -1; + } else { + *backToNet = -1; + if (pattern == NCCL_TOPO_PATTERN_RING || pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) *backToFirstRank = system->nodes[GPU].count-1; + else *backToFirstRank = -1; + } + return ncclSuccess; +} + +ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time) { + int backToNet, backToFirstRank; + NCCLCHECK(ncclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank)); + if (system->nodes[NET].count) { + // Start from NET + ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, maxSpeed, time); + } else { + // Start from GPU 0 + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, graph->speedIntra)); + if (graph->nChannels > 0) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, 0, graph->speedIntra)); + NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, 0, maxSpeed, time, 0, graph->speedIntra)); + } + return ncclSuccess; +} + +/* Parse user defined rings. Format is like : + * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0" + * Rings with a non-matching number of ranks are ignored so we can provide + * rings for multiple cases. + */ +#define MAX_ENV_RANKS 512 +static ncclResult_t parseGraph(const char* str, int* nChannelsRet, int ngpus, int* channels) { + int ranks[MAX_ENV_RANKS]; + int nChannels = 0; + int rank = 0; + int offset = 0; + int status = 0; // 0 : between numbers, 1 : inside number + do { + int digit = str[offset] - '0'; + if (digit >= 0 && digit <= 9) { + if (status == 0) { + ranks[rank] = digit; + status = 1; + } else { + ranks[rank] = ranks[rank]*10+digit; + } + } else { + if (status == 1) { + rank++; + if (rank == MAX_ENV_RANKS) goto end; + } + status = 0; + if (str[offset] == '|' || str[offset] == '\0') { + // Ignore if ngpus doesn't match + if (rank != ngpus) goto newchannel; + + for (int r=0; r<ngpus; r++) { + int rank = ranks[r]; + // Ignore if ranks are out of bounds + if (rank < 0 || rank >= ngpus) goto newchannel; + // Ignore if ranks are duplicate + for (int i=0; i<r; i++) + if (ranks[i] == rank) goto newchannel; + + channels[nChannels*ngpus+r] = rank; + } + nChannels++; +newchannel: + rank = 0; + } + } + } while (str[offset++] != 0); +end: + *nChannelsRet = nChannels; + return ncclSuccess; +} + +ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) { + int ngpus = system->nodes[GPU].count; + int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0; + graph->speedIntra = graph->speedInter = 0; + if (graph->crossNic == 2) graph->crossNic = 0; + graph->nvlink = 0; + graph->type = LINK_LOC; + graph->nChannels = 0; + graph->sameChannels = 1; + + char* str = getenv("NCCL_GRAPH"); + if (str) { + NCCLCHECK(parseGraph(str, &graph->nChannels, ngpus, graph->intra)); + for (int i=0; i<graph->nChannels*ngpus; i++) { + // Translate gpu numbers into ranks + graph->intra[i] = system->nodes[GPU].nodes[graph->intra[i]].rank; + } + // TODO : let user specify NICs + graph->inter[0] = graph->inter[1] = 0; + graph->speedIntra = graph->speedInter = PCI_WIDTH+2; + graph->nvlink = 0; + if (graph->pattern == NCCL_TOPO_PATTERN_RING) { + // Reverse the loop + for (int c=0; c<graph->nChannels; c++) { + for (int i=0; i<=ngpus/2; i++) { + int tmp = graph->intra[ngpus*c+i]; + graph->intra[ngpus*c+i] = graph->intra[ngpus*c+(ngpus-i)%ngpus]; + graph->intra[ngpus*c+ngpus-i] = tmp; + } + } + } + if (graph->nChannels) return ncclSuccess; + } + + if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE; + + struct ncclTopoGraph tmpGraph; + memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph)); + int bestSpeed = 0; + + // First try crossnic, then decrease speed and finally increase speedIntra. + tmpGraph.speedIntra = tmpGraph.speedInter = system->maxWidth; + int maxSpeed = system->maxSpeed; + tmpGraph.pattern = graph->pattern; + +search: + int time = NCCL_SEARCH_TIMEOUT; + tmpGraph.nvlink = 1; + tmpGraph.nChannels = 0; + tmpGraph.sameChannels = 1; + NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, maxSpeed, &time)); +#if 0 + printf("Pattern %d, crossNic %d, Speed %d/%d, type %d -> nChannels %dx%d/%d %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.type, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : ""); + for (int c=0; c<graph->nChannels; c++) { + printf("%2d : ", c); + for (int g=0; g<ngpus; g++) { + printf("%d ", graph->intra[c*ngpus+g]); + } + printf("\n"); + } +#endif + if (time == -1) goto done; + // We already have a solution and we timed out so lower speed will just timeout as well + if (time == 0 && graph->nChannels > 0) goto done; + if ((graph->nChannels > 0) && (bestSpeed == 0)) bestSpeed = graph->speedIntra; + + if (tmpGraph.speedIntra == tmpGraph.speedInter) { + // First pass, we don't have a solution yet ; try to go slower. + + // Try a simpler tree + if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) { + tmpGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE; + goto search; + } + if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) { + tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE; + goto search; + } + tmpGraph.pattern = graph->pattern; + + if (tmpGraph.type < LINK_QPI) { + tmpGraph.type += 1; + goto search; + } + tmpGraph.type = graph->type; + + if (crossNic && tmpGraph.crossNic == 0) { + // Try again with crossNic if permitted + tmpGraph.crossNic = crossNic; + goto search; + } + tmpGraph.crossNic = graph->crossNic; + + // Try to reduce speed per channel + tmpGraph.speedIntra = tmpGraph.speedInter -= 3; + if (tmpGraph.speedIntra >= bestSpeed/2 && tmpGraph.speedIntra >= 3) goto search; + } + +done: + // We have a solution now. See if we can increase speedIntra + if (tmpGraph.speedIntra == tmpGraph.speedInter) { + time = -1; + memcpy(&tmpGraph, graph, sizeof(tmpGraph)); + } + if (time != 0 && tmpGraph.pattern != NCCL_TOPO_PATTERN_RING && tmpGraph.speedIntra == graph->speedIntra) { + // Try to increase the intra speed only but keeping nChannels the same + tmpGraph.speedIntra += 3; + maxSpeed = tmpGraph.speedIntra * graph->nChannels; + if (tmpGraph.speedIntra <= tmpGraph.speedInter*2) goto search; + } + + if (graph->nChannels == 0) { + WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern); + for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].rank; + graph->inter[0] = graph->inter[1] = 0; + graph->speedIntra = graph->speedInter = 3; + graph->nvlink = 0; + graph->nChannels = 1; + } + return ncclSuccess; +} + +ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { + INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, speed %d/%d, nvlink %d, type %d, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->speedIntra, graph->speedInter, graph->nvlink, graph->type, graph->sameChannels); + int ngpus = system->nodes[GPU].count; + + char line[1024]; + for (int c=0; c<graph->nChannels; c++) { + sprintf(line, "%2d :", c); + int offset = strlen(line); + if (system->nodes[NET].count > 0) { + sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c]); + offset = strlen(line); + } + for (int i=0; i<ngpus; i++) { + sprintf(line+offset, " %s/%d", topoNodeTypeStr[GPU], graph->intra[ngpus*c+i]); + offset = strlen(line); + } + if (system->nodes[NET].count > 0) { + sprintf(line+offset, " %s/%d", topoNodeTypeStr[NET], graph->inter[2*c+1]); + offset = strlen(line); + } + INFO(NCCL_GRAPH, "%s", line); + } + return ncclSuccess; +} + +ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* dev) { + *dev = graph->inter[(channelId%graph->nChannels)*2+dir]; + return ncclSuccess; +} diff --git a/src/graph/topo.cc b/src/graph/topo.cc new file mode 100644 index 0000000..a1b3209 --- /dev/null +++ b/src/graph/topo.cc @@ -0,0 +1,641 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" +#include "graph.h" +#include "topo.h" +#include "comm.h" +#include "nvmlwrap.h" +#include "net.h" +#include <sys/stat.h> +#include <fcntl.h> + +#define BUSID_SIZE (sizeof("0000:00:00.0")) +#define BUSID_REDUCED_SIZE (sizeof("0000:00")) + +const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" }; + +const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" }; +const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "QPI", "NET" }; + +/******************************************************************/ +/******************* Graph Creation Functions *********************/ +/******************************************************************/ +static int getNumaId(char *path) { + char npath[PATH_MAX]; + snprintf(npath, PATH_MAX, "%s/numa_node", path); + npath[PATH_MAX-1] = '\0'; + + int numaId = -1; + FILE *file = fopen(npath, "r"); + if (file == NULL) return -1; + if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; } + fclose(file); + + return numaId; +} + +static ncclResult_t getPciPath(char* busId, char** path) { + for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]); + char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0"; + memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1); + memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1); + *path = realpath(busPath, NULL); + if (*path == NULL) { + WARN("Could not find real path of %s", busPath); + return ncclSystemError; + } + return ncclSuccess; +} + +// Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000. +ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) { + char* str = path+offset; + // Remove trailing "/" + if (*str == '/') str--; + // Find next / + while (*str != '/') str--; + str++; + NCCLCHECK(busIdToInt64(str, id)); + return ncclSuccess; +} + +static ncclResult_t idToIndex(struct ncclTopoSystem* system, int64_t id, int* index) { + *index = -1; + for (int i=0; i<system->nodes[GPU].count; i++) { + if (system->nodes[GPU].nodes[i].id == id) { + *index = i; + } + } + return ncclSuccess; +} + + +static ncclResult_t getPath(int64_t id, char** path) { + char busId[] = "0000:00:00.0"; + NCCLCHECK(int64ToBusId(id, busId)); + NCCLCHECK(getPciPath(busId, path)); + return ncclSuccess; +} + +ncclResult_t ncclTopoCudaPath(int cudaDev, char** path) { + char busId[BUSID_SIZE]; + CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev)); + NCCLCHECK(getPciPath(busId, path)); + return ncclSuccess; +} + + +int interCpuWidth = 0; +int cpuPciWidth = 0; + +static ncclResult_t getCpuWidths() { + // Check if already detected + if (interCpuWidth + cpuPciWidth) return ncclSuccess; + + // Defaults + char cpu[256]; + sprintf(cpu, "Generic"); + cpuPciWidth = interCpuWidth = PCI_WIDTH; + +#ifdef __PPC__ + sprintf(cpu, "ppc64"); + interCpuWidth = P9_WIDTH; +#endif +#ifdef __x86_64__ + sprintf(cpu, "x86_64"); + union { + struct { + // CPUID 0 String register order + uint32_t ebx; + uint32_t edx; + uint32_t ecx; + }; + char vendor[12]; + } cpuid0; + + asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0)); + if (strncmp(cpuid0.vendor, "GenuineIntel", 12) == 0) sprintf(cpu, "Intel"); + + if (strcmp(cpu, "Intel") == 0) { + union { + struct { + int steppingId:4; + int model:4; + int familyId:4; + int processorType:2; + int resv0:2; + int extModelId:4; + int modelId:8; + int resv1:4; + }; + uint32_t val; + } cpuid1; + asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1)); + if (cpuid1.familyId == 6 && cpuid1.modelId >= 0x55) { // Skylake + sprintf(cpu, "Intel/Skylake (or later)"); + interCpuWidth = SKL_QPI_WIDTH; + } else { + interCpuWidth = QPI_WIDTH; + } + } +#endif + INFO(NCCL_GRAPH, "%s CPU (PCI %d, InterCpu %d)", cpu, cpuPciWidth, interCpuWidth); + return ncclSuccess; +} + +static ncclResult_t ncclTopoGetInterCpuWidth(int* width) { + NCCLCHECK(getCpuWidths()); + *width = interCpuWidth; + return ncclSuccess; +} +static ncclResult_t ncclTopoGetCpuPciP2pWidth(int* width) { + NCCLCHECK(getCpuWidths()); + *width = cpuPciWidth; + return ncclSuccess; +} +static ncclResult_t ncclTopoGetPciWidth(int* width) { + *width = PCI_WIDTH; + return ncclSuccess; +} +static ncclResult_t ncclTopoGetNetWidth(int* width) { + *width = NET_WIDTH; + return ncclSuccess; +} + +enum ncclNvLinkDeviceType { + ncclNvLinkDeviceUnknown, + ncclNvLinkDeviceGpu, + ncclNvLinkDeviceSwitch, + ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea) +}; + +static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) { + char classPath[] = "/sys/bus/pci/devices/0000:00:00.0/class"; + memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1); + char* rPath = realpath(classPath, NULL); + int fd; + if ((fd = open(rPath, O_RDONLY)) == -1) { + // Could not find device. It might be because we're in a VM and + // we don't see the whole machine. This is handled silently so + // we don't want to print an INFO error. + TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno)); + return ncclSystemError; + } + free(rPath); + char pciClass[9]; + strncpy(pciClass, "0x000000", 9); + int len; + SYSCHECKVAL(read(fd, pciClass, 8), "read", len); + SYSCHECK(close(fd), "close"); + if (strcmp(pciClass, "0x068000") == 0) { + // PCI device is of type "Bridge / Other Bridge Device" (NVswitch) + *type = ncclNvLinkDeviceSwitch; + } else if (strcmp(pciClass, "0x068001") == 0) { + // PCI device is of type "Bridge: IBM Device 04ea" + *type = ncclNvLinkDeviceBridge; + } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla) + || strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce) + *type = ncclNvLinkDeviceGpu; + } else { + *type = ncclNvLinkDeviceUnknown; + } + return ncclSuccess; +} + +ncclResult_t ncclTopoConnectCpu(struct ncclTopoSystem* system, int numaId, struct ncclTopoNode* node, int linkType, int linkWidth) { + struct ncclTopoNode* cpuNode = NULL; + for (int c=0; c<system->nodes[CPU].count; c++) { + if (system->nodes[CPU].nodes[c].id == numaId) cpuNode = system->nodes[CPU].nodes+c; + } + if (cpuNode == NULL) { // Create CPU + NCCLCHECK(ncclTopoCreateNode(system, &cpuNode, CPU, numaId)); + } + NCCLCHECK(ncclTopoConnectNodes(node, cpuNode, linkType, linkWidth)); + NCCLCHECK(ncclTopoConnectNodes(cpuNode, node, linkType, linkWidth)); + return ncclSuccess; +} + +ncclResult_t ncclTopoConnectNVLink(nvmlDevice_t* nvmlDevs, struct ncclTopoSystem* system) { + struct ncclTopoNode* nvsNode = NULL; + + int minNvlinks = 6, minWidth = VOLTA_NVLINK_WIDTH; + for (int g=0; g<system->nodes[GPU].count; g++) { + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + int cudaMajor, cudaMinor; + NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDevs[g], &cudaMajor, &cudaMinor)); + int maxNvLinks, width; + if (cudaMajor < 6) { + maxNvLinks = 0; + width = 0; + } else if (cudaMajor == 6) { + maxNvLinks = 4; + width = PASCAL_NVLINK_WIDTH; + } else { + maxNvLinks = 6; + width = VOLTA_NVLINK_WIDTH; + } + + int nvlinks = 0; + for (int l=0; l<maxNvLinks; ++l) { + // Check whether we can use this NVLink for P2P + unsigned canP2P; + if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDevs[g], l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue; + + // Make sure the Nvlink is up. The previous call should have trained the link. + nvmlEnableState_t isActive; + if ((wrapNvmlDeviceGetNvLinkState(nvmlDevs[g], l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue; + + // Try to figure out what's on the other side of the NVLink + nvmlPciInfo_t remoteProc; + if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevs[g], l, &remoteProc) != ncclSuccess) continue; + + // Make a lower case copy of the bus ID for calling ncclDeviceType + // PCI system path is in lower case + char* p = remoteProc.busId; + char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) { + lowerId[c] = tolower(p[c]); + if (p[c] == 0) break; + } + + enum ncclNvLinkDeviceType type; + NCCLCHECK(ncclDeviceType(lowerId, &type)); + if (type == ncclNvLinkDeviceGpu) { + int64_t remoteId; + NCCLCHECK(busIdToInt64(lowerId, &remoteId)); + int peer; + NCCLCHECK(idToIndex(system, remoteId, &peer)); + if (peer != -1) { + NCCLCHECK(ncclTopoConnectNodes(gpu, system->nodes[GPU].nodes+peer, LINK_NVL, width)); + nvlinks++; + } + } else if (type == ncclNvLinkDeviceBridge) { + // Nvlink between GPU and CPU (PPC) + // Since the remote bridge does not have a valid numa_node, assume we + // are connected to the closest CPU. + char* path; + NCCLCHECK(getPath(gpu->id, &path)); + int numaId = getNumaId(path); + free(path); + NCCLCHECK(ncclTopoConnectCpu(system, numaId, gpu, LINK_NVL, width)); + nvlinks++; + } else { // Nvswitch + if (type == ncclNvLinkDeviceUnknown) { + // The NVLink is up but we couldn't find the PCI device on the other + // side. Assume it's an NVswitch outside a VM. + if (l == 0) INFO(NCCL_INIT, "%d/%d -> %s : Assuming NVLink is connected to NVswitch", g, l, lowerId); + } + if (nvsNode == NULL) { // Create nvswitch + NCCLCHECK(ncclTopoCreateNode(system, &nvsNode, NVS, 0)); + } + NCCLCHECK(ncclTopoConnectNodes(gpu, nvsNode, LINK_NVL, VOLTA_NVLINK_WIDTH)); + NCCLCHECK(ncclTopoConnectNodes(nvsNode, gpu, LINK_NVL, VOLTA_NVLINK_WIDTH)); + nvlinks++; + } + } + minNvlinks = std::min(minNvlinks, nvlinks); + minWidth = std::min(minWidth, width); + } + int pciWidth; + NCCLCHECK(ncclTopoGetPciWidth(&pciWidth)); + system->maxSpeed = minNvlinks ? minNvlinks*minWidth : pciWidth; + system->maxWidth = minNvlinks ? minWidth : pciWidth; + return ncclSuccess; +} + +ncclResult_t ncclTopoCreatePciPath(struct ncclTopoSystem* system, struct ncclTopoNode* endNode, char* path) { + struct ncclTopoNode* lastNode = endNode; + int pciWidth; + NCCLCHECK(ncclTopoGetPciWidth(&pciWidth)); + // Find intermediate PCI switches + int slashCount = 0; + int offsetRC = 0; + while (offsetRC < strlen(path)) { + if (path[offsetRC] == '/') slashCount++; + if (slashCount == 4) break; + offsetRC++; + } + int offset = strlen(path); + slashCount = 0; + while (--offset > offsetRC) { + if (path[offset] == '/') { + slashCount++; + // Find if already existing + if ((slashCount%2) == 0) { + int64_t pciId; + NCCLCHECK(pciPathToInt64(path, offset, offsetRC, &pciId)); + for (int p=0; p<system->nodes[PCI].count; p++) { + if (system->nodes[PCI].nodes[p].id == pciId) { + // Found our PCI switch. Attach and stop since the rest should already + // be connected + NCCLCHECK(ncclTopoConnectNodes(system->nodes[PCI].nodes+p, lastNode, LINK_PCI, pciWidth)); + NCCLCHECK(ncclTopoConnectNodes(lastNode, system->nodes[PCI].nodes+p, LINK_PCI, pciWidth)); + return ncclSuccess; + } + } + struct ncclTopoNode* pciNode; + NCCLCHECK(ncclTopoCreateNode(system, &pciNode, PCI, pciId)); + NCCLCHECK(ncclTopoConnectNodes(pciNode, lastNode, LINK_PCI, pciWidth)); + NCCLCHECK(ncclTopoConnectNodes(lastNode, pciNode, LINK_PCI, pciWidth)); + lastNode = pciNode; + } + } + } + // Then attach to a CPU node + int numaId = getNumaId(path); + int width; + NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width)); + NCCLCHECK(ncclTopoConnectCpu(system, numaId, lastNode, LINK_PCI, width)); + return ncclSuccess; +} + +// Try to detect if IB cards are in fact the same physical NIC, hence sharing ports. +#include <glob.h> +#define IB_GUID_PATH "%s/infiniband/mlx5_*/sys_image_guid" +uint64_t getIbGuid(char* path) { + uint64_t guid = 0ULL; + char guidPath[PATH_MAX]; + snprintf(guidPath, PATH_MAX, IB_GUID_PATH, path); + // PATH has a wildcard in it so use glob() + glob_t globbuf; + glob(guidPath, 0, NULL, &globbuf); + if (globbuf.gl_pathc > 0) + strncpy(guidPath, globbuf.gl_pathv[0], PATH_MAX); + globfree(&globbuf); + guidPath[PATH_MAX-1] = '\0'; + FILE *file = fopen(guidPath, "r"); + if (file != NULL) { + uint64_t a, b, c, d; + if (fscanf(file, "%04lx:%04lx:%04lx:%04lx", &a, &b, &c, &d) != EOF) { + guid = (a << 48) + (b << 32) + (c<<16) + d; + TRACE(NCCL_GRAPH, "Opened %s guid %lx", guidPath, guid); + } + fclose(file); + } + return guid; +} + +struct netInfo { + char* path; + int64_t nic; + uint64_t asic; + int port; + int net; +}; + +ncclResult_t ncclTopoComputeNetInfo(struct netInfo* netInfos, int ndev) { + for (int n=0; n<ndev; n++) { + struct netInfo* info = netInfos+n; + uint64_t ibGuid; + info->nic = n; + info->asic = n; + info->port = 0; + info->net = n; + if (info->path && (ibGuid = getIbGuid(info->path)) != 0) { + info->asic = ibGuid; + + // Ignore PCI subdevice when computing the ID to merge multi-port cards + // and make them use the same PCI link. + char* path = strdup(info->path); + path[strlen(path)-1]='0'; + NCCLCHECK(pciPathToInt64(path, strlen(path), 0, &info->nic)); + free(path); + + // Same PCI path -> different ports of the same NIC + for (int i=0; i<n; i++) if (netInfos[i].nic == info->nic) info->port++; + + // Same GUID -> same network links as the other NIC + for (int i=0; i<n; i++) if (netInfos[i].asic == info->asic && netInfos[i].port == info->port) info->net = netInfos[i].net; + } + INFO(NCCL_GRAPH, "%s -> %x/%lx/%d/%d", info->path, info->nic, info->asic, info->port, info->net); + } + return ncclSuccess; +} + +ncclResult_t ncclTopoConnectPCI(struct ncclTopoSystem* system) { + for (int g=0; g<system->nodes[GPU].count; g++) { + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + char* path; + NCCLCHECK(getPath(gpu->id, &path)); + NCCLCHECK(ncclTopoCreatePciPath(system, gpu, path)); + free(path); + } + + // Connect the NICs + int netDevCount; + NCCLCHECK(ncclNetDevices(&netDevCount)); + int netWidth; + NCCLCHECK(ncclTopoGetNetWidth(&netWidth)); + + struct netInfo* netInfos; + NCCLCHECK(ncclCalloc(&netInfos, netDevCount)); + + for (int n=0; n<netDevCount; n++) { + ncclResult_t res = ncclNetPciPath(n, &netInfos[n].path); + if (res != ncclSuccess) netInfos[n].path = NULL; + } + + NCCLCHECK(ncclTopoComputeNetInfo(netInfos, netDevCount)); + + for (int n=0; n<netDevCount; n++) { + struct netInfo* info = netInfos+n; + // Create NIC and attach it to the PCI tree + struct ncclTopoNode* nicNode = NULL; + for (int i=0; i<system->nodes[NIC].count; i++) { + if (system->nodes[NIC].nodes[i].id == info->nic) { + nicNode = system->nodes[NIC].nodes+i; + break; + } + } + if (!nicNode) { + NCCLCHECK(ncclTopoCreateNode(system, &nicNode, NIC, info->nic)); + if (info->path) { + // Create the PCI path + NCCLCHECK(ncclTopoCreatePciPath(system, nicNode, info->path)); + } else { + // This is probably a virtual NIC. Just attach it directly to CPU 0 + int width; + NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width)); + NCCLCHECK(ncclTopoConnectCpu(system, 0, nicNode, LINK_PCI, width)); + } + } + free(info->path); + + // Create the network side + struct ncclTopoNode* netNode; + NCCLCHECK(ncclTopoCreateNode(system, &netNode, NET, n)); + + // Use rank to store the net information + netNode->rank = info->net; + + NCCLCHECK(ncclTopoConnectNodes(nicNode, netNode, LINK_NET, netWidth)); + NCCLCHECK(ncclTopoConnectNodes(netNode, nicNode, LINK_NET, netWidth)); + } + free(netInfos); + + // And connect all CPU nodes together + for (int n=0; n<system->nodes[CPU].count; n++) { + for (int p=0; p<system->nodes[CPU].count; p++) { + if (n == p) continue; + int width; + NCCLCHECK(ncclTopoGetInterCpuWidth(&width)); + NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_QPI, width)); + } + } + return ncclSuccess; +} + +static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) { + if (node->type == GPU) { + sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->rank); + } else { + sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id); + } + INFO(NCCL_GRAPH, "%s", line); + for (int i=0; i<offset; i++) line[i] = ' '; + + for (int l=0; l<node->nlinks; l++) { + struct ncclTopoLink* link = node->links+l; + if (link->type == LINK_LOC) continue; + if (link->remNode != prevNode) { + sprintf(line+offset, "+ %s[%2d] - ", topoLinkTypeStr[link->type], link->width); + int nextOffset = strlen(line); + if (link->type == LINK_PCI) { + NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset)); + } else { + if (link->remNode->type == NET) { + sprintf(line+nextOffset, "%s/%lX (%d)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->rank); + } else { + sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id); + } + INFO(NCCL_GRAPH, "%s", line); + } + } + } + return ncclSuccess; +} + +ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) { + INFO(NCCL_GRAPH, "=== System : maxWidth %2d maxSpeed %2d ===", s->maxWidth, s->maxSpeed); + char line[1024]; + for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0)); + INFO(NCCL_GRAPH, "=========================================="); + NCCLCHECK(ncclTopoPrintPaths(s)); + return ncclSuccess; +} + +static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode* upNode) { + // Shift all links to have upLink as last link + if (upNode) { + int l=0; + while (node->links[l].remNode != upNode) l++; + struct ncclTopoLink upLink; + memcpy(&upLink, node->links+l, sizeof(struct ncclTopoLink)); + while (node->links[l+1].remNode) { + memcpy(node->links+l, node->links+l+1, sizeof(struct ncclTopoLink)); + l++; + } + memcpy(node->links+l, &upLink, sizeof(struct ncclTopoLink)); + } + + // Recursively sort the PCI tree + for (int l=0; l<node->nlinks; l++) { + struct ncclTopoLink* link = node->links+l; + if (link->type == LINK_PCI && link->remNode != upNode) NCCLCHECK(ncclTopoSort(link->remNode, node)); + } + return ncclSuccess; +} + +// We want the graph to be organized to ease/accelerate traversal : +// 1. NVLinks (already the case) +// 2. PCI down +// 3. PCI up +// 4. QPI (already the case) +ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) { + for (int n=0; n<system->nodes[CPU].count; n++) NCCLCHECK(ncclTopoSort(system->nodes[CPU].nodes+n, NULL)); + return ncclSuccess; +} + +ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) { + struct ncclTopoSystem* s; + NCCLCHECK(ncclCalloc(&s, 1)); + nvmlDevice_t* nvmlDevs; + int g = 0; + NCCLCHECK(ncclCalloc(&nvmlDevs, comm->nRanks)); + for (int r=0; r<comm->nRanks; r++) { + if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) { + // Consider the GPU as outside of our node if we can't see it through NVML. + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId)); + if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevs+g) != ncclSuccess) continue; + g++; + struct ncclTopoNode* gpuNode; + NCCLCHECK(ncclTopoCreateNode(s, &gpuNode, GPU, comm->peerInfo[r].busId)); + gpuNode->rank = r; + } + } + + NCCLCHECK(ncclTopoConnectNVLink(nvmlDevs, s)); + NCCLCHECK(ncclTopoConnectPCI(s)); + + free(nvmlDevs); + NCCLCHECK(ncclTopoSortSystem(s)); + *system = s; + return ncclSuccess; +} + +ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink) { + int g1, g2; + NCCLCHECK(idToIndex(system, busId1, &g1)); + NCCLCHECK(idToIndex(system, busId2, &g2)); + *nvlink = g1 != -1 && g2 != -1 && system->nodes[GPU].nodes[g1].paths[GPU][g2].type == LINK_NVL; + return ncclSuccess; +} + +ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink) { + int g; + NCCLCHECK(idToIndex(system, busId, &g)); + for (int i=0; i<system->nodes[GPU].count; i++) { + if (i == g) continue; + if (system->nodes[GPU].nodes[g].paths[GPU][i].type == LINK_NVL) { + *nvlink = 1; + return ncclSuccess; + } + } + *nvlink = 0; + return ncclSuccess; +} + +static int pathDistance(struct ncclTopoLinkList* links) { + int distance = PATH_PIX; + if (links->count > 2) distance = PATH_PXB; + for (int l=0; l<links->count; l++) { + // PHB if we go through 1 CPU, SYS if we go through 2 CPUs + if (links->list[l]->remNode->type == CPU) distance = (distance == PATH_PHB) ? PATH_SYS : PATH_PHB; + } + return distance; +} + +ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance) { + int g1, g2; + NCCLCHECK(idToIndex(system, busId1, &g1)); + NCCLCHECK(idToIndex(system, busId2, &g2)); + *distance = pathDistance(system->nodes[GPU].nodes[g1].paths[GPU]+g2); + return ncclSuccess; +} + +ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance) { + int g; + NCCLCHECK(idToIndex(system, busId, &g)); + *distance = pathDistance(system->nodes[GPU].nodes[g].paths[NET]+netDev); + return ncclSuccess; +} + +ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count) { + *count = system->nodes[CPU].count; + return ncclSuccess; +} diff --git a/src/graph/topo.h b/src/graph/topo.h new file mode 100644 index 0000000..6b8a2f9 --- /dev/null +++ b/src/graph/topo.h @@ -0,0 +1,138 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_TOPO_H_ +#define NCCL_TOPO_H_ + +#include "graph.h" +#include "core.h" + +#define LOC_WIDTH 5000 +#define PASCAL_NVLINK_WIDTH 18 +#define VOLTA_NVLINK_WIDTH 21 +#define PCI_WIDTH 12 // PCI Gen3 x16 +#define QPI_WIDTH 8 +#define SKL_QPI_WIDTH 12 +#define P9_WIDTH 32 +#define NET_WIDTH 12 // 100Gbit + +// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, to GPU +// to GPU traffic consumed more PCI bandwidth. +#define INTEL_P2P(speed) (speed*9/12) +#define INTEL_P2P_OVERHEAD(speed) (speed*12/9) + +#define NCCL_TOPO_NODE_TYPES 6 +#define GPU 0 +#define PCI 1 +#define NVS 2 +#define CPU 3 // Actually NUMA domains +#define NIC 4 +#define NET 5 +extern const char* topoNodeTypeStr[]; + +#define LINK_LOC 0 +#define LINK_NVL 1 +#define LINK_PCI 2 +#define LINK_QPI 3 +#define LINK_NET 4 +extern const char* topoLinkTypeStr[]; + +struct ncclTopoNode; +struct ncclTopoLink { + int type; + int width; + struct ncclTopoNode* remNode; +}; +#define NCCL_TOPO_MAX_LINKS 32 +#define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES) +#define SELECT_PATH 1 +#define SELECT_LAST 2 + +#define NET_GDR_MASK 0x70000000 + +struct ncclTopoLinkList { + struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS]; + int count; + int width; + int type; +}; + +struct ncclTopoNode { + int type; + int64_t id; + int rank; + int nlinks; + struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS]; + // Pre-computed paths to GPUs and NICs + struct ncclTopoLinkList* paths[NCCL_TOPO_NODE_TYPES]; + // Used during search + uint64_t used; +}; + +struct ncclTopoNodeSet { + int count; + struct ncclTopoNode nodes[NCCL_TOPO_MAX_NODES]; +}; + +struct ncclTopoSystem { + struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES]; + int maxSpeed; + int maxWidth; + int searchInitDone; +}; + +static ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) { + for (int i=0; i<system->nodes[type].count; i++) { + if (system->nodes[type].nodes[i].id == id) { + *node = system->nodes[type].nodes+i; + return ncclSuccess; + } + } + if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) { + WARN("Error : tried to create too many nodes of type %d\n", type); + return ncclInternalError; + } + struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count; + system->nodes[type].count++; + n->type = type; + n->id = id; + if (type == GPU) { + // Create link to itself (used in some corner cases) + n->nlinks=1; + n->links[0].type = LINK_LOC; + n->links[0].remNode = n; + n->links[0].width = LOC_WIDTH; + } + *node = n; + return ncclSuccess; +} + +static ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, int width) { + // Aggregate links into higher width for NVLink + struct ncclTopoLink* link; + for (link = node->links; link->remNode; link++) { + if (link->remNode == remNode && link->type == type) break; + } + if (link->remNode == NULL) node->nlinks++; + link->type = type; + link->remNode = remNode; + link->width += width; + + // Sort links in BW descending order + struct ncclTopoLink linkSave; + memcpy(&linkSave, link, sizeof(struct ncclTopoLink)); + while (link != node->links) { + if ((link-1)->width >= linkSave.width) break; + memcpy(link, link-1, sizeof(struct ncclTopoLink)); + link--; + } + memcpy(link, &linkSave, sizeof(struct ncclTopoLink)); + return ncclSuccess; +} + +ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system); + +#endif diff --git a/src/misc/trees.cu b/src/graph/trees.cc index e53ea0b..722e61b 100644 --- a/src/misc/trees.cu +++ b/src/graph/trees.cc @@ -1,12 +1,10 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" -#include "net.h" -#include "param.h" +#include "nccl.h" #define RANK_TO_INDEX(r) (rank > root ? rank-1 : rank) diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc new file mode 100644 index 0000000..87afb2f --- /dev/null +++ b/src/graph/tuning.cc @@ -0,0 +1,212 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" +#include "devcomm.h" +#include "comm.h" +#include "topo.h" + +NCCL_PARAM(Nthreads, "NTHREADS", -2); +NCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2); + +static int getNthreads(const char* name, int env, int min, int max, int def) { + int nt = env; + if (nt > 0) { + if (nt % WARP_SIZE != 0) { + WARN("Invalid %s %d (must be a multiple of %d)", name, nt, WARP_SIZE); + nt = max; + } else if (nt > max) { + WARN("Invalid %s %d (maximum %d).", name, nt, max); + nt = max; + } else if (nt < min) { + WARN("Invalid %s %d (minimum %d).", name, nt, min); + nt = min; + } + } else { + nt = def; + } + return nt; +} + +ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* list) { + int def, set; + if (str[0] == '^') { + def = 1; set = 0; str++; + } else { + def = 0; set = 1; + } + for (int i=0; i<nelems; i++) list[i] = def; + char* tokStr = strdup(str); + char* tmpStr; + char* token = strtok_r(tokStr, ",", &tmpStr); + while (token) { + for (int i=0; i<nelems; i++) + if (strcasecmp(token, elems[i]) == 0) list[i] = set; + token = strtok_r(NULL, ",", &tmpStr); + } + free(tokStr); + return ncclSuccess; +} + +static const char* ncclFuncStr[] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" }; +static const char* ncclAlgoStr[] = { "Tree", "Ring" }; +static const char* ncclProtoStr[] = { "LL", "LL128", "Simple" }; + +// Latencies in us, Bandwidths in GB/s +// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple } +static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4, 0 }, { 3.6, 3.6, 8.4 } }; + +// NVLink, PCI, Network +#define NCCL_HW_NVLINK 0 +#define NCCL_HW_PCI 1 +#define NCCL_HW_NET 2 +// Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network). +static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = +{ /* NVLINK */ + { /* Tree (LL/LL128/Simple)*/ { .5, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { .4, 2.5, 5.7 } }, + /* PCI */ + { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 } }, + /* NET */ + { /* Tree (LL/LL128/Simple)*/ { 5.0, 7.5, 50 }, /* Ring (LL/LL128/Simple)*/ { .9, 2.5, 6.6 } } +}; + +// LL128 max BW for the different collectives +static const double ll128MaxBw[NCCL_NUM_FUNCTIONS] = { 113.0, 72.0, 110.0, 91.0, 100.0 }; + +ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph) { + int simpleDefaultThreads = (treeGraph->speedIntra*treeGraph->nChannels <= 12) ? 256 : NCCL_MAX_NTHREADS; + comm->maxThreads[NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads); + comm->maxThreads[NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS); + comm->maxThreads[NCCL_PROTO_LL128] = getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS); + + INFO(NCCL_INIT, "Threads per block : %d/%d/%d", comm->maxThreads[NCCL_PROTO_LL], comm->maxThreads[NCCL_PROTO_LL128], comm->maxThreads[NCCL_PROTO_SIMPLE]); + + if (comm->nRanks <= 1) return ncclSuccess; + + struct ncclTopoGraph* graphs[2] = { treeGraph, ringGraph }; + int intraHw[2], hw[2]; + for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->nvlink ? NCCL_HW_NVLINK : NCCL_HW_PCI; + for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = comm->nNodes == 1 ? intraHw[a] : NCCL_HW_NET; + + for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) { + int nsteps = coll == ncclCollAllReduce ? 2*(comm->nRanks-1) : + coll == ncclCollReduceScatter || coll == ncclCollAllGather ? comm->nRanks-1 : + comm->nRanks; + + for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) { + if (coll != ncclCollAllReduce && a == NCCL_ALGO_TREE) continue; + + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + int speed = comm->nNodes <= 2 ? graphs[a]->speedIntra : graphs[a]->speedInter; + float busBw = graphs[a]->nChannels * speed * 1.0; + + // Various model refinements + if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= 1.0/4.0; + if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw*120.0/128.0, ll128MaxBw[coll]); + if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.9, comm->nNodes > 1 ? 70.0 : 90.0); + if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/3.0; + if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw *= 7.0/9.0; + + // Convert bus BW to algorithm BW + float ratio = a == NCCL_ALGO_TREE ? .5 : (1.0 * comm->nRanks) / nsteps; + comm->bandwidths[coll][a][p] = busBw * ratio; + + comm->latencies[coll][a][p] = baseLat[a][p]; + if (a == NCCL_ALGO_RING) { + float lat = hwLat[hw[a]][a][p]; + if ((coll == ncclCollReduce || coll == ncclCollBroadcast)) { + if (ringGraph->sameChannels) { + comm->latencies[coll][a][p] += lat; + } else { + if (p == NCCL_PROTO_SIMPLE) lat = hwLat[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling + comm->latencies[coll][a][p] += nsteps*lat; + } + } else { + comm->latencies[coll][a][p] += nsteps*lat; + } + } else { + float intraLat = hwLat[intraHw[a]][a][p]; + float interLat = hwLat[NCCL_HW_NET][a][p]; + comm->latencies[coll][a][p] += + 2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat); + } + } + } + } + + // Protocols/Algorithms enable/disable, and user overrides. + // All are enabled except ll128 which is enabled by default only in certain cases. + int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 }; + int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1 }; + + const char *protoStr = getenv("NCCL_PROTO"); + if (protoStr) NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable)); + const char *algoStr = getenv("NCCL_ALGO"); + if (algoStr) NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable)); + + for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + int pEnable = protoEnable[p]; + if (pEnable == 2 && p == NCCL_PROTO_LL128) { + // Enable LL128 by default only on Volta+NVLink. Other cases are not tested and may cause silent data corruption. + pEnable = (graphs[a]->type <= LINK_PCI) && graphs[a]->nvlink && minCompCap == 70 && maxCompCap == 70 ? 1 : 0; + } + if (pEnable == 0 || algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0; + } + + if (comm->rank == 0) { + char line[1024]; + int offset = 0; + sprintf(line, "Latency/AlgBw |"); + offset = strlen(line); + for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) { + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + sprintf(line+offset, " %4s/%6s |", ncclAlgoStr[a], ncclProtoStr[p]); + offset = strlen(line); + } + } + INFO(NCCL_TUNING, "%s", line); + for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) { + sprintf(line, "%13s |", ncclFuncStr[c]); + offset = strlen(line); + for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) { + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + sprintf(line+offset, "%7.1f/%5.1f|", comm->latencies[c][a][p], comm->bandwidths[c][a][p]); + offset = strlen(line); + } + } + INFO(NCCL_TUNING, "%s", line); + } + } + + // Set per-thread amount of work before we increase nThreads and nChannels + for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) { + comm->threadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD; + comm->threadThresholds[a][NCCL_PROTO_LL128] = NCCL_LL128_THREAD_THRESHOLD; + comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD; + } + comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= comm->nRanks; + + // Override defaults with user env + char* str = getenv("NCCL_THREAD_THRESHOLDS"); + if (str) { + ssize_t t[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2}}; + sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2); + for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) { + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + if (t[a][p] >= 0) comm->threadThresholds[a][p] = t[a][p]; + } + } + } + + INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld", + comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL], + comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL128], + comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], + comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL], + comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL128], + comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]); + return ncclSuccess; +} diff --git a/src/misc/group.cu b/src/group.cc index c428a22..9bf8ac9 100644 --- a/src/misc/group.cu +++ b/src/group.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -51,11 +51,6 @@ struct ncclAsyncArgs { thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS]; -ncclResult_t ncclSetDevice(int cudaDev) { - CUDACHECK(cudaSetDevice(cudaDev)); - return ncclSuccess; -} - #define CHECK(a) do { \ if ((args->ret = (a)) != ncclSuccess) { \ INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ @@ -65,15 +60,14 @@ ncclResult_t ncclSetDevice(int cudaDev) { void* ncclAsyncThreadMain(void* args_) { struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_; - CHECK(ncclSetDevice(args->init.cudaDev)); - CHECK(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank)); + CHECK(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev)); return args; } -ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) { +ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev) { if (ncclGroupIndex >= MAX_ASYNC_OPS) { WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS); - return ncclAsyncErrCheck(ncclInternalError); + return ncclAsyncErrCheck(ncclInvalidUsage); } int index = ncclGroupIndex++; struct ncclAsyncArgs* args = ncclGroupArgs+index; @@ -84,8 +78,6 @@ ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm args->init.ndev = ndev; memcpy(&args->init.commId, &commId, sizeof(commId)); args->init.myrank = myrank; - // We need to use threads for Init - pthread_create(ncclGroupThreads+index, NULL, ncclAsyncThreadMain, args); return ncclSuccess; } @@ -97,7 +89,7 @@ ncclResult_t ncclAsyncColl(ncclComm_t comm) { } if (ncclGroupIndex >= MAX_ASYNC_OPS) { WARN("Too many async operations in progress, max is %d", MAX_ASYNC_OPS); - return ncclAsyncErrCheck(ncclInternalError); + return ncclAsyncErrCheck(ncclInvalidUsage); } ncclGroupIndex++; args->funcType = ASYNC_FUNC_COLL; @@ -118,12 +110,20 @@ ncclResult_t ncclGroupEnd() { int savedDev; CUDACHECK(cudaGetDevice(&savedDev)); int done = ncclGroupIndex; - int doneArray[ncclGroupIndex]; + int doneArray[MAX_ASYNC_OPS]; for (int i=0; i<ncclGroupIndex; i++) doneArray[i] = 0; ncclResult_t ret = ncclGroupError; if (ret != ncclSuccess) goto group_cleanup; + /* Launch async ncclCommInitRank */ + for (int i=0; i<ncclGroupIndex; i++) { + struct ncclAsyncArgs* args = ncclGroupArgs+i; + if (args->funcType == ASYNC_FUNC_INIT) { + pthread_create(ncclGroupThreads+i, NULL, ncclAsyncThreadMain, args); + } + } + /* Collectives are done in three steps : * 1. Barrier Check In. Only the last call may call cudaLaunchKernel[cooperative] * 2. Barrier Wait. No CUDA call is permitted @@ -166,8 +166,8 @@ ncclResult_t ncclGroupEnd() { if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) { int err = pthread_tryjoin_np(ncclGroupThreads[i], NULL); if (err == EBUSY) continue; - if (err != 0) { ret = ncclSystemError; goto end; } - if (args->ret != ncclSuccess) { ret = args->ret; goto end; } + if (err != 0) ret = ncclSystemError; + if (args->ret != ncclSuccess) ret = args->ret; doneArray[i] = 1; done--; } @@ -175,20 +175,47 @@ ncclResult_t ncclGroupEnd() { } goto end; group_cleanup: - // At least one call in the group failed. Since we want to make that group - // an atomic operation, we need to cancel all operations. - for (int i=0; i<ncclGroupIndex; i++) { - struct ncclComm* comm = ncclGroupArgs[i].coll.comm; - for (int c=0; c<comm->nChannels; c++) { - struct ncclChannel* channel = comm->channels+c; - for (int i=0; i<channel->collCount; i++) { - channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0; + if (ret != ncclSuccess) { + // At least one call in the group failed. Since we want to make that group + // an atomic operation, we need to cancel all operations. + for (int i=0; i<ncclGroupIndex; i++) { + struct ncclAsyncArgs* args = ncclGroupArgs+i; + if (args->funcType == ASYNC_FUNC_INIT && doneArray[i] == 0) { + if (args->init.newcomm) NCCLCHECK(ncclCommDestroy(*args->init.newcomm)); + *args->init.newcomm = NULL; + } else { + struct ncclComm* comm = args->coll.comm; + for (int c=0; c<comm->nChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + for (int i=0; i<channel->collCount; i++) { + channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0; + } + channel->collFifoTail = channel->collStart; + channel->collCount = 0; + } + /* Cancel all proxy ops : mark them as ncclProxyOpNone and they should be freed later on */ + struct ncclProxyState* state = &comm->proxyState; + struct ncclProxyArgs *op, *start; + pthread_mutex_lock(&state->mutex); + op = start = state->ops; + while (op) { + if (op->opCount >= comm->lastOpCount) op->state = ncclProxyOpNone; + struct ncclProxyArgs* peerOp = op->nextPeer; + while (peerOp) { + if (peerOp->opCount >= comm->lastOpCount) peerOp->state = ncclProxyOpNone; + peerOp = peerOp->nextPeer; + } + op = op->next; + if (op == start) break; + } + comm->opCount = comm->lastOpCount; + pthread_cond_signal(&state->cond); + pthread_mutex_unlock(&state->mutex); + + comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0; + comm->userStreamSet = false; } - channel->collFifoTail = channel->collStart; - channel->collCount = 0; } - comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0; - comm->userStreamSet = false; } end: ncclGroupError = ncclSuccess; diff --git a/src/include/alloc.h b/src/include/alloc.h new file mode 100644 index 0000000..bcdbd18 --- /dev/null +++ b/src/include/alloc.h @@ -0,0 +1,51 @@ +/************************************************************************* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_ALLOC_H_ +#define NCCL_ALLOC_H_ + +#include "nccl.h" +#include "checks.h" +#include <sys/mman.h> + +static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) { + CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped)); + memset(*ptr, 0, size); + *devPtr = *ptr; + return ncclSuccess; +} + +static inline ncclResult_t ncclCudaHostFree(void* ptr) { + CUDACHECK(cudaFreeHost(ptr)); + return ncclSuccess; +} + +template <typename T> +static ncclResult_t ncclCalloc(T** ptr, size_t nelem) { + void* p = malloc(nelem*sizeof(T)); + if (p == NULL) { + WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); + return ncclSystemError; + } + memset(p, 0, nelem*sizeof(T)); + *ptr = (T*)p; + return ncclSuccess; +} + +template <typename T> +static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) { + CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T))); + CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T))); + return ncclSuccess; +} + +template <typename T> +static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { + CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault)); + return ncclSuccess; +} + +#endif diff --git a/src/include/argcheck.h b/src/include/argcheck.h new file mode 100644 index 0000000..8d8b74e --- /dev/null +++ b/src/include/argcheck.h @@ -0,0 +1,16 @@ +/************************************************************************* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_ARGCHECK_H_ +#define NCCL_ARGCHECK_H_ + +#include "core.h" +#include "info.h" + +ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname); +ncclResult_t ArgsCheck(struct ncclInfo* info); + +#endif diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index a1aaf50..a7d6be9 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,6 +9,7 @@ #include "nccl.h" +ncclResult_t bootstrapNetInit(); ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv); ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out); ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState); @@ -16,4 +17,5 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size); ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size); ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size); ncclResult_t bootstrapClose(void* commState); +ncclResult_t bootstrapAbort(void* commState); #endif diff --git a/src/include/channel.h b/src/include/channel.h index 76c5e8a..e2da325 100644 --- a/src/include/channel.h +++ b/src/include/channel.h @@ -1,12 +1,12 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_CHANNEL_H_ #define NCCL_CHANNEL_H_ -#include "core.h" +#include "comm.h" ncclResult_t initChannel(struct ncclComm* comm, int channelid); ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks); diff --git a/src/include/checks.h b/src/include/checks.h index bf7750e..50737b0 100644 --- a/src/include/checks.h +++ b/src/include/checks.h @@ -1,10 +1,73 @@ /************************************************************************* - * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#ifndef NCCL_CHECKS_H_ +#define NCCL_CHECKS_H_ -ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname); -ncclResult_t ArgsCheck(struct ncclInfo* info); +#include "debug.h" + +// Check CUDA calls +#define CUDACHECK(cmd) do { \ + cudaError_t e = cmd; \ + if( e != cudaSuccess ) { \ + WARN("Cuda failure '%s'", cudaGetErrorString(e)); \ + return ncclUnhandledCudaError; \ + } \ +} while(false) + +#define CUDACHECKGOTO(cmd, res, label) do { \ + cudaError_t e = cmd; \ + if( e != cudaSuccess ) { \ + WARN("Cuda failure '%s'", cudaGetErrorString(e)); \ + res = ncclUnhandledCudaError; \ + goto label; \ + } \ +} while(false) + +#include <errno.h> +// Check system calls +#define SYSCHECK(call, name) do { \ + int retval; \ + SYSCHECKVAL(call, name, retval); \ +} while (false) + +#define SYSCHECKVAL(call, name, retval) do { \ + SYSCHECKSYNC(call, name, retval); \ + if (retval == -1) { \ + WARN("Call to " name " failed : %s", strerror(errno)); \ + return ncclSystemError; \ + } \ +} while (false) + +#define SYSCHECKSYNC(call, name, retval) do { \ + retval = call; \ + if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ + INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \ + } else { \ + break; \ + } \ +} while(true) + +// Propagate errors up +#define NCCLCHECK(call) do { \ + ncclResult_t res = call; \ + if (res != ncclSuccess) { \ + /* Print the back trace*/ \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + return res; \ + } \ +} while (0); + +#define NCCLCHECKGOTO(call, res, label) do { \ + res = call; \ + if (res != ncclSuccess) { \ + /* Print the back trace*/ \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + goto label; \ + } \ +} while (0); + +#endif diff --git a/src/collectives/collectives.h b/src/include/collectives.h index e6b19cb..69c8e74 100644 --- a/src/collectives/collectives.h +++ b/src/include/collectives.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,7 +7,10 @@ #ifndef NCCL_COLLECTIVES_H_ #define NCCL_COLLECTIVES_H_ -#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*2+(al))*2+(ll)) +#include "core.h" +#include "info.h" + +#define FUNC_INDEX(coll, redop, dtype, al, pr) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)) #define NCCL_COLL_NAME(coll, op, dtype) \ coll##_##op##_##dtype @@ -22,7 +25,8 @@ #define DECL_COLL4(coll, op, dtype) \ DECL_COLL5(coll, op, dtype) \ - DECL_COLL5(coll##LL, op, dtype) + DECL_COLL5(coll##LL, op, dtype) \ + DECL_COLL5(coll##LL128, op, dtype) #define DECL_COLL3(coll, op, dtype) \ DECL_COLL4(coll##Ring, op, dtype) \ diff --git a/src/include/comm.h b/src/include/comm.h new file mode 100644 index 0000000..7164dc0 --- /dev/null +++ b/src/include/comm.h @@ -0,0 +1,141 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_COMM_H_ +#define NCCL_COMM_H_ + +#include "transport.h" + +#if CUDART_VERSION < 9000 +struct cudaLaunchParams { + void *func; + dim3 gridDim; + dim3 blockDim; + void **args; + size_t sharedMem; + cudaStream_t stream; +}; +#endif + +#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */ + +#define CACHE_LINE_SIZE 128 +#define MEM_ALIGN 4096 +#define CUDA_IPC_MIN 2097152UL + +// Channels / LL tuning +#define NCCL_LL_THREAD_THRESHOLD 8 +#define NCCL_LL128_THREAD_THRESHOLD 8 +#define NCCL_SIMPLE_THREAD_THRESHOLD 64 + +struct ncclSendMem { + union { + struct { + uint64_t head; + char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; + void* ptrExchange; + char pad2[CACHE_LINE_SIZE-sizeof(void*)]; + uint64_t opCount; + }; + char pad3[MEM_ALIGN]; + }; +}; + +struct ncclRecvMem { + union { + struct { + uint64_t tail; + char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; + uint64_t opCount; + char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)]; + int sizesFifo[NCCL_STEPS]; + }; + char pad4[MEM_ALIGN]; + }; + ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES]; + uint64_t ll128Buff[NCCL_LL128_BUFF_ELEMS]; + char buff[1]; // Actually larger than that +}; + +struct ncclComm { + struct ncclChannel channels[MAXCHANNELS]; + + struct ncclPeerInfo* peerInfo; + struct ncclTopoSystem* topo; + + void* bootstrap; + + int rank; // my rank in the communicator + int nRanks; // number of GPUs in communicator + int cudaDev; // my cuda device index + int64_t busId; // my PCI bus ID in int format + + int node; + int nNodes; + int localRanks; + + enum { GROUP, PARALLEL } launchMode; + cudaStream_t userStream; + bool userStreamSet; + cudaEvent_t doneEvent; + bool checkPointers; + + // Counter to make sure collectives match (needed for bcast/reduce + // where syncs are not symmetric). + uint64_t opCount; + uint64_t lastOpCount; + + // Channels for collectives + int nChannels; + + // Only nvlink is used for inter-GPU communication + int nvlink; + + // Algorithm/Protocols thresholds + ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + int maxThreads[NCCL_NUM_PROTOCOLS]; + + // An internal CUDA stream for NCCL kernel CGMD launches + int groupCudaStream; + cudaStream_t groupStream; + + // Whether there has been a fatal error in this communicator. + ncclResult_t fatalError; + + // Error reported by GPU + volatile ncclDevError_t* fatalDevError; + + // Flag to ask NCCL kernels to abort + volatile uint32_t *abortFlag; + + // Device side of the communicator + struct ncclDevComm *devComm; + // Host copy of the devComm (to free CUDA allocs) + struct ncclDevComm hostDevComm; + + // Intra-process sync + int intraRank; + int intraRanks; + int* intraBarrier; + int intraPhase; + + // Storage for deferred intra-process launch + struct cudaLaunchParams * intraParams; + struct cudaLaunchParams *myParams; + int* intraCudaDevs; + int* intraCGMode; // Whether we can use CUDA9 CGMD or not + int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not + struct ncclColl args; + void* argsptr; + + // Global proxy thread + pthread_t proxyThread; + struct ncclProxyState proxyState; +}; + +#endif diff --git a/src/include/core.h b/src/include/core.h index d57d271..250f43b 100644 --- a/src/include/core.h +++ b/src/include/core.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,385 +7,12 @@ #ifndef NCCL_CORE_H_ #define NCCL_CORE_H_ -#define NCCL_MAX_OPS 2048 -#define NCCL_STEPS 8 - -#include "nccl.h" -#include "transport.h" -#include "debug.h" -#include <cstdio> -#include <algorithm> // std::min/std::max +#include <pthread.h> #include <unistd.h> #include <stdlib.h> -#include <cuda_runtime.h> - -#if CUDART_VERSION < 9000 -struct cudaLaunchParams { - void *func; - dim3 gridDim; - dim3 blockDim; - void **args; - size_t sharedMem; - cudaStream_t stream; -}; -#endif - -#define MAXCHANNELS 16 -#define MAXTHREADS 256 -#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */ - -// Channels / LL tuning -#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings -#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL -#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs -#define NCCL_LL_MAX_NTHREADS MAXTHREADS -#define NCCL_LL_MIN_NTHREADS 64 - -#define DIVUP(x, y) \ - (((x)+(y)-1)/(y)) -#define ROUNDUP(x, y) \ - (DIVUP((x), (y))*(y)) - -#define ALIGN_SIZE(size, align) \ - size = ((size + (align) - 1) / (align)) * (align); - -union ncclLLFifoLine { - /* Flags have to be *after* data, because otherwise, an incomplete receive - from the network may receive the flag but not the data. - Note this is assuming that either we receive contiguous chunks of data - (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */ - struct { - uint32_t data1; - uint32_t flag1; - uint32_t data2; - uint32_t flag2; - }; - uint64_t v[2]; - int4 i4; -}; - -typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t; - -typedef enum { - ncclPatternRing, - ncclPatternRingTwice, - ncclPatternPipelineFrom, - ncclPatternPipelineTo, - ncclPatternTreeUp, - ncclPatternTreeDown, - ncclPatternTreeUpDown -} ncclPattern_t; - -typedef enum { - ncclDevSuccess, - ncclDevAssertedMismatch, - ncclDevSuspectedMismatch -} ncclDevError_t; - -// Used to pass NCCL call information between functions -struct ncclInfo { - ncclColl_t coll; - const char* opName; - // NCCL Coll Args - const void* sendbuff; - void* recvbuff; - size_t count; - ncclDataType_t datatype; - ncclRedOp_t op; - int root; - ncclComm_t comm; - cudaStream_t stream; - // Algorithm details - int chunkSteps; - int sliceSteps; - // Computed later - ncclPattern_t pattern; - size_t nBytes; - int nstepsPerLoop; - int nchunksPerLoop; -}; - -struct ncclConnInfo { - // Regular comm mechanism - char *buff; // Local for recv, remote for send - uint64_t *tail; // Local for recv, remote for send - uint64_t *head; // Local for send, remote for recv - uint64_t *opCountLoc; // opCount of local rank - uint64_t *opCountRem; // opCount of remote rank - - int direct; // Direct communication - void **ptrExchange; // Pointer exchange for direct communication - - int *fifo; // Size fifo for proxy - - uint64_t step; // Keep where we are - - // Low latency mechanism - union ncclLLFifoLine *llBuff; // Local for recv, remote for send - uint64_t llLastCleaning; -}; - -struct ncclConnector { - int connected; - struct ncclProxyArgs *proxyAppend; - struct ncclTransportComm* transportComm; - void* transportResources; // Host-side resources - struct ncclConnInfo conn; - struct ncclComm *comm; -}; - -#define CACHE_LINE_SIZE 128 -#define MEM_ALIGN 4096 -#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */ - -#define NUM_LINES_PER_THREAD 8 -#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS) -#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS) -#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine)) -#define NCCL_LL_CLEAN_FREQ 0x10000000 - -struct ncclSendMem { - union { - struct { - uint64_t head; - char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; - void* ptrExchange; - char pad2[CACHE_LINE_SIZE-sizeof(void*)]; - uint64_t opCount; - }; - char pad3[MEM_ALIGN]; - }; -}; - -struct ncclRecvMem { - union { - struct { - uint64_t tail; - char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; - uint64_t opCount; - char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)]; - int sizesFifo[NCCL_STEPS]; - }; - char pad4[MEM_ALIGN]; - }; - ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES]; - char buff[1]; // Actually larger than that -}; - -struct ncclRing { - // Shortcuts for userRanks[1] and userRanks[n-1] - int prev; - int next; - - // Maps an internal nccl index to user-specified rank order. This is necessary - // since we need to know how the user expects data to be ordered across - // devices. Ordered from current device. - int* userRanks; - int* devUserRanks; -}; - -#define NCCL_MAX_TREE_ARITY 3 -struct ncclTree { - int depth; - int up; - int down[NCCL_MAX_TREE_ARITY]; -}; - -struct ncclPeer { - struct ncclConnector send; - struct ncclConnector recv; -}; - -struct ncclChannel { - union { - struct { - struct ncclRing ring; - struct ncclTree tree; - - int id; - int nthreads; - int buffSize; - - // Communication structures - struct ncclPeer* peers; - struct ncclPeer* devPeers; - - // Operation list for aggregation - struct ncclColl* collectives; - struct ncclColl* devCollectives; - int collStart; - int collCount; - int collFifoHead; // Only used by GPU - int collFifoTail; // Only used by CPU - }; - int data[0x80]; - }; -}; -static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size"); - -/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */ -/* to make sure reads to host from the CUDA kernel are aligned. */ -/* Make sure to adjust padding at the end of ncclColl. */ -struct CollectiveArgs { - struct ncclComm* comm; - uint64_t opCount; - - // local and remote input, output, and buffer - const void * ThisInput; - void * ThisOutput; - - // general parameters - size_t N; - uint32_t root; - uint8_t bid; - uint8_t nChannels; - uint16_t nThreads; - - int lastChunkSize; -}; -struct ncclColl { - union { - struct { - struct CollectiveArgs args; - uint16_t funcIndex; - uint16_t nextIndex; - uint8_t active; - }; - int data[0x10]; - }; -}; -static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size"); - -struct ncclComm { - struct ncclChannel channels[MAXCHANNELS]; - - struct ncclPeerInfo* peerInfo; - - void* bootstrap; - - int rank; // my rank in the communicator - int nRanks; // number of GPUs in communicator - int cudaDev; // my cuda device index - int nvmlDev; // my NVML device number - - enum { GROUP, PARALLEL } launchMode; - cudaStream_t userStream; - bool userStreamSet; - cudaEvent_t doneEvent; - bool checkPointers; - - // Counter to make sure collectives match (needed for bcast/reduce - // where syncs are not symmetric). - uint64_t opCount; - - // Channels for collectives - int nChannels; - int nThreads; - - // Low-latency algorithm threshold - ssize_t llThreshold; - ssize_t threadThreshold; - - // Tree algorithm threshold - ssize_t treeThreshold; - - // An internal CUDA stream for NCCL kernel CGMD launches - int groupCudaStream; - cudaStream_t groupStream; - - // Whether there has been a fatal error in this communicator. - ncclResult_t fatalError; - - // Error reported by GPU - volatile ncclDevError_t* fatalDevError; - - // On host: this pointer has been obtained from cudaHostAlloc(cudaHostAllocMapped) - // On device: this pointer has been obtained from cudaHostGetDevicePointer() - volatile uint32_t *abortFlag; - - // Device copy of the communicator - struct ncclComm *devComm; - - // Intra-process sync - int intraRank; - int intraRanks; - int* intraBarrier; - int intraPhase; - - // Storage for deferred intra-process launch - struct cudaLaunchParams * intraParams; - struct cudaLaunchParams *myParams; - int* intraCudaDevs; - int* intraCGMode; // Whether we can use CUDA9 CGMD or not - int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not - struct ncclColl args; - void* argsptr; - - // Global proxy thread - pthread_t proxyThread; - struct ncclProxyState proxyState; -}; - -// Check CUDA calls -#define CUDACHECK(cmd) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ - WARN("Cuda failure '%s'", cudaGetErrorString(e)); \ - return ncclUnhandledCudaError; \ - } \ -} while(false) - -#define CUDACHECKGOTO(cmd, res, label) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ - WARN("Cuda failure '%s'", cudaGetErrorString(e)); \ - res = ncclUnhandledCudaError; \ - goto label; \ - } \ -} while(false) - -#include <errno.h> -// Check system calls -#define SYSCHECK(call, name) do { \ - int retval; \ - SYSCHECKVAL(call, name, retval); \ -} while (false) - -#define SYSCHECKVAL(call, name, retval) do { \ - SYSCHECKSYNC(call, name, retval); \ - if (retval == -1) { \ - WARN("Call to " name " failed : %s", strerror(errno)); \ - return ncclSystemError; \ - } \ -} while (false) - -#define SYSCHECKSYNC(call, name, retval) do { \ - retval = call; \ - if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ - INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \ - } else { \ - break; \ - } \ -} while(true) - -// Propagate errors up -#define NCCLCHECK(call) do { \ - ncclResult_t res = call; \ - if (res != ncclSuccess) { \ - /* Print the back trace*/ \ - INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ - return res; \ - } \ -} while (0); - -#define NCCLCHECKGOTO(call, res, label) do { \ - res = call; \ - if (res != ncclSuccess) { \ - /* Print the back trace*/ \ - INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ - goto label; \ - } \ -} while (0); +#include <stdint.h> +#include <algorithm> // For std::min/std::max +#include "nccl.h" #ifdef PROFAPI #define NCCL_API(ret, func, args...) \ @@ -403,10 +30,6 @@ struct ncclComm { ret func(args) #endif // end PROFAPI -int ncclCudaCompCap(); -ncclResult_t ncclNvlinkGpu(int* nvlink); -int64_t ncclTreeThreshold(); - static __inline__ int ncclTypeSize(ncclDataType_t type) { switch (type) { case ncclInt8: @@ -427,42 +50,22 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) { } } -#include <sys/mman.h> -static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) { - CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped)); - memset(*ptr, 0, size); - *devPtr = *ptr; - return ncclSuccess; -} - -static inline ncclResult_t ncclCudaHostFree(void* ptr) { - CUDACHECK(cudaFreeHost(ptr)); - return ncclSuccess; -} +#define NCCL_NUM_FUNCTIONS 5 +typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce } ncclFunc_t; -template <typename T> -static ncclResult_t ncclCalloc(T** ptr, size_t nelem) { - void* p = malloc(nelem*sizeof(T)); - if (p == NULL) { - WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); - return ncclSystemError; - } - memset(p, 0, nelem*sizeof(T)); - *ptr = (T*)p; - return ncclSuccess; -} +#define NCCL_NUM_ALGORITHMS 2 // Tree/Ring +#define NCCL_ALGO_TREE 0 +#define NCCL_ALGO_RING 1 -template <typename T> -static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) { - CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T))); - CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T))); - return ncclSuccess; -} +#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 +#define NCCL_PROTO_LL 0 +#define NCCL_PROTO_LL128 1 +#define NCCL_PROTO_SIMPLE 2 -template <typename T> -static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { - CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault)); - return ncclSuccess; -} +#include "debug.h" +#include "checks.h" +#include "alloc.h" +#include "utils.h" +#include "param.h" #endif // end include guard diff --git a/src/include/cpuset.h b/src/include/cpuset.h index f70d1d8..98b93de 100644 --- a/src/include/cpuset.h +++ b/src/include/cpuset.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/include/debug.h b/src/include/debug.h index 3acdf8c..89b6e42 100644 --- a/src/include/debug.h +++ b/src/include/debug.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,15 +7,14 @@ #ifndef NCCL_DEBUG_H_ #define NCCL_DEBUG_H_ -#include <pthread.h> +#include "core.h" + #include <stdio.h> #include <chrono> -#include <unistd.h> #include <sys/syscall.h> #include <limits.h> #include <string.h> -#include "nccl.h" #include "nccl_net.h" #define gettid() (pid_t) syscall(SYS_gettid) @@ -24,10 +23,17 @@ extern int ncclDebugLevel; extern uint64_t ncclDebugMask; extern pthread_mutex_t ncclDebugOutputLock; extern FILE *ncclDebugFile; -extern ncclResult_t getHostName(char* hostname, int maxlen); -extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev); +extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim); + +void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...); -extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...); +// Let code temporarily downgrade WARN into INFO +extern thread_local int ncclDebugNoWarn; +#define NOWARN(a, ret) do { \ + ncclDebugNoWarn = 1; \ + ret = a; \ + ncclDebugNoWarn = 0; \ +} while (0) #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) @@ -39,101 +45,4 @@ extern std::chrono::high_resolution_clock::time_point ncclEpoch; #define TRACE(...) #endif -#include <stdlib.h> - -static inline void initDebug() { - const char* nccl_debug = getenv("NCCL_DEBUG"); - if (nccl_debug == NULL) { - ncclDebugLevel = NCCL_LOG_NONE; - } else if (strcasecmp(nccl_debug, "VERSION") == 0) { - ncclDebugLevel = NCCL_LOG_VERSION; - } else if (strcasecmp(nccl_debug, "WARN") == 0) { - ncclDebugLevel = NCCL_LOG_WARN; - } else if (strcasecmp(nccl_debug, "INFO") == 0) { - ncclDebugLevel = NCCL_LOG_INFO; - } else if (strcasecmp(nccl_debug, "ABORT") == 0) { - ncclDebugLevel = NCCL_LOG_ABORT; - } else if (strcasecmp(nccl_debug, "TRACE") == 0) { - ncclDebugLevel = NCCL_LOG_TRACE; - } - - /* Parse the NCCL_DEBUG_SUBSYS env var - * This can be a comma separated list such as INIT,COLL - * or ^INIT,COLL etc - */ - char* nccl_debug_subsys = getenv("NCCL_DEBUG_SUBSYS"); - if (nccl_debug_subsys != NULL) { - char *subsys = strtok(nccl_debug_subsys, ","); - while (subsys != NULL) { - int invert = 0; - uint64_t mask = 0; - if (subsys[0] == '^') { invert = 1; subsys++; } - if (strcasecmp(subsys, "INIT") == 0) { - mask = NCCL_INIT; - } else if (strcasecmp(subsys, "COLL") == 0) { - mask = NCCL_COLL; - } else if (strcasecmp(subsys, "P2P") == 0) { - mask = NCCL_P2P; - } else if (strcasecmp(subsys, "SHM") == 0) { - mask = NCCL_SHM; - } else if (strcasecmp(subsys, "NET") == 0) { - mask = NCCL_NET; - } else if (strcasecmp(subsys, "ALL") == 0) { - mask = NCCL_ALL; - } - if (mask) { - if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask; - } - subsys = strtok(NULL, ","); - } - } - - /* Parse and expand the NCCL_DEBUG_FILE path and - * then create the debug file. But don't bother unless the - * NCCL_DEBUG level is > VERSION - */ - const char* nccl_debug_file = getenv("NCCL_DEBUG_FILE"); - if (ncclDebugLevel > NCCL_LOG_VERSION && nccl_debug_file != NULL) { - int c = 0; - char debug_fn[PATH_MAX+1] = ""; - char *dfn = debug_fn; - while (nccl_debug_file[c] != '\0' && c < PATH_MAX) { - if (nccl_debug_file[c++] != '%') { - *dfn++ = nccl_debug_file[c-1]; - continue; - } - switch (nccl_debug_file[c++]) { - case '%': // Double % - *dfn++ = '%'; - break; - case 'h': // %h = hostname - char hostname[1024]; - getHostName(hostname, 1024); - dfn += snprintf(dfn, PATH_MAX, "%s", hostname); - break; - case 'p': // %p = pid - dfn += snprintf(dfn, PATH_MAX, "%d", getpid()); - break; - default: // Echo everything we don't understand - *dfn++ = '%'; - *dfn++ = nccl_debug_file[c-1]; - break; - } - } - *dfn = '\0'; - if (debug_fn[0] != '\0') { - FILE *file = fopen(debug_fn, "w"); - if (file != NULL) { - INFO(NCCL_ALL,"DEBUG file is '%s'", debug_fn); - ncclDebugFile = file; - } - } - } - pthread_mutex_init(&ncclDebugOutputLock, NULL); - -#ifdef ENABLE_TRACE - ncclEpoch = std::chrono::high_resolution_clock::now(); -#endif -} - #endif diff --git a/src/include/devcomm.h b/src/include/devcomm.h new file mode 100644 index 0000000..46d236b --- /dev/null +++ b/src/include/devcomm.h @@ -0,0 +1,214 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_DEVICE_H_ +#define NCCL_DEVICE_H_ + +#include "nccl.h" +#include <stdint.h> + +#define NCCL_MAX_OPS 2048 +#define NCCL_STEPS 8 + +#define DIVUP(x, y) \ + (((x)+(y)-1)/(y)) +#define ROUNDUP(x, y) \ + (DIVUP((x), (y))*(y)) + +#define ALIGN_SIZE(size, align) \ + size = ((size + (align) - 1) / (align)) * (align); + +union ncclLLFifoLine { + /* Flags have to be *after* data, because otherwise, an incomplete receive + from the network may receive the flag but not the data. + Note this is assuming that either we receive contiguous chunks of data + (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */ + struct { + uint32_t data1; + uint32_t flag1; + uint32_t data2; + uint32_t flag2; + }; + uint64_t v[2]; + int4 i4; +}; + +#define WARP_SIZE 32 +#define MAXCHANNELS 32 +#define NCCL_MAX_NTHREADS 512 +#define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS +#define NCCL_LL_LINES_PER_THREAD 8 +#define NCCL_LL_SLICE_LINES (NCCL_LL_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS) +#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS) +#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine)) +#ifdef TEST_LL_CLEANUP +#define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup +#define NCCL_LL_FLAG_MAX 0x100 +#define NCCL_LL_FLAG(a) ((uint32_t)((a) % NCCL_LL_FLAG_MAX)) +#else +#define NCCL_LL_CLEAN_MASK 0x7ffffff8 +#define NCCL_LL_FLAG(a) ((uint32_t)(a)) +#endif +// Make sure the clean mask will last for at least NCCL_NSTEPS +static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value"); + +#define NCCL_LL128_LINESIZE 128 +#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t)) +#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1) + +#define NCCL_LL128_MAX_NTHREADS 640 +#define NCCL_LL128_ELEMS_PER_THREAD 120 + +// Receiving from up to 3 sources is more compute intensive than sending +// to 3 dests. Use 70% for reduce and 30% for bcast. +#define NCCL_LL128_SPLIT(nt) ((nt*7/(10*32))*32) + +#define NCCL_LL128_SLICE_ELEMS (NCCL_LL128_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS) +#define NCCL_LL128_BUFF_ELEMS (NCCL_LL128_SLICE_ELEMS*NCCL_STEPS) +#define NCCL_LL128_BUFF_SIZE (NCCL_LL128_BUFF_ELEMS*sizeof(uint64_t)) + +#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8 +#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS) + +struct ncclConnInfo { + // Regular comm mechanism + char *buff; // Local for recv, remote for send + uint64_t *tail; // Local for recv, remote for send + uint64_t *head; // Local for send, remote for recv + uint64_t *opCountLoc; // opCount of local rank + uint64_t *opCountRem; // opCount of remote rank + + int direct; // Direct communication + void **ptrExchange; // Pointer exchange for direct communication + + int *fifo; // Size fifo for proxy + + uint64_t step; // Keep where we are + + // Low latency mechanism + union ncclLLFifoLine *llBuff; // Local for recv, remote for send + uint64_t llLastCleaning; + + // High bandwidth, low latency protocol + uint64_t* ll128Buff; // Local for recv, remote for send +}; + +struct ncclConnector { + int connected; + struct ncclProxyArgs *proxyAppend; + struct ncclTransportComm* transportComm; + void* transportResources; // Host-side resources + struct ncclConnInfo conn; + struct ncclComm *comm; +}; + +struct ncclRing { + // Shortcuts for userRanks[1] and userRanks[n-1] + int prev; + int next; + + // Maps an internal nccl index to user-specified rank order. This is necessary + // since we need to know how the user expects data to be ordered across + // devices. Ordered from current device. + int* userRanks; + int* devUserRanks; +}; + + +#define NCCL_MAX_TREE_ARITY 3 +struct ncclTree { + int depth; + int up; + int down[NCCL_MAX_TREE_ARITY]; +}; + +struct ncclPeer { + struct ncclConnector send; + struct ncclConnector recv; +}; + +struct ncclDevComm; + +/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */ +/* to make sure reads to host from the CUDA kernel are aligned. */ +/* Make sure to adjust padding at the end of ncclColl. */ +struct CollectiveArgs { + struct ncclDevComm* comm; + uint64_t opCount; + + // local and remote input, output, and buffer + const void * ThisInput; + void * ThisOutput; + + // general parameters + size_t N; + uint32_t root; + uint8_t bid; + uint8_t nChannels; + uint16_t nThreads; + + int lastChunkSize; +}; +struct ncclColl { + union { + struct { + struct CollectiveArgs args; + uint16_t funcIndex; + uint16_t nextIndex; + uint8_t active; + }; + int data[0x10]; + }; +}; +static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size"); + +struct ncclChannel { + union { + struct { + struct ncclRing ring; + struct ncclTree treeUp; + struct ncclTree treeDn; + + int id; + int nthreads; + int buffSize; + + // Communication structures + struct ncclPeer* peers; + struct ncclPeer* devPeers; + + // Operation list for aggregation + struct ncclColl* collectives; + struct ncclColl* devCollectives; + int collStart; + int collCount; + int collFifoHead; // Only used by GPU + int collFifoTail; // Only used by CPU + }; + int data[0x80]; + }; +}; +static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size"); + +typedef enum { + ncclDevSuccess, + ncclDevAssertedMismatch, + ncclDevSuspectedMismatch +} ncclDevError_t; + +struct ncclDevComm { + int rank; + int nRanks; + + // Flag to ask NCCL kernels to abort + volatile uint32_t *abortFlag; + volatile ncclDevError_t *fatalDevError; + + // Channels, device side + struct ncclChannel* channels; +}; + +#endif diff --git a/src/include/enqueue.h b/src/include/enqueue.h index 4db7094..cea486e 100644 --- a/src/include/enqueue.h +++ b/src/include/enqueue.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,8 +7,9 @@ #ifndef NCCL_ENQUEUE_H_ #define NCCL_ENQUEUE_H_ -#include "core.h" +#include "comm.h" #include "group.h" +#include "collectives.h" ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast); diff --git a/src/include/graph.h b/src/include/graph.h new file mode 100644 index 0000000..3c8ba19 --- /dev/null +++ b/src/include/graph.h @@ -0,0 +1,94 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_GRAPH_H_ +#define NCCL_GRAPH_H_ + +#include "nccl.h" +#include "devcomm.h" +#include <limits.h> +#include <stdlib.h> +#include <ctype.h> +#include <stdio.h> + +enum ncclPathDist { + PATH_PIX = 0, + PATH_PXB = 1, + PATH_PHB = 2, + PATH_NODE = 3, + PATH_SYS = 4, + PATH_ARRAY_SIZE = 5 +}; + +extern const char* pathDists[PATH_ARRAY_SIZE]; + +ncclResult_t ncclTopoCudaPath(int cudaDev, char** path); + +struct ncclTopoSystem; +// Build the topology +ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system); +ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system); +ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system); + +ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info); +void ncclTopoFree(struct ncclTopoSystem* system); +ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm); +ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system); + +// Query topology +ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink); +ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink); +ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance); +ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* net); +ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance); +ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count); + +#define NCCL_TOPO_MAX_NODES 256 + +#define NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP 1 // Split tree (send/recv from different ranks) always flowing in the same direction +#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Split tree (send/recv from different ranks) flowing in both directions +#define NCCL_TOPO_PATTERN_TREE 3 // Simple tree (send/recv from same rank) flowing in both directions +#define NCCL_TOPO_PATTERN_RING 4 // Ring +struct ncclTopoGraph { + // Input / output + int pattern; + int crossNic; + // Output + int nChannels; + int speedIntra; + int speedInter; + int type; + int nvlink; + int sameChannels; + int nHops; + int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES]; + int inter[MAXCHANNELS*2]; +}; +ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); + +ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); + +struct ncclTopoRanks { + int ringRecv[MAXCHANNELS]; + int ringSend[MAXCHANNELS]; + int ringPrev[MAXCHANNELS]; + int ringNext[MAXCHANNELS]; + int treeUpRecv[MAXCHANNELS]; + int treeUpSend[MAXCHANNELS]; + int treeDnRecv[MAXCHANNELS]; + int treeDnSend[MAXCHANNELS]; +}; + +ncclResult_t ncclTopoPreset(struct ncclComm* comm, + struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, + struct ncclTopoRanks* topoRanks); + +ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, + struct ncclTopoRanks** allTopoRanks, int* rings); + +ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph); + +#endif diff --git a/src/include/group.h b/src/include/group.h index 76da30f..239b05f 100644 --- a/src/include/group.h +++ b/src/include/group.h @@ -8,14 +8,14 @@ #define NCCL_GROUP_H_ #include "nccl.h" -#include "core.h" +#include "comm.h" bool ncclAsyncMode(); ncclResult_t ncclAsyncErrCheck(ncclResult_t ret); -typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank); +typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev); -ncclResult_t ncclAsyncInit(ncclInitFunc_t func, int cudaDev, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank); +ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev); typedef ncclResult_t(*ncclCollFunc_t)(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h index 4f3e831..0943f99 100644 --- a/src/include/ibvwrap.h +++ b/src/include/ibvwrap.h @@ -4,7 +4,7 @@ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/include/info.h b/src/include/info.h new file mode 100644 index 0000000..9461759 --- /dev/null +++ b/src/include/info.h @@ -0,0 +1,50 @@ +/************************************************************************* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_INFO_H_ +#define NCCL_INFO_H_ + +#include "nccl.h" +#include "core.h" + +typedef enum { + ncclPatternRing, + ncclPatternRingTwice, + ncclPatternPipelineFrom, + ncclPatternPipelineTo, + ncclPatternTreeUp, + ncclPatternTreeDown, + ncclPatternTreeUpDown +} ncclPattern_t; + +// Used to pass NCCL call information between functions +struct ncclInfo { + ncclFunc_t coll; + const char* opName; + // NCCL Coll Args + const void* sendbuff; + void* recvbuff; + size_t count; + ncclDataType_t datatype; + ncclRedOp_t op; + int root; + ncclComm_t comm; + cudaStream_t stream; + // Algorithm details + int chunkSteps; + int sliceSteps; + // Computed later + int algorithm; + int protocol; + ncclPattern_t pattern; + int nChannels; + int nThreads; + size_t nBytes; + int nstepsPerLoop; + int nchunksPerLoop; +}; + +#endif diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h index 89edbf5..d6ae9f8 100644 --- a/src/include/nccl_net.h +++ b/src/include/nccl_net.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -15,7 +15,7 @@ #define NCCL_PTR_CUDA 0x2 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; -typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_ALL=~0} ncclDebugLogSubSys; +typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); @@ -80,12 +80,13 @@ typedef struct { // Finalize connection establishment after remote peer has called connectHandle ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); - // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is diff --git a/src/include/net.h b/src/include/net.h index e75e6bb..3d37c8c 100644 --- a/src/include/net.h +++ b/src/include/net.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -13,16 +13,10 @@ extern ncclNet_t* ncclNet; typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; -/* Socket Interface Selection type */ -typedef enum { findSubnetIf = -1, - dontCareIf = -2 -} ncclSocketIfSl_t; - // Translation to external API static const char* ncclNetName() { return ncclNet->name; } static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; } static ncclResult_t ncclNetPciPath(int dev, char** path) { NCCLCHECK(ncclNet->pciPath(dev, path)); return ncclSuccess; } -static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(ncclNet->ptrSupport(dev, supportedTypes)); return ncclSuccess; } static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; } static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; } static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; } @@ -36,7 +30,37 @@ static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeS static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; } static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; } -extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str); +#define GPU_BUF_SIZE (2*1024*1024) +static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { + int support; + NCCLCHECK(ncclNet->ptrSupport(dev, &support)); + *supportedTypes = support & ~NCCL_PTR_CUDA; + // The network supports GPU Direct RDMA ; verify the GPU supports it as well. + if (support & NCCL_PTR_CUDA) { + void *lComm = NULL, *sComm = NULL, *rComm = NULL; + ncclNetHandle_t handle; + void* gpuPtr = NULL; + void* mHandle = NULL; + ncclResult_t res; + NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), res, cleanup); + NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), res, cleanup); + NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), res, cleanup); + CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), res, cleanup); + NOWARN(ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res); + if (res != ncclSuccess) goto cleanup; + NCCLCHECKGOTO(ncclNetDeregMr(sComm, mHandle), res, cleanup); + NCCLCHECKGOTO(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res, cleanup); + NCCLCHECKGOTO(ncclNetDeregMr(rComm, mHandle), res, cleanup); + *supportedTypes |= NCCL_PTR_CUDA; +cleanup: + if (gpuPtr) cudaFree(gpuPtr); + if (rComm) ncclNetCloseRecv(rComm); + if (sComm) ncclNetCloseSend(sComm); + if (lComm) ncclNetCloseListen(lComm); + } + return ncclSuccess; +} + extern ncclNet_t ncclNetIb; extern ncclNet_t ncclNetSocket; diff --git a/src/include/nvlink.h b/src/include/nvlink.h deleted file mode 100644 index 1baf9e5..0000000 --- a/src/include/nvlink.h +++ /dev/null @@ -1,123 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_NVLINK_H_ -#define NCCL_NVLINK_H_ - -#include <sys/stat.h> -#include <fcntl.h> -#include "nvmlwrap.h" -#include "topo.h" - -#define CONNECT_NVLINK 0x10 -#define CONNECT_NVSWITCH 0x100 - -enum ncclNvLinkDeviceType { - ncclNvLinkDeviceGpu, - ncclNvLinkDeviceSwitch, -}; - -static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) { - char classPath[] = "/sys/bus/pci/devices/0000:00:00.0/class"; - memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1); - char* rPath = realpath(classPath, NULL); - int fd; - SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd); - free(rPath); - char pciClass[9]; - strncpy(pciClass, "0x000000", 9); - int len; - SYSCHECKVAL(read(fd, pciClass, 8), "read", len); - SYSCHECK(close(fd), "close"); - if (strcmp(pciClass, "0x068000") == 0) { - // PCI device is of type "Bridge / Other Bridge Device" (NVswitch) - *type = ncclNvLinkDeviceSwitch; - } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla) - || strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce) - *type = ncclNvLinkDeviceGpu; - } else { - // Ignore if we don't know what's on the other side. - return ncclSystemError; - } - return ncclSuccess; -} - -/* Get the maximum number of NVLinks based on the GPU generation */ -static ncclResult_t getMaxNvlinks(int* maxLinks) { - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - int ccMajor; - CUDACHECK(cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev)); - // 6 for Volta, 4 for Pascal - *maxLinks = (ccMajor > 6) ? 6 : 4; - // INFO("Device %d detected %d NVLinks", cudaDev, *maxLinks); - return ncclSuccess; -} - -static int getNvlinkGpu(const char* busId1, const char* busId2) { - // Determine if that connection is through NVLink - int links = 0; - int nvswitch_links = 0; - int maxNvLinks = ncclCudaCompCap() > 6 ? 6 : 4; - nvmlDevice_t nvmlDev; - ncclResult_t res = wrapNvmlDeviceGetHandleByPciBusId(busId1, &nvmlDev); - if (res != ncclSuccess) return 0; - - for(int l=0; l<maxNvLinks; ++l) { - // Check whether we can use this NVLink for P2P - unsigned canP2P; - if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue; - - // Make sure the Nvlink is up. The previous call should have trained the link. - nvmlEnableState_t isActive; - if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue; - - // Try to figure out what's on the other side of the NVLink - nvmlPciInfo_t remoteProc; - if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue; - - // Old versions of NVML return a lowercase PCI ID - char* p = remoteProc.busId; - for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) { - if (p[c] == 0) break; - p[c] = toupper(p[c]); - } - - if (busId2 != NULL && strncmp(busId2, remoteProc.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) == 0) { - links++; - } else { - // Make a lower case copy of the bus ID for calling ncclDeviceType - // PCI system path is in lower case - char* p = remoteProc.busId; - char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; - for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) { - if (p[c] == 0) break; - lowerId[c] = tolower(p[c]); - } - - // Determine if the remote side is NVswitch or a GPU - enum ncclNvLinkDeviceType type; - ncclResult_t ret = ncclDeviceType(lowerId, &type); - if (ret == ncclSuccess) { - if (type == ncclNvLinkDeviceSwitch) { - //TODO: we are making an assumption that all GPUs are connected to this switch - //This assumption may change for future architectures - nvswitch_links++; - } else if (type == ncclNvLinkDeviceGpu && busId2 == NULL) { - links++; - } - } else { - // The NVLink is up but we couldn't find the PCI device on the other - // side. Assume it's an NVswitch outside a VM. - if (l==0) INFO(NCCL_INIT, "Assuming NVLink is connected to NVswitch"); - nvswitch_links++; - } - } - } - return nvswitch_links ? CONNECT_NVSWITCH*nvswitch_links : CONNECT_NVLINK*links; -} - -#endif diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h index 0b6198a..01bbb7f 100644 --- a/src/include/nvmlwrap.h +++ b/src/include/nvmlwrap.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,18 +9,31 @@ #include "nccl.h" -//#define NVML_DIRECT 1 -#ifdef NVML_DIRECT -#include "nvml.h" +// The NVML library doesn't appear to be thread safe +#include <pthread.h> +extern pthread_mutex_t nvmlLock; +#define NVMLLOCK() pthread_mutex_lock(&nvmlLock) +#define NVMLUNLOCK() pthread_mutex_unlock(&nvmlLock) + +#define NVMLLOCKCALL(cmd, ret) do { \ + NVMLLOCK(); \ + ret = cmd; \ + NVMLUNLOCK(); \ +} while(false) #define NVMLCHECK(cmd) do { \ - nvmlReturn_t e = cmd; \ + nvmlReturn_t e; \ + NVMLLOCKCALL(cmd, e); \ if( e != NVML_SUCCESS ) { \ WARN("NVML failure '%s'", nvmlErrorString(e)); \ return ncclSystemError; \ } \ } while(false) +//#define NVML_DIRECT 1 +#ifdef NVML_DIRECT +#include "nvml.h" + static ncclResult_t wrapNvmlSymbols(void) { return ncclSuccess; } static ncclResult_t wrapNvmlInit(void) { NVMLCHECK(nvmlInit()); return ncclSuccess; } static ncclResult_t wrapNvmlShutdown(void) { NVMLCHECK(nvmlShutdown()); return ncclSuccess; } @@ -57,6 +70,10 @@ static ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned i NVMLCHECK(nvmlDeviceGetMinorNumber(device, minorNumber)); return ncclSuccess; } +static ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) { + NVMLCHECK(nvmlDeviceGetCudaComputeCapability(device, major, minor)); + return ncclSuccess; +} #else // Dynamically handle dependencies on NVML @@ -139,6 +156,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult); ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber); +ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor); #endif // NVML_DIRECT diff --git a/src/include/param.h b/src/include/param.h index dd5f697..5431757 100644 --- a/src/include/param.h +++ b/src/include/param.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -36,7 +36,6 @@ static void setEnvFile(const char* fileName) { s++; strncpy(envValue, line+s, 1024); setenv(envVar, envValue, 0); - char *str = getenv(envVar); } if (line) free(line); fclose(file); diff --git a/src/include/rings.h b/src/include/rings.h deleted file mode 100644 index 43fc595..0000000 --- a/src/include/rings.h +++ /dev/null @@ -1,17 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_RINGS_H_ -#define NCCL_RINGS_H_ - -static int getDefaultThreads() { - // On Kepler, rings are doubled later. - return ncclCudaCompCap() == 3 ? 128 : 256; -} - -ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut); - -#endif diff --git a/src/include/shm.h b/src/include/shm.h index 4fb49cb..9cd9d05 100644 --- a/src/include/shm.h +++ b/src/include/shm.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/include/socket.h b/src/include/socket.h index fb5cfc0..96bf5db 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -18,8 +18,9 @@ #define MAX_IFS 16 #define MAX_IF_NAME_SIZE 16 -#define SLEEP_INT 1000 // sleep interval in usec -#define RETRY_TIMES 2e4 // retry times before reporting a timeout (20 sec) +#define SLEEP_INT 1000 // connection retry sleep interval in usec +#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec) +#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s) /* Common socket address storage structure for IPv4/IPv6 */ union socketAddress { @@ -41,7 +42,7 @@ static inline const char *socketToString(struct sockaddr *saddr, char *buf) { return buf; } -static inline short socketToPort(struct sockaddr *saddr) { +static inline uint16_t socketToPort(struct sockaddr *saddr) { return ntohs(saddr->sa_family == AF_INET ? ((struct sockaddr_in*)saddr)->sin_port : ((struct sockaddr_in6*)saddr)->sin6_port); } @@ -65,6 +66,9 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre #endif struct netIf userIfs[MAX_IFS]; bool searchNot = prefixList && prefixList[0] == '^'; + if (searchNot) prefixList++; + bool searchExact = prefixList && prefixList[0] == '='; + if (searchExact) prefixList++; int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS); int found = 0; @@ -91,7 +95,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre } // check against user specified interfaces - if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs) ^ searchNot)) { + if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) { continue; } @@ -116,17 +120,17 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre return found; } -static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) { +static bool matchSubnet(struct ifaddrs local_if, union socketAddress* remote) { /* Check family first */ int family = local_if.ifa_addr->sa_family; - if (family != remote.sa.sa_family) { + if (family != remote->sa.sa_family) { return false; } if (family == AF_INET) { struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr); struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask); - struct sockaddr_in& remote_addr = remote.sin; + struct sockaddr_in& remote_addr = remote->sin; struct in_addr local_subnet, remote_subnet; local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr; remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr; @@ -134,7 +138,7 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) { } else if (family == AF_INET6) { struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr); struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask); - struct sockaddr_in6& remote_addr = remote.sin6; + struct sockaddr_in6& remote_addr = remote->sin6; struct in6_addr& local_in6 = local_addr->sin6_addr; struct in6_addr& mask_in6 = mask->sin6_addr; struct in6_addr& remote_in6 = remote_addr.sin6_addr; @@ -159,8 +163,11 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) { } } -static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress remoteAddr, int ifNameMaxSize, int maxIfs) { - char line[1024], line_a[1024]; +static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) { +#ifdef ENABLE_TRACE + char line[1024]; +#endif + char line_a[1024]; int found = 0; struct ifaddrs *interfaces, *interface; getifaddrs(&interfaces); @@ -184,13 +191,13 @@ static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAdd // Store the interface name strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize); - INFO(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a)); + TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr->sa), line_a)); found++; if (found == maxIfs) break; } if (found == 0) { - WARN("Net : No interface found in the same subnet as remote address %s", socketToString(&(remoteAddr.sa), line_a)); + WARN("Net : No interface found in the same subnet as remote address %s", socketToString(&(remoteAddr->sa), line_a)); } freeifaddrs(interfaces); return found; @@ -295,7 +302,7 @@ static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNam // Try to find interface that is in the same subnet as the IP in comm id union socketAddress idAddr; GetSocketAddrFromString(&idAddr, commId); - nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, idAddr, ifNameMaxSize, maxIfs); + nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs); } } // Then look for anything else (but not docker or lo) @@ -322,7 +329,11 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr) if (socketToPort(&localAddr->sa)) { // Port is forced by env. Make sure we get the port. int opt = 1; +#if defined(SO_REUSEPORT) SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt"); +#else + SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt"); +#endif } // localAddr port should be 0 (Any port) @@ -370,14 +381,18 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) { #endif int ret; - int retries = 0; + int timedout_retries = 0; + int refused_retries = 0; retry: SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret); if (ret == 0) return ncclSuccess; - if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) { - INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); \ - usleep(SLEEP_INT); - goto retry; + if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) { + if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) || + (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) { + if (refused_retries % 1000 == 0) INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); + usleep(SLEEP_INT); + goto retry; + } } WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno)); return ncclSystemError; @@ -385,12 +400,12 @@ retry: #define NCCL_SOCKET_SEND 0 #define NCCL_SOCKET_RECV 1 -static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* offset) { +static ncclResult_t socketProgressOpt(int op, int fd, void* ptr, int size, int* offset, int block) { int bytes = 0; char* data = (char*)ptr; do { - if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), MSG_DONTWAIT); - if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), MSG_DONTWAIT); + if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT); + if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT); if (op == NCCL_SOCKET_RECV && bytes == 0) { WARN("Net : Connection closed by remote peer"); return ncclSystemError; @@ -408,9 +423,13 @@ static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* off return ncclSuccess; } +static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* offset) { + return socketProgressOpt(op, fd, ptr, size, offset, 0); +} + static ncclResult_t socketWait(int op, int fd, void* ptr, int size, int* offset) { while (*offset < size) - NCCLCHECK(socketProgress(op, fd, ptr, size, offset)); + NCCLCHECK(socketProgressOpt(op, fd, ptr, size, offset, 1)); return ncclSuccess; } diff --git a/src/include/topo.h b/src/include/topo.h deleted file mode 100644 index e824a81..0000000 --- a/src/include/topo.h +++ /dev/null @@ -1,59 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_TOPO_H_ -#define NCCL_TOPO_H_ - -#include "nccl.h" -#include <limits.h> -#include <stdlib.h> -#include <ctype.h> - -#define BUSID_SIZE (sizeof("0000:00:00.0")) -#define BUSID_REDUCED_SIZE (sizeof("0000:00")) - -static ncclResult_t getCudaPath(int cudaDev, char** path) { - char busId[BUSID_SIZE]; - CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev)); - for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]); - char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0"; - memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1); - memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1); - *path = realpath(busPath, NULL); - if (*path == NULL) { - WARN("Could not find real path of %s", busPath); - return ncclSystemError; - } - return ncclSuccess; -} - -enum ncclPathDist { - PATH_PIX = 0, - PATH_PXB = 1, - PATH_PHB = 2, - PATH_SOC = 3 -}; - -static const char* pathDists[] = { "PIX", "PXB", "PHB", "SOC" }; - -static int pciDistance(char* path1, char* path2) { - int score = 0; - int depth = 0; - int same = 1; - for (int i=0; i<strlen(path1); i++) { - if (path1[i] != path2[i]) same = 0; - if (path1[i] == '/') { - depth++; - if (same == 1) score++; - } - } - if (score <= 3) return PATH_SOC; - if (score == 4) return PATH_PHB; - if (score == depth-1) return PATH_PIX; - return PATH_PXB; -} - -#endif diff --git a/src/include/transport.h b/src/include/transport.h index 6231a71..8f9bf0e 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -7,11 +7,15 @@ #ifndef NCCL_TRANSPORT_H_ #define NCCL_TRANSPORT_H_ -#include "nccl.h" -#include <stdint.h> +#include "devcomm.h" +#include "graph.h" #include "nvmlwrap.h" +#include "core.h" #define NTRANSPORTS 3 +#define TRANSPORT_P2P 0 +#define TRANSPORT_SHM 1 +#define TRANSPORT_NET 2 extern struct ncclTransport ncclTransports[]; @@ -23,21 +27,19 @@ struct ncclComm; struct ncclPeerInfo { int rank; int cudaDev; - int nvmlDev; + int gdrSupport; uint64_t hostHash; uint64_t pidHash; - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + dev_t shmDev; + int64_t busId; }; -// Used to hold the transport connection values -typedef int64_t ncclTvalue_t; - #define CONNECT_SIZE 128 struct ncclConnect { char data[CONNECT_SIZE]; }; -enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress, ncclProxyOpDone }; +enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }; struct ncclProxyArgs; typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*); @@ -50,7 +52,7 @@ struct ncclProxyArgs { int chunkSteps; int nsteps; uint64_t opCount; - int llMode; + int protocol; int state; // add component before this line -- it is left out during initialization // Internal state @@ -77,7 +79,7 @@ struct ncclProxyState { }; struct ncclTransportComm { - ncclResult_t (*setup)(struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId); + ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId); ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*); ncclResult_t (*free)(void*); ncclResult_t (*proxy)(struct ncclProxyArgs*); @@ -85,8 +87,7 @@ struct ncclTransportComm { struct ncclTransport { const char name[4]; - ncclResult_t (*canConnect)(ncclTvalue_t*, struct ncclPeerInfo*, struct ncclPeerInfo*); - ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*); + ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*); struct ncclTransportComm send; struct ncclTransportComm recv; }; @@ -117,8 +118,4 @@ inline void transportProxyWait(const FUNC& func) { } } -inline void transportProxyIdle(int idle) { - sched_yield(); -} - #endif diff --git a/src/include/trees.h b/src/include/trees.h index 1a151d1..7eadd85 100644 --- a/src/include/trees.h +++ b/src/include/trees.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/include/utils.h b/src/include/utils.h index 5a6a588..266abca 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -10,7 +10,16 @@ #include "nccl.h" #include <stdint.h> -ncclResult_t getHostName(char* hostname, int maxlen); +int ncclCudaCompCap(); + +// PCI Bus ID <-> int64 conversion functions +ncclResult_t int64ToBusId(int64_t id, char* busId); +ncclResult_t busIdToInt64(char* busId, int64_t* id); + +ncclResult_t getBusId(int cudaDev, int64_t *busId); + +ncclResult_t getHostName(char* hostname, int maxlen, const char delim); +uint64_t getHash(const char* string, int n); uint64_t getHostHash(); uint64_t getPidHash(); @@ -20,6 +29,12 @@ struct netIf { }; int parseStringList(const char* string, struct netIf* ifList, int maxList); -bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize); +bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact); + +static long log2i(long n) { + long l = 0; + while (n>>=1) l++; + return l; +} #endif diff --git a/src/init.cc b/src/init.cc new file mode 100644 index 0000000..627f6c7 --- /dev/null +++ b/src/init.cc @@ -0,0 +1,934 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "nccl.h" +#include "channel.h" +#include "nvmlwrap.h" +#include "bootstrap.h" +#include "transport.h" +#include "group.h" +#include "net.h" +#include "enqueue.h" +#include "graph.h" +#include "argcheck.h" +#include "cpuset.h" +#include <sched.h> +#include <fcntl.h> +#include <string.h> +#include <errno.h> +#include <assert.h> +#include <dlfcn.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +#define STR2(v) #v +#define STR(v) STR2(v) + +#ifdef ENABLE_TRACE +std::chrono::high_resolution_clock::time_point ncclEpoch; +#endif + +#if CUDART_VERSION >= 9020 +#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream +#else +#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream +#endif + +NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM); + +NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); + +ncclNet_t* ncclNet = NULL; + +// Returns ncclInternalError if anything fails, causing that network to be ignored. +ncclResult_t initNet(ncclNet_t* net) { + int ndev; + if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError; + if (net->devices(&ndev) != ncclSuccess) return ncclInternalError; + if (ndev <= 0) return ncclSystemError; + return ncclSuccess; +} + +ncclResult_t initNetPlugin(ncclNet_t** net) { + void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL); + if (netPluginLib == NULL) { + // dlopen does not guarantee to set errno, but dlerror only gives us a + // string, so checking errno doesn't hurt to try to provide a better + // error message + if (errno == ENOENT) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (libnccl-net.so), using internal implementation"); + } else { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror()); + } + return ncclSuccess; + } + ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL)); + if (extNet == NULL) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol."); + goto cleanup; + } + if (initNet(extNet) == ncclSuccess) { + *net = extNet; + return ncclSuccess; + } +cleanup: + if (netPluginLib != NULL) dlclose(netPluginLib); + return ncclSuccess; +} + +ncclResult_t initNet() { + // Always initialize bootstrap network + NCCLCHECK(bootstrapNetInit()); + + NCCLCHECK(initNetPlugin(&ncclNet)); + if (ncclNet != NULL) return ncclSuccess; + if (initNet(&ncclNetIb) == ncclSuccess) { + ncclNet = &ncclNetIb; + } else { + NCCLCHECK(initNet(&ncclNetSocket)); + ncclNet = &ncclNetSocket; + } + return ncclSuccess; +} + +pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER; +static bool initialized = false; +static ncclResult_t ncclInit() { + if (initialized) return ncclSuccess; + pthread_mutex_lock(&initLock); + if (!initialized) { + initEnv(); + initNet(); + initialized = true; + } + pthread_mutex_unlock(&initLock); + return ncclSuccess; +} + +NCCL_API(ncclResult_t, ncclGetVersion, int* version); +ncclResult_t ncclGetVersion(int* version) { + if (version == NULL) return ncclInvalidArgument; + *version = NCCL_VERSION_CODE; + return ncclSuccess; +} + +NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out); +ncclResult_t ncclGetUniqueId(ncclUniqueId* out) { + NCCLCHECK(ncclInit()); + NCCLCHECK(PtrCheck(out, "GetUniqueId", "out")); + return bootstrapGetUniqueId(out); +} + +// Prevent compiler from optimizing out these operations +#ifdef __clang__ +#define NCCL_NO_OPTIMIZE __attribute__((optnone)) +#else +#define NCCL_NO_OPTIMIZE __attribute__((optimize("O0"))) +#endif + +void NCCL_NO_OPTIMIZE commPoison(ncclComm_t comm) { + comm->rank = comm->cudaDev = comm->busId = comm->nRanks = -1; +} + +#undef NCCL_NO_OPTIMIZE + +static ncclResult_t commFree(ncclComm_t comm) { + if (comm == NULL) + return ncclSuccess; + + free(comm->peerInfo); + ncclTopoFree(comm->topo); + + if (comm->bootstrap) + NCCLCHECK(bootstrapClose(comm->bootstrap)); + + CUDACHECK(cudaFree(comm->hostDevComm.channels)); + CUDACHECK(cudaFree(comm->devComm)); + + for (int channel=0; channel<comm->nChannels; channel++) + NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks)); + + if (comm->doneEvent != NULL) + CUDACHECK(cudaEventDestroy(comm->doneEvent)); + + if (comm->launchMode == ncclComm::GROUP) { + CUDACHECK(cudaStreamDestroy(comm->groupStream)); + } + + // Last rank frees shared resources between threads + int isLast; + NCCLCHECK(ncclCpuBarrierIn(comm, &isLast)); + if (isLast) { + free(comm->intraBarrier); + free(comm->intraParams); + free(comm->intraCudaDevs); + free(comm->intraCGMode); + free(comm->intraCC); + } + CUDACHECK(cudaFreeHost((void *)comm->abortFlag)); + CUDACHECK(cudaFreeHost((void *)comm->fatalDevError)); + + // Poison comm to try and catch a double free + commPoison(comm); + + free(comm); + return ncclSuccess; +} + +static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { + if (ndev < 1) { + WARN("invalid device count (%d) requested", ndev); + return ncclInvalidArgument; + } + if (rank >= ndev || rank < 0) { + WARN("rank %d exceeds ndev=%d", rank, ndev); + return ncclInvalidArgument; + } + + // Try to create a CUDA object right away. If there is something wrong with + // the device we're on (failure cause #1) , better know it early. + cudaEvent_t doneEvent; + CUDACHECK(cudaEventCreateWithFlags(&doneEvent, cudaEventDisableTiming)); + + struct ncclComm* comm; + NCCLCHECK(ncclCalloc(&comm, 1)); + + comm->rank = comm->hostDevComm.rank =rank; + comm->nRanks = comm->hostDevComm.nRanks = ndev; + cudaGetDevice(&comm->cudaDev); + NCCLCHECK(getBusId(comm->cudaDev, &comm->busId)); + TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x", comm, rank, ndev, comm->cudaDev, comm->busId); + + comm->doneEvent = doneEvent; + comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false; +#if CUDART_VERSION >= 9020 + comm->groupCudaStream = ncclParamGroupCudaStream(); +#else + // Don't allow the user to overload the default setting in older CUDA builds + comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM; +#endif + comm->fatalError = ncclSuccess; + + NCCLCHECK(ncclCudaHostAlloc((void**) &comm->fatalDevError, (void**) &comm->hostDevComm.fatalDevError, sizeof(ncclDevError_t))); + *comm->fatalDevError = ncclDevSuccess; + + NCCLCHECK(ncclCudaHostAlloc((void**) &comm->abortFlag, (void**) &comm->hostDevComm.abortFlag, sizeof(uint32_t))); + *comm->abortFlag = 0; + + comm->argsptr = &comm->args; + + *comret = comm; + return ncclSuccess; +} + +static ncclResult_t devCommSetup(ncclComm_t comm) { + // Duplicate the channels on the device + NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->nChannels)); + NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->nChannels)); + + // Copy userRanks and peers + for (int r=0; r<comm->nChannels; r++) { + NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks)); + NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks)); + } + + // Duplicate the dev comm on the device + NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1)); + NCCLCHECK(ncclCudaMemcpy(comm->devComm, &comm->hostDevComm, 1)); + return ncclSuccess; +} + +// Pre-process the string so that running "strings" on the lib can quickly reveal the version. +#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR) +static void showVersion() { + static int shown = 0; + if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) { + printf("%s\n", VERSION_STRING); + fflush(stdout); + if (ncclDebugFile != stdout) + INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files + shown = 1; + } +} + +static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) { + info->rank = comm->rank; + CUDACHECK(cudaGetDevice(&info->cudaDev)); + info->hostHash=getHostHash()+commHash; + info->pidHash=getPidHash()+commHash; + + // Get the device MAJOR:MINOR of /dev/shm so we can use that + // information to decide whether we can use SHM for inter-process + // communication in a container environment + struct stat statbuf; + SYSCHECK(stat("/dev/shm", &statbuf), "stat"); + info->shmDev = statbuf.st_dev; + + info->busId = comm->busId; + int netDevs; + + NCCLCHECK(ncclNetDevices(&netDevs)); + for (int n=0; n<netDevs; n++) { + int ptrSupport; + NCCLCHECK(ncclNetPtrSupport(n, &ptrSupport)); + if (ptrSupport & NCCL_PTR_CUDA) info->gdrSupport |= (1 << n); + } + return ncclSuccess; +} + +template <int type> +static ncclResult_t selectTransport(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) { + for (int t=0; t<NTRANSPORTS; t++) { + struct ncclTransport *transport = ncclTransports+t; + struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv; + int ret = 0; + NCCLCHECK(transport->canConnect(&ret, topo, graph, myInfo, peerInfo)); + if (ret) { + connector->transportComm = transportComm; + NCCLCHECK(transportComm->setup(topo, graph, myInfo, peerInfo, connect, connector, buffSize, channelId)); + return ncclSuccess; + } + } + WARN("No transport found !"); + return ncclInternalError; +} + +static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks) { + TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); + NCCLCHECK(initChannel(comm, channelId)); + + struct ncclRing* ring = &comm->channels[channelId].ring; + // Reorganize ranks to start with rank. + int shift; + for (shift = 0; shift<nranks; shift++) { + if (ringRanks[shift] == rank) { + break; + } + } + for (int i=0; i<nranks; i++) { + ring->userRanks[i] = ringRanks[(i+shift)%nranks]; + } + return ncclSuccess; +} + +void* waitForNonNullPtr(void* p) { + volatile void** ptr = (volatile void**) p; + while (*ptr == NULL) sched_yield(); + return (void*)*ptr; +} + +ncclResult_t initParams(struct ncclComm* comm) { + struct cudaLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank; + params->args = &comm->argsptr; + params->stream = NULL; + params->sharedMem = 0; + params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1; + params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1; + return ncclSuccess; +} + +// Allocate/Set Intra Process Structures and set CG options +ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) { + comm->intraRank = rank; + comm->intraRanks = ranks; + comm->intraPhase = 0; + + // Alloc shared structures + if (rank == 0) { + assert(comm == comm0); + int* bar; + NCCLCHECK(ncclCalloc(&bar, 2)); + bar[0] = bar[1] = 0; + comm->intraBarrier = bar; + NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks)); + NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks)); + int* CGMode; + NCCLCHECK(ncclCalloc(&CGMode, 1)); + *CGMode = 0x11; + comm->intraCGMode = CGMode; + int* CC; + NCCLCHECK(ncclCalloc(&CC, 1)); + *CC = ncclCudaCompCap(); + comm->intraCC = CC; + } else { + comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier); + comm->intraParams = (struct cudaLaunchParams*)waitForNonNullPtr(&comm0->intraParams); + comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs); + comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode); + comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC); + } + comm->intraCudaDevs[comm->intraRank] = comm->cudaDev; + NCCLCHECK(initParams(comm)); + + int cgMdLaunch = 0; + + // Set CG Mode + comm->launchMode = ncclComm::GROUP; + char* str = getenv("NCCL_LAUNCH_MODE"); + if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) { + comm->launchMode = ncclComm::PARALLEL; + } + if (comm->launchMode == ncclComm::GROUP) { + CUDACHECK(cudaStreamCreateWithFlags(&comm->groupStream, cudaStreamNonBlocking)); +#if CUDART_VERSION >= 9000 + if (*comm->intraCC && (ncclCudaCompCap() == *comm->intraCC)) { + // Check whether the GPU supports Cooperative Group Multi Device Launch + (void) cudaDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev); + } +#endif + } + + // Disable cgMdLaunch if any rank does not support it + if (cgMdLaunch == 0) { + *comm->intraCGMode = 0x10; + } + return ncclSuccess; +} + +static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) { + TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv); + uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */ + struct ncclConnect connect; + struct ncclConnector* conn; + for (int i=0; i<nrecv; i++) { + int peer = peerRecv[i]; + if (peer == -1) continue; + conn = &channel->peers[peer].recv; + if (conn->connected) { ++nSkippedRecv; continue; } + memset(&connect, 0, sizeof(connect)); + NCCLCHECK(selectTransport<0>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); + NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + } + for (int i=0; i<nsend; i++) { + int peer = peerSend[i]; + if (peer == -1) continue; + conn = &channel->peers[peer].send; + if (conn->connected) { ++nSkippedSend; continue; } + memset(&connect, 0, sizeof(connect)); + NCCLCHECK(selectTransport<1>(comm->topo, graph, comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); + NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + } + for (int i=0; i<nsend; i++) { + int peer = peerSend[i]; + if (peer == -1) continue; + conn = &channel->peers[peer].send; + if (conn->connected) {++nSkippedSend; continue; } + memset(&connect, 0, sizeof(connect)); + NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + NCCLCHECK(conn->transportComm->connect(&connect, conn)); + conn->connected = 1; + } + for (int i=0; i<nrecv; i++) { + int peer = peerRecv[i]; + if (peer == -1) continue; + conn = &channel->peers[peer].recv; + if (conn->connected) {++nSkippedRecv; continue; } + memset(&connect, 0, sizeof(connect)); + NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + NCCLCHECK(conn->transportComm->connect(&connect, conn)); + conn->connected = 1; + } + TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv); + return ncclSuccess; +} + +NCCL_PARAM(CrossNic, "CROSS_NIC", 2); + +static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) { + // We use 3 AllGathers + // 1. { peerInfo, comm } + // 2. ConnectTransport[nranks], ConnectValue[nranks] + // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] } + + int rank = comm->rank; + int nranks = comm->nRanks; + uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES); + TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks); + NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap)); + + // AllGather1 - begin + struct { + struct ncclPeerInfo peerInfo; + struct ncclComm* comm; + } *allGather1Data; + + NCCLCHECK(ncclCalloc(&allGather1Data, nranks)); + allGather1Data[rank].comm = comm; + struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo; + NCCLCHECK(fillInfo(comm, myInfo, commHash)); + NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data))); + + NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks)); + for (int i = 0; i < nranks; i++) { + memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo)); + if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) { + WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId); + return ncclInvalidUsage; + } + } + // AllGather1 data is used again below + // AllGather1 - end + + // Topo detection / System graph creation + NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo)); + // Compute paths between GPUs and NICs + NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo)); + // Remove inaccessible GPUs and unused NICs + NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm)); + // Recompute paths after trimming + NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo)); + // Compute max speed to accelerate search + NCCLCHECK(ncclTopoGetMaxSpeed(comm->topo)); + // Print final topology + NCCLCHECK(ncclTopoPrint(comm->topo)); + + // Get rings and trees + struct ncclTopoGraph treeGraph; + treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE; + treeGraph.crossNic = ncclParamCrossNic(); + // We communicate only half the data between node with trees on 2 nodes. + NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph)); + NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph)); + struct ncclTopoGraph ringGraph; + ringGraph.pattern = NCCL_TOPO_PATTERN_RING; + ringGraph.crossNic = ncclParamCrossNic(); + NCCLCHECK(ncclTopoCompute(comm->topo, &ringGraph)); + NCCLCHECK(ncclTopoPrintGraph(comm->topo, &ringGraph)); + + // AllGather3 - begin + + struct { + int cudaCompCap; + int fullCudaCompCap; + int nvlink; + int nChannels; + struct { + int sameChannels; + int speedIntra; + int speedInter; + int nvlink; + } tree; + struct { + int sameChannels; + int speedIntra; + int speedInter; + int nvlink; + } ring; + struct ncclTopoRanks topoRanks; + } *allGather3Data; + + NCCLCHECK(ncclCalloc(&allGather3Data, nranks)); + allGather3Data[rank].cudaCompCap = ncclCudaCompCap(); + allGather3Data[rank].nvlink = treeGraph.nvlink; + allGather3Data[rank].nChannels = comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels); + allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels; + allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra; + allGather3Data[rank].tree.speedInter = treeGraph.speedInter; + allGather3Data[rank].tree.nvlink = treeGraph.nvlink; + allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels; + allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra; + allGather3Data[rank].ring.speedInter = ringGraph.speedInter; + allGather3Data[rank].ring.nvlink = ringGraph.nvlink; + + NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &allGather3Data[rank].topoRanks)); + + NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data))); + + // Determine nNodes, firstRanks, ... + int* nodesFirstRank; + NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks)); + for (int i=0; i<nranks; i++) { + int node = -1; + int firstRank = allGather3Data[i].topoRanks.ringRecv[0]; + for (int n=0; n<comm->nNodes; n++) { + if (nodesFirstRank[n] == firstRank) node = n; + } + if (node == -1) { + node = comm->nNodes++; + nodesFirstRank[node] = firstRank; + } + if (i == comm->rank) comm->node = node; + } + + // Determine the minimum CUDA Compute capability of all GPUs + int myCompCap = allGather3Data[rank].cudaCompCap; + int minCompCap = myCompCap, maxCompCap = myCompCap; + for (int i = 0; i < nranks; i++) { + minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap); + maxCompCap = std::max(allGather3Data[i].cudaCompCap, maxCompCap); + } + + comm->nvlink = 1; + for (int i = 0; i < nranks; i++) comm->nvlink &= allGather3Data[i].nvlink; + + int nChannelsOrig = comm->nChannels; + struct ncclTopoRanks** allTopoRanks; + NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks)); + for (int i=0; i<nranks; i++) { + allTopoRanks[i] = &allGather3Data[i].topoRanks; + // Make sure we align all ranks so that the tuning is consistent across ranks + treeGraph.nChannels = ringGraph.nChannels = comm->nChannels = std::min(allGather3Data[i].nChannels, comm->nChannels); + treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels); + treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra); + treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter); + treeGraph.nvlink = std::min(allGather3Data[i].tree.nvlink, treeGraph.nvlink); + ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels); + ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra); + ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter); + ringGraph.nvlink = std::min(allGather3Data[i].ring.nvlink, ringGraph.nvlink); + } + + if (comm->nChannels < nChannelsOrig) { + // We started duplicating channels during Preset(), so we need to move the + // duplicated channels since we have removed some. + for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel)); + } + + int *rings; + NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS)); + + NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings)); + + free(allTopoRanks); + free(nodesFirstRank); + free(allGather3Data); + + // AllGather3 - end + + TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels); + + NCCLCHECK(ncclSetThresholds(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph)); + + char line[1024]; + line[0]='\0'; + for (int c=0; c<comm->nChannels; c++) { + struct ncclTree* treeUp = &comm->channels[c].treeUp; + struct ncclTree* treeDn = &comm->channels[c].treeDn; + snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d|%d->%d->%d/%d/%d", + c, treeUp->down[0], treeUp->down[1], treeUp->down[2], rank, treeUp->up, + treeDn->up, rank, treeDn->down[0], treeDn->down[1], treeDn->down[2]); + } + line[1023] = '\0'; + INFO(NCCL_INIT, "Trees%s", line); + + // Connect with prev/next for each ring + struct ncclConnect *connect; + NCCLCHECK(ncclCalloc(&connect, 2)); + for (int c=0; c<comm->nChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + NCCLCHECK(setupChannel(comm, c, rank, nranks, rings+c*nranks)); + if (comm->nRanks == 1) continue; + NCCLCHECK(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next)); + NCCLCHECK(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up)); + NCCLCHECK(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down)); + } + TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); + free(connect); + free(rings); + + // Compute intra ranks (using AllGather1 data) + int intraRank0 = -1, intraRank = -1, intraRanks = 0; + for (int i = 0; i < nranks; i++) { + if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) && + (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) { + if (intraRanks == 0) intraRank0 = i; + if (i == rank) intraRank = intraRanks; + intraRanks++; + } + } + TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d", + rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0); + if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) { + WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d", + rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0); + return ncclInternalError; + } + NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm)); + + // Done with AllGather1 data + free(allGather1Data); + + if (comm->nNodes) NCCLCHECK(transportCreateProxy(comm)); + + TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); + return ncclSuccess; +} + +static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) { + CPU_ZERO_S(sizeof(cpu_set_t), mask); + char* cudaPath; + NCCLCHECK(ncclTopoCudaPath(cudaDev, &cudaPath)); + char path[PATH_MAX]; + strncpy(path, cudaPath, PATH_MAX-1); + snprintf(path+strlen(path), PATH_MAX-1-strlen(path), "/local_cpus"); + path[PATH_MAX-1] = '\0'; + int fd; + SYSCHECKVAL(open(path, O_RDONLY), "open", fd); + char affinityStr[sizeof(cpu_set_t)*2 + 1]; + int r = read(fd, affinityStr, sizeof(cpu_set_t)*2); + if (r > 0) { + affinityStr[r] = '\0'; + NCCLCHECK(ncclStrToCpuset(affinityStr, mask)); + } + close(fd); + free(cudaPath); + return ncclSuccess; +} + +NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0); + +static ncclResult_t setCpuAffinity(int cudaDev) { + // Query the CPU affinity set we were provided + cpu_set_t mask; + SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity"); + +#ifdef ENABLE_TRACE + { + char affinityStr[sizeof(cpu_set_t)*2]; + NCCLCHECK(ncclCpusetToStr(&mask, affinityStr)); + TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", cudaDev, affinityStr); + } +#endif + + // Find the CPUs that are local to the supplied GPU + cpu_set_t gpuMask; + NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask)); + +#ifdef ENABLE_TRACE + { + char affinityStr[sizeof(cpu_set_t)*2]; + NCCLCHECK(ncclCpusetToStr(&gpuMask, affinityStr)); + TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", cudaDev, affinityStr); + } +#endif + + cpu_set_t finalMask; + if (ncclParamIgnoreCpuAffinity()) + // Ignore the CPU affinity set and use the GPU one instead + finalMask = gpuMask; + else + // Use a subset of the GPU affinity set + CPU_AND(&finalMask, &mask, &gpuMask); + + // If there is a non empty set, use it to set affinity + if (CPU_COUNT(&finalMask)) { + char affinityStr[sizeof(cpu_set_t)*2]; + NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr)); + INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", cudaDev, affinityStr); + SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity"); + } + return ncclSuccess; +} + +ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) { + cpu_set_t affinitySave; + sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); + + NCCLCHECK(wrapNvmlSymbols()); + NCCLCHECK(wrapNvmlInit()); + + // Make sure all host memory allocation are close to the GPU + CUDACHECK(cudaSetDevice(cudaDev)); + NCCLCHECK(setCpuAffinity(cudaDev)); + ncclResult_t res; + + NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup); + NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup); + NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup); + + sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); + NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup); + + INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId); + + return ncclSuccess; +cleanup: + if ((*newcomm) && (*newcomm)->bootstrap) bootstrapAbort((*newcomm)->bootstrap); + *newcomm = NULL; + sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); + return res; +} + +static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) { + ncclResult_t res; + char* env = getenv("NCCL_COMM_ID"); + if (env && myrank == 0) { + NCCLCHECKGOTO(bootstrapCreateRoot(&commId, true), res, end); + } + + NCCLCHECKGOTO(ncclInit(), res, end); + if (myrank == 0) showVersion(); + + // Make sure the CUDA runtime is initialized. + CUDACHECKGOTO(cudaFree(NULL), res, end); + + NCCLCHECKGOTO(PtrCheck(newcomm, "CommInitRank", "newcomm"), res, end); + if (nranks < 1 || myrank < 0 || myrank >= nranks) { + WARN("Invalid rank requested : %d/%d", myrank, nranks); + res = ncclInvalidArgument; + goto end; + } + + if (ncclAsyncMode()) { + NCCLCHECKGOTO(ncclAsyncInit(ncclCommInitRankSync, newcomm, nranks, commId, myrank, cudaDev), res, end); + } else { + NCCLCHECKGOTO(ncclCommInitRankSync(newcomm, nranks, commId, myrank, cudaDev), res, end); + } +end: + if (ncclAsyncMode()) return ncclAsyncErrCheck(res); + else return res; +} + +NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank); +ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, commId, myrank, cudaDev)); + return ncclSuccess; +} + +NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist); +ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { + NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms")); + if (ndev < 0) { + WARN("Invalid device count requested : %d", ndev); + return ncclInvalidArgument; + } + + ncclUniqueId uniqueId; + NCCLCHECK(ncclGetUniqueId(&uniqueId)); + NCCLCHECK(ncclGroupStart()); + for (int i=0; i<ndev; i++) { + // Ignore return codes .. we need to call ncclGroupEnd to clean up anyway + ncclCommInitRankDev(comms+i, ndev, uniqueId, i, devlist ? devlist[i] : i); + } + NCCLCHECK(ncclGroupEnd()); + return ncclSuccess; +} + +static ncclResult_t commDestroy(ncclComm_t comm) { + int savedDevice; +#ifdef ENABLE_TRACE + int rank = comm->rank; +#endif + CUDACHECK(cudaGetDevice(&savedDevice)); + int commDevice = comm->cudaDev; + + if (savedDevice != commDevice) { + CUDACHECK(cudaSetDevice(commDevice)); + } + + TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, *comm->abortFlag, comm->fatalError); + + CUDACHECK(cudaStreamSynchronize(comm->groupStream)); + NCCLCHECK(transportDestroyProxy(comm)); + NCCLCHECK(commFree(comm)); + + if (savedDevice != commDevice) + CUDACHECK(cudaSetDevice(savedDevice)); + + TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank); + + return ncclSuccess; +} + +NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm); +ncclResult_t ncclCommDestroy(ncclComm_t comm) { + if (comm == NULL) + return ncclSuccess; + + TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %x", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId); + + // Try and prevent a double free of the comm struct (user error) + if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) { + WARN("comm %p has already been destroyed", comm); + return ncclInvalidArgument; + } + + return commDestroy(comm); +} + +NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm); +ncclResult_t ncclCommAbort(ncclComm_t comm) { + if (comm == NULL) + return ncclSuccess; + + // Ask anything that might still be running on the device to quit + *comm->abortFlag = 1; + + return commDestroy(comm); +} + +NCCL_API(const char*, ncclGetErrorString, ncclResult_t code); +const char* ncclGetErrorString(ncclResult_t code) { + switch (code) { + case ncclSuccess : return "no error"; + case ncclUnhandledCudaError : return "unhandled cuda error"; + case ncclSystemError : return "unhandled system error"; + case ncclInternalError : return "internal error"; + case ncclInvalidArgument : return "invalid argument"; + case ncclInvalidUsage : return "invalid usage"; + default : return "unknown result code"; + } +} + +NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError); +ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) { + NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm")); + NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError")); + + // Check device reported error + static ncclDevError_t printedDevErr = ncclDevSuccess; + switch(*comm->fatalDevError) { + case ncclDevSuccess : + break; + case ncclDevAssertedMismatch : + if (printedDevErr != ncclDevAssertedMismatch) { + WARN("Mismatched collective detected, please check your collective calls at and around rank %d. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank); + printedDevErr = ncclDevAssertedMismatch; + } + if (comm->fatalError == ncclSuccess) { + comm->fatalError = ncclInvalidUsage; + } + break; + case ncclDevSuspectedMismatch : + if (printedDevErr != ncclDevSuspectedMismatch) { + WARN("Your program may be hanging, this may be caused by a collective mismatch around rank %d. Please check your collective calls at and around this rank. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank); + printedDevErr = ncclDevSuspectedMismatch; + } + break; + default: + WARN("Unknown device error %d", *comm->fatalDevError); + return ncclInternalError; + } + *asyncError = comm->fatalError; + return ncclSuccess; +} + +NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count); +ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { + NCCLCHECK(PtrCheck(comm, "CommCount", "comm")); + NCCLCHECK(PtrCheck(count, "CommCount", "count")); + *count = comm->nRanks; + return ncclSuccess; +} + +NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid); +ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) { + NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm")); + NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid")); + *devid = comm->cudaDev; + return ncclSuccess; +} + +NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank); +ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) { + NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm")); + NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank")); + *rank = comm->rank; + return ncclSuccess; +} diff --git a/src/init.cu b/src/init.cu deleted file mode 100644 index 75822e6..0000000 --- a/src/init.cu +++ /dev/null @@ -1,1240 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "nccl.h" -#include "core.h" -#include "channel.h" -#include "param.h" -#include "nvmlwrap.h" -#include "rings.h" -#include "trees.h" -#include "bootstrap.h" -#include "transport.h" -#include "group.h" -#include "utils.h" -#include "net.h" -#include "checks.h" -#include "enqueue.h" -#include "topo.h" -#include "nvlink.h" -#include "cpuset.h" -#include <stdio.h> -#include <stdlib.h> -#include <sys/mman.h> -#include <sys/stat.h> -#include <sys/types.h> -#include <sched.h> -#include <fcntl.h> -#include <unistd.h> -#include <cuda_runtime.h> -#include <string.h> -#include <errno.h> -#include <assert.h> -#include <dlfcn.h> - -#define STR2(v) #v -#define STR(v) STR2(v) - -int ncclDebugLevel; -uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT -pthread_mutex_t ncclDebugOutputLock; -FILE *ncclDebugFile = stdout; - -#ifdef ENABLE_TRACE -std::chrono::high_resolution_clock::time_point ncclEpoch; -#endif - -#if CUDART_VERSION >= 9200 -#define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream -#else -#define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream -#endif - -NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM); - -NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); - -ncclNet_t* ncclNet = NULL; - -// We define this as weak to let tests redefine their own -#pragma weak ncclNvlinkGpu -ncclResult_t ncclNvlinkGpu(int* nvlink) { - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; - CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev)); - *nvlink = getNvlinkGpu(busId, NULL); - return ncclSuccess; -} -// We define this as weak to let tests redefine their own -#pragma weak ncclCudaCompCap -int ncclCudaCompCap() { - int cudaDev; - if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0; - int ccMajor; - if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0; - return ccMajor; -} -int ncclCudaFullCompCap() { - int cudaDev; - if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0; - int ccMajor, ccMinor; - if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0; - if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0; - return ccMajor*10+ccMinor; -} - -// Returns ncclInternalError if anything fails, causing that network to be ignored. -ncclResult_t initNet(ncclNet_t* net) { - int ndev; - if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError; - if (net->devices(&ndev) != ncclSuccess) return ncclInternalError; - if (ndev <= 0) return ncclSystemError; - return ncclSuccess; -} - -ncclResult_t initNetPlugin(ncclNet_t** net) { - void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL); - if (netPluginLib == NULL) { - // dlopen does not guarantee to set errno, but dlerror only gives us a - // string, so checking errno doesn't hurt to try to provide a better - // error message - if (errno == ENOENT) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (libnccl-net.so)."); - } else { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror()); - } - return ncclSuccess; - } - ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL)); - if (extNet == NULL) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol."); - goto cleanup; - } - if (initNet(extNet) == ncclSuccess) { - *net = extNet; - return ncclSuccess; - } -cleanup: - if (netPluginLib != NULL) dlclose(netPluginLib); - return ncclSuccess; -} - -ncclResult_t initNet() { - // Always initialize sockets as we use it for bootstrap - NCCLCHECK(initNet(&ncclNetSocket)); - - NCCLCHECK(initNetPlugin(&ncclNet)); - if (ncclNet != NULL) return ncclSuccess; - if (initNet(&ncclNetIb) == ncclSuccess) { - ncclNet = &ncclNetIb; - } else { - ncclNet = &ncclNetSocket; - } - return ncclSuccess; -} - -NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2); -NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2); -NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", -2); - -int ncclThreadThreshold(int minCompCap, int multiNode) { - int threshold = ncclParamThreadThreshold(); - if (threshold == -2) { // user has not set this env variable - threshold = (minCompCap <= 6) ? NCCL_THREAD_THRESHOLD_PREVOLTA : NCCL_THREAD_THRESHOLD; - // multiply by 2 if running on multiple nodes - if (multiNode) { - threshold *= 2; - } - } - return threshold; -} - -pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER; -static bool initialized = false; -static ncclResult_t ncclInit() { - if (initialized) return ncclSuccess; - pthread_mutex_lock(&initLock); - if (!initialized) { - initEnv(); - initDebug(); - initNet(); - initialized = true; - } - pthread_mutex_unlock(&initLock); - return ncclSuccess; -} - -NCCL_API(ncclResult_t, ncclGetVersion, int* version); -ncclResult_t ncclGetVersion(int* version) { - if (version == NULL) return ncclInvalidArgument; - *version = NCCL_VERSION_CODE; - return ncclSuccess; -} - -NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out); -ncclResult_t ncclGetUniqueId(ncclUniqueId* out) { - NCCLCHECK(ncclInit()); - NCCLCHECK(PtrCheck(out, "GetUniqueId", "out")); - return bootstrapGetUniqueId(out); -} - -static ncclResult_t commFree(ncclComm_t comm) { - if (comm == NULL) - return ncclSuccess; - - free(comm->peerInfo); - - if (comm->bootstrap) - NCCLCHECK(bootstrapClose(comm->bootstrap)); - - CUDACHECK(cudaFree(comm->devComm)); - - for (int channel=0; channel<comm->nChannels; channel++) - NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks)); - - if (comm->doneEvent != NULL) - CUDACHECK(cudaEventDestroy(comm->doneEvent)); - - if (comm->launchMode == ncclComm::GROUP) { - CUDACHECK(cudaStreamDestroy(comm->groupStream)); - } - - // Last rank frees shared resources between threads - int isLast; - NCCLCHECK(ncclCpuBarrierIn(comm, &isLast)); - if (isLast) { - free(comm->intraBarrier); - free(comm->intraParams); - free(comm->intraCudaDevs); - free(comm->intraCGMode); - free(comm->intraCC); - } - CUDACHECK(cudaFreeHost((void *)comm->abortFlag)); - CUDACHECK(cudaFreeHost((void *)comm->fatalDevError)); - - free(comm); - return ncclSuccess; -} - -static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { - if (ndev < 1) { - WARN("invalid device count (%d) requested", ndev); - return ncclInvalidArgument; - } - if (rank >= ndev || rank < 0) { - WARN("rank %d exceeds ndev=%d", rank, ndev); - return ncclInvalidArgument; - } - - // Try to create a CUDA object right away. If there is something wrong with - // the device we're on (failure cause #1) , better know it early. - cudaEvent_t doneEvent; - CUDACHECK(cudaEventCreateWithFlags(&doneEvent, cudaEventDisableTiming)); - - struct ncclComm* comm; - NCCLCHECK(ncclCalloc(&comm, 1)); - - comm->rank = rank; - comm->nRanks = ndev; - cudaGetDevice(&comm->cudaDev); - getNvmlDevice(comm->cudaDev, &comm->nvmlDev); - INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev); - - comm->doneEvent = doneEvent; - comm->llThreshold = ncclParamLlThreshold(); - comm->treeThreshold = ncclParamTreeThreshold(); - comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false; -#if CUDART_VERSION >= 9200 - comm->groupCudaStream = ncclParamGroupCudaStream(); -#else - // Don't allow the user to overload the default setting in older CUDA builds - comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM; -#endif - comm->fatalError = ncclSuccess; - - CUDACHECK(cudaHostAlloc((void**) &comm->fatalDevError, sizeof(ncclDevError_t), cudaHostAllocMapped)); - *comm->fatalDevError = ncclDevSuccess; - - CUDACHECK(cudaHostAlloc((void**) &comm->abortFlag, sizeof(uint32_t), cudaHostAllocMapped)); - *comm->abortFlag = 0; - - comm->argsptr = &comm->args; - - *comret = comm; - return ncclSuccess; -} - -static ncclResult_t devCommSetup(ncclComm_t comm) { - // Fully duplicate the comm on the device - NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1)); - // Copy the comm on the device - NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1)); - // Copy userRanks - for (int r=0; r<comm->nChannels; r++) { - NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks)); - NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks)); - } - // Copy the device-accessible pointer to comm->abortFlag - void *devAbortFlag; - CUDACHECK(cudaHostGetDevicePointer(&devAbortFlag, (uint32_t *)comm->abortFlag, 0)); - CUDACHECK(cudaMemcpy(&comm->devComm->abortFlag, &devAbortFlag, sizeof(int *), cudaMemcpyHostToDevice)); - // Copy the device-accessible pointer to comm->fatalDevError - void *devFatalError; - CUDACHECK(cudaHostGetDevicePointer(&devFatalError, (ncclDevError_t *)comm->fatalDevError, 0)); - CUDACHECK(cudaMemcpy(&comm->devComm->fatalDevError, &devFatalError, sizeof(ncclDevError_t *), cudaMemcpyHostToDevice)); - return ncclSuccess; -} - -// Pre-process the string so that running "strings" on the lib can quickly reveal the version. -#define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR) -static void showVersion() { - static int shown = 0; - if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) { - printf("%s\n", VERSION_STRING); - fflush(stdout); - if (ncclDebugFile != stdout) - INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files - shown = 1; - } -} - -static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank) { - info->rank = rank; - CUDACHECK(cudaGetDevice(&info->cudaDev)); - NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev)) - info->hostHash=getHostHash(); - info->pidHash=getPidHash(); - - // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the - // cudaDev is a CUDA runtime dev number which could be different from the - // NVML device number. Then we get the busID from NVML to be sure it is - // consistent with NVML remote PCI bus Ids. - CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev)); - nvmlDevice_t nvmlDevice; - NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice)); - nvmlPciInfo_t pciInfo; - NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo)); - strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE); - return ncclSuccess; -} - -template <int type> -static ncclResult_t selectTransport(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) { - for (int t=0; t<NTRANSPORTS; t++) { - struct ncclTransport *transport = ncclTransports+t; - struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv; - ncclTvalue_t ret = 0; - NCCLCHECK(transport->canConnect(&ret, myInfo, peerInfo)); - if (ret > 0) { - connector->transportComm = transportComm; - NCCLCHECK(transportComm->setup(myInfo, peerInfo, connect, connector, buffSize, channelId)); - return ncclSuccess; - } - } - WARN("No transport found !"); - return ncclInternalError; -} - -static int log2(int n) { - int l = 0; - while (n>>=1) l++; - return l; -} - -static ncclResult_t ncclTreeThreshold(int nnodes, int nranks, int nChannels, ssize_t *treeThreshold) { - int nvlink; - NCCLCHECK(ncclNvlinkGpu(&nvlink)); - float ringbw = nvlink ? 5000*nChannels : 5000; // approx, in MB/s or B/us - float ringlatinter = 6; - float treelatintra = 4; - float treelatinter = 15; - float treebw; - if (!nvlink) { - treebw = ringbw * 2 / 3; - } else { - treebw = ringbw * 3 / 4; - if (nnodes == 2) treebw *= 2; - } - float ringlat = ringlatinter*(nranks-1); - float treelat = treelatinter*log2(nnodes)+treelatintra*(nranks/nnodes-1); - if (nnodes < 2 || ringlat <= treelat) - *treeThreshold = 0; - else if (treebw > ringbw) - *treeThreshold = 0x7fffffffffffffff; - else - *treeThreshold = (ssize_t)(((ringbw*treebw/(ringbw-treebw)))*(ringlat-treelat)); - return ncclSuccess; -} - -static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks, int* treeMasters) { - TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); - NCCLCHECK(initChannel(comm, channelId)); - - struct ncclChannel* channel = comm->channels+channelId; - struct ncclRing* ring = &channel->ring; - - // Reorganize ranks to start with rank. - int shift; - for (shift = 0; shift<nranks; shift++) { - if (ringRanks[shift] == rank) { - break; - } - } - for (int i=0; i<nranks; i++) { - ring->userRanks[i] = ringRanks[(i+shift)%nranks]; - } - int prev = ring->prev = ring->userRanks[nranks-1]; - int next = ring->next = ring->userRanks[1]; - - struct ncclTree* tree = &channel->tree; - tree->up = -1; - tree->down[0] = tree->down[1] = tree->down[2] = -1; - - // - // Find per-node masters and connect them via a binary tree - // - - int nMasters = 0; - for (int r=0; r<nranks; r++) nMasters += treeMasters[r]; - if (nMasters == 0) { - nMasters = 1; - treeMasters[0] = 1; - } - - if (comm->treeThreshold == -2) - NCCLCHECK(ncclTreeThreshold(nMasters, comm->nRanks, comm->nChannels, &comm->treeThreshold)); - - if (comm->treeThreshold > 0) { - // Compute tree depth. Not an exact value but a good approximation in most - // cases and consistent across nodes - tree->depth = nranks/nMasters + log2(nMasters); - - // Find my master : go backwards in the ring to find my root - int master = 0; - for (int i = 0; i<nranks; i++) { - int r = ring->userRanks[(nranks-i)%nranks]; - if (treeMasters[r]) { - master = r; - break; - } - } - - int ranks[nMasters]; - int i = 0, masterIndex = -1; - // Build binary tree - for (int r=0; r<nranks; r++) { - // Create index table - if (r == master) masterIndex = i; - if (treeMasters[r]) ranks[i++] = r; - } - int btreeUp, btreeDown0, btreeDown1; - int u0, d0_0, d0_1, u1, d1_0, d1_1; - NCCLCHECK(ncclGetDtree(nMasters, masterIndex, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1)); - if (channelId < DIVUP(comm->nChannels, 2)) { - btreeUp = u0; btreeDown0 = d0_0; btreeDown1 = d0_1; - } else { - btreeUp = u1; btreeDown0 = d1_0; btreeDown1 = d1_1; - } - - // - // Now build the full tree, combining the intra-node ring and the - // inter-node binary tree. - // - - if (rank == master) { - int nDown = 0; - if (btreeUp != -1) tree->up = ranks[btreeUp]; - if (treeMasters[next] == 0) tree->down[nDown++] = next; - if (btreeDown0 != -1) tree->down[nDown++] = ranks[btreeDown0]; - if (btreeDown1 != -1) tree->down[nDown++] = ranks[btreeDown1]; - } else { - tree->up = prev; - if (treeMasters[next] == 0) tree->down[0] = next; - } - } - - TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); - return ncclSuccess; -} - -static ncclResult_t fillConnect(struct ncclPeerInfo* peerInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) { - for (int r=0; r<nranks; r++) { - connectTransport[r] = -1; - for (int t=0; t<NTRANSPORTS; t++) { - NCCLCHECK(ncclTransports[t].canConnect(connectValue+r, peerInfo+rank, peerInfo+r)); - if (connectValue[r] > 0) { - connectTransport[r] = t; - break; - } - } - } - return ncclSuccess; -} - -#define MAXWIDTH 20 -#define PREFIXLEN 15 -#define STRLENGTH (PREFIXLEN+5*MAXWIDTH) -void dumpMatrix(int* connectMatrix, int nranks) { - char line[STRLENGTH+1]; - line[STRLENGTH] = '\0'; - memset(line, ' ', STRLENGTH); - for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", j); - INFO(NCCL_INIT,"%s", line); - for (int i=0; i<nranks; i++) { - memset(line, ' ', STRLENGTH); - sprintf(line, "%3d ", i); - for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+4*j, " %3d", connectMatrix[i*nranks+j]); - INFO(NCCL_INIT,"%s", line); - } -} - -void dumpMatrixTvalue(ncclTvalue_t* connectMatrix, int nranks) { - char line[STRLENGTH+1]; - line[STRLENGTH] = '\0'; - memset(line, ' ', STRLENGTH); - for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4d", j); - INFO(NCCL_INIT,"%s", line); - for (int i=0; i<nranks; i++) { - memset(line, ' ', STRLENGTH); - sprintf(line, "%3d ", i); - for (int j=0; j<nranks && j<MAXWIDTH; j++) sprintf(4+line+5*j, " %4o", (int)connectMatrix[i*nranks+j]); - INFO(NCCL_INIT,"%s", line); - } -} - - -void dumpLine(int* values, int nranks, const char* prefix) { - int prefixlen = strlen(prefix); - char line[STRLENGTH+1]; - line[STRLENGTH] = '\0'; - memset(line, ' ', STRLENGTH); - strncpy(line, prefix, PREFIXLEN); - for (int i=0; i<nranks && i<MAXWIDTH; i++) sprintf(line+prefixlen+4*i, " %3d", values[i]); - INFO(NCCL_INIT,"%s", line); -} - -static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) { - for (int r=0; r<nrings; r++) { - char prefix[30]; - /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r); - dumpLine(prev+r*nranks, nranks, prefix); - sprintf(prefix, "[%d] Channel %d Next : ", rank, r); - dumpLine(next+r*nranks, nranks, prefix);*/ - - int current = rank; - for (int i=0; i<nranks; i++) { - rings[r*nranks+i] = current; - current = next[r*nranks+current]; - } - sprintf(prefix, "Channel %02d : ", r); - if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix); - if (current != rank) { - WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank); - return ncclInternalError; - } - // Check that all ranks are there - for (int i=0; i<nranks; i++) { - int found = 0; - for (int j=0; j<nranks; j++) { - if (rings[r*nranks+j] == i) { - found = 1; - break; - } - } - if (found == 0) { - WARN("Error : ring %d does not contain rank %d", r, i); - return ncclInternalError; - } - } - } - return ncclSuccess; -} - -void* waitForNonNullPtr(void* p) { - volatile void** ptr = (volatile void**) p; - while (*ptr == NULL) sched_yield(); - return (void*)*ptr; -} - -ncclResult_t initParams(struct ncclComm* comm) { - struct cudaLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank; - params->args = &comm->argsptr; - params->stream = NULL; - params->sharedMem = 0; - params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1; - params->gridDim.x = 0; params->gridDim.y = params->gridDim.z = 1; - return ncclSuccess; -} - -// Allocate/Set Intra Process Structures and set CG options -ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct ncclComm* comm0) { - comm->intraRank = rank; - comm->intraRanks = ranks; - comm->intraPhase = 0; - - // Alloc shared structures - if (rank == 0) { - assert(comm == comm0); - int* bar; - NCCLCHECK(ncclCalloc(&bar, 2)); - bar[0] = bar[1] = 0; - comm->intraBarrier = bar; - NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks)); - NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks)); - int* CGMode; - NCCLCHECK(ncclCalloc(&CGMode, 1)); - *CGMode = 0x11; - comm->intraCGMode = CGMode; - int* CC; - NCCLCHECK(ncclCalloc(&CC, 1)); - *CC = ncclCudaFullCompCap(); - comm->intraCC = CC; - } else { - comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier); - comm->intraParams = (struct cudaLaunchParams*)waitForNonNullPtr(&comm0->intraParams); - comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs); - comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode); - comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC); - } - comm->intraCudaDevs[comm->intraRank] = comm->cudaDev; - NCCLCHECK(initParams(comm)); - - int cgMdLaunch = 0; - - // Set CG Mode - comm->launchMode = ncclComm::GROUP; - char* str = getenv("NCCL_LAUNCH_MODE"); - if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) { - comm->launchMode = ncclComm::PARALLEL; - } - if (comm->launchMode == ncclComm::GROUP) { - CUDACHECK(cudaStreamCreateWithFlags(&comm->groupStream, cudaStreamNonBlocking)); -#if CUDART_VERSION >= 9000 - if (*comm->intraCC && (ncclCudaFullCompCap() == *comm->intraCC)) { - // Check whether the GPU supports Cooperative Group Multi Device Launch - (void) cudaDeviceGetAttribute(&cgMdLaunch, cudaDevAttrCooperativeMultiDeviceLaunch, comm->cudaDev); - } -#endif - } - - // Disable cgMdLaunch if any rank does not support it - if (cgMdLaunch == 0) { - *comm->intraCGMode = 0x10; - } - return ncclSuccess; -} - -static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) { - TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv); - uint32_t nSkippedSend = 0, nSkippedRecv = 0; /* for tracing */ - struct ncclConnect connect; - struct ncclConnector* conn; - for (int i=0; i<nrecv; i++) { - int peer = peerRecv[i]; - if (peer == -1) continue; - conn = &channel->peers[peer].recv; - if (conn->connected) { ++nSkippedRecv; continue; } - NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); - NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); - } - for (int i=0; i<nsend; i++) { - int peer = peerSend[i]; - if (peer == -1) continue; - conn = &channel->peers[peer].send; - if (conn->connected) { ++nSkippedSend; continue; } - NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); - NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); - } - for (int i=0; i<nsend; i++) { - int peer = peerSend[i]; - if (peer == -1) continue; - conn = &channel->peers[peer].send; - if (conn->connected) {++nSkippedSend; continue; } - NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); - NCCLCHECK(conn->transportComm->connect(&connect, conn)); - conn->connected = 1; - } - for (int i=0; i<nrecv; i++) { - int peer = peerRecv[i]; - if (peer == -1) continue; - conn = &channel->peers[peer].recv; - if (conn->connected) {++nSkippedRecv; continue; } - NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); - NCCLCHECK(conn->transportComm->connect(&connect, conn)); - conn->connected = 1; - } - TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv); - return ncclSuccess; -} - -static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) { - // We use 3 AllGathers - // 1. { peerInfo, comm } - // 2. ConnectTransport[nranks], ConnectValue[nranks] - // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] } - - int rank = comm->rank; - int nranks = comm->nRanks; - TRACE(NCCL_INIT, "rank %d nranks %d - BEGIN", rank, nranks); - NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap)); - - // AllGather1 - begin - struct { - struct ncclPeerInfo peerInfo; - struct ncclComm* comm; - } *allGather1Data; - - NCCLCHECK(ncclCalloc(&allGather1Data, nranks)); - allGather1Data[rank].comm = comm; - NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank)); - NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data))); - - NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks)); - for (int i = 0; i < nranks; i++) { - memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo)); - } - // AllGather1 data is used again below - // AllGather1 - end - - // AllGather2 - begin - size_t allGather2DataRowSize = sizeof(int)*nranks + sizeof(ncclTvalue_t)*nranks; - void *allGather2Data; - NCCLCHECK(ncclCalloc((char **)&allGather2Data, allGather2DataRowSize*nranks)); - int *myTransportRow = (int *)((char *)allGather2Data + allGather2DataRowSize*rank); - ncclTvalue_t *myValueRow = (ncclTvalue_t *)(myTransportRow + nranks); - - NCCLCHECK(fillConnect(comm->peerInfo, nranks, rank, myTransportRow, myValueRow)); - NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather2Data, allGather2DataRowSize)); - - int* connectTransport; - ncclTvalue_t* connectValue; - NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks)); - NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks)); - for (int i = 0; i < nranks; i++) { - memcpy(connectTransport + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize, sizeof(int)*nranks); - memcpy(connectValue + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize + nranks*sizeof(int), sizeof(ncclTvalue_t)*nranks); - } - free(allGather2Data); - // AllGather2 - end - - //if (rank == 0) dumpMatrix(connectTransport, nranks); - //if (rank == 0) dumpMatrixTvalue(connectValue, nranks); - - // Get my rings - int nrings; - int* prev, *next, *treeIn, *treeOut; - NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS)); - comm->nThreads = getDefaultThreads(); - NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut)); - TRACE(NCCL_INIT, "rank %d nranks %d - BUILD %d RINGS", rank, nranks, nrings); - assert(nrings <= MAXCHANNELS); - free(connectTransport); - free(connectValue); - - // AllGather3 - begin - struct { - int nThreads; - int nrings; - int cudaCompCap; - int prev[MAXCHANNELS]; - int next[MAXCHANNELS]; - } *allGather3Data; - - NCCLCHECK(ncclCalloc(&allGather3Data, nranks)); - allGather3Data[rank].nThreads = comm->nThreads; - allGather3Data[rank].nrings = nrings; - allGather3Data[rank].cudaCompCap = ncclCudaCompCap(); - for (int r=0; r<nrings; r++) { - allGather3Data[rank].prev[r] = *(prev+r*nranks+rank); - allGather3Data[rank].next[r] = *(next+r*nranks+rank); - } - NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data))); - - // Find max nThreads - for (int i=0; i<nranks; i++) - comm->nThreads = std::max(allGather3Data[i].nThreads, comm->nThreads); - - // Determine the minimum CUDA Compute capability of all GPUs - int myCompCap = allGather3Data[rank].cudaCompCap; - int minCompCap = myCompCap; - for (int i = 0; i < nranks; i++) - minCompCap = std::min(allGather3Data[i].cudaCompCap, minCompCap); - - // Determine thread threshold across all GPUs - int nnodes = 0; - for (int r=0; r<nranks; r++) nnodes += treeIn[r]; - comm->threadThreshold = ncclThreadThreshold(minCompCap, nnodes); - - // Find min nrings across ranks - for (int i=0; i<nranks; i++) - nrings = std::min(allGather3Data[i].nrings, nrings); - comm->nChannels = nrings; - - // Unpack the per ring prev/next arrays - for (int i = 0; i < nranks; i++) { - for (int r = 0; r < nrings; r++) { - prev[r*nranks+i] = allGather3Data[i].prev[r]; - next[r*nranks+i] = allGather3Data[i].next[r]; - } - } - free(allGather3Data); - // AllGather3 - end - - int *rings; - NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS)); - NCCLCHECK(buildRings(nrings, rings, rank, nranks, prev, next)); - free(prev); - free(next); - TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d RINGS", rank, nranks, nrings); - - // Connect with prev/next for each ring - struct ncclConnect *connect; - NCCLCHECK(ncclCalloc(&connect, 2)); - for (int r=0; r<nrings; r++) { - struct ncclChannel* channel = comm->channels+r; - NCCLCHECK(setupChannel(comm, r, rank, nranks, rings+r*nranks, treeIn+r*nranks)); - NCCLCHECK(p2pSetup(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next)); - NCCLCHECK(p2pSetup(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up)); - NCCLCHECK(p2pSetup(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down)); - } - if (comm->treeThreshold > 0) { - char line[1024]; - line[0]='\0'; - for (int c=0; c<nrings; c++) { - struct ncclTree* tree = &comm->channels[c].tree; - snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d->%d->%d/%d/%d", - c, tree->up, rank, tree->down[0], tree->down[1], tree->down[2]); - } - line[1023] = '\0'; - INFO(NCCL_INIT, "Trees%s", line); - } - if (rank == 0) { - char treeline[64]; - snprintf(treeline, 64, "enabled up to size %ld", comm->treeThreshold); - INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees %s", comm->nThreads, minCompCap, - comm->treeThreshold == 0 ? "disabled" : - comm->treeThreshold == 0x7fffffffffffffff ? "enabled for all sizes" : - treeline); - } - - TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, nrings); - free(connect); - free(rings); - free(treeIn); - free(treeOut); - - // Compute intra ranks (using AllGather1 data) - int intraRank0 = -1, intraRank = -1, intraRanks = 0; - for (int i = 0; i < nranks; i++) { - if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) && - (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) { - if (intraRanks == 0) intraRank0 = i; - if (i == rank) intraRank = intraRanks; - intraRanks++; - } - } - TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d", - rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0); - if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) { - WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d", - rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0); - return ncclInternalError; - } - NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm)); - - // Done with AllGather1 data - free(allGather1Data); - - if (nnodes) NCCLCHECK(transportCreateProxy(comm)); - - TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); - return ncclSuccess; -} - -static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) { - CPU_ZERO_S(sizeof(cpu_set_t), mask); - char* cudaPath; - NCCLCHECK(getCudaPath(cudaDev, &cudaPath)); - char path[PATH_MAX]; - strncpy(path, cudaPath, PATH_MAX-1); - snprintf(path+strlen(path), PATH_MAX-1-strlen(path), "/local_cpus"); - path[PATH_MAX-1] = '\0'; - int fd; - SYSCHECKVAL(open(path, O_RDONLY), "open", fd); - char affinityStr[sizeof(cpu_set_t)*2]; - int r = read(fd, affinityStr, sizeof(cpu_set_t)*2); - if (r > 0) - NCCLCHECK(ncclStrToCpuset(affinityStr, mask)); - close(fd); - free(cudaPath); - return ncclSuccess; -} - -static ncclResult_t setCpuAffinity(int cudaDev) { - // Work within the enveloppe we were provided - cpu_set_t mask; - SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity"); - - // Find the subpart that is local to our GPU - cpu_set_t gpuMask; - NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask)); - cpu_set_t finalMask; - CPU_AND(&finalMask, &mask, &gpuMask); - - // If those are not disjoint, try to stay local - if (CPU_COUNT(&finalMask)) { - char affinityStr[sizeof(cpu_set_t)*2]; - NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr)); - INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", cudaDev, affinityStr); - SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity"); - } - return ncclSuccess; -} - -ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { - cpu_set_t affinitySave; - sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); - - NCCLCHECK(wrapNvmlSymbols()); - NCCLCHECK(wrapNvmlInit()); - - // Make sure all host memory allocation are close to the GPU - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - NCCLCHECK(setCpuAffinity(cudaDev)); - ncclResult_t res; - - NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup); - NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup); - NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup); - - sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); - NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup); - - INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->nvmlDev); - - return ncclSuccess; -cleanup: - *newcomm = NULL; - sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); - return res; -} - -NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank); -ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { - char* env = getenv("NCCL_COMM_ID"); - if (env && myrank == 0) { - NCCLCHECK(bootstrapCreateRoot(&commId, true)); - } - - NCCLCHECK(ncclInit()); - if (myrank == 0) showVersion(); - - // Make sure the CUDA runtime is initialized. - CUDACHECK(cudaFree(NULL)); - - NCCLCHECK(PtrCheck(newcomm, "CommInitRank", "newcomm")); - if (nranks < 1 || myrank < 0 || myrank >= nranks) { - WARN("Invalid rank requested : %d/%d", myrank, nranks); - return ncclInvalidArgument; - } - - if (ncclAsyncMode()) { - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - return ncclAsyncInit(ncclCommInitRankSync, cudaDev, newcomm, nranks, commId, myrank); - } else { - return ncclCommInitRankSync(newcomm, nranks, commId, myrank); - } -} - -static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) { - struct ncclPeerInfo* allInfo; - NCCLCHECK(ncclCalloc(&allInfo, nranks)); - for (int rank=0; rank<nranks; rank++) { - CUDACHECK(cudaSetDevice(devs[rank])); - NCCLCHECK(fillInfo(allInfo+rank, rank)); - } - - int* connectTransport; - ncclTvalue_t* connectValue; - NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks)); - NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks)); - for (int rank=0; rank<nranks; rank++) - NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank)); - - int* prev, *prevFinal, *next, *nextFinal, *treeIn, *treeOut; - NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&prevFinal, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&nextFinal, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS)); - int nrings = MAXCHANNELS; - int nthreads=0; - int myCompCap = ncclCudaCompCap(); - int minCompCap = myCompCap; - for (int rank=0; rank<nranks; rank++) { - CUDACHECK(cudaSetDevice(devs[rank])); - int nringsRank; - int nthreadsRank = getDefaultThreads(); - myCompCap = ncclCudaCompCap(); - NCCLCHECK(ncclGetRings(&nringsRank, &nthreadsRank, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut)); - nrings = std::min(nrings, nringsRank); - nthreads = std::max(nthreads, nthreadsRank); - minCompCap = std::min(minCompCap, myCompCap); - for (int ring=0; ring<nrings; ring++) { - int index = ring*nranks+rank; - prevFinal[index] = prev[index]; - nextFinal[index] = next[index]; - } - } - free(connectTransport); - free(connectValue); - free(prev); - free(next); - - INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees disabled", nthreads, minCompCap); - - int* rings; - NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS)); - NCCLCHECK(buildRings(nrings, rings, 0, nranks, prevFinal, nextFinal)); - free(prevFinal); - free(nextFinal); - - // Determine thread threshold across all GPUs - int threadThreshold = ncclThreadThreshold(minCompCap, 0); - - for (int rank=0; rank<nranks; rank++) { - comms[rank]->nChannels = nrings; - comms[rank]->nThreads = nthreads; - comms[rank]->threadThreshold = threadThreshold; - } - - for (int r=0; r<nrings; r++) { - struct ncclConnect connect[2*nranks]; - int* ringRanks = rings+r*nranks; - for (int rank=0; rank<nranks; rank++) { - CUDACHECK(cudaSetDevice(devs[rank])); - struct ncclChannel* channel = comms[rank]->channels+r; - struct ncclRing *ring = &channel->ring; - NCCLCHECK(setupChannel(comms[rank], r, rank, nranks, ringRanks, treeIn)); - // Make sure we don't use trees, we cannot use them with initAll - comms[rank]->treeThreshold = 0; - int prev = channel->ring.prev = ring->userRanks[nranks-1]; - int next = channel->ring.next = ring->userRanks[1]; - struct ncclConnector* recv = &channel->peers[prev].recv; - struct ncclConnector* send = &channel->peers[next].send; - NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+rank*2+0, recv, channel->buffSize, channel->id)); - NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+rank*2+1, send, channel->buffSize, channel->id)); - } - for (int rank=0; rank<nranks; rank++) { - CUDACHECK(cudaSetDevice(devs[rank])); - struct ncclChannel* channel = comms[rank]->channels+r; - struct ncclRing *ring = &channel->ring; - struct ncclConnector* recv = &channel->peers[ring->prev].recv; - struct ncclConnector* send = &channel->peers[ring->next].send; - NCCLCHECK(recv->transportComm->connect(connect+ring->prev*2+1, recv)); - NCCLCHECK(send->transportComm->connect(connect+ring->next*2+0, send)); - } - } - free(allInfo); - free(rings); - free(treeIn); - free(treeOut); - return ncclSuccess; -} - - -NCCL_API(ncclResult_t, ncclCommInitAll, ncclComm_t* comms, int ndev, const int* devlist); -ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { - NCCLCHECK(ncclInit()); - NCCLCHECK(wrapNvmlSymbols()); - NCCLCHECK(wrapNvmlInit()); - showVersion(); - - INFO(NCCL_INIT,"nranks %d", ndev); - - NCCLCHECK(PtrCheck(comms, "CommInitAll", "comms")); - if (ndev < 1) { - WARN("Invalid device count requested : %d", ndev); - return ncclInvalidArgument; - } - - ncclResult_t res; - int savedDevice; - int rank, cudaDev; - ncclComm_t comm = NULL; - int ncclDevList[ndev]; - for (int i=0; i<ndev; i++) { - ncclDevList[i] = devlist ? devlist[i] : i; - } - - cudaGetDevice(&savedDevice); - - for(rank=0; rank<ndev; ++rank) - comms[rank] = NULL; - - cpu_set_t affinitySave; - sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); - - for (rank=0; rank<ndev; ++rank) { - cudaDev = ncclDevList[rank]; - CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup); - - NCCLCHECK(setCpuAffinity(cudaDev)); - - NCCLCHECKGOTO(commAlloc(&comm, ndev, rank), res, cleanup); - comms[rank] = comm; - - NCCLCHECKGOTO(ncclCommSetIntra(comm, rank, ndev, comms[0]), res, cleanup); - } - - sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); - - NCCLCHECKGOTO(initTransportsAll(comms, ncclDevList, ndev), res, cleanup); - - for(rank=0; rank<ndev; ++rank) { - cudaDev = ncclDevList[rank]; - CUDACHECKGOTO(cudaSetDevice(cudaDev), res, cleanup); - NCCLCHECKGOTO(devCommSetup(comms[rank]), res, cleanup); - } - - res = ncclSuccess; - goto final; - -cleanup: - for(rank=0; rank<ndev; ++rank) { - if(comms[rank] != NULL) { - commFree(comms[rank]); - } - } - -final: - if(wrapNvmlShutdown() != ncclSuccess) - INFO(NCCL_INIT,"NCCL did not shutdown nvml properly"); - cudaSetDevice(savedDevice); - sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); - return res; -} - - -static ncclResult_t commDestroy(ncclComm_t comm) { - int savedDevice; - CUDACHECK(cudaGetDevice(&savedDevice)); - int commDevice = comm->cudaDev; - int rank = comm->rank; - - if (savedDevice != commDevice) { - CUDACHECK(cudaSetDevice(commDevice)); - } - - TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, *comm->abortFlag, comm->fatalError); - - CUDACHECK(cudaStreamSynchronize(comm->groupStream)); - NCCLCHECK(transportDestroyProxy(comm)); - NCCLCHECK(commFree(comm)); - - if (savedDevice != commDevice) - CUDACHECK(cudaSetDevice(savedDevice)); - - INFO(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank); - - return ncclSuccess; -} - -NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm); -ncclResult_t ncclCommDestroy(ncclComm_t comm) { - if (comm == NULL) - return ncclSuccess; - - return commDestroy(comm); -} - -NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm); -ncclResult_t ncclCommAbort(ncclComm_t comm) { - if (comm == NULL) - return ncclSuccess; - - // Ask anything that might still be running on the device to quit - *comm->abortFlag = 1; - - return commDestroy(comm); -} - -NCCL_API(const char*, ncclGetErrorString, ncclResult_t code); -const char* ncclGetErrorString(ncclResult_t code) { - switch (code) { - case ncclSuccess : return "no error"; - case ncclUnhandledCudaError : return "unhandled cuda error"; - case ncclSystemError : return "unhandled system error"; - case ncclInternalError : return "internal error"; - case ncclInvalidArgument : return "invalid argument"; - case ncclInvalidUsage : return "invalid usage"; - default : return "unknown result code"; - } -} - -NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError); -ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) { - NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm")); - NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError")); - - // Check device reported error - static ncclDevError_t printedDevErr = ncclDevSuccess; - switch(*comm->fatalDevError) { - case ncclDevSuccess : - break; - case ncclDevAssertedMismatch : - if (printedDevErr != ncclDevAssertedMismatch) { - WARN("Mismatched collective detected, please check your collective calls at and around rank %d. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank); - printedDevErr = ncclDevAssertedMismatch; - } - if (comm->fatalError == ncclSuccess) { - comm->fatalError = ncclInvalidUsage; - } - break; - case ncclDevSuspectedMismatch : - if (printedDevErr != ncclDevSuspectedMismatch) { - WARN("Your program may be hanging, this may be caused by a collective mismatch around rank %d. Please check your collective calls at and around this rank. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank); - printedDevErr = ncclDevSuspectedMismatch; - } - break; - default: - WARN("Unknown device error %d", *comm->fatalDevError); - return ncclInternalError; - } - *asyncError = comm->fatalError; - return ncclSuccess; -} - -NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count); -ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { - NCCLCHECK(PtrCheck(comm, "CommCount", "comm")); - NCCLCHECK(PtrCheck(count, "CommCount", "count")); - *count = comm->nRanks; - return ncclSuccess; -} - -NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid); -ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) { - NCCLCHECK(PtrCheck(comm, "CommCuDevice", "comm")); - NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid")); - *devid = comm->cudaDev; - return ncclSuccess; -} - -NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank); -ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) { - NCCLCHECK(PtrCheck(comm, "CommUserRank", "comm")); - NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank")); - *rank = comm->rank; - return ncclSuccess; -} diff --git a/src/misc/checks.cu b/src/misc/argcheck.cc index a07e577..67931f8 100644 --- a/src/misc/checks.cu +++ b/src/misc/argcheck.cc @@ -1,10 +1,11 @@ /************************************************************************* - * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "checks.h" +#include "argcheck.h" +#include "comm.h" static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) { cudaPointerAttributes attr; diff --git a/src/misc/ibvwrap.cu b/src/misc/ibvwrap.cc index 7ac3431..f47c141 100644 --- a/src/misc/ibvwrap.cu +++ b/src/misc/ibvwrap.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/misc/nvmlwrap.cu b/src/misc/nvmlwrap.cc index 635f332..34ed0aa 100644 --- a/src/misc/nvmlwrap.cu +++ b/src/misc/nvmlwrap.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -16,6 +16,7 @@ static nvmlReturn_t (*nvmlInternalInit)(void); static nvmlReturn_t (*nvmlInternalShutdown)(void); static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device); static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index); +static nvmlReturn_t (*nvmlInternalDeviceGetHandleByIndex)(unsigned int index, nvmlDevice_t* device); static const char* (*nvmlInternalErrorString)(nvmlReturn_t r); static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci); @@ -23,7 +24,10 @@ static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t dev static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult); static nvmlReturn_t (*nvmlInternalDeviceGetMinorNumber)(nvmlDevice_t device, unsigned int* minorNumber); +static nvmlReturn_t (*nvmlInternalDeviceGetCudaComputeCapability)(nvmlDevice_t device, int* major, int* minor); +// Used to make the NVML library calls thread safe +pthread_mutex_t nvmlLock = PTHREAD_MUTEX_INITIALIZER; ncclResult_t wrapNvmlSymbols(void) { if (nvmlState == nvmlInitialized) @@ -70,12 +74,14 @@ ncclResult_t wrapNvmlSymbols(void) { LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown); LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId); LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex); + LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByIndex", nvmlInternalDeviceGetHandleByIndex); LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString); LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo); LOAD_SYM(nvmlhandle, "nvmlDeviceGetMinorNumber", nvmlInternalDeviceGetMinorNumber); LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState); LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo); LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability); + LOAD_SYM(nvmlhandle, "nvmlDeviceGetCudaComputeCapability", nvmlInternalDeviceGetCudaComputeCapability); nvmlState = nvmlInitialized; return ncclSuccess; @@ -85,6 +91,7 @@ teardown: nvmlInternalShutdown = NULL; nvmlInternalDeviceGetHandleByPciBusId = NULL; nvmlInternalDeviceGetIndex = NULL; + nvmlInternalDeviceGetHandleByIndex = NULL; nvmlInternalDeviceGetPciInfo = NULL; nvmlInternalDeviceGetMinorNumber = NULL; nvmlInternalDeviceGetNvLinkState = NULL; @@ -130,7 +137,8 @@ ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_ WARN("lib wrapper not initialized."); return ncclInternalError; } - nvmlReturn_t ret = nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device); + nvmlReturn_t ret; + NVMLLOCKCALL(nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device), ret); if (ret != NVML_SUCCESS) { WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ", nvmlInternalErrorString(ret)); @@ -144,7 +152,8 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { WARN("lib wrapper not initialized."); return ncclInternalError; } - nvmlReturn_t ret = nvmlInternalDeviceGetIndex(device, index); + nvmlReturn_t ret; + NVMLLOCKCALL(nvmlInternalDeviceGetIndex(device, index), ret); if (ret != NVML_SUCCESS) { WARN("nvmlDeviceGetIndex() failed: %s ", nvmlInternalErrorString(ret)); @@ -153,12 +162,28 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { return ncclSuccess; } +ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) { + if (nvmlInternalDeviceGetHandleByIndex == NULL) { + WARN("lib wrapper not initialized."); + return ncclInternalError; + } + nvmlReturn_t ret; + NVMLLOCKCALL(nvmlInternalDeviceGetHandleByIndex(index, device), ret); + if (ret != NVML_SUCCESS) { + WARN("nvmlDeviceGetHandleByIndex() failed: %s ", + nvmlInternalErrorString(ret)); + return ncclSystemError; + } + return ncclSuccess; +} + ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) { if (nvmlInternalDeviceGetPciInfo == NULL) { WARN("lib wrapper not initialized."); return ncclInternalError; } - nvmlReturn_t ret = nvmlInternalDeviceGetPciInfo(device, pci); + nvmlReturn_t ret; + NVMLLOCKCALL(nvmlInternalDeviceGetPciInfo(device, pci), ret); if (ret != NVML_SUCCESS) { WARN("nvmlDeviceGetPciInfo() failed: %s ", nvmlInternalErrorString(ret)); @@ -172,7 +197,8 @@ ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* min WARN("lib wrapper not initialized."); return ncclInternalError; } - nvmlReturn_t ret = nvmlInternalDeviceGetMinorNumber(device, minorNumber); + nvmlReturn_t ret; + NVMLLOCKCALL(nvmlInternalDeviceGetMinorNumber(device, minorNumber), ret); if (ret != NVML_SUCCESS) { WARN("nvmlDeviceGetMinorNumber() failed: %s ", nvmlInternalErrorString(ret)); @@ -186,7 +212,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link /* Do not warn, this symbol is optional. */ return ncclInternalError; } - nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive); + nvmlReturn_t ret; + NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkState(device, link, isActive), ret); if (ret != NVML_SUCCESS) { if (ret != NVML_ERROR_NOT_SUPPORTED) INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ", @@ -201,7 +228,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned /* Do not warn, this symbol is optional. */ return ncclInternalError; } - nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci); + nvmlReturn_t ret; + NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci), ret); if (ret != NVML_SUCCESS) { if (ret != NVML_ERROR_NOT_SUPPORTED) INFO(NCCL_INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ", @@ -217,7 +245,8 @@ ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int /* Do not warn, this symbol is optional. */ return ncclInternalError; } - nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult); + nvmlReturn_t ret; + NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult), ret); if (ret != NVML_SUCCESS) { if (ret != NVML_ERROR_NOT_SUPPORTED) INFO(NCCL_INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ", @@ -226,4 +255,19 @@ ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int } return ncclSuccess; } + +ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) { + if (nvmlInternalDeviceGetNvLinkCapability == NULL) { + WARN("lib wrapper not initialized."); + return ncclInternalError; + } + nvmlReturn_t ret; + NVMLLOCKCALL(nvmlInternalDeviceGetCudaComputeCapability(device, major, minor), ret); + if (ret != NVML_SUCCESS) { + WARN("nvmlDeviceGetCudaComputeCapability() failed: %s ", + nvmlInternalErrorString(ret)); + return ncclSystemError; + } + return ncclSuccess; +} #endif diff --git a/src/misc/rings.cu b/src/misc/rings.cu deleted file mode 100644 index a7b122c..0000000 --- a/src/misc/rings.cu +++ /dev/null @@ -1,385 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "core.h" -#include "param.h" - -#define NCCL_MAX_SCORE 7 - -/* Parse user defined rings. Format is like : - * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0" - * Rings with a non-matching number of ranks are ignored so we can provide - * rings for multiple cases. - */ -#define MAX_ENV_RANKS 512 -static ncclResult_t parseRings(const char* str, int* nringsRet, int nranks, int* prev, int* next) { - int ranks[MAX_ENV_RANKS]; - int nrings = 0; - int rank = 0; - int offset = 0; - int status = 0; // 0 : between numbers, 1 : inside number - do { - int digit = str[offset] - '0'; - if (digit >= 0 && digit <= 9) { - if (status == 0) { - ranks[rank] = digit; - status = 1; - } else { - ranks[rank] = ranks[rank]*10+digit; - } - } else { - if (status == 1) { - rank++; - if (rank == MAX_ENV_RANKS) goto end; - } - status = 0; - if (str[offset] == '|' || str[offset] == '\0') { - int prevRank = ranks[rank-1]; - // Ignore rings if nranks doesn't match - if (rank != nranks) goto newring; - - for (int r=0; r<nranks; r++) { - int rank = ranks[r]; - // Ignore rings with ranks out of bounds - if (rank < 0 || rank >= nranks) goto newring; - // Ignore rings with duplicate ranks - for (int i=0; i<r; i++) - if (ranks[i] == rank) goto newring; - - next[nrings*nranks+prevRank] = rank; - prev[nrings*nranks+rank] = prevRank; - prevRank = rank; - } - nrings++; -newring: - rank = 0; - } - } - } while (str[offset++] != 0); -end: - *nringsRet = nrings; - return ncclSuccess; -} - -/* - * Ring creation algorithm - * - * First, we establish hierarchical coordinates depending on the way ranks can - * communicate. After fillCoords, we have for each rank a unique 3-int array - * { node, pci_domain, rank } corresponding to the three transports : - * { 2[NET], 1[SHM], 0[P2P] }. - * Also, we renumber ranks (to indexes) based on their growing coordinates. - * - * Then, we ask transports to connect groups together. We start with net, then - * shm, then p2p. We maintain two arrays, prev and next, where values are equal - * to -1 when ranks are not yet connected, and a rank otherwise. We never - * connect ranks outside our group, meaning that on 4 nodes of 2 sockets of 4 - * ranks, if we are rank 13, we should see something like (provided we have a - * single net interface, hence a single ring) : - * - * Connecting all nodes <13> - * 2[NET] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 -1 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1 - * next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 -1 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0 - * - * Connecting P2P domains with shared memory <13> - * 1[SHM] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 11 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1 - * next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 12 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0 - * - * Connecting ranks (only inside the P2P domain) <13> - * 0[P2P] : prev 31 -1 -1 -1 -1 -1 -1 -1 7 -1 -1 -1 11 12 13 14 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1 - * next -1 -1 -1 -1 -1 -1 -1 8 -1 -1 -1 12 13 14 15 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1 0 - * - * Hence, when we ask a transport to connect groups, we provide it with a subview of the ranks (except for net - * which always sees the full world). That way, P2P can bruteforce all combinations inside the node without - * risking to explode in terms of combinations, and we scale better. - * - * Finally, we loop over Network scores to try to create rings with high scores (=locality) and decrease until - * we get at least one ring. - */ - -static void recIsConnected(int rank, int* connected, int nranks, int* matrix, int transport) { - connected[rank] = 1; - for (int r=0; r<nranks; r++) { - if (connected[r] == 0 && matrix[rank*nranks+r] == transport) { - recIsConnected(r, connected, nranks, matrix, transport); - } - } -} - -static void isConnected(int rank, int* connected, int nranks, int* matrix, int transport) { - for (int r=0; r<nranks; r++) connected[r] = 0; - recIsConnected(rank, connected, nranks, matrix, transport); -} - -#define NEW_IDX(rank) do { \ - rankToIdx[rank] = idx; \ - idxToRank[idx] = rank; \ - for (int t=0; t<NTRANSPORTS; t++) coords[rank*NTRANSPORTS+t] = current[t]; \ - idx++; \ -} while (0) - -int findConnected(int rank, int* matrix, int nranks, int transport, int* coords) { - for (int r=0; r<nranks; r++) { - if (coords[r*NTRANSPORTS] == -1 && matrix[rank*nranks+r] == transport) return r; - } - return -1; -} - -static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankToIdx, int* idxToRank) { - int current[NTRANSPORTS]; - int* p2pConnected; - NCCLCHECK(ncclCalloc(&p2pConnected, nranks)); - for (int i=0; i<NTRANSPORTS; i++) current[i] = 0; - int curRank = 0, idx = 0; - while (1) { - // P2P is handled separately as there is no level below it and we need to - // cover the case of being connected to another GPU indirectly. - // So we detect all GPUs in the same P2P domain once and add them all at - // once. - isConnected(curRank, p2pConnected, nranks, matrix, 0); - for (int r=0; r<nranks; r++) { - if (p2pConnected[r]) { - NEW_IDX(r); - curRank = r; - current[0]++; - } - } - current[0] = 0; - - if (idx == nranks) { - free(p2pConnected); - return ncclSuccess; - } - - // Find next group, either connected through SHM or NET. - int rank; - int transport = 1; - while ((rank = findConnected(curRank, matrix, nranks, transport, coords)) == -1) { - current[transport] = 0; - transport++; - if (transport == NTRANSPORTS) { - WARN("Error : Could not find transport to connect next group\n"); - free(p2pConnected); - return ncclInternalError; } - } - curRank = rank; - current[transport]++; - } -} - -NCCL_PARAM(MinNrings, "MIN_NRINGS", 0); -NCCL_PARAM(MaxNrings, "MAX_NRINGS", 0); - -/* Users can force the number of threads with an environment variable */ -NCCL_PARAM(Nthreads, "NTHREADS", -2); -ncclResult_t getEnvThreads(int* nthreads) { - int64_t nt = ncclParamNthreads(); - if (nt != -2) - *nthreads = nt; - return ncclSuccess; -} - -static inline int copyRings(int nrings, int newNrings, int nranks, int* a, int* b, int* c, int* d) { - if (newNrings > MAXCHANNELS) newNrings = MAXCHANNELS; - for (int r=nrings; r<newNrings; r++) { - for (int i=0; i<nranks; i++) { - a[r*nranks+i] = a[(r-nrings)*nranks+i]; - b[r*nranks+i] = b[(r-nrings)*nranks+i]; - c[r*nranks+i] = c[(r-nrings)*nranks+i]; - d[r*nranks+i] = d[(r-nrings)*nranks+i]; - } - } - return newNrings; -} -/* Main ring creation function */ -ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut) { - *nrings = 0; - - if (nranks == 1) return ncclSuccess; - - char* str = getenv("NCCL_RINGS"); - if (str && strlen(str)>0) { - int ret = parseRings(str, nrings, nranks, prev, next); - if (ret == ncclSuccess && *nrings > 0) { - if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings); - NCCLCHECK(getEnvThreads(nthreads)); - for (int r = 0; r<*nrings; r++) { - for (int i = 0; i<nranks; i++) { - if (transports[i*nranks+prev[i]] == 2) treeIn[i] = 1; - if (transports[i*nranks+next[i]] == 2) treeOut[i] = 1; - } - } - return ncclSuccess; - } - if (rank == 0) INFO(NCCL_INIT,"No valid ring found in environment, ignoring"); - *nrings = 0; - } - - // Compute hierarchical topology groups, indexes, and rank<->index tables - int* coords, *globalIdxToRank, *globalRankToIdx; - NCCLCHECK(ncclCalloc(&coords, nranks*NTRANSPORTS)); - for (int i=0; i<nranks*NTRANSPORTS; i++) coords[i] = -1; - NCCLCHECK(ncclCalloc(&globalIdxToRank, nranks)); - NCCLCHECK(ncclCalloc(&globalRankToIdx, nranks)); - - NCCLCHECK(fillCoords(nranks, transports, coords, globalRankToIdx, globalIdxToRank)); - - // Start with a high score, then decrease until we find rings - int minScore = NCCL_MAX_SCORE; - int nringsTmp; - int *prevTmp, *nextTmp, *idxToRank, *rankToIdx, *groups, *subgroups; - NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&idxToRank, nranks)); - NCCLCHECK(ncclCalloc(&rankToIdx, nranks)); - NCCLCHECK(ncclCalloc(&groups, nranks)); - NCCLCHECK(ncclCalloc(&subgroups, nranks)); - - int nThreads; - do { - nThreads = *nthreads; - for (int i=0; i<nranks*MAXCHANNELS; i++) prevTmp[i] = nextTmp[i] = -1; - nringsTmp = MAXCHANNELS; - // Loop over transports to connect groups - for (int t=NTRANSPORTS-1; t>=0; t--) { - for (int i=0; i<nranks; i++) idxToRank[i] = rankToIdx[i] = -1; - - int nidx = 0; - for (int i=0; i<nranks; i++) { - // Extract only ranks in the same local area as rank - // We need to extract them in the topological order, hence we iterate over indexes, not ranks - int r = globalIdxToRank[i]; - int sameLocal = 1; - for (int tr = NTRANSPORTS-1; tr > t; tr--) if (coords[r*NTRANSPORTS+tr] != coords[rank*NTRANSPORTS+tr]) sameLocal = 0; - if (!sameLocal) continue; - - groups[nidx] = coords[r*NTRANSPORTS+t]; - subgroups[nidx] = t ? coords[r*NTRANSPORTS+t-1] : nidx; - rankToIdx[r] = nidx; - idxToRank[nidx] = r; - nidx++; - } - - int ngroups = groups[nidx-1] + 1; // Coords should be ordered - - ncclTvalue_t* subvalues; - int *subprev, *subnext; - NCCLCHECK(ncclCalloc(&subvalues, nidx*nidx)); - NCCLCHECK(ncclCalloc(&subprev, nidx*nringsTmp)); - NCCLCHECK(ncclCalloc(&subnext, nidx*nringsTmp)); - if (ngroups > 1) { - /* Extract subvalues */ - for (int i=0; i<nidx; i++) { - for (int j=0; j<nidx; j++) { - if (transports[idxToRank[i]*nranks+idxToRank[j]] == t) - subvalues[i*nidx+j] = values[idxToRank[i]*nranks+idxToRank[j]]; - else - subvalues[i*nidx+j] = 0; - } - } - /* Extract subprev/subnext */ - for (int i=0; i<nidx*nringsTmp; i++) { - subprev[i] = subnext[i] = -1; - } - for (int r=0; r<nringsTmp; r++) { - int start = -1, end = -1; - for (int i=0; i<nranks; i++) { - if (rankToIdx[i] == -1) continue; - if (prevTmp[r*nranks+i] != -1) start = i; - if (nextTmp[r*nranks+i] != -1) end = i; - } - if (start != -1 && end != -1) { - subprev[r*nidx+rankToIdx[start]] = rankToIdx[end]; - subnext[r*nidx+rankToIdx[end]] = rankToIdx[start]; - } - } - /* Get rings */ - NCCLCHECK(ncclTransports[t].getRings(nidx, groups, subgroups, subvalues, &nringsTmp, subprev, subnext, minScore, &nThreads)); - /* Merge subprev/subnext into prev/next */ - for (int r=0; r<nringsTmp; r++) { - for (int i=0; i<nidx; i++) { - if ((prevTmp[r*nranks+idxToRank[i]] == -1) && (subprev[r*nidx+i] != -1)) prevTmp[r*nranks+idxToRank[i]] = idxToRank[subprev[r*nidx+i]]; - if ((nextTmp[r*nranks+idxToRank[i]] == -1) && (subnext[r*nidx+i] != -1)) nextTmp[r*nranks+idxToRank[i]] = idxToRank[subnext[r*nidx+i]]; - if (t == NTRANSPORTS-1) { - // Save node-level masters for trees - treeIn[r*nranks+idxToRank[i]] = prevTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1; - treeOut[r*nranks+idxToRank[i]] = nextTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1; - } - } - } - //for (int r=0; r<nringsTmp; r++) { - //printf("[%d] [%d] [%d] [%d] Prev ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", prevTmp[r*nranks+i]); printf("\n"); - //printf("[%d] [%d] [%d] [%d] Next ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", nextTmp[r*nranks+i]); printf("\n"); - //} - } - free(subvalues); - free(subprev); - free(subnext); - if (nringsTmp == 0) break; - } - minScore--; - if (nringsTmp > *nrings) { - *nrings = nringsTmp; - for (int i=0; i<nranks*(*nrings); i++) { - prev[i] = prevTmp[i]; - next[i] = nextTmp[i]; - } - } - } while (nringsTmp == 0 && minScore); - - free(coords); - free(globalRankToIdx); - free(globalIdxToRank); - free(prevTmp); - free(nextTmp); - free(idxToRank); - free(rankToIdx); - free(groups); - free(subgroups); - - *nthreads = nThreads; - - /* Duplicate the rings in case of multinode+NVLink */ - int nnodes = 0; - for (int r=0; r<nranks; r++) nnodes += treeIn[r]; - int nvlink; - NCCLCHECK(ncclNvlinkGpu(&nvlink)); - if (nnodes > 1 && nvlink) { - *nrings = copyRings(*nrings, *nrings*2, nranks, prev, next, treeIn, treeOut); - } - - if (*nrings == 0) { - WARN("Could not create rings, falling back on simple ring"); - *nrings = 1; - prev[rank] = (rank-1+nranks) % nranks; - next[rank] = (rank+1)%nranks; - } - - int maxNrings = ncclParamMaxNrings(); - int minNrings = ncclParamMinNrings(); - if (maxNrings > 0 && minNrings > maxNrings) { - if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than NCCL_MAX_NRINGS, ignoring NCCL_MIN_NRINGS"); - minNrings = 0; - } - if (minNrings > MAXCHANNELS) { - if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXCHANNELS, MAXCHANNELS); - minNrings = MAXCHANNELS; - } - if (maxNrings > 0 && maxNrings <= *nrings) { - if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings); - *nrings = maxNrings; - } else { - int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1; - if (minNrings < defaultMinNrings) minNrings = defaultMinNrings; - if (minNrings > 0 && minNrings > *nrings) { - if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings); - *nrings = copyRings(*nrings, minNrings, nranks, prev, next, treeIn, treeOut); - } - } - - NCCLCHECK(getEnvThreads(nthreads)); - return ncclSuccess; -} diff --git a/src/misc/utils.cc b/src/misc/utils.cc new file mode 100644 index 0000000..5158529 --- /dev/null +++ b/src/misc/utils.cc @@ -0,0 +1,194 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "utils.h" +#include "core.h" + +#include "nvmlwrap.h" + +// Get current Compute Capability +int ncclCudaCompCap() { + int cudaDev; + if (cudaGetDevice(&cudaDev) != cudaSuccess) return 0; + int ccMajor, ccMinor; + if (cudaDeviceGetAttribute(&ccMajor, cudaDevAttrComputeCapabilityMajor, cudaDev) != cudaSuccess) return 0; + if (cudaDeviceGetAttribute(&ccMinor, cudaDevAttrComputeCapabilityMinor, cudaDev) != cudaSuccess) return 0; + return ccMajor*10+ccMinor; +} + +ncclResult_t int64ToBusId(int64_t id, char* busId) { + sprintf(busId, "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4, (id & 0xf)); + return ncclSuccess; +} + +ncclResult_t busIdToInt64(char* busId, int64_t* id) { + const int size = strlen(busId); + char* hexStr; + NCCLCHECK(ncclCalloc(&hexStr, size)); + int hexOffset = 0; + for (int i=0; i<size; i++) { + char c = busId[i]; + if (c == '.' || c == ':') continue; + if ((c >= '0' && c <= '9') || + (c >= 'A' && c <= 'F') || + (c >= 'a' && c <= 'f')) { + hexStr[hexOffset++] = busId[i]; + } else break; + } + hexStr[hexOffset] = '\0'; + *id = strtol(hexStr, NULL, 16); + free(hexStr); + return ncclSuccess; +} + +// Convert a logical cudaDev index to the NVML device minor number +ncclResult_t getBusId(int cudaDev, int64_t *busId) { + // On most systems, the PCI bus ID comes back as in the 0000:00:00.0 + // format. Still need to allocate proper space in case PCI domain goes + // higher. + char busIdStr[] = "00000000:00:00.0"; + CUDACHECK(cudaDeviceGetPCIBusId(busIdStr, sizeof(busIdStr), cudaDev)); + NCCLCHECK(busIdToInt64(busIdStr, busId)); + return ncclSuccess; +} + +ncclResult_t getHostName(char* hostname, int maxlen, const char delim) { + if (gethostname(hostname, maxlen) != 0) { + strncpy(hostname, "unknown", maxlen); + return ncclSystemError; + } + int i = 0; + while ((hostname[i] != delim) && (hostname[i] != '\0') && (i < maxlen-1)) i++; + hostname[i] = '\0'; + return ncclSuccess; +} + +uint64_t getHash(const char* string, int n) { + // Based on DJB2, result = result * 33 + char + uint64_t result = 5381; + for (int c = 0; c < n; c++) { + result = ((result << 5) + result) + string[c]; + } + return result; +} + +/* Generate a hash of the unique identifying string for this host + * that will be unique for both bare-metal and container instances + * Equivalent of a hash of; + * + * $(hostname)$(cat /proc/sys/kernel/random/boot_id) + * + * This string can be overridden by using the NCCL_HOSTID env var. + */ +#define HOSTID_FILE "/proc/sys/kernel/random/boot_id" +uint64_t getHostHash(void) { + char hostHash[1024]; + char *hostId; + + // Fall back is the full hostname if something fails + (void) getHostName(hostHash, sizeof(hostHash), '\0'); + int offset = strlen(hostHash); + + if ((hostId = getenv("NCCL_HOSTID")) != NULL) { + strncpy(hostHash, hostId, sizeof(hostHash)); + } else { + FILE *file = fopen(HOSTID_FILE, "r"); + if (file != NULL) { + char *p; + if (fscanf(file, "%ms", &p) == 1) { + strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1); + free(p); + } + } + fclose(file); + } + + // Make sure the string is terminated + hostHash[sizeof(hostHash)-1]='\0'; + + TRACE(NCCL_INIT,"unique hostname '%s'", hostHash); + + return getHash(hostHash, strlen(hostHash)); +} + +/* Generate a hash of the unique identifying string for this process + * that will be unique for both bare-metal and container instances + * Equivalent of a hash of; + * + * $$ $(readlink /proc/self/ns/pid) + */ +uint64_t getPidHash(void) { + char pname[1024]; + // Start off with our pid ($$) + sprintf(pname, "%ld", (long) getpid()); + int plen = strlen(pname); + int len = readlink("/proc/self/ns/pid", pname+plen, sizeof(pname)-1-plen); + if (len < 0) len = 0; + + pname[plen+len]='\0'; + TRACE(NCCL_INIT,"unique PID '%s'", pname); + + return getHash(pname, strlen(pname)); +} + +int parseStringList(const char* string, struct netIf* ifList, int maxList) { + if (!string) return 0; + + const char* ptr = string; + + int ifNum = 0; + int ifC = 0; + char c; + do { + c = *ptr; + if (c == ':') { + if (ifC > 0) { + ifList[ifNum].prefix[ifC] = '\0'; + ifList[ifNum].port = atoi(ptr+1); + ifNum++; ifC = 0; + } + while (c != ',' && c != '\0') c = *(++ptr); + } else if (c == ',' || c == '\0') { + if (ifC > 0) { + ifList[ifNum].prefix[ifC] = '\0'; + ifList[ifNum].port = -1; + ifNum++; ifC = 0; + } + } else { + ifList[ifNum].prefix[ifC] = c; + ifC++; + } + ptr++; + } while (ifNum < maxList && c); + return ifNum; +} + +static bool matchIf(const char* string, const char* ref, bool matchExact) { + // Make sure to include '\0' in the exact case + int matchLen = matchExact ? strlen(string) + 1 : strlen(ref); + return strncmp(string, ref, matchLen) == 0; +} + +static bool matchPort(const int port1, const int port2) { + if (port1 == -1) return true; + if (port2 == -1) return true; + if (port1 == port2) return true; + return false; +} + + +bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) { + // Make an exception for the case where no user list is defined + if (listSize == 0) return true; + + for (int i=0; i<listSize; i++) { + if (matchIf(string, ifList[i].prefix, matchExact) + && matchPort(port, ifList[i].port)) { + return true; + } + } + return false; +} diff --git a/src/misc/utils.cu b/src/misc/utils.cu deleted file mode 100644 index c618e71..0000000 --- a/src/misc/utils.cu +++ /dev/null @@ -1,203 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "utils.h" -#include "debug.h" -#include "nccl_net.h" -#include <unistd.h> -#include <string.h> -#include <stdarg.h> - -#include "nvmlwrap.h" -#include "core.h" - -// Convert a logical cudaDev index to the NVML device minor number -ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) { - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; - nvmlDevice_t nvmlDevice; - unsigned int dev; - *nvmlDev = -1; - CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev)); - NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDevice)); - NCCLCHECK(wrapNvmlDeviceGetMinorNumber(nvmlDevice, &dev)); - - *nvmlDev = dev; - - return ncclSuccess; -} - -ncclResult_t getHostName(char* hostname, int maxlen) { - if (gethostname(hostname, maxlen) != 0) { - strncpy(hostname, "unknown", maxlen); - return ncclSystemError; - } - int i = 0; - while ((hostname[i] != '.') && (hostname[i] != '\0') && (i < maxlen-1)) i++; - hostname[i] = '\0'; - return ncclSuccess; -} - -/* Common logging function used by the INFO, WARN and TRACE macros - * Also exported to the dynamically loadable Net transport modules so - * they can share the debugging mechanisms and output files - */ -void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) { - if (ncclDebugLevel <= NCCL_LOG_NONE) return; - - char hostname[1024]; - getHostName(hostname, 1024); - int cudaDev; - cudaGetDevice(&cudaDev); - - char buffer[1024]; - size_t len = 0; - pthread_mutex_lock(&ncclDebugOutputLock); - if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN) - len = snprintf(buffer, sizeof(buffer), - "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line); - else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask)) - len = snprintf(buffer, sizeof(buffer), - "%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev); -#ifdef ENABLE_TRACE - else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) { - auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch; - double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000; - len = snprintf(buffer, sizeof(buffer), - "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line); - } -#endif - if (len) { - va_list vargs; - va_start(vargs, fmt); - (void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs); - va_end(vargs); - fprintf(ncclDebugFile,"%s\n", buffer); - fflush(ncclDebugFile); - } - pthread_mutex_unlock(&ncclDebugOutputLock); - - // If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort() - if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) { - fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n", - hostname, getpid(), gettid(), cudaDev, filefunc, line); - abort(); - } -} - -uint64_t getHash(const char* string) { - // Based on DJB2, result = result * 33 + char - uint64_t result = 5381; - for (int c = 0; string[c] != '\0'; c++) { - result = ((result << 5) + result) + string[c]; - } - return result; -} - -/* Generate a hash of the unique identifying string for this host - * that will be unique for both bare-metal and container instances - * Equivalent of a hash of; - * - * $(hostname) $(readlink /proc/self/ns/uts) $(readlink /proc/self/ns/mnt) - */ -uint64_t getHostHash(void) { - char uname[1024]; - // Start off with the hostname - (void) getHostName(uname, sizeof(uname)); - int offset = strlen(uname); - int len; - // $(readlink /proc/self/ns/uts) - len = readlink("/proc/self/ns/uts", uname+offset, sizeof(uname)-1-offset); - if (len < 0) len = 0; - offset += len; - // $(readlink /proc/self/ns/mnt) - len = readlink("/proc/self/ns/mnt", uname+offset, sizeof(uname)-1-offset); - if (len < 0) len = 0; - offset += len; - // Trailing '\0' - uname[offset]='\0'; - TRACE(NCCL_INIT,"unique hostname '%s'", uname); - - return getHash(uname); -} - -/* Generate a hash of the unique identifying string for this process - * that will be unique for both bare-metal and container instances - * Equivalent of a hash of; - * - * $$ $(readlink /proc/self/ns/pid) - */ -uint64_t getPidHash(void) { - char pname[1024]; - // Start off with our pid ($$) - sprintf(pname, "%ld", (long) getpid()); - int plen = strlen(pname); - int len = readlink("/proc/self/ns/pid", pname+plen, sizeof(pname)-1-plen); - if (len < 0) len = 0; - - pname[plen+len]='\0'; - TRACE(NCCL_INIT,"unique PID '%s'", pname); - - return getHash(pname); -} - -int parseStringList(const char* string, struct netIf* ifList, int maxList) { - if (!string) return 0; - - const char* ptr = string; - // Ignore "^" prefix, will be detected outside of this function - if (ptr[0] == '^') ptr++; - - int ifNum = 0; - int ifC = 0; - char c; - do { - c = *ptr; - if (c == ':') { - if (ifC > 0) { - ifList[ifNum].prefix[ifC] = '\0'; - ifList[ifNum].port = atoi(ptr+1); - ifNum++; ifC = 0; - } - while (c != ',' && c != '\0') c = *(++ptr); - } else if (c == ',' || c == '\0') { - if (ifC > 0) { - ifList[ifNum].prefix[ifC] = '\0'; - ifList[ifNum].port = -1; - ifNum++; ifC = 0; - } - } else { - ifList[ifNum].prefix[ifC] = c; - ifC++; - } - ptr++; - } while (ifNum < maxList && c); - return ifNum; -} - -static bool matchPrefix(const char* string, const char* prefix) { - return (strncmp(string, prefix, strlen(prefix)) == 0); -} - -static bool matchPort(const int port1, const int port2) { - if (port1 == -1) return true; - if (port2 == -1) return true; - if (port1 == port2) return true; - return false; -} - - -bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize) { - // Make an exception for the case where no user list is defined - if (listSize == 0) return true; - - for (int i=0; i<listSize; i++) { - if (matchPrefix(string, ifList[i].prefix) - && matchPort(port, ifList[i].port)) { - return true; - } - } - return false; -} diff --git a/src/nccl.h.in b/src/nccl.h.in index 985274e..f07e0a4 100644 --- a/src/nccl.h.in +++ b/src/nccl.h.in @@ -41,7 +41,7 @@ typedef enum { ncclSuccess = 0, * This integer is coded with the MAJOR, MINOR and PATCH level of the * NCCL library */ -ncclResult_t ncclGetVersion(int *version); +ncclResult_t ncclGetVersion(int *version); ncclResult_t pncclGetVersion(int *version); /* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be @@ -244,7 +244,8 @@ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou * Start a group call. All subsequent calls to NCCL may not block due to * inter-CPU synchronization. */ -ncclResult_t ncclGroupStart(); +ncclResult_t ncclGroupStart(); +ncclResult_t pncclGroupStart(); /* * Group End @@ -252,7 +253,8 @@ ncclResult_t ncclGroupStart(); * End a group call. Wait for all calls since ncclGroupStart to complete * before returning. */ -ncclResult_t ncclGroupEnd(); +ncclResult_t ncclGroupEnd(); +ncclResult_t pncclGroupEnd(); #ifdef __cplusplus } // end extern "C" diff --git a/src/nccl.pc.in b/src/nccl.pc.in new file mode 100755 index 0000000..0d98494 --- /dev/null +++ b/src/nccl.pc.in @@ -0,0 +1,10 @@ +prefix=${nccl:Prefix} +exec_prefix=${prefix} +libdir=${exec_prefix}/lib +includedir=${prefix}/include + +Name: nccl +Description: Optimized primitives for collective multi-GPU communication +Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch} +Libs: -L${libdir} -lnccl +Cflags: -I${includedir} diff --git a/src/transport.cu b/src/transport.cc index 1436a5b..4059849 100644 --- a/src/transport.cu +++ b/src/transport.cc @@ -4,7 +4,8 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "comm.h" +#include "info.h" extern struct ncclTransport p2pTransport; extern struct ncclTransport shmTransport; @@ -119,13 +120,13 @@ ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int r } if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) { // Tree up - struct ncclTree* tree = &args->channel->tree; + struct ncclTree* tree = &args->channel->treeUp; for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxyRecv>(tree->down[i], args)); NCCLCHECK(SaveProxy<proxySend>(tree->up, args)); } if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) { // Tree down - struct ncclTree* tree = &args->channel->tree; + struct ncclTree* tree = &args->channel->treeDn; for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args)); NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args)); } @@ -157,7 +158,9 @@ void* persistentThread(void *comm_) { } } while (op == NULL); op->idle = 0; - if (op->state != ncclProxyOpNone) ret = op->progress(op); + // opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started + // yet and might be cancelled before they even start. Hold on on those. + if (op->state != ncclProxyOpNone && op->opCount < comm->lastOpCount) ret = op->progress(op); if (ret != ncclSuccess) { comm->fatalError = ret; INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); diff --git a/src/transport/net.cu b/src/transport/net.cc index 06a6e23..87fc9ce 100644 --- a/src/transport/net.cu +++ b/src/transport/net.cc @@ -4,39 +4,9 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" -#include "transport.h" -#include "nvmlwrap.h" +#include "comm.h" #include "net.h" -#include "param.h" -#include "topo.h" -#include <cuda_runtime.h> -#include <assert.h> - -#define NET_MAX_IFS 16 -#define NET_MAX_GPUS 32 - -// Cache GPU-NIC distances to avoid re-computing them -#define NET_TVALUE_UNKNOWN 0ULL -static ncclTvalue_t ncclNetTvalues[NET_MAX_GPUS] = { NET_TVALUE_UNKNOWN }; -static int ncclNetNDev; - -// We encode 3 bits of distance per interface into a ncclTvalue_t (64-bit) -#define NET_BITS_PER_IF 3 -#define NET_BITS_PER_IF_MASK ((1<<NET_BITS_PER_IF)-1) -static_assert(sizeof(ncclTvalue_t)*8 >= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t"); -static ncclTvalue_t getTvalue(short* distances, int ndev) { - ncclTvalue_t tvalue = 0; - for (int d=0; d<ndev; d++) { - int score = 1 + PATH_SOC - distances[d]; - // Keep 3 bits of score info per dev - tvalue |= ((score & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d)); - } - return tvalue; -} -static int getScore(ncclTvalue_t tvalue, int dev) { - return (tvalue >> (dev*NET_BITS_PER_IF)) & NET_BITS_PER_IF_MASK; -} +#include "graph.h" struct netConnectInfo { ncclNetHandle_t netHandle; @@ -53,6 +23,7 @@ struct netSendResources { int buffSize; void* mhandle; void* llMhandle; + void* ll128Mhandle; struct ncclRecvMem* devRecvMem; uint64_t step; uint64_t llLastCleaning; @@ -70,224 +41,61 @@ struct netRecvResources { int buffSize; void* mhandle; void* llMhandle; + void* ll128Mhandle; struct ncclRecvMem* devRecvMem; uint64_t step; uint64_t llLastCleaning; }; -static ncclResult_t netDistance(int cudaDev, int dev, short* distance) { - char* cudaPath = NULL; - char* nicPath = NULL; - ncclResult_t err; - NCCLCHECK(getCudaPath(cudaDev, &cudaPath)); - err = ncclNetPciPath(dev, &nicPath); - *distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath); - if (nicPath) free(nicPath); - if (cudaPath) free(cudaPath); - return ncclSuccess; -} - -static ncclResult_t netDevices(int* ndev, short** distances) { - NCCLCHECK(ncclNetDevices(ndev)); - if (*ndev == 0) { - WARN("Error : Network returned 0 device"); - return ncclSystemError; - } - if (*ndev > NET_MAX_IFS) *ndev = NET_MAX_IFS; - - *distances = (short*)malloc(*ndev*sizeof(short)); - if (*distances == NULL) return ncclSystemError; - - // Find distance with current GPU - int cudaDev, nvmlDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev)) - char line[1024]; - sprintf(line, "CUDA Dev %d[%d], %s NIC distance : ", cudaDev, nvmlDev, ncclNetName()); - for (int d=0; d<*ndev; d++) { - NCCLCHECK(netDistance(cudaDev, d, *distances+d)); - sprintf(line+strlen(line), " %s", pathDists[(*distances)[d]]); - } - INFO(NCCL_INIT|NCCL_NET, "%s", line); - return ncclSuccess; -} - -/* Determine if we can communicate with the peer */ -ncclResult_t netCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) { - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - ret[0] = ncclNetTvalues[cudaDev]; - if (ret[0] == NET_TVALUE_UNKNOWN) { - if (cudaDev >= NET_MAX_GPUS) { - WARN("CUDA device %d >= MAX %d\n", cudaDev, NET_MAX_GPUS); - return ncclInternalError; - } - int nDev; - short* distances; - NCCLCHECK(netDevices(&nDev, &distances)); - ncclNetTvalues[cudaDev] = ret[0] = getTvalue(distances, nDev); - ncclNetNDev = nDev; - free(distances); - } - return ncclSuccess; -} - -static inline int groupBestStart(int nranks, int* groups, int group, ncclTvalue_t* values, int card, int minScore) { - int bestRank = -1; - int bestScore = 0; - for (int rank=0; rank<nranks; rank++) { - if (groups[rank] != group) continue; - for (int i=0; i<nranks; i++) { - ncclTvalue_t netValue = values[rank*nranks+i]; - if (netValue != 0) { - ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK; - if (score >= minScore && score > bestScore) { - bestScore = score; - bestRank = rank; - } - // All other values should be the same, stop here for this rank - break; - } - } - } - return bestRank; -} -static inline int groupBestEnd(int nranks, int* groups, int group, int* subgroups, int startSubGroup, int startRank, ncclTvalue_t* values, int card, int minScore) { - // For the last rank, we don't need the absolute best score, just to be within minScore. - for (int rank=nranks-1; rank>=0; rank--) { - if (groups[rank] != group) continue; - if (startSubGroup != -1 && startSubGroup == subgroups[rank]) continue; - if (startRank == rank) continue; - for (int i=0; i<nranks; i++) { - ncclTvalue_t netValue = values[rank*nranks+i]; - if (netValue != 0) { - ncclTvalue_t score = (netValue>>(NET_BITS_PER_IF*card)) & NET_BITS_PER_IF_MASK; - if (score >= minScore) { - return rank; - } - // All other values should be the same, stop here for this rank - break; - } - } - } - return -1; -} - - -ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) { - int nGroups = groups[nranks-1] + 1; - int cardUsed[NET_MAX_IFS*nGroups]; - for (int c=0; c<NET_MAX_IFS*nGroups; c++) cardUsed[c] = 0; - - for (int ring = 0; ring<*nringsRet; ring++) { - int starts[nGroups]; - int ends[nGroups]; - for (int group = 0; group<nGroups; group++) { - int nranksInGroup = 0; - int nsubGroups = 0; - for (int rank=0; rank<nranks; rank++) if (groups[rank] == group) { - nranksInGroup++; - nsubGroups = std::max(subgroups[rank], nsubGroups); - } - starts[group] = ends[group] = -1; - // Receive on the rank closest to the NIC - for (int card=0; card<NET_MAX_IFS; card++) { - if (cardUsed[group*NET_MAX_IFS+card] == 1) continue; - int start = groupBestStart(nranks, groups, group, values, card, minScore); - // Send from any rank, but best on a different subgroup and close to the NIC also. - int end = (nranksInGroup == 1) ? start - : groupBestEnd(nranks, groups, group, subgroups, nsubGroups ? subgroups[start] : -1, start, values, card, minScore); - //printf("Ring %d, Minscore %d, Card %d, group %d, start = %d, end = %d\n", ring, minScore, card, group, start, end); - if (start != -1 && end != -1) { - cardUsed[group*NET_MAX_IFS+card] = 1; - starts[group] = start; - ends[group] = end; - break; - } - } - if (starts[group] == -1 || ends[group] == -1) { - *nringsRet = ring; - return ncclSuccess; - } - } - // Link groups together - for (int group = 0; group<nGroups; group++) { - int nextGroup = (group+1)%nGroups; - next[ring*nranks+ends[group]] = starts[nextGroup]; - prev[ring*nranks+starts[nextGroup]] = ends[group]; - } - } +/* Determine if two peers can communicate with NET */ +ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { + *ret = 1; return ncclSuccess; } -int getDev(int cudaDev, int ringId) { - ncclTvalue_t tvalues = ncclNetTvalues[cudaDev]; - - int dev = 0; - int maxScore = 0; - for (int d=0; d<ncclNetNDev; d++) if (getScore(tvalues,d) > maxScore) maxScore = getScore(tvalues,d); - int skip = ringId+1; - while (skip) { - for (int d=0; d<ncclNetNDev; d++) { - if (getScore(tvalues, d) == maxScore) { - skip--; - if (skip == 0) { dev = d; goto end; } - } - } - } -end: - return dev; -} - NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2); NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB); -static ncclResult_t netGetGdrSupport(int dev, int read, int* useGdr) { +static ncclResult_t netGetGdrSupport(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr) { *useGdr = 0; - int cudaDev, nvmlDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev)) - if (read) { // For reads (sends) only enable under certain conditions int gdrReadParam = ncclParamNetGdrRead(); if (gdrReadParam == 0) return ncclSuccess; if (gdrReadParam < 0) { int nvlink; - NCCLCHECK(ncclNvlinkGpu(&nvlink)); + NCCLCHECK(ncclTopoHasNvlink(topo, busId, &nvlink)); if (!nvlink) return ncclSuccess; } } // Check if we are close enough that it makes sense to enable GDR int netGdrLevel = ncclParamNetGdrLevel(); - short distance; - NCCLCHECK(netDistance(cudaDev, dev, &distance)); + int distance; + NCCLCHECK(ncclTopoNetDistance(topo, busId, netDev, &distance)); if (distance >= netGdrLevel) { - INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel); + INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d >= %d)", ncclNetName(), busId, netDev, distance, netGdrLevel); return ncclSuccess; } // Finally, check if the NIC supports it int flags; - NCCLCHECK(ncclNetPtrSupport(dev, &flags)); + NCCLCHECK(ncclNetPtrSupport(netDev, &flags)); if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess; *useGdr = 1; - INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d[%d] / HCA %d (distance %d < %d), read %d", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel, read); + INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d < %d), read %d", ncclNetName(), busId, netDev, distance, netGdrLevel, read); return ncclSuccess; } /* Determine if we will use this transport for this peer and return connect * information for this peer */ -ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { +ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { struct netSendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - resources->netDev = getDev(cudaDev, channelId); - NCCLCHECK(netGetGdrSupport(resources->netDev, 1, &resources->useGdr)); + NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &resources->netDev)); + NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr)); int sendSize = sizeof(struct ncclSendMem); NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize)); @@ -299,20 +107,18 @@ ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize)); resources->buffSize = buffSize; - INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [send] via NET/%s/%d%s", channelId, myInfo->rank, peerInfo->rank, ncclNetName(), resources->netDev, + INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev, resources->useGdr ? "/GDRDMA" : ""); return ncclSuccess; } -ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { +ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { struct netRecvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - resources->netDev = getDev(cudaDev, channelId); - NCCLCHECK(netGetGdrSupport(resources->netDev, 0, &resources->useGdr)); + NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &resources->netDev)); + NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr)); int sendSize = sizeof(struct ncclSendMem); NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize)); @@ -324,7 +130,7 @@ ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize)); resources->buffSize = buffSize; - INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [receive] via NET/%s/%d%s", channelId, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev, + INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev, resources->useGdr ? "/GDRDMA" : ""); struct netConnectInfo* info = (struct netConnectInfo*) connectInfo; NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm)); @@ -339,6 +145,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem; send->conn.buff = recvMem->buff; send->conn.llBuff = resources->devHostRecvMem->llBuff; + send->conn.ll128Buff = recvMem->ll128Buff; // Head/Tail/Opcount/Fifos are always on host send->conn.tail = &resources->devHostRecvMem->tail; @@ -356,6 +163,8 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle)); NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->devHostRecvMem->llBuff, NCCL_LL_BUFF_SIZE, NCCL_PTR_HOST, &resources->llMhandle)); + NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE, + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle)); return ncclSuccess; } @@ -369,6 +178,7 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem; recv->conn.buff = recvMem->buff; recv->conn.llBuff = recvMem->llBuff; + recv->conn.ll128Buff = recvMem->ll128Buff; // Head/Tail/Opcount are always on host recv->conn.tail = &resources->devHostRecvMem->tail; @@ -384,6 +194,8 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle)); NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->llBuff, NCCL_LL_BUFF_SIZE, resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->llMhandle)); + NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->ll128Buff, NCCL_LL128_BUFF_SIZE, + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->ll128Mhandle)); return ncclSuccess; } @@ -393,6 +205,7 @@ ncclResult_t netSendFree(void* transportResources) { NCCLCHECK(ncclCudaHostFree(resources->hostSendMem)); NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandle)); NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->llMhandle)); + NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->ll128Mhandle)); NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem)); if (resources->useGdr) CUDACHECK(cudaFree(resources->devRecvMem)); @@ -406,6 +219,7 @@ ncclResult_t netRecvFree(void* transportResources) { NCCLCHECK(ncclCudaHostFree(resources->hostSendMem)); NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandle)); NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->llMhandle)); + NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->ll128Mhandle)); NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem)); if (resources->useGdr) CUDACHECK(cudaFree(resources->devRecvMem)); @@ -432,11 +246,44 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { if (args->head < args->end) { if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) { volatile int* sizesFifo = resources->hostRecvMem->sizesFifo; - if (args->llMode) { + volatile uint64_t* recvTail = &resources->hostRecvMem->tail; + if (args->protocol == NCCL_PROTO_LL128) { + int stepSize = NCCL_LL128_BUFF_SIZE/NCCL_STEPS; + if (args->tail < *recvTail) { + int buffSlot = args->tail%NCCL_STEPS; + if (sizesFifo[buffSlot] != -1) { + struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; + char* localBuff = (char*)localMem->ll128Buff; + int ready = resources->useGdr; + if (!ready) { + // When data is in sysmem, we need to wait until all flags are correct since the GPU only + // called threadfence() + uint64_t flag = args->tail + 1; + int nFifoLines = DIVUP(sizesFifo[buffSlot], sizeof(uint64_t)*NCCL_LL128_LINEELEMS); + volatile uint64_t* lines = (volatile uint64_t*)(localBuff+buffSlot*stepSize); + ready = 1; + for (int i=0; i<nFifoLines; i++) { + if (lines[i*NCCL_LL128_LINEELEMS+NCCL_LL128_DATAELEMS] != flag) { ready = 0; break; } + } + } + if (ready) { + // Send through network + NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+buffSlot*stepSize, sizesFifo[buffSlot], resources->ll128Mhandle, args->requests+buffSlot)); + if (args->requests[buffSlot] != NULL) { + sizesFifo[buffSlot] = -1; + // Make sure size is reset to zero before we update the head. + __sync_synchronize(); + args->tail += args->sliceSteps; + args->idle = 0; + } + } + } + } + } else if (args->protocol == NCCL_PROTO_LL) { int buffSlot = args->tail%NCCL_STEPS; int size = sizesFifo[buffSlot]; if (size != -1) { - uint32_t flag = args->tail + 1; + uint32_t flag = NCCL_LL_FLAG(args->tail + 1); int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine)); size = nFifoLines * sizeof(union ncclLLFifoLine); union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES; @@ -457,18 +304,20 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { } } } - } else if (args->tail < resources->hostRecvMem->tail) { - struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; + } else if (args->tail < *recvTail) { int stepSize = args->channel->buffSize/NCCL_STEPS; + struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; // Send through network int buffSlot = args->tail%NCCL_STEPS; - NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot)); - if (args->requests[buffSlot] != NULL) { - sizesFifo[buffSlot] = -1; - // Make sure size is reset to zero before we update the head. - __sync_synchronize(); - args->tail += args->sliceSteps; - args->idle = 0; + if (sizesFifo[buffSlot] != -1) { + NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot)); + if (args->requests[buffSlot] != NULL) { + sizesFifo[buffSlot] = -1; + // Make sure size is reset to zero before we update the head. + __sync_synchronize(); + args->tail += args->sliceSteps; + args->idle = 0; + } } } } @@ -486,18 +335,8 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { if (args->head == args->end) { resources->step = args->end; args->idle = 0; - args->state = ncclProxyOpDone; - } - } - if (args->state == ncclProxyOpDone) { - union ncclLLFifoLine* llBuff = resources->hostRecvMem->llBuff; - if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - for (int i=0; i< NCCL_LL_BUFF_LINES; i++) llBuff[i].flag1 = llBuff[i].flag2 = resources->step; - resources->step += NCCL_STEPS; - resources->hostSendMem->head = resources->step; - resources->llLastCleaning = resources->step; + args->state = ncclProxyOpNone; } - args->state = ncclProxyOpNone; } return ncclSuccess; } @@ -517,12 +356,13 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { } if (args->state == ncclProxyOpProgress) { args->idle = 1; - int stepSize = ( args->llMode ? NCCL_LL_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS; + int stepSize = ( args->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_SIZE : args->protocol == NCCL_PROTO_LL128 ? NCCL_LL128_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS; if (args->head < args->end) { struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; - char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff; - void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle; - if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) { + char* localBuff = args->protocol == NCCL_PROTO_LL ? (char*)localMem->llBuff : args->protocol == NCCL_PROTO_LL128 ? (char*)localMem->ll128Buff : localMem->buff; + void* mhandle = args->protocol == NCCL_PROTO_LL ? resources->llMhandle : args->protocol == NCCL_PROTO_LL128 ? resources->ll128Mhandle : resources->mhandle; + volatile uint64_t* sendHead = &resources->hostSendMem->head; + if ((args->tail < args->head + NCCL_STEPS) && (args->tail < *sendHead + NCCL_STEPS) && (args->tail < args->end)) { int buffSlot = args->tail%NCCL_STEPS; int sliceSize = stepSize * args->sliceSteps; NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot)); @@ -537,7 +377,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size)); if (done) { args->head += args->sliceSteps; - if (args->llMode == 0) { + if (args->protocol == NCCL_PROTO_SIMPLE) { if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle); resources->hostRecvMem->tail = args->head; } @@ -548,16 +388,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { if (args->head == args->end) { resources->step = args->end; args->idle = 0; - args->state = ncclProxyOpDone; - } - } - if (args->state == ncclProxyOpDone) { - if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - resources->step += NCCL_STEPS; - while (resources->hostSendMem->head < resources->step); - resources->llLastCleaning = resources->step; + args->state = ncclProxyOpNone; } - args->state = ncclProxyOpNone; } return ncclSuccess; } @@ -565,7 +397,6 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { struct ncclTransport netTransport = { "NET", netCanConnect, - netGetRings, { netSendSetup, netSendConnect, netSendFree, netSendProxy }, { netRecvSetup, netRecvConnect, netRecvFree, netRecvProxy } }; diff --git a/src/transport/net_ib.cu b/src/transport/net_ib.cc index f7c574b..0d5307c 100644 --- a/src/transport/net_ib.cu +++ b/src/transport/net_ib.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,7 +8,7 @@ #include "core.h" #include "socket.h" #include "net.h" -#include "topo.h" +#include "graph.h" #include "utils.h" #include "param.h" @@ -107,18 +107,22 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { char* userIbEnv = getenv("NCCL_IB_HCA"); struct netIf userIfs[MAX_IB_DEVS]; bool searchNot = userIbEnv && userIbEnv[0] == '^'; + if (searchNot) userIbEnv++; + bool searchExact = userIbEnv && userIbEnv[0] == '='; + if (searchExact) userIbEnv++; int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS); if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return ncclInternalError; - for (int d=0; d<nIbDevs; d++) { + for (int d=0; d<nIbDevs && ncclNIbDevs<MAX_IB_DEVS; d++) { struct ibv_context * context; if (ncclSuccess != wrap_ibv_open_device(&context, devices[d]) || context == NULL) { WARN("NET/IB : Unable to open device %s", devices[d]->name); continue; } - int found = 0; + int nPorts = 0; struct ibv_device_attr devAttr; + memset(&devAttr, 0, sizeof(devAttr)); if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) { WARN("NET/IB : Unable to query device %s", devices[d]->name); if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } @@ -135,7 +139,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue; // check against user specified HCAs/ports - if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) { + if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs, searchExact) ^ searchNot)) { continue; } TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port, @@ -146,10 +150,10 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { ncclIbDevs[ncclNIbDevs].context = context; strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE); ncclNIbDevs++; - found++; + nPorts++; pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context); } - if (found == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } + if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } } if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; }; } @@ -197,32 +201,14 @@ ncclResult_t ncclIbGdrSupport(int ibDev) { moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1; } if (moduleLoaded == 0) return ncclSystemError; - ncclResult_t ret = ncclSystemError; - void* ptr; - if (cudaMalloc(&ptr, sizeof(int)) == cudaSuccess) { - struct ibv_mr* mr; - struct ibv_pd* pd; - if (wrap_ibv_alloc_pd(&pd, ncclIbDevs[ibDev].context) == ncclSuccess) { - if ((mr = wrap_direct_ibv_reg_mr(pd, ptr, sizeof(int), IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)) != NULL) { - ret = ncclSuccess; - wrap_ibv_dereg_mr(mr); - } - wrap_ibv_dealloc_pd(pd); - } - cudaFree(ptr); - } - return ret; + return ncclSuccess; } ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) { *supportedTypes = NCCL_PTR_HOST; - int cudaDev, nvmlDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev)) - if (ncclIbGdrSupport(dev) != ncclSuccess) { - INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d '%s' (no module or not supported by GPU)", cudaDev, nvmlDev, dev, ncclIbDevs[dev].devName); + INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName); return ncclSuccess; } *supportedTypes |= NCCL_PTR_CUDA; diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc new file mode 100644 index 0000000..1b1fc4f --- /dev/null +++ b/src/transport/net_socket.cc @@ -0,0 +1,503 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "comm.h" +#include "core.h" +#include "socket.h" +#include "net.h" +#include "param.h" + +#include <assert.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <poll.h> +#include <limits.h> +#include <fcntl.h> + +/* Init functions */ +static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS]; +static union socketAddress ncclNetIfAddrs[MAX_IFS]; +static int ncclNetIfs = -1; +pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER; + +ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) { + if (ncclNetIfs == -1) { + pthread_mutex_lock(&ncclSocketLock); + if (ncclNetIfs == -1) { + ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS); + if (ncclNetIfs <= 0) { + WARN("NET/Socket : no interface found"); + return ncclInternalError; + } else { + char line[1024]; + char addrline[1024]; + line[0] = '\0'; + for (int i=0; i<ncclNetIfs; i++) { + snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, ncclNetIfNames+i*MAX_IF_NAME_SIZE, + socketToString(&ncclNetIfAddrs[i].sa, addrline)); + } + line[1023] = '\0'; + INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line); + } + } + pthread_mutex_unlock(&ncclSocketLock); + } + return ncclSuccess; +} + +ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) { + *supportedTypes = NCCL_PTR_HOST; + return ncclSuccess; +} + +ncclResult_t ncclSocketDevices(int* ndev) { + *ndev = ncclNetIfs; + return ncclSuccess; +} + +ncclResult_t ncclSocketPciPath(int dev, char** path) { + char devicepath[PATH_MAX]; + snprintf(devicepath, PATH_MAX, "/sys/class/net/%s/device", ncclNetIfNames+dev*MAX_IF_NAME_SIZE); + *path = realpath(devicepath, NULL); + if (*path == NULL) { + INFO(NCCL_NET|NCCL_INIT, "Could not find real path of %s", devicepath); + return ncclSystemError; + } + return ncclSuccess; +} + +ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) { + if (dev >= ncclNetIfs) return ncclInternalError; + memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr)); + return ncclSuccess; +} + +/* Communication functions */ + +#define MAX_SOCKETS 64 +#define MAX_THREADS 16 +#define MAX_REQUESTS 128 +#define MAX_QUEUE_LEN MAX_REQUESTS +#define MIN_CHUNKSIZE (64*1024) + +NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2); +NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2); + +struct ncclSocketHandle { + union socketAddress connectAddr; + int nSocks; + int nThreads; +}; + +struct ncclSocketTask { + int op; + void* data; + int size; + int fd; + int offset; + int used; + ncclResult_t result; +}; + +struct ncclSocketRequest { + int op; + void* data; + int size; + int ctrlFd; + int offset; + int used; + struct ncclSocketComm* comm; + struct ncclSocketTask* tasks[MAX_SOCKETS]; + int nSubs; +}; + +struct ncclSocketTaskQueue { + int next; + struct ncclSocketTask* tasks; +}; + +enum threadState {start, stop}; + +struct ncclSocketThreadResources { + struct ncclSocketTaskQueue threadTaskQueue; + enum threadState state; + struct ncclSocketComm* comm; + pthread_mutex_t threadLock; + pthread_cond_t threadCond; +}; + +struct ncclSocketListenComm { + int fd; + int nSocks; + int nThreads; +}; + +struct ncclSocketComm { + int ctrlFd; + int fds[MAX_SOCKETS]; + int nSocks; + int nThreads; + int nextFd; + struct ncclSocketRequest requests[MAX_REQUESTS]; + pthread_t helperThread[MAX_THREADS]; + struct ncclSocketThreadResources threadResources[MAX_THREADS]; +}; + +void* persistentSocketThread(void *args_) { + struct ncclSocketThreadResources* resource = (struct ncclSocketThreadResources*)args_; + struct ncclSocketComm* comm = resource->comm; + volatile enum threadState* state = &resource->state; + struct ncclSocketTaskQueue* myQueue = &resource->threadTaskQueue; + int nSocksPerThread = comm->nSocks / comm->nThreads; + while (1) { + int idle = 1; + int mark = myQueue->next; // mark newest task seen + for (int i=0; i<MAX_QUEUE_LEN; i+=nSocksPerThread) { + int repeat; + do { + repeat = 0; + for (int j=0; j<nSocksPerThread; j++) { + struct ncclSocketTask* r = myQueue->tasks+i+j; + if (r != NULL && r->used == 1 && r->offset < r->size) { + r->result = socketProgress(r->op, r->fd, r->data, r->size, &r->offset); + if (r->result != ncclSuccess) { + WARN("NET/Socket : socket progress error"); + return NULL; + } + idle = 0; + if (r->offset < r->size) repeat = 1; + } + } + } while (repeat); + } + if (idle) { + pthread_mutex_lock(&resource->threadLock); + while (mark == myQueue->next && *state != stop) { // no new tasks, wait + pthread_cond_wait(&resource->threadCond, &resource->threadLock); + } + pthread_mutex_unlock(&resource->threadLock); + } + if (*state == stop) return NULL; + } +} + +ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) { + int nSocksPerThread = ncclParamSocketNsocksPerThread(); + int nThreads = ncclParamSocketNthreads(); + if (nThreads > MAX_THREADS) { + WARN("NET/Socket : NCCL_SOCKET_NTHREADS is greater than the maximum allowed, setting to %d", MAX_THREADS); + nThreads = MAX_THREADS; + } + if (nThreads == -2 || nSocksPerThread == -2) { + // Auto-detection + int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads + char vendorPath[PATH_MAX]; + snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetIfNames+dev*MAX_IF_NAME_SIZE); + char* rPath = realpath(vendorPath, NULL); + int fd = open(rPath, O_RDONLY); + free(rPath); + if (fd == -1) { + // Could not find device vendor. This is handled silently so + // we don't want to print an INFO error. + TRACE(NCCL_NET, "Open of %s failed : %s\n", vendorPath, strerror(errno)); + goto end; + } + char vendor[7]; + strncpy(vendor, "0x0000", 7); + int len; + SYSCHECKVAL(read(fd, vendor, 6), "read", len); + SYSCHECK(close(fd), "close"); + if (strcmp(vendor, "0x1d0f") == 0) { // AWS + autoNt = 2; + autoNs = 8; + } else if (strcmp(vendor, "0x1ae0") == 0) { // GCP + autoNt = 4; + autoNs = 1; + } +end: + if (nThreads == -2) nThreads = autoNt; + if (nSocksPerThread == -2) nSocksPerThread = autoNs; + } + int nSocks = nSocksPerThread * nThreads; + if (nSocks > MAX_SOCKETS) { + nSocksPerThread = MAX_SOCKETS/nThreads; + WARN("NET/Socket : the total number of sockets is greater than the maximum allowed, setting NCCL_NSOCKS_PERTHREAD to %d", nSocksPerThread); + nSocks = nSocksPerThread * nThreads; + } + *ns = nSocks; + *nt = nThreads; + if (nSocks > 0) INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread); + return ncclSuccess; +} + +ncclResult_t ncclSocketNewListenComm(struct ncclSocketListenComm** comm) { + NCCLCHECK(ncclCalloc(comm, 1)); + (*comm)->fd = -1; + return ncclSuccess; +} + +ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) { + NCCLCHECK(ncclCalloc(comm, 1)); + (*comm)->ctrlFd = -1; + for (int i=0; i < MAX_SOCKETS; i++) { + (*comm)->fds[i] = -1; + } + (*comm)->nextFd = 0; + return ncclSuccess; +} + +ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) { + if (dev < 0) { // data transfer socket is based on specified dev + return ncclInternalError; + } + struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle; + static_assert(sizeof(struct ncclSocketHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large"); + struct ncclSocketListenComm* comm; + NCCLCHECK(ncclSocketNewListenComm(&comm)); + NCCLCHECK(GetSocketAddr(dev, &handle->connectAddr)); + NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr)); + NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads)); + handle->nSocks = comm->nSocks; + handle->nThreads = comm->nThreads; + *listenComm = comm; + return ncclSuccess; +} + +ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) { + if (dev < 0) { // data transfer socket is based on specified dev + return ncclInternalError; + } + struct ncclSocketComm* comm; + NCCLCHECK(ncclSocketNewComm(&comm)); + struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle; + comm->nSocks = handle->nSocks; + comm->nThreads = handle->nThreads; + for (int i=0; i<comm->nSocks+1; i++) { + int tmpFd, offset=0; + NCCLCHECK(connectAddress(&tmpFd, &handle->connectAddr)); + NCCLCHECK(socketWait(NCCL_SOCKET_SEND, tmpFd, &i, sizeof(int), &offset)); + if (i == comm->nSocks) comm->ctrlFd = tmpFd; + else comm->fds[i] = tmpFd; + } + *sendComm = comm; + return ncclSuccess; +} + +ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) { + struct ncclSocketListenComm* lComm = (struct ncclSocketListenComm*)listenComm; + struct ncclSocketComm* rComm; + NCCLCHECK(ncclSocketNewComm(&rComm)); + rComm->nSocks = lComm->nSocks; + rComm->nThreads = lComm->nThreads; + for (int i=0; i<rComm->nSocks+1; i++) { + int tmpFd, sendSockIdx, offset=0; + struct sockaddr_in sockaddr; + socklen_t socklen = sizeof(struct sockaddr_in); + SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", tmpFd); + NCCLCHECK(socketWait(NCCL_SOCKET_RECV, tmpFd, &sendSockIdx, sizeof(int), &offset)); + if (sendSockIdx == rComm->nSocks) rComm->ctrlFd = tmpFd; + else rComm->fds[sendSockIdx] = tmpFd; + } + *recvComm = rComm; + return ncclSuccess; +} + +ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketRequest** req) { + for (int i=0; i<MAX_REQUESTS; i++) { + struct ncclSocketRequest* r = comm->requests+i; + if (r->used == 0) { + r->op = op; + r->data = data; + r->size = size; + r->ctrlFd = comm->ctrlFd; + r->used = 1; + r->comm = comm; + r->nSubs = 0; + *req = r; + return ncclSuccess; + } + } + WARN("NET/Socket : unable to allocate requests"); + return ncclInternalError; +} + +ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketTask** req) { + int tid = comm->nextFd % comm->nThreads; + struct ncclSocketThreadResources* res = comm->threadResources+tid; + struct ncclSocketTaskQueue* queue = &res->threadTaskQueue; + // create helper threads and prepare per-thread task queue + if (queue->tasks == NULL) { + NCCLCHECK(ncclCalloc(&queue->tasks, MAX_QUEUE_LEN)); + queue->next = 0; + res->comm = comm; + pthread_mutex_init(&res->threadLock, NULL); + pthread_cond_init(&res->threadCond, NULL); + pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res); + } + struct ncclSocketTask* r = queue->tasks+queue->next; + if (r->used == 0) { + r->op = op; + r->data = data; + r->size = size; + r->fd = comm->fds[comm->nextFd]; + r->offset = 0; + r->result = ncclSuccess; + comm->nextFd = (comm->nextFd + 1) % comm->nSocks; + r->used = 1; + *req = r; + pthread_mutex_lock(&res->threadLock); + queue->next = (queue->next+1)%MAX_QUEUE_LEN; + res->state = start; + pthread_cond_signal(&res->threadCond); + pthread_mutex_unlock(&res->threadLock); + return ncclSuccess; + } + WARN("NET/Socket : unable to allocate subtasks"); + return ncclInternalError; +} + +ncclResult_t ncclSocketTest(void* request, int* done, int* size) { + *done = 0; + struct ncclSocketRequest *r = (struct ncclSocketRequest*)request; + if (r == NULL) { + WARN("NET/Socket : test called with NULL request"); + return ncclInternalError; + } + if (r->used == 1) { /* try to send/recv size */ + int data = r->size; + int offset = 0; + NCCLCHECK(socketProgress(r->op, r->ctrlFd, &data, sizeof(int), &offset)); + + if (offset == 0) return ncclSuccess; /* Not ready -- retry later */ + + // Not sure we could ever receive less than 4 bytes, but just in case ... + if (offset < sizeof(int)) NCCLCHECK(socketWait(r->op, r->ctrlFd, &data, sizeof(int), &offset)); + + // Check size is less or equal to the size provided by the user + if (r->op == NCCL_SOCKET_RECV && data > r->size) { + WARN("NET/Socket : message truncated : receiving %d bytes instead of %d", data, r->size); + return ncclInternalError; + } + r->size = data; + r->offset = 0; + r->used = 2; // done exchanging size + // divide into subtasks + int chunkOffset = 0, i = 0; + if (r->comm->nSocks > 0) { + int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks)); + while (chunkOffset < r->size) { + int chunkSize = std::min(taskSize, r->size-chunkOffset); + NCCLCHECK(ncclSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++)); + chunkOffset += chunkSize; + } + } + r->nSubs = i; + } + if (r->used == 2) { // already exchanged size + if (r->nSubs > 0) { + int nCompleted = 0; + for (int i=0; i<r->nSubs; i++) { + struct ncclSocketTask* sub = r->tasks[i]; + if (sub->result != ncclSuccess) return sub->result; + if (sub->offset == sub->size) nCompleted++; + } + if (nCompleted == r->nSubs) { + if (size) *size = r->size; + *done = 1; + r->used = 0; + for (int i=0; i<r->nSubs; i++) { + struct ncclSocketTask* sub = r->tasks[i]; + sub->used = 0; + } + } + } else { // progress request using main thread + if (r->offset < r->size) { + NCCLCHECK(socketProgress(r->op, r->ctrlFd, r->data, r->size, &r->offset)); + } + if (r->offset == r->size) { + if (size) *size = r->size; + *done = 1; + r->used = 0; + } + } + } + return ncclSuccess; +} + +ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) { + return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess; +} +ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } + +ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { + struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm; + NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclSocketRequest**)request)); + return ncclSuccess; +} + +ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { + struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm; + NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_RECV, data, size, (struct ncclSocketRequest**)request)); + return ncclSuccess; +} + +ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size, void* mhandle) { + // We don't support CUDA pointers, so we don't need a flush operation + return ncclInternalError; +} + +ncclResult_t ncclSocketCloseListen(void* opaqueComm) { + struct ncclSocketListenComm* comm = (struct ncclSocketListenComm*)opaqueComm; + if (comm) { + if (comm->fd != -1) close(comm->fd); + free(comm); + } + return ncclSuccess; +} + +ncclResult_t ncclSocketClose(void* opaqueComm) { + struct ncclSocketComm* comm = (struct ncclSocketComm*)opaqueComm; + if (comm) { + for (int i=0; i<comm->nThreads; i++) { + struct ncclSocketThreadResources* res = comm->threadResources+i; + if (comm->helperThread[i]) { + pthread_mutex_lock(&res->threadLock); + res->state = stop; + pthread_cond_signal(&res->threadCond); + pthread_mutex_unlock(&res->threadLock); + pthread_join(comm->helperThread[i], NULL); + } + free(res->threadTaskQueue.tasks); + } + if (comm->ctrlFd != -1) close(comm->ctrlFd); + for (int i=0; i<comm->nSocks; i++) { + if (comm->fds[i] != -1) close(comm->fds[i]); + } + free(comm); + } + return ncclSuccess; +} + +ncclNet_t ncclNetSocket = { + "Socket", + ncclSocketInit, + ncclSocketDevices, + ncclSocketPciPath, + ncclSocketPtrSupport, + ncclSocketListen, + ncclSocketConnect, + ncclSocketAccept, + ncclSocketRegMr, + ncclSocketDeregMr, + ncclSocketIsend, + ncclSocketIrecv, + ncclSocketFlush, + ncclSocketTest, + ncclSocketClose, + ncclSocketClose, + ncclSocketCloseListen +}; diff --git a/src/transport/net_socket.cu b/src/transport/net_socket.cu deleted file mode 100644 index 0464b43..0000000 --- a/src/transport/net_socket.cu +++ /dev/null @@ -1,267 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "nccl.h" -#include "core.h" -#include "socket.h" -#include "net.h" - -#include <assert.h> -#include <pthread.h> -#include <stdio.h> -#include <stdlib.h> -#include <poll.h> -#include <limits.h> - -/* Init functions */ -static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS]; -static union socketAddress ncclNetIfAddrs[MAX_IFS]; -static int ncclNetIfs = -1; -pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER; - -ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) { - if (ncclNetIfs == -1) { - pthread_mutex_lock(&ncclSocketLock); - if (ncclNetIfs == -1) { - ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS); - if (ncclNetIfs <= 0) { - WARN("NET/Socket : no interface found"); - return ncclInternalError; - } else { - char line[1024]; - char addrline[1024]; - line[0] = '\0'; - for (int i=0; i<ncclNetIfs; i++) { - snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, ncclNetIfNames+i*MAX_IF_NAME_SIZE, - socketToString(&ncclNetIfAddrs[i].sa, addrline)); - } - line[1023] = '\0'; - INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line); - } - } - pthread_mutex_unlock(&ncclSocketLock); - } - return ncclSuccess; -} - -ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) { - *supportedTypes = NCCL_PTR_HOST; - return ncclSuccess; -} - -ncclResult_t ncclSocketDevices(int* ndev) { - *ndev = ncclNetIfs; - return ncclSuccess; -} - -ncclResult_t ncclSocketPciPath(int dev, char** path) { - char devicepath[PATH_MAX]; - snprintf(devicepath, PATH_MAX, "/sys/class/net/%s/device", ncclNetIfNames+dev*MAX_IF_NAME_SIZE); - *path = realpath(devicepath, NULL); - if (*path == NULL) { - INFO(NCCL_NET|NCCL_INIT, "Could not find real path of %s", devicepath); - return ncclSystemError; - } - return ncclSuccess; -} - -static ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) { - if (dev >= ncclNetIfs) return ncclInternalError; - memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr)); - return ncclSuccess; -} - -/* Communication functions */ - -struct ncclSocketHandle { - union socketAddress connectAddr; -}; - -struct ncclSocketRequest { - int op; - void* data; - int size; - int fd; - int offset; - int used; -}; - -struct ncclSocketReqs { - struct ncclSocketRequest* requests; -}; - -struct ncclSocketComm { - int fd; - struct ncclSocketReqs reqs; -}; - -ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) { - NCCLCHECK(ncclCalloc(comm, 1)); - (*comm)->fd = -1; - return ncclSuccess; -} - -ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str) { - struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle; - NCCLCHECK(GetSocketAddrFromString(&(handle->connectAddr), str)); - return ncclSuccess; -} - -ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) { - struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle; - static_assert(sizeof(struct ncclSocketHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large"); - // if dev >= 0, listen based on dev - if (dev >= 0) { - NCCLCHECK(GetSocketAddr(dev, &(handle->connectAddr))); - } else if (dev == findSubnetIf) { - // handle stores a remote address - // need to find a local addr that is in the same network as the remote addr - union socketAddress localAddr; - char ifName[MAX_IF_NAME_SIZE]; - if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) { - WARN("NET/Socket : No usable listening interface found"); - return ncclSystemError; - } - // pass the local address back - memcpy(&handle->connectAddr, &localAddr, sizeof(handle->connectAddr)); - } // Otherwise, handle stores a local address - struct ncclSocketComm* comm; - NCCLCHECK(ncclSocketNewComm(&comm)); - NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr)); - *listenComm = comm; - return ncclSuccess; -} - -ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) { - struct ncclSocketComm* comm; - NCCLCHECK(ncclSocketNewComm(&comm)); - struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle; - NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr)); - *sendComm = comm; - return ncclSuccess; -} - -ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) { - struct ncclSocketComm* lComm = (struct ncclSocketComm*)listenComm; - struct ncclSocketComm* rComm; - NCCLCHECK(ncclSocketNewComm(&rComm)); - struct sockaddr_in sockaddr; - socklen_t socklen = sizeof(struct sockaddr_in); - SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd); - *recvComm = rComm; - return ncclSuccess; -} - -#define MAX_REQUESTS 128 - -ncclResult_t ncclSocketGetRequest(struct ncclSocketReqs* reqs, int op, void* data, int size, int fd, struct ncclSocketRequest** req) { - if (reqs->requests == NULL) { - NCCLCHECK(ncclCalloc(&reqs->requests, MAX_REQUESTS)); - } - for (int i=0; i<MAX_REQUESTS; i++) { - struct ncclSocketRequest* r = reqs->requests+i; - if (r->used == 0) { - r->op = op; - r->data = data; - r->size = size; - r->fd = fd; - r->offset = -1; - r->used = 1; - *req = r; - return ncclSuccess; - } - } - WARN("Socket : unable to allocate requests"); - return ncclInternalError; -} - -ncclResult_t ncclSocketTest(void* request, int* done, int* size) { - *done = 0; - struct ncclSocketRequest *r = (struct ncclSocketRequest*)request; - if (r == NULL) { - WARN("NET/Socket : test called with NULL request"); - return ncclInternalError; - } - if (r->offset == -1) { /* try to send/recv size */ - int data = r->size; - int offset = 0; - NCCLCHECK(socketProgress(r->op, r->fd, &data, sizeof(int), &offset)); - - if (offset == 0) return ncclSuccess; /* Not ready -- retry later */ - - // Not sure we could ever receive less than 4 bytes, but just in case ... - if (offset < sizeof(int)) NCCLCHECK(socketWait(r->op, r->fd, &data, sizeof(int), &offset)); - - // Check size is less or equal to the size provided by the user - if (r->op == NCCL_SOCKET_RECV && data > r->size) { - WARN("NET/Socket : message truncated : receiving %d bytes instead of %d", data, r->size); - return ncclInternalError; - } - r->size = data; - r->offset = 0; - } - if (r->offset < r->size) { - NCCLCHECK(socketProgress(r->op, r->fd, r->data, r->size, &r->offset)); - } - if (r->offset == r->size) { - if (size) *size = r->size; - *done = 1; - r->used = 0; - } - return ncclSuccess; -} - -ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) { - return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess; -} -ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } - -ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { - struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm; - NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_SEND, data, size, comm->fd, (struct ncclSocketRequest**)request)); - return ncclSuccess; -} - -ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { - struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm; - NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_RECV, data, size, comm->fd, (struct ncclSocketRequest**)request)); - return ncclSuccess; -} - -ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size, void* mhandle) { - // We don't support CUDA pointers, so we don't need a flush operation - return ncclInternalError; -} - -ncclResult_t ncclSocketClose(void* opaqueComm) { - struct ncclSocketComm* comm = (struct ncclSocketComm*)opaqueComm; - if (comm) { - free(comm->reqs.requests); - close(comm->fd); - free(comm); - } - return ncclSuccess; -} - -ncclNet_t ncclNetSocket = { - "Socket", - ncclSocketInit, - ncclSocketDevices, - ncclSocketPciPath, - ncclSocketPtrSupport, - ncclSocketListen, - ncclSocketConnect, - ncclSocketAccept, - ncclSocketRegMr, - ncclSocketDeregMr, - ncclSocketIsend, - ncclSocketIrecv, - ncclSocketFlush, - ncclSocketTest, - ncclSocketClose, - ncclSocketClose, - ncclSocketClose -}; diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc new file mode 100644 index 0000000..0cc92f3 --- /dev/null +++ b/src/transport/p2p.cc @@ -0,0 +1,312 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "comm.h" +#include "graph.h" +#include "utils.h" + +struct p2pConnectInfo { + int direct; + union { + void* directPtr; + cudaIpcMemHandle_t devIpc; + }; +}; + +struct p2pSendResources { + struct ncclSendMem* devMem; + void* ipcPtr; +}; + +struct p2pRecvResources { + struct ncclRecvMem* devMem; + void* ipcPtr; +}; + +#include <sys/types.h> + +NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2); +NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2); + +/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */ +static int busIdToCudaDev(int64_t busId) { + int ndev; + if (cudaGetDeviceCount(&ndev) != cudaSuccess) + return -1; + for (int i = 0; i < ndev; i++) { + char devBusIdStr[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + if (cudaDeviceGetPCIBusId(devBusIdStr, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess) + return -1; + int64_t devBusId; + NCCLCHECK(busIdToInt64(devBusIdStr, &devBusId)); + if (busId == devBusId) return i; + } + // BusId was not found in our locally visible CUDA devices + return -1; +} + +/* Determine if two peers can communicate through p2p */ +ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { + int cpuCount; + NCCLCHECK(ncclTopoCpuCount(topo, &cpuCount)); + // Do not use P2P across sockets by default (provided CUDA permits it). + // When we are on a single socket, don't even use P2P through the CPU as + // it should be able to sustain two flows to sysmem faster than PCI P2P. + int p2pLevel = cpuCount == 1 ? PATH_PHB : PATH_NODE; + if (ncclParamP2pDisable() == 1) p2pLevel = 0; + if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel(); + + // Disable P2P + *ret = 0; + + if (p2pLevel == 0) return ncclSuccess; + + // Rule out different nodes + if (info1->hostHash != info2->hostHash) return ncclSuccess; + + // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) + int cudaDev1 = busIdToCudaDev(info1->busId); + int cudaDev2 = busIdToCudaDev(info2->busId); + if (cudaDev1 == -1 || cudaDev2 == -1) { + // Peer's CUDA device is not visible in this process +#if CUDART_VERSION >= 10010 + // But in CUDA 10.1 we can still communicate with 'invisible' devices + TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %lx and %lx", info1->busId, info2->busId); + // Check for NVLink/NVswitch including P2P access + int nvlink; + NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink)); + if (nvlink > 0) { + *ret = 1; + return ncclSuccess; + } +#endif + return ncclSuccess; + } + + TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%lx] and [%d=%lx]", cudaDev1, info1->busId, cudaDev2, info2->busId); + + // Do not detect topology if we're on the same GPU. Note this is not really supported. + if (cudaDev1 == cudaDev2) { + *ret = 1; + return ncclSuccess; + } + + // See if CUDA can do P2P + int p2p; + if (cudaDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != cudaSuccess) { + INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%lx) and dev %d(=%lx)", + cudaDev1, info1->busId, cudaDev2, info2->busId); + return ncclSuccess; + } + if (p2p == 0) return ncclSuccess; + + // Check for NVLink/NVswitch + int nvlink; + NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink)); + if (nvlink > 0) { + *ret = 1; + return ncclSuccess; + } + + // Finally compute the PCI distance and compare with the p2pLevel. + int distance; + NCCLCHECK(ncclTopoGpuDistance(topo, info1->busId, info2->busId, &distance)); + if (distance < p2pLevel) { + *ret = 1; + } + return ncclSuccess; +} + +#define TRACE_DUMP_IPC(DEVIPC) \ + do { \ + unsigned long *devIpc = (unsigned long *) (DEVIPC); \ + TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[0], devIpc[1], devIpc[2], devIpc[3]); \ + TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \ + } while (0) + +/* Send: Create and return connect structures for this peer to connect to me */ +ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, + struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { + + struct p2pSendResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + send->transportResources = resources; + int sendSize = sizeof(struct ncclSendMem); + ALIGN_SIZE(sendSize, CUDA_IPC_MIN); + NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize)); + + struct p2pConnectInfo info; + if (myInfo->pidHash == peerInfo->pidHash) { + info.direct = 1; + info.directPtr = resources->devMem; + if (myInfo->cudaDev == peerInfo->cudaDev) { + INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/common device", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + return ncclInternalError; + } else { + // Enable P2P access + cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0); + if (err == cudaErrorPeerAccessAlreadyEnabled) { + cudaGetLastError(); + } else if (err != cudaSuccess) { + WARN("failed to peer with device %d(=%lx): %d %s", + peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err)); + return ncclInternalError; + } + INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer", + channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); + } + } else { + // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) + int peerCudaDev = busIdToCudaDev(peerInfo->busId); + info.direct = 0; + // Map IPC and enable P2P access + cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem); + if (err != cudaSuccess) { + WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s", + myInfo->rank, peerCudaDev, peerInfo->busId, err, cudaGetErrorString(err)); + return ncclInternalError; + } + INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] -> %d[%lx] via P2P/IPC", + channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); + //TRACE_DUMP_IPC(&info.devIpc); + } + static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); + memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo)); + return ncclSuccess; +} + +/* Create and return connect structures for this peer to connect to me */ +ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, + struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) { + + struct p2pRecvResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + recv->transportResources = resources; + int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; + ALIGN_SIZE(recvSize, CUDA_IPC_MIN); + NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize)); + + struct p2pConnectInfo info; + if (myInfo->pidHash == peerInfo->pidHash) { + info.direct = 1; + info.directPtr = resources->devMem; + if (myInfo->cudaDev == peerInfo->cudaDev) { + TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank); + } else { + // Enable P2P access + cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0); + if (err == cudaErrorPeerAccessAlreadyEnabled) { + cudaGetLastError(); + } else if (err != cudaSuccess) { + WARN("failed to peer with device %d(=%lx): %d %s", + peerInfo->cudaDev, peerInfo->busId, err, cudaGetErrorString(err)); + return ncclInternalError; + } + TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/direct pointer", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); + } + } else { + // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) + int peerCudaDev = busIdToCudaDev(peerInfo->busId); + info.direct = 0; + // Map IPC and enable P2P access + cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem); + if (err != cudaSuccess) { + WARN("rank %d failed to get CUDA IPC handle to device %d(=%lx) : %d %s", + myInfo->rank, peerCudaDev, peerInfo->busId, err, cudaGetErrorString(err)); + return ncclInternalError; + } + TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%lx] <- %d[%lx] via P2P/IPC", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); + //TRACE_DUMP_IPC(&info.devIpc); + } + static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); + memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo)); + return ncclSuccess; +} + +/* Connect/Send to this peer */ +static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) { + struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources; + struct ncclRecvMem* remDevMem; + struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; + if (info->direct) { + remDevMem = (struct ncclRecvMem*)(info->directPtr); + send->conn.direct = 1; + } else { + //TRACE_DUMP_IPC(&info->devIpc); + cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess); + remDevMem = (struct ncclRecvMem*)resources->ipcPtr; + if (err != cudaSuccess) { + WARN("failed to open CUDA IPC handle : %d %s", + err, cudaGetErrorString(err)); + return ncclUnhandledCudaError; + } + } + + send->conn.buff = remDevMem->buff; + send->conn.llBuff = remDevMem->llBuff; + send->conn.ll128Buff = remDevMem->ll128Buff; + send->conn.tail = &remDevMem->tail; + send->conn.opCountRem = &remDevMem->opCount; + send->conn.head = &resources->devMem->head; + send->conn.ptrExchange = &resources->devMem->ptrExchange; + send->conn.opCountLoc = &resources->devMem->opCount; + return ncclSuccess; +} + +/* Connect/Recv from this peer */ +ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) { + struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources; + struct ncclSendMem* remDevMem; + struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; + if (info->direct) { + remDevMem = (struct ncclSendMem*)(info->directPtr); + recv->conn.direct = 1; + recv->conn.ptrExchange = &remDevMem->ptrExchange; + } else { + //TRACE_DUMP_IPC(&info->devIpc); + cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess); + remDevMem = (struct ncclSendMem*)resources->ipcPtr; + if (err != cudaSuccess) { + WARN("failed to open CUDA IPC handle : %d %s", + err, cudaGetErrorString(err)); + return ncclUnhandledCudaError; + } + } + + recv->conn.buff = resources->devMem->buff; + recv->conn.llBuff = resources->devMem->llBuff; + recv->conn.ll128Buff = resources->devMem->ll128Buff; + recv->conn.tail = &resources->devMem->tail; + recv->conn.opCountLoc = &resources->devMem->opCount; + recv->conn.head = &remDevMem->head; + recv->conn.opCountRem = &remDevMem->opCount; + return ncclSuccess; +} + +ncclResult_t p2pSendFree(void* resources) { + struct p2pSendResources* sendRes = (struct p2pSendResources*)resources; + if (sendRes->ipcPtr) + CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr)); + CUDACHECK(cudaFree(sendRes->devMem)); + free(sendRes); + return ncclSuccess; +} + +ncclResult_t p2pRecvFree(void* resources) { + struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources; + if (recvRes->ipcPtr) + CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr)); + CUDACHECK(cudaFree(recvRes->devMem)); + free(recvRes); + return ncclSuccess; +} + +struct ncclTransport p2pTransport = { + "P2P", + p2pCanConnect, + { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL }, + { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL } +}; diff --git a/src/transport/p2p.cu b/src/transport/p2p.cu deleted file mode 100644 index 9f3e0b6..0000000 --- a/src/transport/p2p.cu +++ /dev/null @@ -1,620 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "core.h" -#include "utils.h" -#include "topo.h" -#include "transport.h" -#include "param.h" -#include <unistd.h> -#include <cuda_runtime.h> -#include <ctype.h> -#include "nvlink.h" - -struct p2pConnectInfo { - int direct; - union { - void* directPtr; - cudaIpcMemHandle_t devIpc; - }; -}; - -struct p2pSendResources { - struct ncclSendMem* devMem; - void* ipcPtr; -}; - -struct p2pRecvResources { - struct ncclRecvMem* devMem; - void* ipcPtr; -}; - -#include <sys/types.h> - -NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2); -NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2); - -/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */ -static int busIdToCudaDev(const char* busId) { - int ndev; - if (cudaGetDeviceCount(&ndev) != cudaSuccess) - return -1; - for (int i = 0; i < ndev; i++) { - char devBusId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; - if (cudaDeviceGetPCIBusId(devBusId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess) - return -1; - if (strcmp(busId, devBusId) == 0) { - return i; - } - } - // BusId was not found in our locally visible CUDA devices - return -1; -} - -/* Determine if we can communicate with the peer through p2p */ -ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) { - // Do not use P2P across root complexes by default (provided CUDA permits it) - int p2pLevel = PATH_SOC; - if (ncclParamP2pDisable() == 1) p2pLevel = 0; - if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel(); - - *ret = 0; - - if (p2pLevel == 0) return ncclSuccess; - - // Rule out different nodes - if (myInfo->hostHash != peerInfo->hostHash) return ncclSuccess; - - // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) - int peerCudaDev = busIdToCudaDev(peerInfo->busId); - if (peerCudaDev == -1) return ncclSuccess; // Peer's CUDA device is not visible in this process - - TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev); - - // Do not detect topology if we're on the same GPU. Note this is not really supported. - if (myInfo->cudaDev == peerCudaDev) { - *ret = 1 + PATH_SOC; - return ncclSuccess; - } - - // See if CUDA can do P2P - int p2p; - if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerCudaDev) != cudaSuccess) { - INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%d) and dev %d(=%d)", - myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev); - return ncclSuccess; - } - if (p2p == 0) return ncclSuccess; - - // Check for NVLink/NVswitch - int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId); - if (nvlinkp2p > 0) { - *ret = nvlinkp2p; - return ncclSuccess; - } - - // Finally compute the PCI distance and compare with the p2pLevel. - char* myPath; - char* peerPath; - ncclResult_t err1 = getCudaPath(myInfo->cudaDev, &myPath); - ncclResult_t err2 = getCudaPath(peerCudaDev, &peerPath); - if (err1 == ncclSuccess && err2 == ncclSuccess) { - int distance = pciDistance(myPath, peerPath); - if (distance < p2pLevel) { - *ret = 1 + PATH_SOC - distance; - } - } - if (err1 == ncclSuccess) free(myPath); - if (err2 == ncclSuccess) free(peerPath); - return ncclSuccess; -} - -static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) { - int nrings = 0; - ncclTvalue_t* line = matrix+current*n; - inTheRing[current] = 1; - int currentStep = (currentRing+1)*n-remaining; - rings[currentStep-1] = current; - if (remaining == 0) { - int looprank = rings[currentRing*n]; - if (line[looprank] > 0) { - if (currentRing+1 == nRingsMax) { - nrings = 1; - } else { - line[looprank]--; - for (int i=0; i<n; i++) inTheRing[i] = 0; - if (connect) { - // First two slots are already set and we need to respect those constraints - inTheRing[rings[currentStep]] = 1; - nrings = 1 + computeRingsRec(matrix, n, rings, currentRing+1, nRingsMax, inTheRing, rings[currentStep+1], n-2, connect); - } else { - rings[(currentRing+1)*n] = 0; - nrings = 1 + computeRingsRec(matrix, n, rings, currentRing+1, nRingsMax, inTheRing, 0, n-1, connect); - } - line[looprank]++; - for (int i=0; i<n; i++) inTheRing[i] = 1; - } - } - } else { - int ringsSave[nRingsMax*n]; - int maxStep = 0; - for (int i=0; i<n; i++) { - if (inTheRing[i] == 0 && line[i] > 0) { - line[i]--; - int nr = computeRingsRec(matrix, n, rings, currentRing, nRingsMax, inTheRing, i, remaining-1, connect); - if (nr > nrings) { - nrings = nr; - maxStep = (nr+currentRing)*n; - ringsSave[currentStep] = i; - // Save the rest of the rings - for (int r=currentStep+1; r<maxStep; r++) { - ringsSave[r] = rings[r]; - } - if (nrings + currentRing == nRingsMax) { - // We found an optimal solution. Let's stop there. - break; - } - } - line[i]++; - } - } - for (int r=currentStep; r<maxStep; r++) { - rings[r] = ringsSave[r]; - } - } - inTheRing[current] = 0; - return nrings; -} - -static inline int copyRings(int nranks, int* rings, int nrings, int newNrings) { - if (nrings == 0) return 0; - // Copy rings by dup times - if (newNrings > MAXCHANNELS) { - newNrings = MAXCHANNELS; - } - for (int r=nrings; r<newNrings; r++) { - for (int i=0; i<nranks; i++) rings[r*nranks+i] = rings[(r%nrings)*nranks+i]; - } - return newNrings; -} - -int p2pComputeRingsNvLink(ncclTvalue_t* matrix, int nranks, int *rings, int nringsMax, int connect) { - int* inTheRing = (int*)malloc(sizeof(int)*nranks); - if (inTheRing == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*nranks); return 0; } - for (int i=0; i<nranks; i++) inTheRing[i] = 0; - int nrings; - if (connect) { - inTheRing[rings[0]] = 1; - nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, rings[1], nranks-2, connect); - } else { - rings[0] = 0; - nrings = computeRingsRec(matrix, nranks, rings, 0, nringsMax, inTheRing, 0, nranks-1, connect); - } - free(inTheRing); - return nrings; -} - -static inline int findConnect(int nranks, int* ranks) { - for (int i = 0; i<nranks; i++) { - if (ranks[i] != -1) return i; - } - return -1; -} - -int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int oversubscribe, int* nthreads) { - if (nrings == 0) return 0; - if (nrings > MAXCHANNELS) { - WARN("Max rings reached, limiting to %d", MAXCHANNELS); - nrings = MAXCHANNELS; - } - // Find existing constraints / connections - int connect = 0; - for (int r=0; r<nrings; r++) { - int start = findConnect(nranks, prev+r*nranks); - int end = findConnect(nranks, next+r*nranks); - if (start != -1 && end != -1) { - rings[r*nranks] = end; - rings[r*nranks+1] = start; - connect = 1; - } - } - - // Compute rings - ncclTvalue_t* matrix = (ncclTvalue_t*)malloc(sizeof(ncclTvalue_t)*nranks*nranks); - if (matrix == NULL) { WARN("malloc of %ld bytes failed", sizeof(ncclTvalue_t)*nranks*nranks); return 0; } - for (int i=0; i<nranks; i++) for (int j=0; j<nranks; j++) - matrix[i*nranks+j] = oversubscribe ? values[i*nranks+j]/CONNECT_NVLINK*2 : values[i*nranks+j]/CONNECT_NVLINK ; - - int compNrings = p2pComputeRingsNvLink(matrix, nranks, rings, nrings, connect); - - free(matrix); - - if (oversubscribe || connect) return compNrings; - - if (compNrings && compNrings < nrings && nranks <= 4) { - // Try to oversubscribe to get a better result - int *rings2 = (int *)malloc(sizeof(int)*MAXCHANNELS*nranks); - if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXCHANNELS*nranks); return 0; } - for (int i=0; i<MAXCHANNELS*nranks; i++) rings2[i] = -1; - int nThreads = *nthreads; - int compNrings2 = p2pComputeRingsNvLink(values, nranks, rings2, nrings, prev, next, 1, &nThreads); - if (compNrings2 > compNrings*2) { - // Oversubscription worked. - for (int i=0; i<compNrings2*nranks; i++) rings[i] = rings2[i]; - compNrings = compNrings2; - } - free(rings2); - } - - // Duplicate the rings for direct NVLink - compNrings = copyRings(nranks, rings, compNrings, compNrings*2); - - return compNrings; -} - -int p2pComputeRingsSeqConnect(ncclTvalue_t* values, int nranks, int* rings, int nringsStart, int* prev, int* next, int minScore, int* nthreads) { - int nrings = nringsStart; - int connect = 0; - for (int r=0; r<nrings; r++) { - int start = findConnect(nranks, prev+r*nranks); - int end = findConnect(nranks, next+r*nranks); - if (start != -1 && end != -1) { - rings[r*nranks] = end; - rings[r*nranks+1] = start; - int cur = start; - for (int i=2; i<nranks; i++) { - int next = (cur+1) % nranks; - while (next == end || next == start) next = (next+1) % nranks; - if (values[cur*nranks+next] < minScore) { - return 0; - } - rings[r*nranks+i] = next; - cur = next; - } - connect = 1; - } else { - if (connect == 1 && r > 0) { - WARN("Connecting rings but did not find start/end for ring %d. Disabling other rings.", r); - return r; - } else { - return 0; - } - } - } - return nrings; -} - -int p2pComputeRingsSeqNew(ncclTvalue_t* values, int nranks, int* rings, int nringsStart, int* prev, int* next, int minScore, int* nthreads) { - for (int r=0; r<nringsStart; r++) { - for (int i=0; i<nranks; i++) { - rings[r*nranks+i] = i; - } - } - return nringsStart; -} - -static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) { - for (int score = PATH_SOC+1; score >= minScore; score--) { - int best = -1; - int worst_end_score = PATH_SOC+2; // find the closest to rank, farthest from end - for (int n = 0; n < nranks; n++) { - if (inRing[n]) continue; - if (values[rank*nranks+n] == score) { - if (end == -1) return n; - if (values[end*nranks+n] < worst_end_score) { - best = n; - worst_end_score = values[end*nranks+n]; - } - } - } - if (best != -1) return best; - } - return -1; -} - -int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings, int* prev, int* next, int minScore) { - int connect = 0; - for (int r=0; r<nrings; r++) { - int start = findConnect(nranks, prev+r*nranks); - int end = findConnect(nranks, next+r*nranks); - - int inRing[nranks]; - for (int i=0; i<nranks; i++) inRing[i] = 0; - - if (start == -1 && end == -1) { - if (connect == 1 && r > 0) { - WARN("Connecting ring %d : did not find start/end. Disabling other rings.", r); - return r; - } - end = 0; - inRing[end] = 1; - start = findClosestPci(values, inRing, end, -1, nranks, minScore); - if (start == -1) return r; - } else if (start == -1 || end == -1) { - WARN("Connecting ring %d : inconsistent start/end. Disabling other rings.", r); - return r; - } else { - connect = 1; - } - rings[r*nranks] = end; - rings[r*nranks+1] = start; - inRing[start] = inRing[end] = 1; - int cur = start; - for (int i=2; i<nranks; i++) { - int next = findClosestPci(values, inRing, cur, end, nranks, minScore); - if (next == -1) return r; - - inRing[next] = 1; - rings[r*nranks+i] = next; - cur = next; - } - // Check the loop is closing - inRing[end] = 0; - if (findClosestPci(values, inRing, cur, end, nranks, minScore) != end) return r; - - if (connect == 0) return 1; - } - return nrings; -} - -ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) { - if (*nringsRet == 0) return ncclSuccess; - int *rings; - NCCLCHECK(ncclCalloc(&rings, MAXCHANNELS*nranks)); - for (int i=0; i<MAXCHANNELS*nranks; i++) rings[i] = -1; - int nrings = *nringsRet; - - // NVswitch - int nvswitchLinks = 0; - int directLinks = 0; - for (int rank=0; rank<nranks; rank++) { - for (int j=1; j<nranks; j++) { - int i = (rank + j) % nranks; - ncclTvalue_t links = values[rank*nranks+i]/CONNECT_NVSWITCH; - if (j>1 && links != nvswitchLinks) { - WARN("Internal error : NVswitch links mismatch"); - return ncclInternalError; - } - nvswitchLinks = links; - } - } - if (nvswitchLinks) { - // NVSwitch : Connect existing rings - int nringsConnected = p2pComputeRingsSeqConnect(values, nranks, rings, nrings, prev, next, minScore, nthreads); - if (nringsConnected > 0) { - nrings = nringsConnected; - } else { - nrings = std::min(nrings, nvswitchLinks); // NVSwitch: Limit rings to number of NVLinks - // Or create new ones - nrings = p2pComputeRingsSeqNew(values, nranks, rings, nrings, prev, next, minScore, nthreads); - // And duplicate them - nrings = copyRings(nranks, rings, nrings, nrings*2); - } - goto end; - } - - // point-to-point NVLink - for (int rank=0; rank<nranks; rank++) { - int links = 0; - for (int i=0; i<nranks; i++) { - ncclTvalue_t val = values[rank*nranks+i]; - if (val >= CONNECT_NVSWITCH) continue; - links += val/CONNECT_NVLINK; - } - if (rank == 0) directLinks = links; - else directLinks = std::min(directLinks, links); - } - if (directLinks > 0) { - // NVLink : Connect rings or create new ones - nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads); - goto end; - } - - // PCIe or QPI : Connect rings or create new ones - nrings = p2pComputeRingsPci(values, nranks, rings, *nringsRet, prev, next, minScore); - -end: - *nringsRet = nrings; - for (int ring = 0; ring<nrings; ring++) { - for (int index=0; index<nranks; index++) { - int prevIndex = (index - 1 + nranks) % nranks; - int nextIndex = (index + 1) % nranks; - int curRank = rings[ring*nranks+index]; - int prevRank = rings[ring*nranks+prevIndex]; - int nextRank = rings[ring*nranks+nextIndex]; - if (prev[ring*nranks+curRank] == -1) prev[ring*nranks+curRank] = prevRank; - if (next[ring*nranks+curRank] == -1) next[ring*nranks+curRank] = nextRank; - } - } - - free(rings); - return ncclSuccess; -} - -#define TRACE_DUMP_IPC(DEVIPC) \ - do { \ - unsigned long *devIpc = (unsigned long *) (DEVIPC); \ - TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[0], devIpc[1], devIpc[2], devIpc[3]); \ - TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \ - } while (0) - -/* Send: Create and return connect structures for this peer to connect to me */ -ncclResult_t p2pSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, - struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { - - struct p2pSendResources* resources; - NCCLCHECK(ncclCalloc(&resources, 1)); - send->transportResources = resources; - const int sendSize = sizeof(struct ncclSendMem); - NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize)); - - struct p2pConnectInfo info; - if (myInfo->pidHash == peerInfo->pidHash) { - info.direct = 1; - info.directPtr = resources->devMem; - if (myInfo->cudaDev == peerInfo->cudaDev) { - INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", channelId, myInfo->rank, peerInfo->rank); - } else { - // Enable P2P access - cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0); - if (err == cudaErrorPeerAccessAlreadyEnabled) { - cudaGetLastError(); - } else if (err != cudaSuccess) { - WARN("failed to peer with device %d(=%d): %d %s", - peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err)); - return ncclInternalError; - } - INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer", - channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); - } - } else { - // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) - int peerCudaDev = busIdToCudaDev(peerInfo->busId); - info.direct = 0; - // Map IPC and enable P2P access - cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem); - if (err != cudaSuccess) { - WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s", - myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err)); - return ncclInternalError; - } - INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC", - channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); - //TRACE_DUMP_IPC(&info.devIpc); - } - static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); - memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo)); - return ncclSuccess; -} - -/* Create and return connect structures for this peer to connect to me */ -ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, - struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) { - - struct p2pRecvResources* resources; - NCCLCHECK(ncclCalloc(&resources, 1)); - recv->transportResources = resources; - const int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; - NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize)); - - struct p2pConnectInfo info; - if (myInfo->pidHash == peerInfo->pidHash) { - info.direct = 1; - info.directPtr = resources->devMem; - if (myInfo->cudaDev == peerInfo->cudaDev) { - TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank); - } else { - // Enable P2P access - cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0); - if (err == cudaErrorPeerAccessAlreadyEnabled) { - cudaGetLastError(); - } else if (err != cudaSuccess) { - WARN("failed to peer with device %d(=%d): %d %s", - peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err)); - return ncclInternalError; - } - TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); - } - } else { - // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) - int peerCudaDev = busIdToCudaDev(peerInfo->busId); - info.direct = 0; - // Map IPC and enable P2P access - cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem); - if (err != cudaSuccess) { - WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s", - myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err)); - return ncclInternalError; - } - TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); - //TRACE_DUMP_IPC(&info.devIpc); - } - static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); - memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo)); - return ncclSuccess; -} - -/* Connect/Send to this peer */ -static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) { - struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources; - struct ncclRecvMem* remDevMem; - struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; - if (info->direct) { - remDevMem = (struct ncclRecvMem*)(info->directPtr); - send->conn.direct = 1; - } else { - //TRACE_DUMP_IPC(&info->devIpc); - cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess); - remDevMem = (struct ncclRecvMem*)resources->ipcPtr; - if (err != cudaSuccess) { - WARN("failed to open CUDA IPC handle : %d %s", - err, cudaGetErrorString(err)); - return ncclUnhandledCudaError; - } - } - - send->conn.buff = remDevMem->buff; - send->conn.llBuff = remDevMem->llBuff; - send->conn.tail = &remDevMem->tail; - send->conn.opCountRem = &remDevMem->opCount; - send->conn.head = &resources->devMem->head; - send->conn.ptrExchange = &resources->devMem->ptrExchange; - send->conn.opCountLoc = &resources->devMem->opCount; - return ncclSuccess; -} - -/* Connect/Recv from this peer */ -ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) { - struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources; - struct ncclSendMem* remDevMem; - struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; - if (info->direct) { - remDevMem = (struct ncclSendMem*)(info->directPtr); - recv->conn.direct = 1; - recv->conn.ptrExchange = &remDevMem->ptrExchange; - } else { - //TRACE_DUMP_IPC(&info->devIpc); - cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess); - remDevMem = (struct ncclSendMem*)resources->ipcPtr; - if (err != cudaSuccess) { - WARN("failed to open CUDA IPC handle : %d %s", - err, cudaGetErrorString(err)); - return ncclUnhandledCudaError; - } - } - - recv->conn.buff = resources->devMem->buff; - recv->conn.llBuff = resources->devMem->llBuff; - recv->conn.tail = &resources->devMem->tail; - recv->conn.opCountLoc = &resources->devMem->opCount; - recv->conn.head = &remDevMem->head; - recv->conn.opCountRem = &remDevMem->opCount; - return ncclSuccess; -} - -ncclResult_t p2pSendFree(void* resources) { - struct p2pSendResources* sendRes = (struct p2pSendResources*)resources; - if (sendRes->ipcPtr) - CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr)); - CUDACHECK(cudaFree(sendRes->devMem)); - return ncclSuccess; -} - -ncclResult_t p2pRecvFree(void* resources) { - struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources; - if (recvRes->ipcPtr) - CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr)); - CUDACHECK(cudaFree(recvRes->devMem)); - return ncclSuccess; -} - -struct ncclTransport p2pTransport = { - "P2P", - p2pCanConnect, - p2pGetRings, - { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL }, - { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL } -}; diff --git a/src/transport/shm.cc b/src/transport/shm.cc new file mode 100644 index 0000000..60f16c8 --- /dev/null +++ b/src/transport/shm.cc @@ -0,0 +1,175 @@ +/************************************************************************* + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "comm.h" +#include "shm.h" + +struct shmConnectInfo { + uint64_t pidHash; + int id; + int sendRank; + int recvRank; + int shmSize; +}; + +struct shmSendResources { + int remShmSize; + struct ncclRecvMem* remHostMem; + struct ncclRecvMem* devRemHostMem; + int shmSize; + struct ncclSendMem* hostMem; + struct ncclSendMem* devHostMem; +}; + +struct shmRecvResources { + int remShmSize; + struct ncclSendMem* remHostMem; + struct ncclSendMem* devRemHostMem; + int shmSize; + struct ncclRecvMem* hostMem; + struct ncclRecvMem* devHostMem; +}; + +NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0); + +/* Determine two peers can communicate with SHM */ +ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { + *ret = 0; + + if (ncclParamShmDisable() == 1) return ncclSuccess; + + // Same host? + TRACE(NCCL_INIT|NCCL_SHM, "peer1 hostHash %lx peer2 hostHash %lx", info1->hostHash, info2->hostHash); + if (info1->hostHash != info2->hostHash) return ncclSuccess; + + // Common /dev/shm (between containers) ? + TRACE(NCCL_INIT|NCCL_SHM, "peer1 shmDev %lx peer2 shmDev %lx", info1->shmDev, info2->shmDev); + if (info1->shmDev != info2->shmDev) return ncclSuccess; + + *ret = 1; + + return ncclSuccess; +} + +#define MAX_SHM_NAME_LEN 1024 + +/* Create and return connect structures for this peer to connect to me */ +ncclResult_t shmSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { + + struct shmSendResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + send->transportResources = resources; + + struct shmConnectInfo info; + info.id = channelId; + info.pidHash = myInfo->pidHash; + info.sendRank = myInfo->rank; + info.recvRank = peerInfo->rank; + + char shmName[MAX_SHM_NAME_LEN]; + sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank); + info.shmSize = resources->shmSize = sizeof(struct ncclSendMem); + TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); + NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); + + INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); + static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big"); + memcpy(connectInfo, &info, sizeof(struct shmConnectInfo)); + return ncclSuccess; +} + +ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { + struct shmRecvResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + recv->transportResources = resources; + + struct shmConnectInfo info; + info.id = channelId; + info.pidHash = myInfo->pidHash; + info.sendRank = peerInfo->rank; + info.recvRank = myInfo->rank; + + char shmName[MAX_SHM_NAME_LEN]; + sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank); + info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+buffSize; + TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); + NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); + + static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big"); + memcpy(connectInfo, &info, sizeof(struct shmConnectInfo)); + return ncclSuccess; +} + +/* Connect to this peer */ +ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) { + // Setup device pointers + struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; + struct shmSendResources* resources = (struct shmSendResources*)send->transportResources; + + char shmName[MAX_SHM_NAME_LEN]; + sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank); + resources->remShmSize = info->shmSize; + TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize); + NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); + // Remove the file to ensure proper clean-up + NCCLCHECK(shmUnlink(shmName)); + + send->transportResources = resources; + send->conn.buff = resources->devRemHostMem->buff; + send->conn.llBuff = resources->devRemHostMem->llBuff; + send->conn.ll128Buff = resources->devRemHostMem->ll128Buff; + send->conn.tail = &resources->devRemHostMem->tail; + send->conn.opCountRem = &resources->devRemHostMem->opCount; + + send->conn.head = &resources->devHostMem->head; + send->conn.opCountLoc = &resources->devHostMem->opCount; + return ncclSuccess; +} + +ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) { + // Setup device pointers + struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources; + struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; + + char shmName[MAX_SHM_NAME_LEN]; + sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank); + resources->remShmSize = info->shmSize; + TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize); + NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); + NCCLCHECK(shmUnlink(shmName)); + recv->conn.head = &resources->devRemHostMem->head; + recv->conn.opCountRem = &resources->devRemHostMem->opCount; + + recv->conn.buff = resources->devHostMem->buff; + recv->conn.llBuff = resources->devHostMem->llBuff; + recv->conn.ll128Buff = resources->devHostMem->ll128Buff; + recv->conn.tail = &resources->devHostMem->tail; + recv->conn.opCountLoc = &resources->devHostMem->opCount; + return ncclSuccess; +} + +ncclResult_t shmSendFree(void* transportResources) { + struct shmSendResources* resources = (struct shmSendResources*)transportResources; + NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize)); + NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize)); + free(resources); + return ncclSuccess; +} + +ncclResult_t shmRecvFree(void* transportResources) { + struct shmRecvResources* resources = (struct shmRecvResources*)transportResources; + NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize)); + NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize)); + free(resources); + return ncclSuccess; +} + +struct ncclTransport shmTransport = { + "SHM", + shmCanConnect, + { shmSendSetup, shmSendConnect, shmSendFree, NULL }, + { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL } +}; diff --git a/src/transport/shm.cu b/src/transport/shm.cu deleted file mode 100644 index 56e0242..0000000 --- a/src/transport/shm.cu +++ /dev/null @@ -1,245 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "core.h" -#include "utils.h" -#include "transport.h" -#include "param.h" -#include "shm.h" -#include <unistd.h> -#include <cuda_runtime.h> - -struct shmSendConnectInfo { - uint64_t pidHash; - int id; - int rank; - int shmSize; -}; - -struct shmRecvConnectInfo { - uint64_t pidHash; - int id; - int rank; - int shmSize; -}; - -struct shmSendResources { - int remShmSize; - struct ncclRecvMem* remHostMem; - struct ncclRecvMem* devRemHostMem; - int shmSize; - struct ncclSendMem* hostMem; - struct ncclSendMem* devHostMem; -}; - -struct shmRecvResources { - int remShmSize; - struct ncclSendMem* remHostMem; - struct ncclSendMem* devRemHostMem; - int shmSize; - struct ncclRecvMem* hostMem; - struct ncclRecvMem* devHostMem; -}; - -NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0); - -/* Determine if we can communicate with the peer */ -ncclResult_t shmCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) { - *ret = ((ncclParamShmDisable() == 1) || (myInfo->hostHash != peerInfo->hostHash)) ? 0 : 1; - return ncclSuccess; -} - -static inline int groupFirst(int nranks, int* groups, int group, int rankToAvoid) { - for (int rank = 0; rank<nranks; rank++) { - if ((groups[rank] == group) && (rank != rankToAvoid)) return rank; - } - return -1; -} - -static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid) { - for (int rank = nranks-1; rank>=0; rank--) { - if ((groups[rank] == group) && (rank != rankToAvoid)) return rank; - } - return -1; -} - -ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) { - if (*nringsRet == MAXCHANNELS) *nringsRet = 1; - int nGroups = groups[nranks-1] + 1; - int starts[nGroups]; - int ends[nGroups]; - for (int ring = 0; ring<*nringsRet; ring++) { - int startGroup = -1, endGroup = -1; - for (int group = 0; group<nGroups; group++) { - int start = -1; - int end = -1; - int nranksInGroup = 0; - for (int rank=0; rank<nranks; rank++) { - if (groups[rank] != group) continue; - nranksInGroup++; - if (prev[ring*nranks+rank] != -1) { - if (start != -1) { - WARN("Multiple starts found in group"); - } - start = rank; - startGroup = group; - } - if (next[ring*nranks+rank] != -1) { - if (end != -1) { - WARN("Multiple ends found in group"); - } - end = rank; - endGroup = group; - } - } - if (nranksInGroup == 1) { - start = end = groupFirst(nranks, groups, group, -1); - } else { - if (start == -1) - start = groupFirst(nranks, groups, group, end); - if (end == -1) - end = groupLast(nranks, groups, group, start); - } - if (start == -1 || end == -1) { - *nringsRet = ring; - return ncclSuccess; - } - starts[group] = start; - ends[group] = end; - } - if (endGroup == -1 || startGroup == -1) { - startGroup = 0; - endGroup = nGroups-1; - // Close the loop - next[ring*nranks+ends[endGroup]] = starts[startGroup]; - prev[ring*nranks+starts[startGroup]] = ends[endGroup]; - } - int group = startGroup; - for (int i=0; i<nGroups-2; i++) { - int nextGroup = (group+1)%nGroups; - if (nextGroup == endGroup) nextGroup = (nextGroup+1)%nGroups; - next[ring*nranks+ends[group]] = starts[nextGroup]; - prev[ring*nranks+starts[nextGroup]] = ends[group]; - group = nextGroup; - } - // Connect with the last - next[ring*nranks+ends[group]] = starts[endGroup]; - prev[ring*nranks+starts[endGroup]] = ends[group]; - } - return ncclSuccess; -} - -#define MAX_SHM_NAME_LEN 1024 - -/* Create and return connect structures for this peer to connect to me */ -ncclResult_t shmSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { - - struct shmSendResources* resources; - NCCLCHECK(ncclCalloc(&resources, 1)); - send->transportResources = resources; - - struct shmRecvConnectInfo info; - char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank); - info.shmSize = resources->shmSize = sizeof(struct ncclSendMem); - TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); - NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); - - INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); - info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash; - static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big"); - memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo)); - return ncclSuccess; -} - -ncclResult_t shmRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { - struct shmRecvResources* resources; - NCCLCHECK(ncclCalloc(&resources, 1)); - recv->transportResources = resources; - - struct shmSendConnectInfo info; - - char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank); - info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+buffSize; - TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); - NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); - - info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash; - static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big"); - memcpy(connectInfo, &info, sizeof(struct shmSendConnectInfo)); - return ncclSuccess; -} - -/* Connect to this peer */ -ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) { - // Setup device pointers - struct shmSendConnectInfo* info = (struct shmSendConnectInfo*)connectInfo; - struct shmSendResources* resources = (struct shmSendResources*)send->transportResources; - - char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", info->pidHash, info->id, info->rank); - resources->remShmSize = info->shmSize; - TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize); - NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); - // Remove the file to ensure proper clean-up - NCCLCHECK(shmUnlink(shmName)); - - send->transportResources = resources; - send->conn.buff = resources->devRemHostMem->buff; - send->conn.llBuff = resources->devRemHostMem->llBuff; - send->conn.tail = &resources->devRemHostMem->tail; - send->conn.opCountRem = &resources->devRemHostMem->opCount; - - send->conn.head = &resources->devHostMem->head; - send->conn.opCountLoc = &resources->devHostMem->opCount; - return ncclSuccess; -} - -ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) { - // Setup device pointers - struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources; - struct shmRecvConnectInfo* info = (struct shmRecvConnectInfo*)connectInfo; - - char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-send-%lx-%d-%d", info->pidHash, info->id, info->rank); - resources->remShmSize = info->shmSize; - TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize); - NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); - NCCLCHECK(shmUnlink(shmName)); - recv->conn.head = &resources->devRemHostMem->head; - recv->conn.opCountRem = &resources->devRemHostMem->opCount; - - recv->conn.buff = resources->devHostMem->buff; - recv->conn.llBuff = resources->devHostMem->llBuff; - recv->conn.tail = &resources->devHostMem->tail; - recv->conn.opCountLoc = &resources->devHostMem->opCount; - return ncclSuccess; -} - -ncclResult_t shmSendFree(void* transportResources) { - struct shmSendResources* resources = (struct shmSendResources*)transportResources; - NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize)); - NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize)); - free(resources); - return ncclSuccess; -} - -ncclResult_t shmRecvFree(void* transportResources) { - struct shmRecvResources* resources = (struct shmRecvResources*)transportResources; - NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize)); - NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize)); - free(resources); - return ncclSuccess; -} - -struct ncclTransport shmTransport = { - "SHM", - shmCanConnect, - shmGetRings, - { shmSendSetup, shmSendConnect, shmSendFree, NULL }, - { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL } -}; |