diff options
70 files changed, 6366 insertions, 3343 deletions
diff --git a/makefiles/common.mk b/makefiles/common.mk index 64f8d2d..1a1c2b6 100644 --- a/makefiles/common.mk +++ b/makefiles/common.mk @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -23,7 +23,6 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2) #$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR}) - # You should define NVCC_GENCODE in your environment to the minimal set # of archs to reduce compile time. CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \ @@ -39,7 +38,7 @@ CUDA11_PTX = -gencode=arch=compute_80,code=compute_80 # Include Ampere support if we're using CUDA11 or above ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0) - NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX) $(CUDA11_GENCODE) $(CUDA11_PTX) + NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX) # Include Volta support if we're using CUDA9 or above else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0) NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX) diff --git a/makefiles/version.mk b/makefiles/version.mk index 22bddce..e7fe35e 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 11 -NCCL_PATCH := 4 +NCCL_MINOR := 12 +NCCL_PATCH := 7 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/Makefile b/src/Makefile index a548840..65c8b28 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -9,8 +9,8 @@ include ../makefiles/version.mk ##### src files INCEXPORTS := nccl.h nccl_net.h -LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc \ - misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc \ +LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc enhcompat.cc net.cc \ + misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc \ transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \ collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \ graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc @@ -74,14 +74,14 @@ $(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB) ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME) ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME) +null := +space := $(null) # +comma := , + $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB) @printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@ mkdir -p $(LIBDIR) - $(eval TMP := $(shell mktemp -d)) - cp $(LIBOBJ) $(TMP) - cd $(TMP) && ar x $(DEVICELIB) && cd - - ar cr $@ $(LIBOBJ) $(TMP)/*.o - rm -Rf $(TMP) + printf "create $@\naddlib $(DEVICELIB)\naddmod $(subst $(space),$(comma),$(strip $(LIBOBJ)))\nsave\nend" | ar -M $(PKGDIR)/nccl.pc : nccl.pc.in mkdir -p $(PKGDIR) diff --git a/src/bootstrap.cc b/src/bootstrap.cc index ae9da9b..db1e70e 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,13 +9,13 @@ #include "utils.h" #include "bootstrap.h" #include "net.h" -#include "socket.h" #include <unistd.h> #include <sys/types.h> +#include "proxy.h" /* Init functions */ static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1]; -static union socketAddress bootstrapNetIfAddr; +static union ncclSocketAddress bootstrapNetIfAddr; static int bootstrapNetInitDone = 0; pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER; @@ -25,17 +25,17 @@ ncclResult_t bootstrapNetInit() { if (bootstrapNetInitDone == 0) { char* env = getenv("NCCL_COMM_ID"); if (env) { - union socketAddress remoteAddr; - if (GetSocketAddrFromString(&remoteAddr, env) != ncclSuccess) { + union ncclSocketAddress remoteAddr; + if (ncclGetSocketAddrFromString(&remoteAddr, env) != ncclSuccess) { WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>"); return ncclInvalidArgument; } - if (findInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) { + if (ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) { WARN("NET/Socket : No usable listening interface found"); return ncclSystemError; } } else { - int nIfs = findInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1); + int nIfs = ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1); if (nIfs <= 0) { WARN("Bootstrap : no socket interface found"); return ncclInternalError; @@ -43,7 +43,7 @@ ncclResult_t bootstrapNetInit() { } char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2]; sprintf(line, " %s:", bootstrapNetIfName); - socketToString(&bootstrapNetIfAddr, line+strlen(line)); + ncclSocketToString(&bootstrapNetIfAddr, line+strlen(line)); INFO(NCCL_INIT, "Bootstrap : Using%s", line); bootstrapNetInitDone = 1; } @@ -55,35 +55,28 @@ ncclResult_t bootstrapNetInit() { /* Socket Interface Selection type */ enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 }; -static ncclResult_t bootstrapNetAccept(int listenFd, int* recvFd, union socketAddress *addr) { - struct sockaddr *saddr = &addr->sa; - socklen_t socklen = sizeof(union socketAddress); - SYSCHECKVAL(accept(listenFd, saddr, &socklen), "accept", *recvFd); - return ncclSuccess; -} - // Additional sync functions -static ncclResult_t bootstrapNetSend(int fd, union socketAddress *addr, void* data, int size) { - NCCLCHECK(socketSend(fd, addr, &size, sizeof(int))); - NCCLCHECK(socketSend(fd, addr, data, size)); +static ncclResult_t bootstrapNetSend(struct ncclSocket* sock, void* data, int size) { + NCCLCHECK(ncclSocketSend(sock, &size, sizeof(int))); + NCCLCHECK(ncclSocketSend(sock, data, size)); return ncclSuccess; } -static ncclResult_t bootstrapNetRecv(int fd, union socketAddress *addr, void* data, int size) { +static ncclResult_t bootstrapNetRecv(struct ncclSocket* sock, void* data, int size) { int recvSize; - NCCLCHECK(socketRecv(fd, addr, &recvSize, sizeof(int))); + NCCLCHECK(ncclSocketRecv(sock, &recvSize, sizeof(int))); if (recvSize > size) { WARN("Message truncated : received %d bytes instead of %d", recvSize, size); return ncclInternalError; } - NCCLCHECK(socketRecv(fd, addr, data, std::min(recvSize, size))); + NCCLCHECK(ncclSocketRecv(sock, data, std::min(recvSize, size))); return ncclSuccess; } struct extInfo { int rank; int nranks; - union socketAddress extAddressListenRoot; - union socketAddress extAddressListen; + union ncclSocketAddress extAddressListenRoot; + union ncclSocketAddress extAddressListen; }; #include <sys/resource.h> @@ -97,24 +90,24 @@ static ncclResult_t setFilesLimit() { } static void *bootstrapRoot(void* args) { - int listenFd = (uint64_t)args; + struct ncclSocket* listenSock = (struct ncclSocket*)args; ncclResult_t res = ncclSuccess; int nranks = 0, c = 0; struct extInfo info; - union socketAddress *rankAddresses = NULL; - union socketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange - union socketAddress *zero = NULL; + union ncclSocketAddress *rankAddresses = NULL; + union ncclSocketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange + union ncclSocketAddress *zero = NULL; NCCLCHECKGOTO(ncclCalloc(&zero, 1), res, out); setFilesLimit(); TRACE(NCCL_INIT, "BEGIN"); /* Receive addresses from all ranks */ do { - int tmpFd; - union socketAddress addr; - NCCLCHECKGOTO(bootstrapNetAccept(listenFd, &tmpFd, &addr), res, out); - NCCLCHECKGOTO(bootstrapNetRecv(tmpFd, &addr, &info, sizeof(info)), res, out); - close(tmpFd); + struct ncclSocket sock; + sock.abortFlag = NULL; + NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out); + NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out); + close(sock.fd); if (c == 0) { nranks = info.nranks; @@ -127,14 +120,14 @@ static void *bootstrapRoot(void* args) { goto out; } - if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union socketAddress)) != 0) { + if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union ncclSocketAddress)) != 0) { WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks); goto out; } // Save the connection handle for that rank - memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union socketAddress)); - memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union socketAddress)); + memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union ncclSocketAddress)); + memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union ncclSocketAddress)); ++c; TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks); @@ -144,15 +137,18 @@ static void *bootstrapRoot(void* args) { // Send the connect handle for the next rank in the AllGather ring for (int r=0; r<nranks; ++r) { int next = (r+1) % nranks; - int tmpSendFd; - NCCLCHECKGOTO(connectAddress(&tmpSendFd, rankAddressesRoot+r), res, out); - NCCLCHECKGOTO(bootstrapNetSend(tmpSendFd, rankAddressesRoot+r, rankAddresses+next, sizeof(union socketAddress)), res, out); - close(tmpSendFd); + struct ncclSocket sock; + sock.abortFlag = NULL; + memcpy(&sock.addr, rankAddressesRoot+r, sizeof(union ncclSocketAddress)); + NCCLCHECKGOTO(ncclSocketConnect(&sock), res, out); + NCCLCHECKGOTO(bootstrapNetSend(&sock, rankAddresses+next, sizeof(union ncclSocketAddress)), res, out); + close(sock.fd); } TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks); out: - close(listenFd); + close(listenSock->fd); + free(listenSock); if (rankAddresses) free(rankAddresses); if (rankAddressesRoot) free(rankAddressesRoot); if (zero) free(zero); @@ -162,28 +158,31 @@ out: } ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) { - union socketAddress* connectAddr = (union socketAddress*) id; - int listenFd; - NCCLCHECK(createListenSocket(&listenFd, connectAddr)); + struct ncclSocket* listenSock; + NCCLCHECK(ncclCalloc(&listenSock, 1)); + memcpy(&listenSock->addr, id, sizeof(union ncclSocketAddress)); + NCCLCHECK(ncclSocketListen(listenSock)); + memcpy(id, &listenSock->addr, sizeof(union ncclSocketAddress)); pthread_t thread; - pthread_create(&thread, NULL, bootstrapRoot, (void*)(uint64_t)listenFd); + pthread_create(&thread, NULL, bootstrapRoot, (void*)listenSock); + ncclSetThreadName(thread, "NCCL BootstrapR"); return ncclSuccess; } ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) { - static_assert(sizeof(union socketAddress) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId"); + static_assert(sizeof(union ncclSocketAddress) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId"); memset(id, 0, sizeof(ncclUniqueId)); - union socketAddress* connectAddr = (union socketAddress*) id; + union ncclSocketAddress* connectAddr = (union ncclSocketAddress*) id; char* env = getenv("NCCL_COMM_ID"); if (env) { INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env); - if (GetSocketAddrFromString(connectAddr, env) != ncclSuccess) { + if (ncclGetSocketAddrFromString(connectAddr, env) != ncclSuccess) { WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>"); return ncclInvalidArgument; } } else { - memcpy(id, &bootstrapNetIfAddr, sizeof(union socketAddress)); + memcpy(id, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress)); NCCLCHECK(bootstrapCreateRoot(id, false)); } @@ -193,157 +192,51 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) { struct unexConn { int peer; int tag; - int fd; - union socketAddress addr; + struct ncclSocket sock; struct unexConn* next; }; -// Remote allocator state -struct remAllocState { - int cudaDev; - int listenFd; - volatile int stop; -}; - -struct extState { - int extListenFd; - int extRingRecvFd; - int extRingSendFd; - union socketAddress extRingRecvAddr, extRingSendAddr; - union socketAddress* peerCommAddresses; - union socketAddress* peerAllocAddresses; +struct bootstrapState { + struct ncclSocket listenSock; + struct ncclSocket ringRecvSocket; + struct ncclSocket ringSendSocket; + union ncclSocketAddress* peerCommAddresses; + union ncclSocketAddress* peerProxyAddresses; struct unexConn* unexpectedConnections; int cudaDev; int rank; int nranks; - - // Intermediate memory allocation service - struct remAllocState* allocState; - pthread_t allocThread; + volatile uint32_t *abortFlag; }; -#define MAX_SEGMENTS 128 - -static ncclResult_t remoteAlloc(void** ptr, int fd, union socketAddress *addr) { - size_t size; - NCCLCHECK(socketRecv(fd, addr, &size, sizeof(size_t))); - cudaIpcMemHandle_t devIpc; - NCCLCHECK(ncclCudaCalloc((char**)ptr, size)); - cudaError_t res = cudaIpcGetMemHandle(&devIpc, *ptr); - if (res != cudaSuccess) { - WARN("[Rem Allocator] cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res)); - cudaFree(*ptr); - CUDACHECK(res); - } - // The CUDA IPC - NCCLCHECK(socketSend(fd, addr, &devIpc, sizeof(cudaIpcMemHandle_t))); - // And the direct pointer - NCCLCHECK(socketSend(fd, addr, ptr, sizeof(void*))); - return ncclSuccess; -} - -#include <poll.h> - -// Service thread to allocate memory for other GPUs, used as intermediate step. -void* ncclRemoteMemAllocationService(void* args) { - struct remAllocState* state = (struct remAllocState *) args; - if (cudaSetDevice(state->cudaDev) != cudaSuccess) { - WARN("[Rem Allocator] Failed to set CUDA device %d", state->cudaDev); - } - - // Prepare poll descriptor - void* segments[MAX_SEGMENTS]; - struct pollfd pollfds[MAX_SEGMENTS+1]; - for (int s=0; s<MAX_SEGMENTS; s++) segments[s] = NULL; - for (int s=0; s<MAX_SEGMENTS; s++) { - pollfds[s].fd = -1; - pollfds[s].events = POLLIN; - } - pollfds[MAX_SEGMENTS].fd = state->listenFd; - pollfds[MAX_SEGMENTS].events = POLLIN; - - int nbuffers = 0; - while (state->stop == 0 || (state->stop == 1 && nbuffers > 0)) { - if (int error = poll(pollfds, MAX_SEGMENTS+1, 100/*ms*/) < 0) { - WARN("[Rem Allocator] Poll failed with error %d", error); - return NULL; - } - if (pollfds[MAX_SEGMENTS].revents) { - int s = 0; - union socketAddress addr; - while (segments[s] != NULL && s < MAX_SEGMENTS) s++; - if (bootstrapNetAccept(pollfds[MAX_SEGMENTS].fd, &pollfds[s].fd, &addr) != ncclSuccess) { - pollfds[s].fd = -1; - } else { - if (s == MAX_SEGMENTS || (remoteAlloc(segments+s, pollfds[s].fd, &addr) != ncclSuccess)) { - WARN("[Rem Allocator] Allocation failed (segment %d, fd %d)", s, pollfds[s].fd); - close(pollfds[s].fd); - pollfds[s].fd = -1; - } else { - nbuffers++; - } - } - } - for (int s=0; s<MAX_SEGMENTS; s++) { - if (pollfds[s].revents & (POLLIN|POLLHUP)) { - if (cudaFree(segments[s]) != cudaSuccess) { - WARN("[Rem Allocator] cudaFree %p failed", segments[s]); - } - segments[s] = NULL; - close(pollfds[s].fd); - pollfds[s].fd = -1; - nbuffers--; - } - } - } - for (int s=0; s<MAX_SEGMENTS; s++) { - if (segments[s]) cudaFree(segments[s]); - close(pollfds[s].fd); - } - close(state->listenFd); - free(state); - return NULL; -} - -ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, cudaIpcMemHandle_t* ipc, void** ptr) { - struct extState* state = (struct extState*)commState; - int fd; - ncclResult_t res; - *id = -1; - union socketAddress *addr = state->peerAllocAddresses+rank; - NCCLCHECK(connectAddress(&fd, addr)); - NCCLCHECKGOTO(socketSend(fd, addr, &size, sizeof(size_t)), res, end); - NCCLCHECKGOTO(socketRecv(fd, addr, ipc, sizeof(cudaIpcMemHandle_t)), res, end); - NCCLCHECKGOTO(socketRecv(fd, addr, ptr, sizeof(void*)), res, end); - *id = fd; -end: - return res; -} - -ncclResult_t bootstrapRemFree(int id, int rank, void* commState) { - SYSCHECK(close(id), "close"); - return ncclSuccess; -} - -ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState) { - struct extState* state; +ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) { + int rank = comm->rank; + int nranks = comm->nRanks; + struct bootstrapState* state; NCCLCHECK(ncclCalloc(&state, 1)); state->rank = rank; state->nranks = nranks; - *commState = state; + state->abortFlag = comm->abortFlag; + comm->bootstrap = state; TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); struct extInfo info = { 0 }; info.rank = rank; info.nranks = nranks; - int tmpSendFd, tmpRecvFd; + struct ncclSocket sock, listenSockRoot; + sock.abortFlag = listenSockRoot.abortFlag = comm->abortFlag; + sock.asyncFlag = listenSockRoot.asyncFlag = 0; + + // Create socket for other ranks to contact me + memcpy(&state->listenSock.addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress)); + NCCLCHECK(ncclSocketListen(&state->listenSock)); + memcpy(&info.extAddressListen, &state->listenSock.addr, sizeof(union ncclSocketAddress)); - int extListenFdRoot; - memcpy(&info.extAddressListen, &bootstrapNetIfAddr, sizeof(union socketAddress)); - memcpy(&info.extAddressListenRoot, &bootstrapNetIfAddr, sizeof(union socketAddress)); - NCCLCHECK(createListenSocket(&state->extListenFd, &info.extAddressListen)); - NCCLCHECK(createListenSocket(&extListenFdRoot, &info.extAddressListenRoot)); + // Create socket for root to contact me + memcpy(&listenSockRoot.addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress)); + NCCLCHECK(ncclSocketListen(&listenSockRoot)); + memcpy(&info.extAddressListenRoot, &listenSockRoot.addr, sizeof(union ncclSocketAddress)); // stagger connection times to avoid an overload of the root if (nranks > 128) { @@ -356,35 +249,36 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS } // send info on my listening socket to root - union socketAddress* rootAddr = (union socketAddress*)id; - NCCLCHECK(connectAddress(&tmpSendFd, rootAddr)); - NCCLCHECK(bootstrapNetSend(tmpSendFd, rootAddr, &info, sizeof(info))); - close(tmpSendFd); + memcpy(&sock.addr, id, sizeof(union ncclSocketAddress)); + NCCLCHECK(ncclSocketConnect(&sock)); + NCCLCHECK(bootstrapNetSend(&sock, &info, sizeof(info))); + close(sock.fd); // get info on my "next" rank in the bootstrap ring from root - union socketAddress addr; - NCCLCHECK(bootstrapNetAccept(extListenFdRoot, &tmpRecvFd, &addr)); - NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, &state->extRingSendAddr, sizeof(state->extRingSendAddr))); - close(tmpRecvFd); - close(extListenFdRoot); + NCCLCHECK(ncclSocketAccept(&sock, &listenSockRoot)); + NCCLCHECK(bootstrapNetRecv(&sock, &state->ringSendSocket.addr, sizeof(union ncclSocketAddress))); + close(sock.fd); + close(listenSockRoot.fd); - NCCLCHECK(connectAddress(&state->extRingSendFd, &state->extRingSendAddr)); + NCCLCHECK(ncclSocketConnect(&state->ringSendSocket)); // Accept the connect request from the previous rank in the AllGather ring - NCCLCHECK(bootstrapNetAccept(state->extListenFd, &state->extRingRecvFd, &state->extRingRecvAddr)); + NCCLCHECK(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock)); // AllGather all listen handlers NCCLCHECK(ncclCalloc(&state->peerCommAddresses, nranks)); - memcpy(state->peerCommAddresses+rank, &info.extAddressListen, sizeof(union socketAddress)); - NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union socketAddress))); - - // Create the memory allocation service - NCCLCHECK(ncclCalloc(&state->peerAllocAddresses, nranks)); - memcpy(state->peerAllocAddresses+rank, &bootstrapNetIfAddr, sizeof(union socketAddress)); - NCCLCHECK(ncclCalloc(&state->allocState, 1)); - CUDACHECK(cudaGetDevice(&state->allocState->cudaDev)); - NCCLCHECK(createListenSocket(&state->allocState->listenFd, state->peerAllocAddresses+rank)); - pthread_create(&state->allocThread, NULL, ncclRemoteMemAllocationService, state->allocState); - NCCLCHECK(bootstrapAllGather(state, state->peerAllocAddresses, sizeof(union socketAddress))); + memcpy(state->peerCommAddresses+rank, &state->listenSock.addr, sizeof(union ncclSocketAddress)); + NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress))); + + // Create the service proxy + NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks)); + struct ncclSocket* proxySocket; + NCCLCHECK(ncclCalloc(&proxySocket, 1)); + proxySocket->abortFlag = NULL; // proxy is aborted through a message + memcpy(&proxySocket->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress)); + NCCLCHECK(ncclSocketListen(proxySocket)); + memcpy(state->peerProxyAddresses+rank, &proxySocket->addr, sizeof(union ncclSocketAddress)); + NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress))); + NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses)); TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); @@ -392,7 +286,7 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS } ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) { - struct extState* state = (struct extState*)commState; + struct bootstrapState* state = (struct bootstrapState*)commState; char* data = (char*)allData; int rank = state->rank; int nranks = state->nranks; @@ -408,9 +302,9 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) { size_t sslice = (rank - i + nranks) % nranks; // Send slice to the right - NCCLCHECK(bootstrapNetSend(state->extRingSendFd, &state->extRingSendAddr, data+sslice*size, size)); + NCCLCHECK(bootstrapNetSend(&state->ringSendSocket, data+sslice*size, size)); // Recv slice from the left - NCCLCHECK(bootstrapNetRecv(state->extRingRecvFd, &state->extRingRecvAddr, data+rslice*size, size)); + NCCLCHECK(bootstrapNetRecv(&state->ringRecvSocket, data+rslice*size, size)); } TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); @@ -418,14 +312,15 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) { } ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) { - struct extState* state = (struct extState*)commState; - int tmpSendFd; - union socketAddress *addr = state->peerCommAddresses+peer; - NCCLCHECK(connectAddress(&tmpSendFd, addr)); - NCCLCHECK(bootstrapNetSend(tmpSendFd, addr, &state->rank, sizeof(int))); - NCCLCHECK(bootstrapNetSend(tmpSendFd, addr, &tag, sizeof(int))); - NCCLCHECK(bootstrapNetSend(tmpSendFd, addr, data, size)); - close(tmpSendFd); + struct bootstrapState* state = (struct bootstrapState*)commState; + struct ncclSocket sock; + sock.abortFlag = state->abortFlag; + memcpy(&sock.addr, state->peerCommAddresses+peer, sizeof(union ncclSocketAddress)); + NCCLCHECK(ncclSocketConnect(&sock)); + NCCLCHECK(bootstrapNetSend(&sock, &state->rank, sizeof(int))); + NCCLCHECK(bootstrapNetSend(&sock, &tag, sizeof(int))); + NCCLCHECK(bootstrapNetSend(&sock, data, size)); + close(sock.fd); return ncclSuccess; } @@ -466,14 +361,13 @@ ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, return ncclSuccess; } -ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int tag, int fd, union socketAddress *addr) { +ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) { // New unex struct unexConn* unex; NCCLCHECK(ncclCalloc(&unex, 1)); unex->peer = peer; unex->tag = tag; - unex->fd = fd; - unex->addr = *addr; + memcpy(&unex->sock, sock, sizeof(struct ncclSocket)); // Enqueue struct unexConn* list = state->unexpectedConnections; @@ -486,7 +380,7 @@ ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int tag, int fd return ncclSuccess; } -int unexpectedDequeue(struct extState* state, int peer, int tag, union socketAddress *addr) { +ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) { struct unexConn* elem = state->unexpectedConnections; struct unexConn* prev = NULL; while (elem) { @@ -496,79 +390,72 @@ int unexpectedDequeue(struct extState* state, int peer, int tag, union socketAdd } else { prev->next = elem->next; } - int fd = elem->fd; - *addr = elem->addr; + memcpy(sock, &elem->sock, sizeof(struct ncclSocket)); free(elem); - return fd; + return ncclSuccess; } prev = elem; elem = elem->next; } - return -1; + sock->fd = -1; + return ncclSuccess; } // We can't know who we'll receive from, so we need to receive everything at once ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) { - struct extState* state = (struct extState*)commState; + struct bootstrapState* state = (struct bootstrapState*)commState; - int tmpRecvFd; - union socketAddress addr; + struct ncclSocket sock; + sock.abortFlag = state->abortFlag; // Search unexpected connections first - if ((tmpRecvFd = unexpectedDequeue(state, peer, tag, &addr)) != -1) { - NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, ((char*)data), size)); - close(tmpRecvFd); + NCCLCHECK(unexpectedDequeue(state, peer, tag, &sock)); + if (sock.fd != -1) { + NCCLCHECK(bootstrapNetRecv(&sock, ((char*)data), size)); + close(sock.fd); return ncclSuccess; } // Then look for new connections while (1) { - union socketAddress addr; - NCCLCHECK(bootstrapNetAccept(state->extListenFd, &tmpRecvFd, &addr)); + NCCLCHECK(ncclSocketAccept(&sock, &state->listenSock)); int newPeer, newTag; - NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, &newPeer, sizeof(int))); - NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, &newTag, sizeof(int))); + NCCLCHECK(bootstrapNetRecv(&sock, &newPeer, sizeof(int))); + NCCLCHECK(bootstrapNetRecv(&sock, &newTag, sizeof(int))); if (newPeer == peer && newTag == tag) { - NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, ((char*)data), size)); - close(tmpRecvFd); + NCCLCHECK(bootstrapNetRecv(&sock, ((char*)data), size)); + close(sock.fd); return ncclSuccess; } // Unexpected connection. Save for later. - NCCLCHECK(unexpectedEnqueue(state, newPeer, newTag, tmpRecvFd, &addr)); + NCCLCHECK(unexpectedEnqueue(state, newPeer, newTag, &sock)); } } ncclResult_t bootstrapClose(void* commState) { - struct extState* state = (struct extState*)commState; + struct bootstrapState* state = (struct bootstrapState*)commState; if (state->unexpectedConnections != NULL) { WARN("Unexpected connections are not empty"); return ncclInternalError; } - close(state->extListenFd); - close(state->extRingSendFd); - close(state->extRingRecvFd); - - state->allocState->stop = 1; - - // Join the allocThread so we catch resource leaks as being hung here - // pthread_join(state->allocThread, nullptr); + close(state->listenSock.fd); + close(state->ringSendSocket.fd); + close(state->ringRecvSocket.fd); free(state->peerCommAddresses); - free(state->peerAllocAddresses); free(state); return ncclSuccess; } ncclResult_t bootstrapAbort(void* commState) { - struct extState* state = (struct extState*)commState; + struct bootstrapState* state = (struct bootstrapState*)commState; if (commState == NULL) return ncclSuccess; - if (state->extListenFd) close(state->extListenFd); - if (state->extRingSendFd) close(state->extRingSendFd); - if (state->extRingRecvFd) close(state->extRingRecvFd); - if (state->allocState) state->allocState->stop = 2; + if (state->listenSock.fd) close(state->listenSock.fd); + if (state->ringSendSocket.fd) close(state->ringSendSocket.fd); + if (state->ringRecvSocket.fd) close(state->ringRecvSocket.fd); free(state->peerCommAddresses); - free(state->peerAllocAddresses); + free(state->peerProxyAddresses); free(state); return ncclSuccess; } diff --git a/src/channel.cc b/src/channel.cc index a07e38a..87cec65 100644 --- a/src/channel.cc +++ b/src/channel.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -64,13 +64,13 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) { for (int r=0; r<nRanks+1; r++) { struct ncclPeer* peer = channel->peers+r; for (int b=0; b<NCCL_MAX_CONNS; b++) { - if (peer->send[b].transportResources) NCCLCHECK(peer->send[b].transportComm->free(peer->send[b].transportResources)); + if (peer->send[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b)); } } for (int r=0; r<nRanks+1; r++) { struct ncclPeer* peer = channel->peers+r; for (int b=0; b<NCCL_MAX_CONNS; b++) { - if (peer->recv[b].transportResources) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv[b].transportResources)); + if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b)); } } diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h index 83b0da9..c86384c 100644 --- a/src/collectives/device/all_gather.h +++ b/src/collectives/device/all_gather.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -12,9 +12,9 @@ namespace { template<typename T, typename RedOp, typename Proto> __device__ __forceinline__ void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int bid = args->coll.bid; - const int nChannels = args->coll.nChannels; + const int nthreads = args->header.nWarps*WARP_SIZE; + const int bid = args->bid; + const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem.channel.ring; const int *ringRanks = ring->devUserRanks; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1)); @@ -22,12 +22,12 @@ namespace { const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2); const int nranks = ncclShmem.comm.nRanks; const ssize_t loopSize = nChannels*int(chunkSize); - const ssize_t size = args->coll.count; + const ssize_t size = args->count; T *inputBuf = (T*)args->sendbuff; T *outputBuf = (T*)args->recvbuff; - Primitives<T, RedOp, FanSymmetric<1>, 1, Proto> - prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->coll.redOpArg); + Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims + (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t realChunkSize; @@ -36,7 +36,7 @@ namespace { realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T)); } else if (Proto::Id == NCCL_PROTO_LL) - realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize; + realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize; else if (Proto::Id == NCCL_PROTO_LL128) realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128); realChunkSize = int(realChunkSize); diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h index c3171bf..41ef255 100644 --- a/src/collectives/device/all_reduce.h +++ b/src/collectives/device/all_reduce.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -12,15 +12,15 @@ namespace { template<typename T, typename RedOp, typename Proto> __device__ __forceinline__ void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int bid = args->coll.bid; - const int nChannels = args->coll.nChannels; + const int nthreads = args->header.nWarps*WARP_SIZE; + const int bid = args->bid; + const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem.channel.ring; int ringIx = ring->index; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLREDUCE_CHUNKSTEPS : 1)); const int nranks = ncclShmem.comm.nRanks; const ssize_t loopSize = nChannels*nranks*chunkSize; - const ssize_t size = args->coll.count; + const ssize_t size = args->count; int minChunkSize; if (Proto::Id == NCCL_PROTO_LL) @@ -30,8 +30,8 @@ namespace { minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2; } - Primitives<T, RedOp, FanSymmetric<1>, 1, Proto> prims - (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg); + Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims + (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t realChunkSize; @@ -97,25 +97,25 @@ namespace { template<typename T, typename RedOp, typename Proto> __device__ __forceinline__ void runTreeUpDown(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int bid = args->coll.bid; - const int nChannels = args->coll.nChannels; + const int nthreads = args->header.nWarps*WARP_SIZE; + const int bid = args->bid; + const int nChannels = args->nChannels; ncclTree *tree = &ncclShmem.channel.tree; ssize_t chunkSize = int( - Proto::Id == NCCL_PROTO_SIMPLE ? args->coll.lastChunkSize + Proto::Id == NCCL_PROTO_SIMPLE ? args->lastChunkSize /* LL & LL128 */ : Proto::calcBytePerStep()/sizeof(T)); const ssize_t minChunkSize = int( Proto::Id == NCCL_PROTO_SIMPLE ? (nthreads-2*WARP_SIZE)*8*(sizeof(uint64_t)/sizeof(T)) /* LL & LL128 */ : nthreads*(Proto::calcBytePerGrain()/sizeof(T))); const ssize_t loopSize = int(nChannels*chunkSize); - const ssize_t size = args->coll.count; + const ssize_t size = args->count; if (loopSize > size) chunkSize = divUp((int)size, int(nChannels*minChunkSize))*int(minChunkSize); { // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) - Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto> prims - (tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->coll.redOpArg); + Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto, 0> prims + (tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg); if (tree->up == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); @@ -140,8 +140,8 @@ namespace { } { // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) - Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto> prims - (tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg); + Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto, 0> prims + (tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg); if (tree->up == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); @@ -169,19 +169,19 @@ namespace { template<typename T, typename RedOp, typename Proto> __device__ __forceinline__ void runTreeSplit(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int bid = args->coll.bid; - const int nChannels = args->coll.nChannels; + const int nthreads = args->header.nWarps*WARP_SIZE; + const int bid = args->bid; + const int nChannels = args->nChannels; ncclTree *tree = &ncclShmem.channel.tree; ssize_t chunkSize = int( - Proto::Id != NCCL_PROTO_LL ? args->coll.lastChunkSize + Proto::Id != NCCL_PROTO_LL ? args->lastChunkSize : Proto::calcBytePerStep()/sizeof(T)); const ssize_t minChunkSize = int( Proto::Id == NCCL_PROTO_SIMPLE ? (nthreads - 2*WARP_SIZE)*8*(sizeof(uint64_t)/sizeof(T)) : Proto::Id == NCCL_PROTO_LL ? nthreads*(Proto::calcBytePerGrain()/sizeof(T)) /* LL128 */ : nthreads*(Proto::calcBytePerGrain()/sizeof(T))/8); const ssize_t loopSize = int(nChannels*chunkSize); - const ssize_t size = args->coll.count; + const ssize_t size = args->count; int nthreadsSplit; if (Proto::Id == NCCL_PROTO_SIMPLE) { @@ -198,8 +198,8 @@ namespace { if (tree->up == -1) { // Reduce and broadcast. Max number of recv is 3, max number of send is 3 - Primitives<T, RedOp, FanSymmetric<NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto> - prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg); + Primitives<T, RedOp, FanSymmetric<NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto, 0> + prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->redOpArg); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); int nelem = min(chunkSize, size-offset); @@ -215,8 +215,8 @@ namespace { * into DirectRecv and DirectSend capabilities, this ctor would have both=0, * but the ctor above for tree roots would be DirectRecv=0 DirectSend=1. */ - Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/1, Proto> - prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->coll.redOpArg, 0*Proto::MaxGroupWidth); + Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/1, Proto, 0> + prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg, 0*Proto::MaxGroupWidth); if (tree->down[0] == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); @@ -234,8 +234,8 @@ namespace { } else { // Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local) - Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto> - prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg, 1*Proto::MaxGroupWidth); + Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto, 0> + prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, 1*Proto::MaxGroupWidth); if (tree->down[0] == -1) { for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*int(chunkSize); @@ -278,11 +278,11 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO __device__ __forceinline__ void run(ncclWorkElem *args) { static constexpr int COLLNET_COPY_THREADS = 96; const int tid = threadIdx.x; - const int bid = args->coll.bid; - const int nChannels = args->coll.nChannels; + const int bid = args->bid; + const int nChannels = args->nChannels; struct ncclDirect* tree = &ncclShmem.channel.collTree; - const ssize_t chunkSize = int(args->coll.lastChunkSize); - const ssize_t size = args->coll.count; + const ssize_t chunkSize = int(args->lastChunkSize); + const ssize_t size = args->count; const ssize_t loopSize = nChannels*tree->nHeads*chunkSize; const int hasUp = (tree->up[0] >= 0) ? 1 : 0; @@ -290,7 +290,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO const int nThreadsScatter = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 3*COLLNET_COPY_THREADS : 0); const int nThreadsGather = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0); const int nThreadsBcast = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 2*COLLNET_COPY_THREADS); - const int nThreadsReduce = args->nThreads - nThreadsScatter - nThreadsGather - nThreadsBcast; + const int nThreadsReduce = args->header.nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast; const int tidStartBcast = nThreadsGather; const int tidStartScatter = tidStartBcast + nThreadsBcast; const int tidStartReduce = tidStartScatter + nThreadsScatter; @@ -300,8 +300,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) { // Scatter int group = (2*Proto::MaxGroupWidth) | (1<<16); - Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto> - prims(tid-tidStartScatter, nThreadsScatter, NULL, tree->up, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args); + Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0> + prims(tid-tidStartScatter, nThreadsScatter, NULL, tree->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize; int nelem = min(tree->nHeads*chunkSize, size-offset); @@ -315,8 +315,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO int group = (3*Proto::MaxGroupWidth) | (1<<16); if (hasDn) { // Reduce, send to network - Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto> - prims(tid-tidStartReduce, nThreadsReduce, tree->down, &tree->out, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args); + Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0> + prims(tid-tidStartReduce, nThreadsReduce, tree->down, &tree->out, args->sendbuff, args->recvbuff, args->redOpArg, group, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); @@ -328,8 +328,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO } } else { // Directly send to network - Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto> - prims(tid-tidStartReduce, nThreadsReduce, nullptr, &tree->out, args->sendbuff, args->recvbuff, args->coll.redOpArg, group); + Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0> + prims(tid-tidStartReduce, nThreadsReduce, nullptr, &tree->out, args->sendbuff, args->recvbuff, args->redOpArg, group); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); @@ -339,8 +339,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO } else if (tid < tidStartBcast && hasUp) { // Gather int group = (0*Proto::MaxGroupWidth) | (0<<16); - Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto> - prims(tid, nThreadsGather, tree->up, NULL, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args); + Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto, 0> + prims(tid, nThreadsGather, tree->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize; int nelem = min(tree->nHeads*chunkSize, size-offset); @@ -350,8 +350,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO int group = (1*Proto::MaxGroupWidth) | (0<<16); if (hasDn) { // Recv from network, broadcast - Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto> - prims(tid-tidStartBcast, nThreadsBcast, &tree->out, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args); + Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0> + prims(tid-tidStartBcast, nThreadsBcast, &tree->out, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); @@ -359,8 +359,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO } } else { // Recv from network (no post thread needed) - Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto> - prims(tid-tidStartBcast, nThreadsBcast, &tree->out, nullptr, args->sendbuff, args->recvbuff, args->coll.redOpArg, group); + Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0> + prims(tid-tidStartBcast, nThreadsBcast, &tree->out, nullptr, args->sendbuff, args->recvbuff, args->redOpArg, group); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h index 61c60b9..ba4ef56 100644 --- a/src/collectives/device/broadcast.h +++ b/src/collectives/device/broadcast.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -12,22 +12,22 @@ namespace { template<typename T, typename RedOp, typename Proto> __device__ __forceinline__ void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int bid = args->coll.bid; - const int nChannels = args->coll.nChannels; + const int nthreads = args->header.nWarps*WARP_SIZE; + const int bid = args->bid; + const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem.channel.ring; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? BROADCAST_CHUNKSTEPS : 1)); const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))); const ssize_t loopSize = nChannels*chunkSize; - const ssize_t size = args->coll.count; + const ssize_t size = args->count; const int rank = ring->devUserRanks[0]; const int nextRank = ring->devUserRanks[1]; - const int root = args->coll.root; + const int root = args->root; T *inputBuf = (T*)args->sendbuff; T *outputBuf = (T*)args->recvbuff; - Primitives<T, RedOp, FanSymmetric<1>, 0, Proto> - prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->coll.redOpArg); + Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> + prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t realChunkSize; @@ -36,7 +36,7 @@ namespace { realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T)); } else if (Proto::Id == NCCL_PROTO_LL) - realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize; + realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize; else if (Proto::Id == NCCL_PROTO_LL128) realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128); realChunkSize = int(realChunkSize); diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index ff410d7..40a2303 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,6 +9,7 @@ #include "collectives.h" #include "devcomm.h" +#include "op128.h" #if __CUDA_ARCH__ >= 800 #define COLL_UNROLL 8 @@ -23,11 +24,31 @@ __device__ inline bool barrierReduceAny(int bit) { asm ("{" ".reg .pred barr_pred;" "setp.eq.u32 barr_pred, %1, 1;" - "bar.red.popc.u32 %0, 0, barr_pred;" + "bar.red.popc.u32 %0, 2, barr_pred;" "}" : "=r"(popc) : "r"(bit)); return popc != 0; } +// Copy src to dst and fill extra size with zeroes +template<typename Tdst, typename Tsrc> +__device__ void copyToShmem(Tdst *dst, Tsrc const *src, int tid, int nthreads) { + static_assert(sizeof(Tdst)%(2*sizeof(uint64_t)) == 0 && sizeof(Tsrc)%(2*sizeof(uint64_t)) == 0, + "copyToShmem needs sizes which are multiple of 16B"); + static_assert(sizeof(Tdst) >= sizeof(Tsrc), "Tdst size is too small"); + static_assert(sizeof(Tdst) <= WARP_SIZE*2*sizeof(uint64_t), "copyToShmem limited to 512B to make sure it can always be done in one cycle"); + uint64_t *d = reinterpret_cast<uint64_t*>(dst); + uint64_t const *s = reinterpret_cast<uint64_t const*>(src); + uint64_t *shmemPtr = shmemCvtPtr(d); + int offset = 2*tid; + uint64_t v0, v1; + if (offset >= sizeof(Tsrc)/sizeof(uint64_t)) { + v0 = v1 = 0ULL; + } else { + v0 = s[offset] ; v1 = s[offset+1]; + } + if (offset < sizeof(Tdst)/sizeof(uint64_t)) storeShmem128(shmemPtr+offset, v0, v1); +} + template<typename T> __device__ int copyToShmem(T *dst, T const *src, int turn=0) { static_assert(sizeof(uint64_t) <= alignof(T), "Uhoh"); @@ -67,41 +88,16 @@ struct RunWorkElement { } }; -#if CUDART_VERSION >= 11030 -__device__ constexpr int ncclWorkElemFactors[NCCL_NUM_ALGORITHMS] = -#else -static __device__ __constant__ int ncclWorkElemFactors[NCCL_NUM_ALGORITHMS] = -#endif -{/*Tree*/1, /*Ring and P2P*/1, /*CollNet*/NCCL_REG_ELEM_FACTOR}; - template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto> struct RunWork { // This __forceinline__ is necessary. The compiler was inserting a function call // here from the LL ncclKernel. __device__ __forceinline__ void run(ncclWork *w) { - int tid = threadIdx.x; - /* Some invariants that must hold: - * 1. All elems[] have same funcIndex. - * 2. All elems[] have same nThreads. - * 3. The thread-to-group relation (as in prims group numbers) is the same - * for all elems[]. - * - * If (1) isn't true then we might be in the wrong function since dispatch - * on ncclFuncs[w->funcIndex] is how we got here. - * - * If (2) or (3) aren't true, then threads from different work elements - * could race for barrier resources (barrier numbers 0...15) which is fatal. - * - * IMPORTANT!!! To ensure (3), implementations of - * `RunWorkElement<Fn,T,RedOp,Algo,Proto>::run()` may only use the following - * when deciding how to map threads to groups: - * Fn, T, RedOp, Algo, Proto, nThreads - * - * This last one is difficult to enforce so I hope everyone reads this. - */ - if (tid < w->elems[0].nThreads) { - #pragma unroll 1 - for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].active != 0; e+=ncclWorkElemFactors[Algo]) + int wid = threadIdx.x / WARP_SIZE; + int inc = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) / sizeof(ncclWorkElem) : 1; + #pragma unroll 1 + for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e += inc) { + if (wid < w->header.nWarps) RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(&w->elems[e]); } } @@ -124,30 +120,51 @@ struct ncclShmemData { struct ncclShmemGroup groups[NCCL_MAX_GROUPS]; }; uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1]; - ncclDevComm comm; - ncclChannel channel; - ncclWork work; + struct ncclDevComm comm; + struct ncclChannel channel; + uint64_t pad; + struct ncclWork work; }; +static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned"); + +static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) { + if (we->header.type != ncclWorkTypeUnused && we->redOpArgIsPtr) { + /* redOpArg is a pointer to the scalar value, so we'll dereference it + * here so that redOpArg holds the bits of the scalar going forward. + * The tricky thing is we don't know its type T since that's encoded in + * the funcIndex. Because it would be difficult to get sizeof(T) from + * funcIndex, we'll cheat and just dereference the largest possible size + * given the alignment of the pointer. We might be reading in more bytes + * than we need but that's harmless. + */ + if (we->redOpArg%2 != 0) + we->redOpArg = *reinterpret_cast<uint8_t*>(we->redOpArg); + else if (we->redOpArg%4 != 0) + we->redOpArg = *reinterpret_cast<uint16_t*>(we->redOpArg); + else if (we->redOpArg%8 != 0) + we->redOpArg = *reinterpret_cast<uint32_t*>(we->redOpArg); + else + we->redOpArg = *reinterpret_cast<uint64_t*>(we->redOpArg); + } +} extern __shared__ ncclShmemData ncclShmem; template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex> -__device__ void ncclKernel(ncclWorkElem first) { +__device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) { int tid = threadIdx.x; + int nthreads = blockDim.x; int bid = blockIdx.x; - int turn = copyToShmem(&ncclShmem.comm, first.comm); + int turn = copyToShmem(&ncclShmem.comm, comm); // get address of channel without incurring indirect load from ncclDevCom::channels - ncclChannel *channel = &((ncclDevCommAndChannels*)first.comm)->channels[bid]; + ncclChannel *channel = &((ncclDevCommAndChannels*)comm)->channels[bid]; turn = copyToShmem(&ncclShmem.channel, channel, turn); // To optimize for latency, (only) the first operation is passed as argument. - if (bid == 0 && first.active != 0) { - turn = copyToShmem(&ncclShmem.work.elems[0], &first, turn); - if (1 <= tid && tid < NCCL_MAX_WORK_ELEMENTS && tid % ncclWorkElemFactors[Algo] == 0) { - ncclShmem.work.elems[tid].active = 0; - ncclShmem.work.elems[tid].redOpArgIsPtr = 0; - } + if (bid == 0 && first.header.type != ncclWorkTypeUnused) { + // Copy first elem to work and zero out the rest + copyToShmem(&ncclShmem.work, &first, tid, nthreads); } __syncthreads(); // publish ncclShmem @@ -155,17 +172,17 @@ __device__ void ncclKernel(ncclWorkElem first) { ncclWork *workFifoDev = ncclShmem.channel.workFifoDev; int workFifoIx = ncclShmem.channel.index; - if (bid == 0 && first.active != 0) + if (bid == 0 && first.header.type != ncclWorkTypeUnused) goto SkipLoadWork; while (true) { - copyToShmem(&ncclShmem.work, &workFifoDev[workFifoIx]); // turn no longer helps + copyToShmem(&ncclShmem.work, &workFifoDev[workFifoIx], tid, nthreads); { // Check whether the last operation was aborted and make sure all threads exit - int aborted = tid == 0 ? *ncclShmem.comm.abortFlag : 0; + int aborted = tid == 0 ? *comm->abortFlag : 0; if (barrierReduceAny(aborted)) // publish ncclShmem.work break; if (tid == 0) - workFifoHost[workFifoIx].elems[0].active = 0; + workFifoHost[workFifoIx].header.type = ncclWorkTypeUnused; } SkipLoadWork: @@ -173,36 +190,20 @@ __device__ void ncclKernel(ncclWorkElem first) { if (tid == 0) channel->index = workFifoIx; // write back to real channel, not shmem shadow - if (tid < NCCL_MAX_WORK_ELEMENTS && tid % ncclWorkElemFactors[Algo] == 0) { - ncclWorkElem *we = &ncclShmem.work.elems[tid]; - if (we->redOpArgIsPtr && we->active != 0) { - /* redOpArg is a pointer to the scalar value, so we'll dereference it - * here so that redOpArg holds the bits of the scalar going forward. - * The tricky thing is we don't know its type T since that's encoded in - * the funcIndex. Because it would be difficult to get sizeof(T) from - * funcIndex, we'll cheat and just dereference the largest possible size - * given the alignment of the pointer. We might be reading in more bytes - * than we need but that's harmless. - */ - if (we->coll.redOpArg%2 != 0) - we->coll.redOpArg = *reinterpret_cast<uint8_t*>(we->coll.redOpArg); - else if (we->coll.redOpArg%4 != 0) - we->coll.redOpArg = *reinterpret_cast<uint16_t*>(we->coll.redOpArg); - else if (we->coll.redOpArg%8 != 0) - we->coll.redOpArg = *reinterpret_cast<uint32_t*>(we->coll.redOpArg); - else - we->coll.redOpArg = *reinterpret_cast<uint64_t*>(we->coll.redOpArg); - } + __syncwarp(); + if (ncclShmem.work.header.type == ncclWorkTypeColl) { + if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&ncclShmem.work.elems[tid]); + } else if (ncclShmem.work.header.type == ncclWorkTypeRegColl) { + if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&ncclShmem.work.regElems[tid].elem); } __syncthreads(); - if (ncclShmem.work.elems[0].funcIndex == FnIndex) + if (ncclShmem.work.header.funcIndex == FnIndex) RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem.work); else - ncclFuncs[ncclShmem.work.elems[0].funcIndex](); + ncclFuncs[ncclShmem.work.header.funcIndex](); - if (ncclShmem.work.elems[0].active == 2) - break; + if (ncclShmem.work.header.isLast) break; __syncthreads(); } } @@ -210,8 +211,8 @@ __device__ void ncclKernel(ncclWorkElem first) { // Only generate kernels for SUM #if NCCL_OP == 0 #define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \ -__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(ncclWorkElem first) { \ - ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex>(first); \ +__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, struct ncclWorkElem first) { \ + ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex>(comm, first); \ } #else #define IMPL_COLL_KERN(func, algo, proto, devredop, type, fInded) diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h index dcf1f66..c21d373 100644 --- a/src/collectives/device/common_kernel.h +++ b/src/collectives/device/common_kernel.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -16,10 +16,11 @@ // Define min for ssize_t static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; } -template <typename T> -inline __device__ void loadPtr(void** ptr, T* &v) { - asm volatile("ld.volatile.global.u64 %0, [%1];" - : "=l"(v) : "l"(ptr)); +inline __device__ int loadInt(int* ptr) { + int v; + asm volatile("ld.volatile.global.u32 %0, [%1];" + : "=r"(v) : "l"(ptr)); + return v; } typedef uint64_t PackType; diff --git a/src/collectives/device/onerank_reduce.cu b/src/collectives/device/onerank_reduce.cu index f451582..b7dc3e9 100644 --- a/src/collectives/device/onerank_reduce.cu +++ b/src/collectives/device/onerank_reduce.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -16,11 +16,11 @@ namespace { int tid = threadIdx.x; int tn = blockDim.x; #pragma unroll 1 - for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].active != 0; e++) { + for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e++) { ncclWorkElem *we = &w->elems[e]; - intptr_t eltN = we->coll.count; - int bid = we->coll.bid; - int bn = we->coll.nChannels; + intptr_t eltN = we->count; + int bid = we->bid; + int bn = we->nChannels; T const *src = (T const*)we->sendbuff; T *dst = (T*)we->recvbuff; @@ -36,7 +36,7 @@ namespace { src += i0; dst += i0; ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 1> - (tid, tn, &(we->coll.redOpArg), true, 1, &src, 1, &dst, i1-i0); + (tid, tn, &(we->redOpArg), true, 1, &src, 1, &dst, i1-i0); } } } diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h index 8f63447..ccc0d22 100644 --- a/src/collectives/device/primitives.h +++ b/src/collectives/device/primitives.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -109,7 +109,7 @@ struct FanSymmetric { }; // The primitives class. Specialized per protocol in the other headers. -template<typename T, typename RedOp, typename Fan, int Direct, typename Proto> +template<typename T, typename RedOp, typename Fan, int Direct, typename Proto, int P2p> class Primitives; // Used by LL & LL128 to implement direct members in the naive way. diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h index 8fa84e5..afed3df 100644 --- a/src/collectives/device/prims_ll.h +++ b/src/collectives/device/prims_ll.h @@ -1,12 +1,12 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -template<typename T, typename RedOp, typename Fan, int Direct> -class Primitives<T, RedOp, Fan, Direct, ProtoLL>: - public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL>> { +template<typename T, typename RedOp, typename Fan, int Direct, int P2p> +class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>: + public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>> { static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend; static constexpr int Input=0, Output=1; @@ -41,7 +41,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL>: inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); } inline __device__ void barrier() { - asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(1+group)); + asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group)); } uint32_t abort = 0; diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h index 3c049d1..8090385 100644 --- a/src/collectives/device/prims_ll128.h +++ b/src/collectives/device/prims_ll128.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,9 +8,9 @@ #define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1) -template<typename T, typename RedOp, typename Fan, int Direct> -class Primitives<T, RedOp, Fan, Direct, ProtoLL128>: - public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL128>> { +template<typename T, typename RedOp, typename Fan, int Direct, int P2p> +class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>: + public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>> { static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend; static constexpr int Input=0, Output=1; @@ -49,7 +49,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128>: inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; } inline __device__ void barrier() { - asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(1+group)); + asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group)); } uint32_t abort = 0; diff --git a/src/collectives/device/prims_simple.h b/src/collectives/device/prims_simple.h index c30ff40..fd61dc4 100644 --- a/src/collectives/device/prims_simple.h +++ b/src/collectives/device/prims_simple.h @@ -1,13 +1,13 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ template<typename T, typename RedOp, typename Fan, int Direct, - int SlicePerChunk, int StepPerSlice, int Unroll> + int SlicePerChunk, int StepPerSlice, int Unroll, int P2p> class Primitives< - T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll> + T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll>, P2p > { static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend; static constexpr int Input=0, Output=1; @@ -18,7 +18,7 @@ class Primitives< RolePostSend = 0x10, RolePostRecv = 0x20, Aborted = 0x40, - PtrsFifoEnabled = 0x80, + OffsFifoEnabled = 0x80, SizesFifoEnabled = 0x100, DirectWrite = 0x200, DirectRead = 0x400, @@ -32,10 +32,10 @@ class Primitives< int flags; int group; uint64_t step; + int *connOffsFifoPtr; // (flags & OffsFifoEnabled) union { - void **connPtrsFifoPtr; // (flags & PtrsFifoEnabled) T *userBuff; // (flags & (RoleInput|RoleOutput)) - T *connEltsFifo; // !(flags & (PtrsFifoEnabled|RoleInput|RoleOutput)) + T *connEltsFifo; // !(flags & (RoleInput|RoleOutput)) }; union { int volatile *connSizesFifoPtr; // (flags & SizesFifoEnabled) @@ -49,14 +49,14 @@ class Primitives< if (nthreads == WARP_SIZE) __syncwarp(); else - asm volatile("bar.sync %0, %1;" :: "r"(group+1), "r"(nthreads)); + asm volatile("bar.sync %0, %1;" :: "r"(15-group), "r"(nthreads)); flags |= ThreadsSynced; } inline __device__ void subBarrier() { if (nworkers == nthreads) barrier(); else - asm volatile("bar.sync %0, %1;" :: "r"(group+2), "r"(nworkers)); + asm volatile("bar.sync %0, %1;" :: "r"(8-group), "r"(nworkers)); } inline __device__ bool checkAbort(int &spins) { @@ -89,8 +89,8 @@ class Primitives< void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst) : (ncclShmem.groups[group].srcs + Src); - if (flags & PtrsFifoEnabled) - loadPtr(connPtrsFifoPtr + step%NCCL_STEPS, ptrs[index]); + if (flags & OffsFifoEnabled) + ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T); else if (isSendNotRecv && DirectSend) { if (flags & DirectWrite) { ptrs[index] = directBuff + remoteIx + offset; @@ -232,6 +232,8 @@ class Primitives< } // Scatter/Gather generic op + // skip: my own rank order in the buffer chunks + // shift: peer offset to avoid all ranks sending to or receiving from same peer template <int DirectRecv1, int DirectSend1, int Recv, int Send> __device__ __forceinline__ void ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int skip, int shift, bool postOp) { @@ -254,14 +256,17 @@ class Primitives< waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize); subBarrier(); #pragma unroll + // Loop over peers for (int j=0; j<fan.nsend(); j++) { int i = (j+shift)%fan.nsend(); int peerOffset = i*peerElem; + // Skip the data I am responsible of reducing myself if (skip >= 0 && i >= skip) peerOffset += peerElem; const T* src0 = (T*)ncclShmem.groups[group].srcs[0] + peerOffset; int realPeerSize = min(realSize, totalElem-peerOffset); if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) { ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpN>(tid, nworkers, ncclShmem.redOpArgs, false, 1, &src0, 1, (T**)ncclShmem.groups[group].dsts+i, realPeerSize); + // Mark for threadfence at the end if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] += realPeerSize; } } @@ -289,6 +294,7 @@ class Primitives< } } barrier(); + // If we indeed send something, threadfence if (Send && (flags & RolePostSend) && ncclShmem.groups[group].totalSendSize[slice] > 0 && index == 0) __threadfence_system(); __syncwarp(); @@ -310,18 +316,18 @@ class Primitives< ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs() connStepPtr = conn->tail; connStepCache = *connStepPtr; - flags |= (conn->ptrsFifo != nullptr) ? PtrsFifoEnabled : 0; + flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0; if (Direct) { // User buffers have been registered if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) { - if (connIndex == 1) { + if (connIndex == 1 && P2p == 0) { flags |= DirectRead; // scatter-reduce use direct pull } else { flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite : (e->direct & NCCL_DIRECT_READ) ? DirectRead : 0; } } else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) { - if (connIndex == 1) { + if (connIndex == 1 && P2p == 0) { flags |= DirectRead; // scatter-reduce use direct pull } else { // direct read not allowed in non-register case @@ -330,10 +336,9 @@ class Primitives< } } } - if (flags & PtrsFifoEnabled) - connPtrsFifoPtr = conn->ptrsFifo; - else - connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; + if (flags & OffsFifoEnabled) + connOffsFifoPtr = conn->offsFifo; + connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; } } } @@ -350,11 +355,10 @@ class Primitives< ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs() connStepPtr = conn->head; connStepCache = *connStepPtr; - flags |= (conn->ptrsFifo != nullptr) ? PtrsFifoEnabled : 0; - if (flags & PtrsFifoEnabled) - connPtrsFifoPtr = conn->ptrsFifo; - else - connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; + flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0; + if (flags & OffsFifoEnabled) + connOffsFifoPtr = conn->offsFifo; + connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; if (conn->sizesFifo != nullptr) { flags |= SizesFifoEnabled; @@ -362,14 +366,14 @@ class Primitives< } else if (Direct) { // User buffers have been registered if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) { - if (connIndex == 1) { + if (connIndex == 1 && P2p == 0) { flags |= DirectRead; // scatter-reduce use direct pull } else { flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite : (e->direct & NCCL_DIRECT_READ) ? DirectRead : 0; } } else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) { - if (connIndex == 1) { + if (connIndex == 1 && P2p == 0) { flags |= DirectRead; // scatter-reduce use direct pull } else { // direct read not allowed in non-register case @@ -427,7 +431,7 @@ class Primitives< loadRecvConn(&ncclShmem.channel.devPeers[peer], connIndex, e); loadSendConn(&ncclShmem.channel.devPeers[peer], connIndex, e); - setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkRegElem*)e); + setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e); } __device__ ~Primitives() { @@ -444,7 +448,7 @@ class Primitives< barrier(); } - __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkRegElem* e) { + __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkElemReg* e) { if (flags & RoleInput) { userBuff = (T*)inputBuf; ncclShmem.redOpArgs[0] = redOpArg; // scaler for local input diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h index fbc5be9..8dc867b 100644 --- a/src/collectives/device/reduce.h +++ b/src/collectives/device/reduce.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -12,21 +12,21 @@ namespace { template<typename T, typename RedOp, typename Proto> __device__ __forceinline__ void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int bid = args->coll.bid; - const int nChannels = args->coll.nChannels; + const int nthreads = args->header.nWarps*WARP_SIZE; + const int bid = args->bid; + const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem.channel.ring; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCE_CHUNKSTEPS : 1)); const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))); const int nranks = ncclShmem.comm.nRanks; const ssize_t loopSize = nChannels*chunkSize; - const ssize_t size = args->coll.count; + const ssize_t size = args->count; const int rank = ncclShmem.comm.rank; const int prevRank = ring->devUserRanks[nranks-1]; - const int root = args->coll.root; + const int root = args->root; - Primitives<T, RedOp, FanSymmetric<1>, 0, Proto> - prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg); + Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> + prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg); auto calcChunkSize = [&]__device__(ssize_t gridOffset)->int { int realChunkSize; @@ -35,7 +35,7 @@ namespace { realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T)); } else if (Proto::Id == NCCL_PROTO_LL) - realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize; + realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize; else if (Proto::Id == NCCL_PROTO_LL128) realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize); return realChunkSize; diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h index 0334448..3f38b1a 100644 --- a/src/collectives/device/reduce_scatter.h +++ b/src/collectives/device/reduce_scatter.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -12,9 +12,9 @@ namespace { template<typename T, typename RedOp, typename Proto> __device__ __forceinline__ void runRing(ncclWorkElem *args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int bid = args->coll.bid; - const int nChannels = args->coll.nChannels; + const int nthreads = args->header.nWarps*WARP_SIZE; + const int bid = args->bid; + const int nChannels = args->nChannels; ncclRing *ring = &ncclShmem.channel.ring; int const *ringRanks = ring->devUserRanks; const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1)); @@ -22,10 +22,10 @@ namespace { const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2); const int nranks = ncclShmem.comm.nRanks; const ssize_t loopSize = nChannels*chunkSize; - const ssize_t size = args->coll.count; + const ssize_t size = args->count; - Primitives<T, RedOp, FanSymmetric<1>, 0, Proto> - prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg); + Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> + prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { ssize_t realChunkSize; @@ -34,7 +34,7 @@ namespace { realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T)); } else if (Proto::Id == NCCL_PROTO_LL) - realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize; + realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize; else if (Proto::Id == NCCL_PROTO_LL128) realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize); realChunkSize = int(realChunkSize); diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h index 76f49c0..be0dbc5 100644 --- a/src/collectives/device/sendrecv.h +++ b/src/collectives/device/sendrecv.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -10,73 +10,67 @@ template<typename T, typename RedOp> struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> { - __device__ __forceinline__ void run(ncclWork *work) { - int tid = threadIdx.x; - int group = 0; - const int rank = ncclShmem.comm.rank; - const int nRanks = ncclShmem.comm.nRanks; - using Proto = ProtoSimple<1, 1>; - - for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s++) { - ncclWorkElem *args = &work->elems[s]; - int nThreadsSegment = args->p2p.nThreads; - if (args->active == 0 || nThreadsSegment == 0) break; - - int nThreadsSplit = (nThreadsSegment - (nThreadsSegment > 128 ? WARP_SIZE : 0))/2; - int groupRecv = group; - group += Proto::calcGroupWidth(/*send=*/false, nThreadsSplit); - int groupSend = group; - group += Proto::calcGroupWidth(/*send=*/true, nThreadsSegment - nThreadsSplit); + __device__ __forceinline__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) { + if (args->peer == ncclShmem.comm.rank) { + struct ncclWorkElemP2p* recvArgs = args-1; + if (args->buff != recvArgs->buff) { + ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&args->buff, 1, (T**)&recvArgs->buff, args->count); + } + } else { + using Proto = ProtoSimple<1, 1>; + ssize_t const count = args->count; + int const chunkSize = args->chunkSize/sizeof(T); + int const peer = args->peer; + Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1> prims + (tid, nthreads, nullptr, &peer, args->buff, nullptr, /*redOpArg(ignored)=*/0, group); + ssize_t offset = 0; + do { + int nelem = min(chunkSize, count-offset); + prims.directSend(offset, offset, nelem); + offset += nelem; + } while(offset < count); + } + } - if (tid < nThreadsSegment) { - // Compute pointers - T const* sendbuff = (const T*)args->sendbuff; - T* recvbuff = (T*)args->recvbuff; - ssize_t const sendCount = args->p2p.sendCount; - ssize_t const recvCount = args->p2p.recvCount; - int const delta = args->p2p.delta; + __device__ __forceinline__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) { + if (args->peer != ncclShmem.comm.rank) { + using Proto = ProtoSimple<1, 1>; + ssize_t const count = args->count; + int const chunkSize = args->chunkSize/sizeof(T); + int const peer = args->peer; + Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1> prims + (tid, nthreads, &peer, nullptr, nullptr, args->buff, /*redOpArg(ignored)=*/0, group); + ssize_t offset = 0; + do { + int nelem = min(chunkSize, count-offset); + prims.directRecv(offset, nelem); + offset += nelem; + } while(offset < count); + } + } - if (delta == 0) { - if (sendbuff != recvbuff) { - ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nThreadsSegment, nullptr, false, 1, &sendbuff, 1, &recvbuff, sendCount); - } - } - else { - if ((tid < nThreadsSplit) && recvCount >= 0) { - int const peer = (rank - delta + nRanks)%nRanks; - int const t0 = 0; - int const nt = nThreadsSplit; - int const chunkSize = args->p2p.recvChunkSize/sizeof(T); - Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto> prims - (tid-t0, nt, &peer, nullptr, nullptr, recvbuff, /*redOpArg(ignored)=*/0, groupRecv); - ssize_t offset = 0; - do { - int nelem = roundUp(chunkSize, nt*(sizeof(uint64_t)/sizeof(T))); - nelem = min(chunkSize, recvCount-offset); - prims.directRecv(offset, nelem); - offset += nelem; - } while(offset < recvCount); - } + __device__ __forceinline__ void run(ncclWork *work) { + struct ncclWorkElemP2p* args = work->p2pElems; + int ngroups = args->ngroups; + int tid = threadIdx.x; + int wid = tid / WARP_SIZE; + // This has to work even for groups of 2.5 warps (which is 8 groups, and means 3 + // warps for send, 2 warps for recv). + // warpStarts were rounded thanks to int division, but for group number we need to round the other way around + // So we mirror wid then mirror again the group. + #define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE) + int group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS; + args += group; + if (args->header.type == ncclWorkTypeUnused) return; - if ((tid >= nThreadsSplit) && sendCount >= 0) { - int const peer = (rank + delta)%nRanks; - int const t0 = nThreadsSplit; - int const nt = nThreadsSegment - nThreadsSplit; - int const chunkSize = args->p2p.sendChunkSize/sizeof(T); - Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto> prims - (tid-t0, nt, nullptr, &peer, sendbuff, nullptr, /*redOpArg(ignored)=*/0, groupSend); - ssize_t offset = 0; - do { - int nelem = roundUp(chunkSize, nt*(sizeof(uint64_t)/sizeof(T))); - nelem = min(chunkSize, sendCount-offset); - prims.directSend(offset, offset, nelem); - offset += nelem; - } while(offset < sendCount); - } - } - break; - } - tid -= nThreadsSegment; + tid -= args->warpStart * WARP_SIZE; + int nthreads = args->nWarps * WARP_SIZE; + group |= 1<<16; // Used to select connIndex 1 + if (tid >= nthreads || args->peer == -1) return; + if ((group%2) == 0) { + runRecv(tid, nthreads, group, args); + } else { + runSend(tid, nthreads, group, args); } } }; diff --git a/src/collectives/sendrecv.cc b/src/collectives/sendrecv.cc index 65222a5..0e9ca4f 100644 --- a/src/collectives/sendrecv.cc +++ b/src/collectives/sendrecv.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -13,8 +13,8 @@ NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataTyp ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream) { NVTX3_FUNC_RANGE_IN(nccl_domain); - struct ncclInfo info = { ncclFuncSendRecv, "Send", - sendbuff, NULL, count, datatype, ncclSum, peer, comm, stream, /* Args */ + struct ncclInfo info = { ncclFuncSend, "Send", + NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ 1, 1 }; ncclResult_t ret; NCCLCHECK(ncclGroupStart()); @@ -28,7 +28,7 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream) { NVTX3_FUNC_RANGE_IN(nccl_domain); - struct ncclInfo info = { ncclFuncSendRecv, "Recv", + struct ncclInfo info = { ncclFuncRecv, "Recv", NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */ 1, 1 }; ncclResult_t ret; diff --git a/src/debug.cc b/src/debug.cc index 795c401..9060abb 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -167,3 +167,19 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file } pthread_mutex_unlock(&ncclDebugLock); } + +NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0); + +void ncclSetThreadName(pthread_t thread, const char *fmt, ...) { + // pthread_setname_np is nonstandard GNU extension + // needs the following feature test macro +#ifdef _GNU_SOURCE + if (ncclParamSetThreadName() != 1) return; + char threadName[NCCL_THREAD_NAMELEN]; + va_list vargs; + va_start(vargs, fmt); + vsnprintf(threadName, NCCL_THREAD_NAMELEN, fmt, vargs); + va_end(vargs); + pthread_setname_np(thread, threadName); +#endif +} diff --git a/src/enhcompat.cc b/src/enhcompat.cc new file mode 100644 index 0000000..97f5a3f --- /dev/null +++ b/src/enhcompat.cc @@ -0,0 +1,28 @@ +/************************************************************************* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +/* Define weak symbols used to allow libnccl_static.a to work with older libcudart_static.a */ + +enum cudaError_t { cudaErrorStubLibrary = 34 }; + +extern "C" { + +cudaError_t cudaStreamGetCaptureInfo_v2(...) __attribute__((visibility("hidden"))) __attribute((weak)); +cudaError_t cudaStreamGetCaptureInfo_v2(...) { return cudaErrorStubLibrary; } + +cudaError_t cudaUserObjectCreate(...) __attribute__((visibility("hidden"))) __attribute((weak)); +cudaError_t cudaUserObjectCreate(...) { return cudaErrorStubLibrary; } + +cudaError_t cudaGraphRetainUserObject(...) __attribute__((visibility("hidden"))) __attribute((weak)); +cudaError_t cudaGraphRetainUserObject(...) { return cudaErrorStubLibrary; } + +cudaError_t cudaStreamUpdateCaptureDependencies(...) __attribute__((visibility("hidden"))) __attribute((weak)); +cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorStubLibrary; } + +cudaError_t cudaGetDriverEntryPoint(...) __attribute__((visibility("hidden"))) __attribute((weak)); +cudaError_t cudaGetDriverEntryPoint(...) { return cudaErrorStubLibrary; } + +} diff --git a/src/enqueue.cc b/src/enqueue.cc index 4deac18..d28191b 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -156,21 +156,23 @@ static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** wor } int opIndex = channel->workFifoTail%NCCL_MAX_OPS; struct ncclWork* w = channel->workFifo+opIndex; - struct ncclWorkElem* e = w->elems; - volatile uint8_t* activePtr = (volatile uint8_t*)&e->active; - while (activePtr[0] != 0) sched_yield(); + volatile uint8_t* typePtr = (volatile uint8_t*)&w->header.type; + while (typePtr[0] != ncclWorkTypeUnused) sched_yield(); memset(w, 0, sizeof(struct ncclWork)); // Initialize with work elem if provided - if (base) memcpy(e, base, sizeof(struct ncclWorkElem)); - e->active = 1; + if (base) memcpy(w->elems, base, sizeof(struct ncclWorkElem)); channel->workFifoTail++; channel->workCount++; if (work) *work = w; return ncclSuccess; } +// Finalize channel work FIFO states before launch +// Called during dynamic enqueue static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph) { ncclComm_t comm = eqInfo->comm; + // Do not use comm->myParams in this function unless in non-graph mode + // In graph mode, enqueue is async to capture, myParams can have been changed struct cudaLaunchParams* params = comm->myParams; // Only launch blocks where we have work to do. @@ -185,26 +187,24 @@ static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph eqInfo->maxChannels = params->gridDim.x; } - // Set active = 2 for the last operation and add a no-op on empty channels (p2p case). + // Set isLast = 1 for the last operation and add a no-op on empty channels (p2p case). for (int c=0; c<eqInfo->maxChannels; c++) { struct ncclChannel* channel = comm->channels+c; if (channel->workCount == 0) { struct ncclWork* w; NCCLCHECK(getNextOp(channel, &w, NULL)); - struct ncclWorkElem* e = w->elems; - e->comm = comm->devComm; - e->funcIndex = FUNC_INDEX_P2P; - e->p2p.nThreads = 0; + w->header.funcIndex = FUNC_INDEX_P2P; + w->header.type = ncclWorkTypeP2p; + w->header.nWarps = 0; } - channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].elems[0].active = 2; + channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].header.isLast = 1; if (c == 0) { // As we inline the first coll directly, we can free it immediately. // Except P2P or aggregation or registration cases struct ncclWork* work = channel->workFifo+((channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS); - struct ncclWorkElem* elem = work->elems; - if (elem->funcIndex != FUNC_INDEX_P2P && eqInfo->elemList->count() == 1 && elem->regUsed == 0) - elem->active = 0; + if (work->header.type == ncclWorkTypeColl && eqInfo->elemList->count() == 1) + work->header.type = ncclWorkTypeUnused; } if (channel->gdrMemDesc) { @@ -264,6 +264,8 @@ ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) { return ncclSuccess; } +// Check dependency wrt outside streams or previous launches +// Launch kernel in GROUP mode ncclResult_t ncclLaunchBarrier(struct ncclComm* comm) { struct cudaLaunchParams* params = comm->myParams; if (params->gridDim.x == 0) return ncclSuccess; @@ -299,6 +301,7 @@ ncclResult_t ncclLaunchBarrier(struct ncclComm* comm) { return ncclSuccess; } +// Launch kernel in PARALLEL mode ncclResult_t ncclLaunchKernel(ncclComm_t comm) { struct cudaLaunchParams *params = comm->myParams; if (params->gridDim.x == 0) return ncclSuccess; @@ -321,6 +324,7 @@ ncclResult_t ncclLaunchKernel(ncclComm_t comm) { return ncclSuccess; } +// Launch network proxy static ncclResult_t ncclLaunchProxy(struct ncclQueueInfo* eqInfo) { // Start the network proxies as soon as the kernel has been launched. We can't // perform any CUDA call between the two or having a cudaFree between the CUDA @@ -340,6 +344,7 @@ static ncclResult_t ncclLaunchProxy(struct ncclQueueInfo* eqInfo) { return ncclSuccess; } +// Record done event for current launch ncclResult_t ncclRecordEvents(ncclComm_t comm) { struct cudaLaunchParams *params = comm->myParams; @@ -358,6 +363,7 @@ ncclResult_t ncclRecordEvents(ncclComm_t comm) { return ncclSuccess; } +// Reset parameter space for launch ncclResult_t ncclLaunchReset(ncclComm_t comm) { comm->userStreamSet = false; @@ -371,6 +377,8 @@ ncclResult_t ncclLaunchReset(ncclComm_t comm) { NCCLCHECK(ncclResetQueueInfo(comm->enqueueInfo)); } + // After capturing an op in graph mode or launching the op in non-graph mode + // we can reset myParams for use in next op struct cudaLaunchParams *params = comm->myParams; params->gridDim.x = params->blockDim.x = 0; params->func = NULL; @@ -388,6 +396,7 @@ ncclResult_t ncclLaunchReset(ncclComm_t comm) { static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport) { if (info->comm->collNetSupport > 0) { + // Translate ncclAvg and PreMulSum ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op; NCCLCHECK(collNetReduceSupport(info->datatype, netOp, collNetTypeSupport)); } else { @@ -396,6 +405,7 @@ static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNet return ncclSuccess; } +// numPipeOps: number of pipelined ops. Can be greater than 1 in aggregation mode. Used to adjust latency. static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, int numPipeOps) { struct ncclComm* comm = info->comm; if (comm->nRanks == 1) { @@ -432,6 +442,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i int nt = comm->maxThreads[info->algorithm][info->protocol]; int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol]; if (info->algorithm == NCCL_ALGO_COLLNET) { + // CollNet channel tuning int ncSwitch = 16; bool flag = true; while (ncSwitch >= 1 && flag) { @@ -442,6 +453,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i ncSwitch /= 2; } } else { + // Ring/Tree channel tuning while (info->nBytes < nc*nt*threadThreshold) { if (nc >= 2) nc--; else if ((nt % 128) == 0) nt/=2; @@ -450,6 +462,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i } if (info->protocol == NCCL_PROTO_SIMPLE) { nt += WARP_SIZE; // Extra warp for sync + // More threads or sync warps needed due to split thread model if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE; if (info->algorithm == NCCL_ALGO_COLLNET) nt += 3*WARP_SIZE; } @@ -497,11 +510,10 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) { return ncclSuccess; } -static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWorkElem* work, struct ncclProxyArgs* proxyArgs /* output */) { - work->comm = info->comm->devComm; - +static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */) { int collNetTypeSupport = 0; - // Check whether algo and proto have been preset + // Check whether algo and proto have been preset (as in aggregation case) + // If so, skip the calculation if (info->nChannels > 0 && info->nThreads > 0) goto comp_next; NCCLCHECK(getCollNetSupport(info, &collNetTypeSupport)); NCCLCHECK(getAlgoInfo(info, collNetTypeSupport, 1)); @@ -511,22 +523,23 @@ comp_next: NCCLCHECK(getPatternInfo(info)); NCCLCHECK(getLoopInfo(info)); + work->header.type = ncclWorkTypeColl; work->sendbuff = info->sendbuff; work->recvbuff = info->recvbuff; - work->coll.root = info->root; - work->coll.count = info->count; - work->coll.nChannels = info->nChannels; - work->nThreads = info->nThreads; - work->coll.redOpArg = info->opFull.scalarArg; + work->root = info->root; + work->count = info->count; + work->nChannels = info->nChannels; + work->header.nWarps = info->nThreads / WARP_SIZE; + work->redOpArg = info->opFull.scalarArg; work->redOpArgIsPtr = info->opFull.scalarArgIsPtr; if (info->comm->nRanks == 1) { // one-rank reduce index - work->funcIndex = 1 + int(info->datatype); + work->header.funcIndex = 1 + int(info->datatype); return ncclSuccess; } - work->funcIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol); + work->header.funcIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol); int stepSize = info->comm->buffSizes[info->protocol]/NCCL_STEPS; int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1; @@ -542,22 +555,22 @@ comp_next: while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2; } // Use lastChunkSize as chunkSize - work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype); + work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype); } else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) { // Optimize chunkSize / nSteps while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*64 && chunkSize > 131072) chunkSize /= 2; while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 65536) chunkSize /= 2; while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 32768) chunkSize /= 2; // Use lastChunkSize as chunkSize - work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype); + work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype); // Set direct direction for broadcast-gather (read or write) work->direct = (info->nBytes / info->nChannels <= 1024*1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ; } else if (info->protocol == NCCL_PROTO_LL) { const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine); const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize; - work->coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop); - ALIGN_SIZE(work->coll.lastChunkSize, info->nThreads*sizeof(uint64_t)); - work->coll.lastChunkSize /= ncclTypeSize(info->datatype); + work->lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop); + ALIGN_SIZE(work->lastChunkSize, info->nThreads*sizeof(uint64_t)); + work->lastChunkSize /= ncclTypeSize(info->datatype); } else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) { int nNodes = info->comm->nNodes; float ppn = info->comm->nRanks / (float)nNodes; @@ -565,7 +578,7 @@ comp_next: while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2; while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2; // Use lastChunkSize as chunkSize - work->coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype)); + work->lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype)); } // Compute nSteps for proxies @@ -574,25 +587,25 @@ comp_next: if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS; //if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol); int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize))); - proxyArgs->subs[0].nsteps = info->nstepsPerLoop * nLoops * chunkSteps; - proxyArgs->sliceSteps = sliceSteps; - proxyArgs->chunkSteps = chunkSteps; - proxyArgs->chunkSize = chunkSize; - proxyArgs->protocol = info->protocol; - proxyArgs->dtype = info->datatype; - proxyArgs->redOp = info->algorithm != NCCL_ALGO_COLLNET ? ncclNumOps : // Only set redOp when using CollNet + proxyOp->nsteps = info->nstepsPerLoop * nLoops * chunkSteps; + proxyOp->sliceSteps = sliceSteps; + proxyOp->chunkSteps = chunkSteps; + proxyOp->chunkSize = chunkSize; + proxyOp->protocol = info->protocol; + proxyOp->dtype = info->datatype; + proxyOp->redOp = info->algorithm != NCCL_ALGO_COLLNET ? ncclNumOps : // Only set redOp when using CollNet info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum info->op; - proxyArgs->pattern = info->pattern; - proxyArgs->root = info->root; + proxyOp->pattern = info->pattern; + proxyOp->root = info->root; // This is used by P2P to reduce the receive buffer size. We don't use it in collectives // because some protocols need to transmit more than the total size, plus they sometimes // round up - proxyArgs->subs[0].recvbytes = stepSize*proxyArgs->sliceSteps; + proxyOp->nbytes = stepSize*proxyOp->sliceSteps; TRACE(NCCL_COLL,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d chunksize %d comm %p", - proxyArgs->opCount, sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads, - nLoops, proxyArgs->subs[0].nsteps, chunkSize, info->comm); + proxyOp->opCount, sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads, + nLoops, proxyOp->nsteps, chunkSize, info->comm); return ncclSuccess; } @@ -607,6 +620,7 @@ static ncclResult_t checkSetStream(struct ncclInfo* info) { return ncclSuccess; } +// Handle structure for user buffer registration (IPC) exchange struct ncclBuffRegHandle { cudaIpcMemHandle_t sendBuffIpc; cudaIpcMemHandle_t recvBuffIpc; @@ -621,37 +635,48 @@ static ncclResult_t ncclRegBuffAndExchange(struct ncclInfo* info, struct ncclBuf if (comm->localRanks == 1) return ncclSuccess; if (comm->pfnCuMemGetAddressRange == NULL) return ncclSuccess; // CUDA toolkit or driver version too old - struct ncclBuffRegHandle regHandles[NCCL_MAX_INTRA_RANKS]; + ncclResult_t ret = ncclSuccess; + struct ncclBuffRegHandle regHandles[NCCL_MAX_LOCAL_RANKS]; // Get IPC handles // Note: the handle only corresponds to the base address of the allocation - CUDACHECK(cudaIpcGetMemHandle(®Handles[comm->intraNodeRank].sendBuffIpc, (void*)info->sendbuff)); - CUDACHECK(cudaIpcGetMemHandle(®Handles[comm->intraNodeRank].recvBuffIpc, (void*)info->recvbuff)); + CUDACHECKGOTO(cudaIpcGetMemHandle(®Handles[comm->localRank].sendBuffIpc, (void*)info->sendbuff), ret, reg_fallback); + CUDACHECKGOTO(cudaIpcGetMemHandle(®Handles[comm->localRank].recvBuffIpc, (void*)info->recvbuff), ret, reg_fallback); // Get offset of user buffer within allocation void* baseAddr; size_t size; + // Get base address CUDACHECK(comm->pfnCuMemGetAddressRange(&baseAddr, &size, (void*)info->sendbuff)); - regHandles[comm->intraNodeRank].sendBuffOffset = (char*)info->sendbuff - (char*)baseAddr; + regHandles[comm->localRank].sendBuffOffset = (char*)info->sendbuff - (char*)baseAddr; CUDACHECK(comm->pfnCuMemGetAddressRange(&baseAddr, &size, (void*)info->recvbuff)); - regHandles[comm->intraNodeRank].recvBuffOffset = (char*)info->recvbuff - (char*)baseAddr; - TRACE(NCCL_COLL, "Base %p size %lu offset %ld", baseAddr, size, regHandles[comm->intraNodeRank].recvBuffOffset); + regHandles[comm->localRank].recvBuffOffset = (char*)info->recvbuff - (char*)baseAddr; + TRACE(NCCL_COLL, "Base %p size %lu offset %ld", baseAddr, size, regHandles[comm->localRank].recvBuffOffset); // Exchange handles within node - NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, regHandles, sizeof(struct ncclBuffRegHandle))); + NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regHandles, sizeof(struct ncclBuffRegHandle))); // Open handles at local process for (int i=0; i<comm->localRanks; i++) { - if (i == comm->intraNodeRank) { + // Skip myself + if (i == comm->localRank) { regInfo->sendbuffsBase[i] = regInfo->recvbuffsBase[i] = NULL; continue; } + // Get base address of mapping CUDACHECK(cudaIpcOpenMemHandle(regInfo->sendbuffsBase+i, regHandles[i].sendBuffIpc, cudaIpcMemLazyEnablePeerAccess)); CUDACHECK(cudaIpcOpenMemHandle(regInfo->recvbuffsBase+i, regHandles[i].recvBuffIpc, cudaIpcMemLazyEnablePeerAccess)); - // Get real address of buffer + // Get real buffer address by adding offset in the mapping regInfo->sendbuffs[i] = (char*)regInfo->sendbuffsBase[i] + regHandles[i].sendBuffOffset; regInfo->recvbuffs[i] = (char*)regInfo->recvbuffsBase[i] + regHandles[i].recvBuffOffset; } + // Marks the operation as being buffer registered regInfo->nBuffs = comm->localRanks; TRACE(NCCL_COLL, "Rank %d exchanged %d buffers", comm->rank, regInfo->nBuffs); return ncclSuccess; + +reg_fallback: + // If we cannot register specific buffer types, we just bypass this stage, and continue without failing + (void)ret; + WARN("Unable to register user buffers"); + return ncclSuccess; } // Compute enqueue element, save it in list @@ -670,9 +695,8 @@ static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) { // Compute cuda kernel arg and proxy arg templates struct ncclQueueElem* eqElem; NCCLCHECK(comm->enqueueInfo->elemList->getNewElem(&eqElem)); - struct ncclWorkElem* work = &eqElem->work; - eqElem->proxyArgs.nsubs = 1; - NCCLCHECK(computeColl(info, work, &eqElem->proxyArgs)); + struct ncclWork* work = &eqElem->work; + NCCLCHECK(computeColl(info, work->elems, &eqElem->proxyOp)); // Determine grid size struct cudaLaunchParams* params = comm->myParams; @@ -681,14 +705,6 @@ static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) { params->blockDim.x = std::max<unsigned>(params->blockDim.x, info->nThreads); comm->enqueueInfo->maxChannels = params->gridDim.x; // params may be varied by a second graph hence we need to capture it here - // Inline the first kernel - if (params->func == NULL) { - params->func = ncclKerns[work->funcIndex]; - memcpy(&comm->args, work, sizeof(struct ncclWorkElem)); - comm->args.coll.bid = 0; // Only inline for channel 0 - comm->args.active = 2; // I am so far the last element; may be changed later in aggregation mode - } - // Register and exchange input and output buffers if (comm->usingCudaGraph && // only in CUDA graph mode comm->graphRegister == 1 && // when registration is enabled @@ -696,15 +712,26 @@ static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) { comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other comm->intraRanks == 1) { // only in multi-process mode NCCLCHECK(ncclRegBuffAndExchange(info, &eqElem->buffRegInfo)); - // Disable inline argument because we need kernel to copy the entire ncclWork from workFifo - // because the registered addresses are in ncclWork - if (eqElem->buffRegInfo.nBuffs > 0) comm->args.active = 0; comm->enqueueInfo->nRegBuffs += eqElem->buffRegInfo.nBuffs; + work->header.type = ncclWorkTypeRegColl; + } + + // Inline the first kernel + if (params->func == NULL) { + params->func = ncclKerns[work->header.funcIndex]; + if (work->header.type == ncclWorkTypeColl) { + // Copy the first operation to the inline argument. Type may be set later to + // ncclWorkTypeUnused if we have more than one coll element. + memcpy(&comm->args, work->elems, sizeof(struct ncclWorkElem)); + comm->args.bid = 0; // Only inline for channel 0 + comm->args.header.isLast = 1; // I am so far the last element + } } return ncclSuccess; } +// Find the channel with the least enqueued work (counted in bytes) static inline int findShortestChannel(ncclComm_t comm) { size_t minSize = SIZE_MAX; int minC = 0; @@ -718,6 +745,7 @@ static inline int findShortestChannel(ncclComm_t comm) { return minC; } +// Get next channel based on shortest-queue mode or round-robin mode static inline int getNextChannel(ncclComm_t comm, int aggMode) { int nextChannel = 0; if (aggMode && comm->asyncAllocMode == ncclComm::SHORTEST_QUEUE) { @@ -729,6 +757,8 @@ static inline int getNextChannel(ncclComm_t comm, int aggMode) { return nextChannel; } +// Setup aggregated kernels +// Op info has been previously saved in comm->asyncOps ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) { if (comm->asyncOpCount == 0) { return ncclSuccess; @@ -739,16 +769,22 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) { NCCLCHECK(ncclSetupCollKernel(info)); } else { // Aggregation + // Determine a per-channel chunk size used to divide an operation into multiple channels size_t channelSize; if (comm->channelSize > 0) { + // Set by user channelSize = comm->channelSize; } else if (comm->collNetSupport && comm->asyncOps[0].coll == ncclFuncAllReduce) { + // CollNet specific size (tuned based on experiments) channelSize = 256 * 1024; } else { - channelSize = NCCL_AGG_CHANNEL_SIZE * std::min(16, comm->nRanks); // scale channel size based on nranks as latency increases + // Latency increases as scale increases + // We would thus want to increase the chunk size to compensate for the lost efficiency + channelSize = NCCL_AGG_CHANNEL_SIZE * std::min(16, comm->nRanks); } // Reduce the per-channel size if we cannot fully utilize the channels while (comm->asyncTotalSize < channelSize * comm->nChannels && channelSize > NCCL_MIN_CHANNEL_SIZE) channelSize /= 2; + // Check whether the ops have same reduce and data types (and hence can be packed in same ncclWork) int channelUsed = 0; int homogeneous = 1; int allCollNetSupport = comm->collNetSupport; @@ -763,6 +799,7 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) { if (allCollNetSupport > 0) NCCLCHECK(getCollNetSupport(info, &allCollNetSupport)); } // Compute algo, proto, nthreads for the entire kernel + // Prepare a synthetic op info to calculate the collective algo struct ncclInfo total; total.comm = comm; total.coll = comm->asyncOps[0].coll; @@ -770,16 +807,18 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) { total.nChannels = std::min(channelUsed, comm->nChannels); int perChannelOps = DIVUP(channelUsed, total.nChannels); if (homogeneous) NCCLCHECK(getAlgoInfo(&total, allCollNetSupport, perChannelOps)); + // Set for each op for (int c = 0; c < comm->asyncOpCount; c++) { struct ncclInfo* info = comm->asyncOps+c; if (homogeneous) { + // Set fields to skip the individual computeColl in ncclSetupCollKernel info->algorithm = total.algorithm; info->protocol = total.protocol; info->nThreads = total.nThreads; } NCCLCHECK(ncclSetupCollKernel(info)); } - comm->args.active = 0; // disable inline argument + comm->args.header.type = ncclWorkTypeUnused; // disable inline argument } // Reset counters comm->asyncOpCount = 0; @@ -787,6 +826,7 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) { return ncclSuccess; } +// Store aggregated operations info static ncclResult_t ncclSaveAsyncColl(struct ncclInfo* info) { ncclComm_t comm = info->comm; if (comm->asyncOpCount >= NCCL_MAX_OPS) { @@ -805,25 +845,38 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) { struct ncclComm* comm = info->comm; int peer = info->root; ssize_t nBytes = info->count*ncclTypeSize(info->datatype); - if (info->opName[0] == 'S') { // Send + int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2; + int peerNode = comm->rankToNode[peer]; + int peerIndex = comm->rankToLocalRank[peer]; + int nsteps = comm->maxLocalRanks; + int rankIndex = comm->rankToLocalRank[comm->rank]; + if (info->coll == ncclFuncSend) { if (peer != comm->rank) { - int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks; + int step = (nsteps + peerIndex - rankIndex)%nsteps; + int delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes; + if (comm->nNodes == 1) delta = (comm->nRanks + peer - comm->rank) % comm->nRanks; + // Mark channels that need pre-connect for (int c=0; c<comm->p2pnChannelsPerPeer; c++) { - int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels; - if (comm->channels[channelId].peers[peer].send[0].connected == 0) { // P2P uses only 1 connector + int shuffle = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step; + int channelId = (shuffle+comm->p2pChannels[c]) % comm->p2pnChannels; + if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector comm->connectSend[peer] |= (1<<channelId); comm->connect = 1; } } } - NCCLCHECK(ncclSaveP2pInfo(comm->p2pSends[info->root], (void*)info->sendbuff, nBytes)); + NCCLCHECK(ncclSaveP2pInfo(comm->p2pSends[info->root], info->recvbuff, nBytes)); comm->p2pSendCount++; } else { if (peer != comm->rank) { - int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks; + int step = (nsteps + rankIndex - peerIndex)%nsteps; + int delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes; + if (comm->nNodes == 1) delta = (comm->nRanks - peer + comm->rank) % comm->nRanks; + // Mark channels that need pre-connect for (int c=0; c<comm->p2pnChannelsPerPeer; c++) { - int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels; - if (comm->channels[channelId].peers[peer].recv[0].connected == 0) { // P2P uses only 1 connector + int shuffle = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step; + int channelId = (shuffle+comm->p2pChannels[c]) % comm->p2pnChannels; + if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector comm->connectRecv[peer] |= (1<<channelId); comm->connect = 1; } @@ -835,134 +888,155 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) { return ncclSuccess; } -enum { RingTree_Segment=0, P2P_Segment=1, CollNet_Segment=2 }; -static int getSegment(int type, int delta, struct ncclWork* work) { - // Current ncclWork is full - if (work->elems[NCCL_MAX_WORK_ELEMENTS-1].active != 0) return -1; +static int getSegment(enum ncclWorkElemType type, enum ncclWorkElemSubType subType, int peer, struct ncclWork* work) { + if (work->header.type && (work->header.type != type)) return -1; - if (type == P2P_Segment) { // P2P - // Do not mix P2P and collective ops - if (work->elems[0].funcIndex != FUNC_INDEX_P2P) return -1; - for (int s=0; s<NCCL_MAX_WORK_ELEMENTS && work->elems[s].p2p.delta != delta; s++) { - if (work->elems[s].active == 0) return s; + if (type == ncclWorkTypeP2p) { // P2P + int start = subType == ncclWorkSubTypeRecv ? 0 : 1; + for (int s=start; s<NCCL_MAX_WORK_ELEMENTS_P2P; s+=2) { + if (work->p2pElems[s].peer == -1) return s; + // Do not aggregate multiple sends to the same peer (or receives from the same peer) + if (work->p2pElems[s].peer == peer) return -1; } - } else if (type == CollNet_Segment) { // CollNet - for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s+=NCCL_REG_ELEM_FACTOR) { - if (work->elems[s].active == 0) return s; + } else if (type == ncclWorkTypeRegColl) { // CollNet + for (int s=0; s<NCCL_MAX_WORK_ELEMENTS_REG; s++) { + if (work->regElems[s].elem.header.type == ncclWorkTypeUnused) return s; } - } else { // Ring or Tree + } else if (type == ncclWorkTypeColl) { // Ring or Tree for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s++) { - if (work->elems[s].active == 0) return s; + if (work->elems[s].header.type == ncclWorkTypeUnused) return s; } } return -1; } -static ncclResult_t computeP2pWorkElem(struct ncclInfo* info /* input */, struct ncclWorkElem* elem /* output */) { - elem->comm = info->comm->devComm; - elem->funcIndex = FUNC_INDEX_P2P; - elem->nThreads = NCCL_MAX_NTHREADS; - elem->sendbuff = info->sendbuff; - elem->recvbuff = info->recvbuff; - elem->p2p.sendCount = info->sendbytes; - elem->p2p.recvCount = info->recvbytes; - elem->p2p.sendChunkSize = info->sendChunkSize; - elem->p2p.recvChunkSize = info->recvChunkSize; - elem->p2p.delta = info->delta; +// Compute kernel arguments for P2P ops +static ncclResult_t computeP2pWorkElem(struct ncclInfo* info /* input */, struct ncclWorkElemP2p* elem /* output */) { + elem->header.type = ncclWorkTypeP2p; + elem->header.funcIndex = FUNC_INDEX_P2P; + elem->header.nWarps = NCCL_MAX_NTHREADS/WARP_SIZE; + elem->buff = info->recvbuff; + elem->subType = info->coll == ncclFuncSend ? ncclWorkSubTypeSend : ncclWorkSubTypeRecv; + elem->count = info->count; + elem->chunkSize = info->chunkSize; + elem->peer = info->root; return ncclSuccess; } -static ncclResult_t enqueueSegOp(int type, struct ncclWorkElem* elem /* input */, struct ncclWork* work, int s, +// Equeue work elements into segment of ncclWork +// Supporting both collectives (aggregated or not) and P2P +static ncclResult_t enqueueSegOp(enum ncclWorkElemType type, struct ncclWork* elem /* input */, struct ncclWork* work, int s, struct ncclBuffRegInfo* regInfo, struct ncclChannel* channel, struct ncclComm* comm) { - // Copy element into corresponding segment of ncclWork - memcpy(work->elems+s, elem, sizeof(struct ncclWorkElem)); - work->elems[s].active = 1; - - // Determine nThreads at dynamic time - if (type == P2P_Segment) { - const int nsegments = s+1; - int nThreads = 512; - while (nsegments*nThreads > 512) nThreads /= 2; - if (nThreads >= 128) nThreads += WARP_SIZE; - for (int i=0; i<nsegments; i++) work->elems[i].p2p.nThreads = nThreads; + + if (type == ncclWorkTypeP2p) { + memcpy(work->p2pElems+s, elem, sizeof(struct ncclWorkElemP2p)); + int nelems = 0; + for (int i=0; i<NCCL_MAX_WORK_ELEMENTS_P2P; i++) { + if (work->p2pElems[i].header.type) nelems = i+1; + } + + int ngroups = 1; + while (ngroups < nelems) ngroups *= 2; + int nWarps = 1; + while (nWarps*ngroups <= elem->header.nWarps/2) nWarps *= 2; + + for (int i=0; i<ngroups; i++) { + work->p2pElems[i].ngroups = ngroups; + work->p2pElems[i].warpStart = + i*(NCCL_MAX_NTHREADS/WARP_SIZE)/ngroups; + int extraWarp = nWarps >= 2 ? i%2 : 0; + work->p2pElems[i].nWarps = nWarps + extraWarp; + } + return ncclSuccess; } + memcpy(work->elems+s, elem, sizeof(struct ncclWorkElem)); + + if (regInfo->nBuffs == 0) return ncclSuccess; + // Copy registered buffer addresses into ncclWork - if (regInfo->nBuffs > 0) { - struct ncclWorkRegElem* regElem = (struct ncclWorkRegElem*)(work->elems+s); - // For CollNet - for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) { - int peer = channel->collTree.down[i]; - if (peer == -1) break; - int j = comm->rankToIntraNodeRank[peer]; - if (j < 0) { - WARN("Invalid intra-node rank %d for peer %d", j, peer); - return ncclInternalError; - } - regElem->dnInputs[i] = regInfo->sendbuffs[j]; - regElem->dnOutputs[i] = regInfo->recvbuffs[j]; + struct ncclWorkElemReg* regElem = (struct ncclWorkElemReg*)(work->elems+s); + // For CollNet + for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) { + int peer = channel->collTree.down[i]; + if (peer == -1) break; + // Get intra-node slot + int j = comm->rankToLocalRank[peer]; + if (j < 0) { + WARN("Invalid intra-node rank %d for peer %d", j, peer); + return ncclInternalError; } - for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) { - int peer = channel->collTree.up[i]; - if (peer == -1) break; - int j = comm->rankToIntraNodeRank[peer]; - if (j < 0) { - WARN("Invalid intra-node rank %d for peer %d", j, peer); - return ncclInternalError; - } - regElem->upOutputs[i] = regInfo->recvbuffs[j]; + // Input buffer of leaf peer + regElem->dnInputs[i] = regInfo->sendbuffs[j]; + // Output buffer of leaf peer + regElem->dnOutputs[i] = regInfo->recvbuffs[j]; + } + for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) { + int peer = channel->collTree.up[i]; + if (peer == -1) break; + int j = comm->rankToLocalRank[peer]; + if (j < 0) { + WARN("Invalid intra-node rank %d for peer %d", j, peer); + return ncclInternalError; } - work->elems[s].regUsed = 1; + // Output buffer of root peer + regElem->upOutputs[i] = regInfo->recvbuffs[j]; } + work->elems[s].regUsed = 1; return ncclSuccess; } +// Enqueue P2P op ncclResult_t ncclEnqueueP2pKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem) { - struct ncclWorkElem* workElem = &eqElem->work; - struct ncclProxyArgs* proxyArgs = &eqElem->proxyArgs; + struct ncclWorkElemP2p* workElem = eqElem->work.p2pElems; + struct ncclProxyOp* proxyOp = &eqElem->proxyOp; // Try to reuse last p2p operation if not full yet - struct ncclChannel* channel = proxyArgs->subs[0].channel; + struct ncclChannel* channel = comm->channels+proxyOp->channelId; int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS; struct ncclWork* w = channel->workFifo+opIndex; int segment = -1; if (channel->workCount) { // Try to pack more segments into a single operation - segment = getSegment(P2P_Segment, workElem->p2p.delta, w); + segment = getSegment(ncclWorkTypeP2p, workElem->subType, workElem->peer, w); } if (segment == -1) { NCCLCHECK(getNextOp(channel, &w, NULL)); - segment = 0; + segment = workElem->subType == ncclWorkSubTypeRecv ? 0 : 1; + // Initialize work as P2P, set peer=-1 to designate the p2p elem is not used. + w->header.type = ncclWorkTypeP2p; + for (int i=0; i<NCCL_MAX_WORK_ELEMENTS_P2P; i++) w->p2pElems[i].peer = -1; } + //printf("%s to %d -> Channel %d OpCount %ld Segment %d\n", workElem->subType == ncclWorkSubTypeRecv ? "Recv" : "Send", proxyOp->root, channel->id, channel->workFifoTail-1, segment); // store work element into FIFO - NCCLCHECK(ncclProxySaveP2p(comm, proxyArgs)); - NCCLCHECK(enqueueSegOp(P2P_Segment, workElem, w, segment, &eqElem->buffRegInfo, channel, comm)); + NCCLCHECK(ncclProxySaveP2p(comm, proxyOp)); + NCCLCHECK(enqueueSegOp(ncclWorkTypeP2p, &eqElem->work, w, segment, &eqElem->buffRegInfo, channel, comm)); return ncclSuccess; } +// Setup P2P op ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) { ncclComm* comm = info->comm; // Compute cuda kernel arg and proxy arg templates struct ncclQueueElem* eqElem; NCCLCHECK(comm->enqueueInfo->elemList->getNewElem(&eqElem)); // The proxy code will set and tune the send/recv chunk size, make sure to run it first. - NCCLCHECK(ncclProxyComputeP2p(info, &eqElem->proxyArgs)); - NCCLCHECK(computeP2pWorkElem(info, &eqElem->work)); - + NCCLCHECK(ncclProxyComputeP2p(info, &eqElem->proxyOp)); + NCCLCHECK(computeP2pWorkElem(info, eqElem->work.p2pElems)); + // Compute grid size int channelId = info->channelId; struct cudaLaunchParams* params = comm->myParams; params->gridDim.x = std::max<unsigned>(params->gridDim.x, channelId+1); - params->blockDim.x = std::max<unsigned>(params->blockDim.x, eqElem->work.nThreads); + params->blockDim.x = std::max<unsigned>(params->blockDim.x, eqElem->work.header.nWarps*WARP_SIZE); comm->enqueueInfo->maxChannels = params->gridDim.x; // params may be varied by a second graph hence we need to capture it here // Record the first kernel to launch // Just for CUDA kernel to know this is a P2P operation // The CUDA kernel does not use the inlined first work element as fastpath argument if (params->func == NULL) { - params->func = ncclKerns[eqElem->work.funcIndex]; - comm->args.comm = eqElem->work.comm; - comm->args.active = 0; + params->func = ncclKerns[eqElem->work.header.funcIndex]; + comm->args.header.type = ncclWorkTypeUnused; } return ncclSuccess; } @@ -970,24 +1044,24 @@ ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) { // Dynamic enqueue function for collective kernels // Supports both aggregated and non-aggregated modes ncclResult_t ncclEnqueueCollKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem, int aggMode) { - struct ncclWorkElem* work = &eqElem->work; - struct ncclProxyArgs* proxyArgs = &eqElem->proxyArgs; + struct ncclWork* work = &eqElem->work; + struct ncclWorkElem* elem = work->elems; + struct ncclProxyOp* proxyOp = &eqElem->proxyOp; - int nChannels = work->coll.nChannels; - size_t channelSize = work->coll.count*ncclTypeSize(proxyArgs->dtype)/work->coll.nChannels; - int segmentType = proxyArgs->redOp == ncclNumOps ? RingTree_Segment : CollNet_Segment; // redOp is only set when using CollNet + int nChannels = elem->nChannels; + size_t channelSize = elem->count*ncclTypeSize(proxyOp->dtype)/elem->nChannels; + enum ncclWorkElemType workElemType = proxyOp->redOp == ncclNumOps ? ncclWorkTypeColl : ncclWorkTypeRegColl; // redOp is only set when using CollNet for (int bid=0; bid<nChannels; bid++) { int channelId = getNextChannel(comm, aggMode); struct ncclChannel* channel = comm->channels+channelId; // Proxy - proxyArgs->subs[0].channel = channel; - proxyArgs->opCount = comm->collOpCount; - proxyArgs->commOpCount = comm->opCount; - if (proxyArgs->subs[0].nsteps) NCCLCHECK(ncclProxySaveColl(proxyArgs, comm->nRanks)); + proxyOp->channelId = channelId; + proxyOp->opCount = comm->collOpCount; + if (proxyOp->nsteps) NCCLCHECK(ncclProxySaveColl(comm, proxyOp, comm->nRanks)); - work->coll.bid = bid % nChannels; + elem->bid = bid % nChannels; struct ncclWork* w = NULL; int segment = -1; if (aggMode && channel->workCount) { @@ -996,9 +1070,9 @@ ncclResult_t ncclEnqueueCollKernel(struct ncclComm* comm, struct ncclQueueElem* w = channel->workFifo+opIndex; // All elems in work must have same (funcIndex,nThreads), // see "src/collectives/device/common.h" - if (w->elems[0].funcIndex == work->funcIndex && - w->elems[0].nThreads == work->nThreads) { - segment = getSegment(segmentType, 0, w); + if (w->header.funcIndex == work->header.funcIndex && + w->header.nWarps == work->header.nWarps) { + segment = getSegment(workElemType, ncclWorkSubTypeUnused, 0, w); } } if (segment == -1) { @@ -1007,16 +1081,20 @@ ncclResult_t ncclEnqueueCollKernel(struct ncclComm* comm, struct ncclQueueElem* } // store work element into FIFO - NCCLCHECK(enqueueSegOp(segmentType, work, w, segment, &eqElem->buffRegInfo, channel, comm)); + NCCLCHECK(enqueueSegOp(workElemType, work, w, segment, &eqElem->buffRegInfo, channel, comm)); channel->totalSize += channelSize; } comm->collOpCount++; return ncclSuccess; } +// Host setup node for CUDA Graph +// Performs the enqueue job template<int USING_CUDA_GRAPH> void CUDART_CB ncclEnqueueHostSetup(void* arg) { + NVTX3_FUNC_RANGE_IN(nccl_domain); ncclResult_t ret; + // All work for current launch has been captured in Queue Info struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)arg; ncclComm_t comm = eqInfo->comm; int aggMode = eqInfo->elemList->count() > 1 ? 1 : 0; @@ -1024,7 +1102,7 @@ void CUDART_CB ncclEnqueueHostSetup(void* arg) { // Iterate through the element list struct ncclQueueElem* eqElem = eqInfo->elemList->begin(); while (eqElem != NULL) { - if (eqElem->work.funcIndex == FUNC_INDEX_P2P) { + if (eqElem->work.header.funcIndex == FUNC_INDEX_P2P) { NCCLCHECKGOTO(ncclEnqueueP2pKernel(comm, eqElem), ret, cb_end); } else { NCCLCHECKGOTO(ncclEnqueueCollKernel(comm, eqElem, aggMode), ret, cb_end); @@ -1045,6 +1123,8 @@ cb_end: template void CUDART_CB ncclEnqueueHostSetup<0>(void*); template void CUDART_CB ncclEnqueueHostSetup<1>(void*); +// CUDA Graph helper thread +// for de-registering user buffers void* graphHelperFunc(void *args) { struct ncclGraphHelperResources* res = (struct ncclGraphHelperResources*)args; if (res == NULL) { @@ -1058,8 +1138,10 @@ void* graphHelperFunc(void *args) { volatile enum helperThreadState* state = &res->threadState; volatile int* ipcTail = &res->ipcTail; while (1) { + // Last IPC entry enqueue so far int ipcTailMark = *ipcTail; int ipcCount = 0; + // Close IPC till the last entry while (res->ipcHead != ipcTailMark) { if (res->ipcBases[res->ipcHead] != NULL) CUDACHECKIGNORE(cudaIpcCloseMemHandle(res->ipcBases[res->ipcHead])); @@ -1069,6 +1151,7 @@ void* graphHelperFunc(void *args) { } TRACE(NCCL_COLL, "CUDA Graph helper thread closed %d IPC handles", ipcCount); pthread_mutex_lock(&res->threadLock); + // Check for exit signal while (res->ipcHead == *ipcTail && *state != ThreadStop) { pthread_cond_wait(&res->threadCond, &res->threadLock); } @@ -1080,20 +1163,21 @@ void* graphHelperFunc(void *args) { } } +// Check if we are in CUDA Graph capture mode ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph) { comm->usingCudaGraph = 0; + // Feature requires CUDA 11.3/R465 or above #if CUDART_VERSION >= 11030 cudaStreamCaptureStatus captureStatus; unsigned long long cudaGraphId; + ncclResult_t ret = ncclSuccess; if (comm->driverVersion < 11030) { - CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus)); - if (captureStatus != cudaStreamCaptureStatusNone) { - WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support"); - return ncclInvalidUsage; - } - return ncclSuccess; + // Runtime driver version older than compiler version + // Enhanced compat fallback + goto enh_compat_end; } - CUDACHECK(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL)); + // Get CUDA Graph handle + CUDACHECKGOTO(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL), ret, enh_compat_end); if (captureStatus == cudaStreamCaptureStatusActive) { if (cudaGraphId != comm->lastCudaGraphId) { INFO(NCCL_COLL, "stream is being captured by a new graph, id %llu", cudaGraphId); @@ -1109,15 +1193,31 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph) { // Only create this thread when buffer registration is enabled if ((!comm->graphHelperThread) && comm->graphRegister == 1 && comm->disableGraphHelper == 0) { pthread_mutex_init(&comm->graphHelperResources->threadLock, NULL); + // Init signaling method between Graph destroy function and helper thread pthread_cond_init(&comm->graphHelperResources->threadCond, NULL); + // Set state comm->graphHelperResources->threadState = ThreadStart; + // Create thread pthread_create(&comm->graphHelperThread, NULL, graphHelperFunc, comm->graphHelperResources); + // Name thread + ncclSetThreadName(comm->graphHelperThread, "NCCL GrHelper%2d", comm->cudaDev); } } + return ncclSuccess; + +enh_compat_end: // Enhanced compat fallback + (void)ret; + CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus)); + if (captureStatus != cudaStreamCaptureStatusNone) { + WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support"); + return ncclInvalidUsage; + } + // If we are not in capture mode, we can ignore the driver being lower #endif return ncclSuccess; } +// Create host setup node in CUDA Graph ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph) { #if CUDART_VERSION >= 11030 struct ncclQueueInfo* eqInfo = comm->enqueueInfo; @@ -1125,14 +1225,17 @@ ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph) { // which CUDA graph would manage lifetime of cudaUserObject_t object; CUDACHECK(cudaUserObjectCreate(&object, eqInfo, ncclDestroyQueueInfo, 1/*initialRefcount*/, cudaUserObjectNoDestructorSync)); + // Hand over ownership to CUDA Graph CUDACHECK(cudaGraphRetainUserObject(graph, object, 1, cudaGraphUserObjectMove)); cudaHostFn_t fn = ncclEnqueueHostSetup<1>; // Add a CPU node to the graph cudaGraphNode_t setupNode; + // Function + parameter space for that function (i.e. enqueue info) cudaHostNodeParams setupNodeParams = {fn, eqInfo}; int numDependencies = comm->lastSetupNode == NULL ? 0 : 1; CUDACHECK(cudaGraphAddHostNode(&setupNode, graph, &comm->lastSetupNode, numDependencies, &setupNodeParams)); + // Create dependency from last setup node in the same graph CUDACHECK(cudaStreamUpdateCaptureDependencies(comm->userStream, &setupNode, 1, cudaStreamAddCaptureDependencies)); comm->lastSetupNode = setupNode; return ncclSuccess; @@ -1237,7 +1340,7 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count, info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream); - if (info->coll == ncclFuncSendRecv) { //p2p stored separately + if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) { //p2p stored separately NCCLCHECKGOTO(ncclSaveP2p(info), ret, end); } else { NCCLCHECKGOTO(ncclSaveAsyncColl(info), ret, end); diff --git a/src/graph/connect.cc b/src/graph/connect.cc index a26611e..da9a360 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,6 +8,7 @@ #include "graph.h" #include "trees.h" #include "rings.h" +#include "topo.h" /******************************************************************/ /********************* Internode connection ***********************/ @@ -17,7 +18,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoRanks* topoRanks) { int rank = comm->rank; - int localRanks = comm->localRanks; + int localRanks = comm->topo->nodes[GPU].count; int nChannels = comm->nChannels; for (int c=0; c<nChannels; c++) { diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 64c54df..2bd52b0 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -171,20 +171,21 @@ static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* ret return ncclSuccess; } -static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int i1, int t2, int i2) { - struct ncclTopoNode* cpuNode = system->nodes[CPU].nodes+c; +static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) { + struct ncclTopoNode* cpuNode = system->nodes[tx].nodes+ix; struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1; int l=0; // Node 1 -> CPU - for (int i=0; i<srcNode->paths[CPU][c].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[CPU][c].list[i]; + for (int i=0; i<srcNode->paths[tx][ix].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[tx][ix].list[i]; // CPU -> Node 2 for (int i=0; i<cpuNode->paths[t2][i2].count; i++) srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i]; // Update path characteristics srcNode->paths[t2][i2].count = l; - srcNode->paths[t2][i2].type = std::max(srcNode->paths[CPU][c].type, cpuNode->paths[t2][i2].type); - srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width); + srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type); + if (tx == GPU) srcNode->paths[t2][i2].type = PATH_PXN; + srcNode->paths[t2][i2].width = std::min(srcNode->paths[tx][ix].width, cpuNode->paths[t2][i2].width); return ncclSuccess; } @@ -241,6 +242,8 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE return ncclSuccess; } +NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0); + int ncclTopoUserP2pLevel = -1; ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) { *p2p = 0; @@ -256,13 +259,14 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_ return ncclSuccess; } - + int intermediateIndex = -1; // Set intermediate GPU rank, if routing through an intermediate GPU. struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2; if (path->count == 2) { struct ncclTopoNode* intermediateNode = path->list[0]->remNode; - if (intermediateNode->type == GPU && intermediateRank) { - *intermediateRank = intermediateNode->gpu.rank; + if (intermediateNode->type == GPU) { + intermediateIndex = intermediateNode - system->nodes[GPU].nodes; + if (intermediateRank) *intermediateRank = intermediateNode->gpu.rank; } } @@ -292,6 +296,38 @@ compare: // Compute the PCI distance and compare with the p2pLevel. if (path->type <= p2pLevel) *p2p = 1; + if (*p2p == 1) { + // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to + // validate against NVML at all since they are pretending to be on other hw. + if (g1 != g2 && ncclParamIgnoreDisabledP2p() != 2) { + int indexes[3] = {-1,-1,-1}; + int verticeN = 0; + NCCLCHECK(ncclNvmlEnsureInitialized()); + + indexes[verticeN++] = system->nodes[GPU].nodes[g1].gpu.dev; + if (intermediateIndex != -1) indexes[verticeN++] = system->nodes[GPU].nodes[intermediateIndex].gpu.dev; + indexes[verticeN++] = system->nodes[GPU].nodes[g2].gpu.dev; + + for (int i=1; i < verticeN; i++) { + nvmlGpuP2PStatus_t status; + status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusRead; + bool good = status == NVML_P2P_STATUS_OK; + status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusWrite; + good &= status == NVML_P2P_STATUS_OK; + if (!good) { + if (ncclParamIgnoreDisabledP2p()) { + *p2p = 0; + } else if (path->type <= PATH_NVB) { + WARN("P2P is disabled between NVLINK connected GPUs %d and %d. This should not be the case given their connectivity, and is probably due to a hardware issue. If you still want to proceed, you can set NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]); + return ncclUnhandledCudaError; + } else if (path->type < PATH_SYS) { + INFO(NCCL_INIT, "P2P is disabled between connected GPUs %d and %d. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]); + } + } + } + } + } + if (path->type == PATH_NVL) { struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2; // Enable P2P Read for Ampere/NVLink only @@ -342,6 +378,14 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int NCCLCHECK(ncclGetLevel(&ncclTopoUserGdrLevel, NULL, "NCCL_NET_GDR_LEVEL")); if (ncclTopoUserGdrLevel != -2) netGdrLevel = ncclTopoUserGdrLevel; int distance = gpu->paths[NET][n].type; + if (distance == PATH_PXN) { + // In case of PXN, use the intermediate GPU distance instead + int proxyRank, g; + NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netDev, &proxyRank)); + NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g)); + struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g; + distance = proxyGpu->paths[NET][n].type; + } if (distance > netGdrLevel) { INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel); return ncclSuccess; @@ -352,6 +396,77 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int return ncclSuccess; } +ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank) { + // Get GPU and NET + int n, g; + NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n)); + NCCLCHECK(ncclTopoRankToIndex(system, rank, &g)); + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + struct ncclTopoLinkList* path = gpu->paths[NET]+n; + if (path->type == PATH_PXN) { + struct ncclTopoNode* node; + int type = NVS; + for (int i=0; i<path->count && type == NVS; i++) { + node = path->list[i]->remNode; + type = node->type; + } + if (type != GPU) { + WARN("Could not find intermediate GPU between GPU rank %d and NIC %d\n", rank, netDev); + return ncclInternalError; + } + *intermediateRank = node->gpu.rank; + } else { + *intermediateRank = rank; + } + return ncclSuccess; +} + +NCCL_PARAM(PxnDisable, "PXN_DISABLE", 0); + +// Net v4 plugins don't have non-blocking connect/accept. We can't therefore use +// remote proxies without risking deadlocks +int ncclPxnDisable() { + static int pxnDisable = -1; + if (pxnDisable == -1) { + if (ncclNetVersion() == 4) { + INFO(NCCL_INIT, "PXN Disabled as plugin is v4"); + pxnDisable = 1; + } else { + pxnDisable = ncclParamPxnDisable(); + } + } + return pxnDisable; +} + +ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks) { + struct ncclTopoSystem* system = comm->topo; + *nranks = 0; + *intermediateRanks = NULL; + if (system->nodes[NET].count == 0) return ncclSuccess; + + int nr = 0; + int* ranks = NULL; + for (int rank=0; rank<comm->nRanks; rank++) { + int netDev, proxyRank; + NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netDev, &proxyRank)); + if (proxyRank == comm->rank) continue; + int useGdr; + NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->busId, netDev, 1, &useGdr)); + if (useGdr == 0) continue; + int found = 0; + for (int r=0; r<nr; r++) { + if (ranks[r] == proxyRank) found = 1; + } + if (!found) { + NCCLCHECK(ncclRealloc(&ranks, nr, nr+1)); + ranks[nr++] = proxyRank; + } + } + *nranks = nr; + *intermediateRanks = ranks; + return ncclSuccess; +} + ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) { // Precompute paths between GPUs/NICs. @@ -376,7 +491,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer // Divert all traffic through the CPU int cpu; NCCLCHECK(getLocalCpu(system, g, &cpu)); - NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g)); + NCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g)); } } @@ -403,6 +518,29 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer NCCLCHECK(ncclTopoSetPaths(netNode, system)); for (int g=0; g<system->nodes[GPU].count; g++) { + // Check whether we can access the NIC through another NVLink-connected GPU (PXN) + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + if (ncclPxnDisable() != 1 && gpu->paths[NET][n].type > PATH_PXB) { + for (int p=0; p<system->nodes[GPU].count; p++) { + if (p == g) continue; + struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p; + + // To ensure proper balancing, use only a local GPU which advertised that NIC as its preferred one. + int netDev; + NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank, &netDev)); + // Make sure we can allocate memory on that GPU. + if (netDev != netNode->id) continue; + + // PXN = PCI + NVLink. + if (netNode->paths[GPU][p].type > PATH_PXB || peerNode->paths[GPU][g].type > PATH_NVL) continue; + + // We can use that GPU as relay to communicate with that NIC. + // Only enabling it in the GPU->NIC direction for now to favor + // receiving locally and sending remotely (consistent with net.cc) + NCCLCHECK(addInterStep(system, GPU, p, GPU, g, NET, n)); + break; + } + } // Update path when we dont want to / can't use GPU Direct RDMA. int gdr; NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr)); @@ -410,8 +548,8 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU int localCpu; NCCLCHECK(getLocalCpu(system, g, &localCpu)); - NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g)); - NCCLCHECK(addCpuStep(system, localCpu, GPU, g, NET, n)); + NCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g)); + NCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n)); } } } @@ -454,7 +592,6 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* NCCLCHECK(ncclTopoRemoveNode(system, GPU, g)); } - comm->localRanks = system->nodes[GPU].count; if (system->nodes[GPU].count == comm->nRanks) { for (int n=system->nodes[NET].count-1; n>=0; n--) NCCLCHECK(ncclTopoRemoveNode(system, NET, n)); @@ -469,6 +606,8 @@ void ncclTopoFree(struct ncclTopoSystem* system) { free(system); } +NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", 2); + static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*local gpu index*/, int peerRank, int* nChannels) { int peer; struct ncclTopoLinkList* path = NULL; @@ -488,7 +627,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /* } } else { // Remote rank, use network - *nChannels = 1; + *nChannels = ncclParamNChannelsPerNetPeer(); } return ncclSuccess; } diff --git a/src/graph/search.cc b/src/graph/search.cc index 8894bd1..d70b6a7 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -254,10 +254,10 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time); // Try to keep all searchs within one second -#define NCCL_SEARCH_GLOBAL_TIMEOUT (3ULL<<19) -#define NCCL_SEARCH_TIMEOUT (1<<18) -#define NCCL_SEARCH_TIMEOUT_TREE (1<<17) -#define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<10) +#define NCCL_SEARCH_GLOBAL_TIMEOUT (1ULL<<18) +#define NCCL_SEARCH_TIMEOUT (1<<14) +#define NCCL_SEARCH_TIMEOUT_TREE (1<<14) +#define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<8) #define FORCED_ORDER_PCI 1 #define FORCED_ORDER_REPLAY 2 @@ -305,6 +305,57 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoGraph* graph, struct ncclTopoG return ncclSuccess; } +// Build a list of the best NETs to try. +// +// "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu +// index when trying to get back to the NIC. +// +// The list is built the following way: +// 1. Select NETs starting with those close to GPU(s), based on paths[n].type. +// 2. For each GPU, once that list of NICs with a given distance is prepared, shuffle the list +// based on the GPU NVML index so that e.g. GPU 1 chooses NIC 1 first instead of NIC 0 which +// might have been choosen by GPU 0 (case with multiple independent communicators per node) +// 3. Then add the NETs to the final list if they were not already added by another closer GPU. + +ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) { + int netCount = 0; + int localNetCount; + int* localNets; + NCCLCHECK(ncclCalloc(&localNets, system->nodes[NET].count)); + + for (int t=0; t <= typeInter; t++) { + for (int g=0; g<system->nodes[GPU].count; g++) { + if (gpu != -1 && gpu != g) continue; + localNetCount = 0; + struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; + struct ncclTopoLinkList* paths = gpu->paths[NET]; + for (int n=0; n<system->nodes[NET].count; n++) { + if (paths[n].type == t) localNets[localNetCount++] = n; + } + if (localNetCount == 0) continue; + // Shuffle by gpu NVML device number so that GPUs on the same PCI switch + // with multiple NICs don't use the same one as first choice. + for (int r=0; r<system->nodes[GPU].nodes[g].gpu.dev % localNetCount; r++) { + int net0 = localNets[0]; + for (int i=0; i<localNetCount-1; i++) localNets[i] = localNets[i+1]; + localNets[localNetCount-1] = net0; + } + // Append NICs to list + for (int i=0; i<localNetCount; i++) { + int n = localNets[i]; + int found = 0; + while (nets[found] != n && found<netCount) found++; + if (found == netCount) nets[netCount++] = n; + } + } + } + + *netCountRet = netCount; + free(localNets); + + return ncclSuccess; +} + ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) { if ((*time) <= 0) return ncclSuccess; (*time)--; @@ -333,7 +384,12 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo int startNetIndex; NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex)); struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex; - for (int n=0; n<system->nodes[NET].count; n++) { + int netcount; + int* nets; + NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); + NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netcount)); + for (int i=0; i<netcount; i++) { + int n = nets[i]; struct ncclTopoNode* net = system->nodes[NET].nodes+n; if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue; @@ -359,6 +415,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo graph->speedInter = speedInterSave; } } + free(nets); } } else if (step < system->nodes[GPU].count-1) { // Go to next GPU @@ -393,65 +450,12 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo return ncclSuccess; } -// Select only NICs with the maximum bandwidth w.r.t. GPUs, and sort them by distance. -ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int* nets, int* netcountRet) { - float* maxwidths; - int* minhops; - int netcount = 0; - NCCLCHECK(ncclCalloc(&minhops, system->nodes[NET].count)); - NCCLCHECK(ncclCalloc(&maxwidths, system->nodes[NET].count)); - for (int n=0; n<system->nodes[NET].count; n++) { - maxwidths[n] = 0.0; - minhops[n] = 255; - struct ncclTopoNode* net = system->nodes[NET].nodes+n; - struct ncclTopoLinkList* paths = net->paths[GPU]; - for (int g=0; g<system->nodes[GPU].count; g++) { - if (paths[g].width > maxwidths[n] || (paths[g].width == maxwidths[n] && paths[g].count < minhops[n])) { - maxwidths[n] = paths[g].width; - minhops[n] = paths[g].count; - } - } - if (netcount && maxwidths[nets[0]] > maxwidths[n]) continue; // Do not keep NICs with lower BW - if (netcount && maxwidths[nets[0]] < maxwidths[n]) netcount = 0; // Remove all NICs with lower BW - int index; - for (index = 0; index < netcount; index++) { - if (minhops[n] < minhops[nets[index]]) break; - } - // Insert net at index - // Shift all nets with higher nhops - for (int i = netcount; i>index; i--) nets[i] = nets[i-1]; - // Insert this net at index - nets[index] = n; - netcount++; - } - - *netcountRet = netcount; - - // Then shuffle NICs with the same nhops based on the GPU device number, so that when we have - // 2 NICs and 2 GPUs and create communicators with only one GPU, we will use both NICs. - for (int start = 0; start < netcount;) { - int end = start+1; - while (end < netcount && minhops[nets[end]] == minhops[nets[start]]) end++; - // Shuffle - for (int r=0; r<system->nodes[GPU].nodes[0].gpu.dev % (end-start); r++) { - int netStart = nets[start]; - for (int i=start; i<end-1; i++) nets[i] = nets[i+1]; - nets[end-1] = netStart; - } - start = end; - } - - free(minhops); - free(maxwidths); - return ncclSuccess; -} - ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) { const int speed = graph->speedInter; int* nets; NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); int netcount; - NCCLCHECK(ncclTopoSelectNets(system, nets, &netcount)); + NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netcount)); for (int i=0; i<netcount; i++) { int n = nets[i]; struct ncclTopoNode* net = system->nodes[NET].nodes+n; @@ -461,6 +465,8 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo if (net->net.maxChannels == 0) continue; graph->inter[graph->nChannels*2] = net->id; + graph->latencyInter = net->net.latency; + for (int i=0; i<system->nodes[NET].count; i++) { if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) && (system->nodes[NET].nodes[i].net.port == net->net.port)) { @@ -587,7 +593,18 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra /* User defined graph from XML file */ /************************************/ -struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "NVB", PATH_NVB}, { "LOC", PATH_LOC }, { NULL, 0 } }; +struct kvDict kvDictLinkType[] = { + { "LOC", PATH_LOC }, + { "NVL", PATH_NVL }, + { "NVB", PATH_NVB }, + { "PIX", PATH_PIX }, + { "PXB", PATH_PXB }, + { "PXN", PATH_PXN }, + { "PHB", PATH_PHB }, + { "SYS", PATH_SYS }, + { NULL, 0 } +}; + ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { int ngpus = system->nodes[GPU].count; int* inter = graph->inter+2*c; @@ -627,6 +644,7 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels)); NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->speedIntra)); NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->speedInter)); + if (xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != ncclSuccess) graph->latencyInter = 0.0; const char* str; NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str)); NCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType)); @@ -685,6 +703,7 @@ ncclResult_t ncclTopoGetXmlFromGraph(struct ncclTopoGraph* graph, struct ncclTop NCCLCHECK(xmlSetAttrInt(xmlGraph, "nchannels", graph->nChannels)); NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->speedIntra)); NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->speedInter)); + NCCLCHECK(xmlSetAttrFloat(xmlGraph, "latencyinter", graph->latencyInter)); const char* str; NCCLCHECK(kvConvertToStr(graph->typeIntra, &str, kvDictLinkType)); NCCLCHECK(xmlSetAttr(xmlGraph, "typeintra", str)); @@ -712,10 +731,14 @@ float speedArrayInter[] = { 48.0, 30.0, 24.0, 22.0, 18.0, 15.0, 12.0, 10.0, 9.0, #define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float)) #define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float)) +NCCL_PARAM(CrossNic, "CROSS_NIC", 2); + ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) { int ngpus = system->nodes[GPU].count; + graph->crossNic = ncclParamCrossNic(); int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0; graph->speedIntra = graph->speedInter = 0; + graph->latencyInter = 0; if (graph->crossNic == 2) graph->crossNic = 0; graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL; graph->typeInter = PATH_PIX; @@ -802,19 +825,13 @@ search: goto search; } tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL; - if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXB)) { + + if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) { tmpGraph.typeInter += 1; goto search; } tmpGraph.typeInter = PATH_PIX; - // Try a simpler tree - if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) { - tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE; - goto search; - } - tmpGraph.pattern = graph->pattern; - if (crossNic && tmpGraph.crossNic == 0) { // Try again with crossNic if permitted tmpGraph.crossNic = crossNic; @@ -822,6 +839,13 @@ search: } tmpGraph.crossNic = 0; + // Try a simpler tree + if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) { + tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE; + goto search; + } + tmpGraph.pattern = graph->pattern; + // Decrease speed until we find a solution if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->speedInter > .49))) { tmpGraph.speedInter = tmpGraph.speedIntra = speedArray[++speedIndex]; @@ -915,17 +939,66 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru return ncclSuccess; } -ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* dev) { +// 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation +NCCL_PARAM(P2pPxnLevel, "P2P_PXN_LEVEL", 2); + +#include "comm.h" +ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* dev, int* proxyRank) { if (graph) { // Honor the net device in the graph int channel = channelId%graph->nChannels; - int ngpus = system->nodes[GPU].count; + int ngpus = comm->topo->nodes[GPU].count; int index = graph->intra[channel*ngpus] == rank ? 0 : 1; *dev = graph->inter[channel*2+index]; + NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank)); + } else if (peerRank == -1) { + return ncclInternalError; } else { - int64_t id; - NCCLCHECK(ncclTopoGetLocalNet(system, rank, &id, rr)); - *dev = id; + // Start with our local NIC and local Rank + NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, dev)); + *proxyRank = rank; + + int pxnLevel = ncclPxnDisable() == 1 ? 0 : ncclParamP2pPxnLevel(); + // See whether we can use the remote rank preferred device. + if (ncclParamCrossNic() == 0 || (pxnLevel != 0)) { + int netDev = comm->peerInfo[peerRank].netDev; + int n; + // Check that device exists on our node + if (ncclParamCrossNic() == 0) { + if (ncclTopoIdToIndex(comm->topo, NET, netDev, &n) != ncclSuccess) { + WARN("Rank %d requires NIC %d but that NIC is not available for rank %d", peerRank, netDev, rank); + return ncclInvalidUsage; + } + *dev = netDev; + } + if (pxnLevel == 1) { + int g, n; + NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g)); + NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n)); + struct ncclTopoNode* gpu = comm->topo->nodes[GPU].nodes+g; + if (gpu->paths[NET][n].type <= PATH_PXN) { + *dev = netDev; + NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank)); + } + } else if (pxnLevel == 2) { + // Check whether we can access it through our node-local GPU for that NIC. + for (int r=0; r<comm->localRanks; r++) { + int peerRank = comm->localRankToRank[r]; + if (comm->peerInfo[peerRank].netDev == netDev) { + int g1, g2, n; + NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1)); + NCCLCHECK(ncclTopoRankToIndex(comm->topo, peerRank, &g2)); + NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n)); + struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2; + if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) { + *proxyRank = peerRank; + *dev = netDev; + return ncclSuccess; + } + } + } + } + } } return ncclSuccess; } diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 1d34286..83f125f 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -20,8 +20,8 @@ #define BUSID_REDUCED_SIZE (sizeof("0000:00")) const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" }; -const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "SYS", "NET" }; -const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PHB", "SYS" }; +const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "", "SYS", "NET" }; +const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS" }; /******************************************************************/ /******************* Graph Creation Functions *********************/ @@ -121,6 +121,7 @@ ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNo n->net.asic = 0ULL; n->net.port = NCCL_TOPO_UNDEF; n->net.width = 0.0; + n->net.latency = 0.0; } *node = n; return ncclSuccess; @@ -332,13 +333,14 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s ncclDebugNoWarn = NCCL_GRAPH; int mbps; - if (xmlGetAttrInt(xmlNet, "speed", &mbps) != ncclSuccess) mbps = 0; + NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0)); if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1 net->net.width = mbps / 8000.0; - if (xmlGetAttrInt(xmlNet, "port", &net->net.port) != ncclSuccess) net->net.port = 0; - if (xmlGetAttrInt(xmlNet, "gdr", &net->net.gdrSupport) != ncclSuccess) net->net.gdrSupport = 0; - if (xmlGetAttrInt(xmlNet, "maxconn", &net->net.maxChannels) != ncclSuccess) net->net.maxChannels = MAXCHANNELS; - if (xmlGetAttrInt(xmlNet, "coll", &net->net.collSupport) != ncclSuccess) net->net.collSupport = 0; + if (xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != ncclSuccess) net->net.latency = 0; + NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0)); + NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0)); + NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS)); + NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0)); ncclDebugNoWarn = 0; NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width)); @@ -578,6 +580,16 @@ static ncclResult_t xmlInitAttrUint64(struct ncclXmlNode* node, const char* attr } return ncclSuccess; } +static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrName, const float value) { + int index; + NCCLCHECK(xmlGetAttrIndex(node, attrName, &index)); + if (index == -1) { + index = node->nAttrs++; + strncpy(node->attrs[index].key, attrName, MAX_STR_LEN); + snprintf(node->attrs[index].value, MAX_STR_LEN, "%f", value); + } + return ncclSuccess; +} ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) { @@ -614,7 +626,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes, // so we start with collnet so that it has precedence. int netDevCount = 0; - if (ncclCollNet) { + if (collNetSupport()) { NCCLCHECK(collNetDevices(&netDevCount)); for (int n=0; n<netDevCount; n++) { ncclNetProperties_t props; @@ -643,6 +655,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy NCCLCHECK(xmlSetAttrInt(netNode, "dev", n)); NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed)); NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port)); + NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency)); NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid)); NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms)); NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0)); @@ -662,7 +675,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy return ncclSuccess; } -ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_t* id, int rr) { +ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id) { int g; NCCLCHECK(ncclTopoRankToIndex(system, rank, &g)); int minType = PATH_SYS; @@ -679,6 +692,13 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_ } if (path->width == maxWidth && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id; } + if (count == 0) { + *id = -1; + free(nets); + return ncclSuccess; + } + + int rr = system->nodes[GPU].nodes[g].gpu.dev; *id = nets[rr%count]; free(nets); return ncclSuccess; @@ -778,3 +798,14 @@ ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int* if (ccMax) *ccMax = max; return ncclSuccess; } + +ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank) { + for (int g=0; g<system->nodes[GPU].count; g++) { + if (system->nodes[GPU].nodes[g].gpu.rank == rank) { + *localRank = g; + return ncclSuccess; + } + } + WARN("Could not find local GPU with rank %d\n", rank); + return ncclInternalError; +} diff --git a/src/graph/topo.h b/src/graph/topo.h index 304b496..ada1732 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -43,9 +43,10 @@ extern const char* topoNodeTypeStr[]; // Skipping 2 for PATH_NVB #define LINK_PCI 3 // Skipping 4 for PATH_PXB -// Skipping 5 for PATH_PHB -#define LINK_SYS 6 -#define LINK_NET 7 +// Skipping 5 for PATH_PXN +// Skipping 6 for PATH_PHB +#define LINK_SYS 7 +#define LINK_NET 8 extern const char* topoLinkTypeStr[]; #define PATH_LOC 0 @@ -53,8 +54,10 @@ extern const char* topoLinkTypeStr[]; #define PATH_NVB 2 #define PATH_PIX 3 #define PATH_PXB 4 -#define PATH_PHB 5 -#define PATH_SYS 6 +#define PATH_PXN 5 +#define PATH_PHB 6 +#define PATH_SYS 7 +#define PATH_DIS 7 extern const char* topoPathTypeStr[]; struct ncclTopoNode; @@ -93,6 +96,7 @@ struct ncclTopoNode { uint64_t asic; int port; float width; + float latency; int gdrSupport; int collSupport; int maxChannels; @@ -132,8 +136,7 @@ ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id) ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width); ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system); ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system); - -ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_t* id, int rr); +ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank); ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem); ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels); diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index e30a927..b07ca38 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -66,7 +66,7 @@ static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = /* PCI */ { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { 1.0, 1.9, 8.0 } }, /* NET */ - { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 9.6 }, /* CollNet (LL/LL128/Simple)*/ { 5.0, 5.0, 10.7 } } + { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 28 }, /* CollNet (LL/LL128/Simple)*/ { 5.0, 5.0, 10.7 } } }; // LL128 max BW per channel @@ -80,8 +80,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads); comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS); - comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = - getNthreads("NCCL_NTHREADS", ncclParamNthreads(), NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS); + comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS; comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS); comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128] = @@ -112,7 +111,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) : coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 : nRanks; - int nInterSteps = coll == ncclFuncAllReduce ? 2*(nNodes-1) : + int nInterSteps = coll == ncclFuncAllReduce ? (nNodes > 1 ? 2*nNodes :0) : coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 : nNodes; @@ -138,7 +137,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom comm->latencies[coll][a][p] = baseLat[a][p]; float intraLat = hwLat[intraHw[a]][a][p]; - float interLat = hwLat[NCCL_HW_NET][a][p]; + float interLat = graphs[a]->latencyInter ? graphs[a]->latencyInter : hwLat[NCCL_HW_NET][a][p]; + if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8; if (a == NCCL_ALGO_RING) { float lat = hwLat[hw[a]][a][p]; diff --git a/src/graph/xml.cc b/src/graph/xml.cc index 8f50301..838a7f5 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -602,7 +602,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId)); if (busId == NULL || cudaDeviceGetByPCIBusId(&dev, busId) != cudaSuccess) dev = -1; } else { - NCCLCHECK(wrapNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&dev)); + NCCLCHECK(ncclNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&dev)); } NCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev)); } @@ -617,7 +617,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm CUDACHECK(cudaGetDeviceProperties(&devProp, dev)); cudaMajor = devProp.major; cudaMinor = devProp.minor; } else { - NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDev, &cudaMajor, &cudaMinor)); + NCCLCHECK(ncclNvmlDeviceGetCudaComputeCapability(nvmlDev, &cudaMajor, &cudaMinor)); } NCCLCHECK(xmlSetAttrInt(gpuNode, "sm", cudaMajor*10+cudaMinor)); } @@ -638,15 +638,15 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm for (int l=0; l<maxNvLinks; ++l) { // Check whether we can use this NVLink for P2P unsigned canP2P; - if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue; + if ((ncclNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue; // Make sure the Nvlink is up. The previous call should have trained the link. nvmlEnableState_t isActive; - if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue; + if ((ncclNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue; // Try to figure out what's on the other side of the NVLink nvmlPciInfo_t remoteProc; - if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue; + if (ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue; // Make a lower case copy of the bus ID for calling ncclDeviceType // PCI system path is in lower case @@ -701,13 +701,7 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl NCCLCHECK(xmlSetAttrIfUnset(node, "class", "0x03")); NCCLCHECK(ncclTopoGetXmlFromSys(node, xml)); nvmlDevice_t nvmlDev = NULL; - static int nvmlInit = 0; - if (nvmlInit == 0) { - nvmlInit = (wrapNvmlSymbols() != ncclSuccess || wrapNvmlInit() != ncclSuccess) ? 2 : 1; - } - if (nvmlInit == 1) { - if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL; - } + if (ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL; NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode)); return ncclSuccess; } diff --git a/src/graph/xml.h b/src/graph/xml.h index 0c16b95..73f777d 100644 --- a/src/graph/xml.h +++ b/src/graph/xml.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -94,6 +94,14 @@ static ncclResult_t xmlGetAttrInt(struct ncclXmlNode* node, const char* attrName return ncclSuccess; } +static ncclResult_t xmlGetAttrIntDefault(struct ncclXmlNode* node, const char* attrName, int* value, int defaultValue) { + const char* str; + NCCLCHECK(xmlGetAttr(node, attrName, &str)); + *value = str ? strtol(str, NULL, 0) : defaultValue; + return ncclSuccess; +} + + static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrName, float* value) { const char* str; NCCLCHECK(xmlGetAttrStr(node, attrName, &str)); diff --git a/src/group.cc b/src/group.cc index 217e76d..0e8f19e 100644 --- a/src/group.cc +++ b/src/group.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -52,21 +52,6 @@ struct ncclAsyncArgs { thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS]; -#define NCCLCHECKTHREAD(a) do { \ - if ((args->ret = (a)) != ncclSuccess) { \ - INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ - return args; \ - } \ -} while(0) - -#define CUDACHECKTHREAD(a) do { \ - if ((a) != cudaSuccess) { \ - INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ - args->ret = ncclUnhandledCudaError; \ - return args; \ - } \ -} while(0) - void* ncclAsyncThreadMain(void* args_) { struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_; NCCLCHECKTHREAD(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev)); @@ -116,15 +101,19 @@ ncclResult_t ncclGroupStart() { return ncclSuccess; } -static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes, void* recvbuff, ssize_t sendbytes, const void* sendbuff) { - struct ncclInfo info = { ncclFuncSendRecv, "SendRecv", - sendbuff, recvbuff, (size_t)std::max<ssize_t>(sendbytes,recvbytes), ncclInt8, ncclSum, -1, comm, comm->userStream, /* Args */ +static ncclResult_t scheduleSend(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff) { + struct ncclInfo info = { ncclFuncSend, "Send", + NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */ + 1, 1 }; + info.channelId = channelId; + NCCLCHECK(ncclSetupP2pKernel(&info)); + return ncclSuccess; +} +static ncclResult_t scheduleRecv(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff) { + struct ncclInfo info = { ncclFuncRecv, "Recv", + NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */ 1, 1 }; - info.delta = delta; info.channelId = channelId; - info.sendbytes = sendbytes; - info.recvbytes = recvbytes; - if (delta == 0 && sendbytes != recvbytes) return ncclInvalidUsage; NCCLCHECK(ncclSetupP2pKernel(&info)); return ncclSuccess; } @@ -134,7 +123,7 @@ void* ncclAsyncThreadPreconnect(void* args_) { struct ncclComm* comm = args->coll.comm; CUDACHECKTHREAD(cudaSetDevice(comm->cudaDev)); if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); - NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, 0)); + NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, 1)); return args; } @@ -216,8 +205,10 @@ ncclResult_t ncclGroupEnd() { struct ncclAsyncArgs* args = ncclGroupArgs+i; if (args->funcType == ASYNC_FUNC_COLL) { struct ncclComm* comm = args->coll.comm; - int rank = comm->rank; - int nRanks = comm->nRanks; + int node = comm->node; + int nNodes = comm->nNodes; + int localRank = comm->localRank; + int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2; // Compute how much to split operations // Natural step size matching buffer steps. @@ -233,50 +224,70 @@ ncclResult_t ncclGroupEnd() { while (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) { // schedule delta 0, +1, -1, +2, -2, ... // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even. - for (int d=0; d<=nRanks/4; d++) { - int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, (nRanks-(nRanks/2-d))%nRanks }; + for (int d=0; d<=nNodes/4; d++) { + int deltas[4] = { d, (nNodes-d)%nNodes, nNodes/2-d, (nNodes-(nNodes/2-d))%nNodes }; int index = 0; int delta = deltas[index]; sched_delta: - uint32_t from = (rank+nRanks-delta)%nRanks; - uint32_t to = (rank+delta)%nRanks; - struct ncclP2Pinfo* recv = comm->p2pRecvs[from] ? comm->p2pRecvs[from]->getNext() : NULL; - struct ncclP2Pinfo* send = comm->p2pSends[to] ? comm->p2pSends[to]->getNext() : NULL; - if (recv != NULL || send != NULL) { - ssize_t totRecvBytes = -1, totSendBytes = -1; - if (recv != NULL) totRecvBytes = recv->nbytes; - if (send != NULL) totSendBytes = send->nbytes; - ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize); - ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize); - - ssize_t sendOffset = 0; - ssize_t recvOffset = 0; - int sendRemaining = 1, recvRemaining = 1; - int chunk = 0; - do { - int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels; - ssize_t recvbytes = totRecvBytes-recvOffset; - ssize_t sendbytes = totSendBytes-sendOffset; - if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; } - if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; } - // 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested - // (total size == 0), otherwise set size to -1 so that the kernel skips the operation. - if (sendbytes == 0 && totSendBytes != 0) sendbytes = -1; - if (recvbytes == 0 && totRecvBytes != 0) recvbytes = -1; - if (sendbytes >= 0 || recvbytes >= 0) { - NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId, - recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL, - sendbytes, send ? ((const char*)(send->buff)) + sendOffset : NULL), ret, group_cleanup); + uint32_t recvNode = (node+nNodes-delta)%nNodes; + uint32_t sendNode = (node+delta)%nNodes; + int steps = comm->maxLocalRanks; + for (int s=0; s<steps; s++) { + int recvIndex = (localRank-s+steps)%steps; + int recvPeer = recvIndex<comm->nodeRanks[recvNode].localRanks ? comm->nodeRanks[recvNode].localRankToRank[recvIndex] : -1; + int sendIndex = (localRank+s)%steps; + int sendPeer = sendIndex<comm->nodeRanks[sendNode].localRanks ? comm->nodeRanks[sendNode].localRankToRank[sendIndex] : -1; + struct ncclP2Pinfo* recv = recvPeer != -1 && comm->p2pRecvs[recvPeer] ? comm->p2pRecvs[recvPeer]->getNext() : NULL; + struct ncclP2Pinfo* send = sendPeer != -1 && comm->p2pSends[sendPeer] ? comm->p2pSends[sendPeer]->getNext() : NULL; + if (recv != NULL || send != NULL) { + ssize_t totRecvBytes = -1, totSendBytes = -1; + if (recv != NULL) totRecvBytes = recv->nbytes; + if (send != NULL) totSendBytes = send->nbytes; + if (recv) comm->p2pRecvCount--; + if (send) comm->p2pSendCount--; + if (recvPeer == comm->rank) { // Check self send/recv + if (sendPeer != comm->rank) { WARN("Sendrecv schedule not aligned for self"); ret = ncclInternalError; goto group_cleanup; } + if (send && recv == NULL) { WARN("Trying to send to self without a matching recv"); ret = ncclInvalidUsage; goto group_cleanup; } + if (send == NULL && recv) { WARN("Trying to recv to self without a matching send"); ret = ncclInvalidUsage; goto group_cleanup; } } - recvOffset += recvChunkSize; - sendOffset += sendChunkSize; - chunk++; - } while (sendRemaining || recvRemaining); - if (recv) comm->p2pRecvCount--; - if (send) comm->p2pSendCount--; + void* recvBuff = recv ? recv->buff : NULL; + void* sendBuff = send ? send->buff : NULL; + // After we recycle p2pSend/Recv, we're no longer allowed to dereference send or recv, only use them as boolean NULL/not NULL. + if (recv && comm->p2pRecvs[recvPeer]->peakNext() == NULL) comm->p2pRecvs[recvPeer]->recycle(); + if (send && comm->p2pSends[sendPeer]->peakNext() == NULL) comm->p2pSends[sendPeer]->recycle(); + + ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize); + ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize); + + ssize_t sendOffset = 0; + ssize_t recvOffset = 0; + int sendRemaining = 1, recvRemaining = 1; + int chunk = 0; + do { + // Shuffle channels with s intra-node, and delta inter-node. Inter-node, make sure + // to use multiple channels to guarantee progress on all ranks from the same node. + int shuffle = comm->nNodes > 1 ? delta+(s/p2pGroupSize) : s; + int channelId = (shuffle+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels; + ssize_t recvbytes = totRecvBytes-recvOffset; + ssize_t sendbytes = totSendBytes-sendOffset; + if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; } + if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; } + // 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested + // (total size == 0), otherwise set size to -1. + if (sendbytes <= 0 && totSendBytes != 0) send = NULL; + if (recvbytes <= 0 && totRecvBytes != 0) recv = NULL; + if (recv) { + NCCLCHECKGOTO(scheduleRecv(comm, recvPeer, channelId, recvbytes, ((char*)recvBuff)+recvOffset), ret, group_cleanup); + } + if (send) { + NCCLCHECKGOTO(scheduleSend(comm, sendPeer, channelId, sendbytes, ((char*)sendBuff)+sendOffset), ret, group_cleanup); + } + recvOffset += recvChunkSize; + sendOffset += sendChunkSize; + chunk++; + } while (sendRemaining || recvRemaining); + } } - if (recv == NULL && comm->p2pRecvs[from]) comm->p2pRecvs[from]->recycle(); - if (send == NULL && comm->p2pSends[to]) comm->p2pSends[to]->recycle(); index++; if (index == 1 && deltas[1] == deltas[0]) index++; if (index == 2 && deltas[2] == deltas[0]) index++; @@ -382,16 +393,6 @@ group_cleanup: } comm->p2pSendCount = comm->p2pRecvCount = 0; } - /* Free all proxy ops in state->nextOps */ - struct ncclProxyState* state = &comm->proxyState; - pthread_mutex_lock(&state->poolMutex); - for (struct ncclProxyArgs *op = state->nextOps; op; op = op->next) { - op->next = state->pool; - state->pool = op; - } - pthread_mutex_unlock(&state->poolMutex); - state->nextOps = NULL; - ncclLaunchReset(comm); } } diff --git a/src/include/alloc.h b/src/include/alloc.h index 0791592..14bccf9 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -30,16 +30,37 @@ static inline ncclResult_t ncclCudaHostFree(void* ptr) { } template <typename T> -static ncclResult_t ncclCalloc(T** ptr, size_t nelem) { +static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { void* p = malloc(nelem*sizeof(T)); if (p == NULL) { WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); return ncclSystemError; } + //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p); memset(p, 0, nelem*sizeof(T)); *ptr = (T*)p; return ncclSuccess; } +#define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__) + +template <typename T> +static ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) { + if (nelem < oldNelem) return ncclInternalError; + if (nelem == oldNelem) return ncclSuccess; + + T* oldp = *ptr; + T* p = (T*)malloc(nelem*sizeof(T)); + if (p == NULL) { + WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); + return ncclSystemError; + } + memcpy(p, oldp, oldNelem*sizeof(T)); + free(oldp); + memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T)); + *ptr = (T*)p; + INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr); + return ncclSuccess; +} template <typename T> static ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index 77ac12b..a787c0b 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,18 +8,17 @@ #define NCCL_BOOTSTRAP_H_ #include "nccl.h" +#include "comm.h" ncclResult_t bootstrapNetInit(); ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv); ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out); -ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState); +ncclResult_t bootstrapInit(ncclUniqueId* id, struct ncclComm* comm); ncclResult_t bootstrapAllGather(void* commState, void* allData, int size); ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size); ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size); ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag); ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size); -ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, cudaIpcMemHandle_t* ipc, void** ptr); -ncclResult_t bootstrapRemFree(int id, int rank, void* commState); ncclResult_t bootstrapClose(void* commState); ncclResult_t bootstrapAbort(void* commState); #endif diff --git a/src/include/checks.h b/src/include/checks.h index 131c079..9624608 100644 --- a/src/include/checks.h +++ b/src/include/checks.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -60,6 +60,49 @@ } \ } while(true) +#define SYSCHECKGOTO(statement, res, label) do { \ + if ((statement) == -1) { \ + /* Print the back trace*/ \ + res = ncclSystemError; \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + goto label; \ + } \ +} while (0); + +#define NEQCHECK(statement, value) do { \ + if ((statement) != value) { \ + /* Print the back trace*/ \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError); \ + return ncclSystemError; \ + } \ +} while (0); + +#define NEQCHECKGOTO(statement, value, res, label) do { \ + if ((statement) != value) { \ + /* Print the back trace*/ \ + res = ncclSystemError; \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + goto label; \ + } \ +} while (0); + +#define EQCHECK(statement, value) do { \ + if ((statement) == value) { \ + /* Print the back trace*/ \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError); \ + return ncclSystemError; \ + } \ +} while (0); + +#define EQCHECKGOTO(statement, value, res, label) do { \ + if ((statement) == value) { \ + /* Print the back trace*/ \ + res = ncclSystemError; \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + goto label; \ + } \ +} while (0); + // Propagate errors up #define NCCLCHECK(call) do { \ ncclResult_t res = call; \ @@ -79,4 +122,39 @@ } \ } while (0); +#define NCCLWAIT(call, cond, abortFlagPtr) do { \ + volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ + ncclResult_t res = call; \ + if (res != ncclSuccess) { \ + if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + return ncclInternalError; \ + } \ + if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \ +} while (!(cond)); + +#define NCCLWAITGOTO(call, cond, abortFlagPtr, res, label) do { \ + volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ + res = call; \ + if (res != ncclSuccess) { \ + if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + goto label; \ + } \ + if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \ +} while (!(cond)); + +#define NCCLCHECKTHREAD(a) do { \ + if ((args->ret = (a)) != ncclSuccess) { \ + INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ + return args; \ + } \ +} while(0) + +#define CUDACHECKTHREAD(a) do { \ + if ((a) != cudaSuccess) { \ + INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ + args->ret = ncclUnhandledCudaError; \ + return args; \ + } \ +} while(0) + #endif diff --git a/src/include/coll_net.h b/src/include/coll_net.h index 0d17b76..c2d831e 100644 --- a/src/include/coll_net.h +++ b/src/include/coll_net.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -29,6 +29,6 @@ static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; } static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; } -static int collNetSupport() { return ncclCollNet != NULL ? 1 : 0; } +static int collNetSupport() { return ncclCollNet != nullptr ? 1 : 0; } #endif diff --git a/src/include/collectives.h b/src/include/collectives.h index 5fde721..d65c6ae 100644 --- a/src/include/collectives.h +++ b/src/include/collectives.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -36,7 +36,7 @@ struct ncclDevRedOpFull { /* Declare all collective operations */ #define DECL5(func, algo, proto, devredop, type) \ extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \ - extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(ncclWorkElem c); \ + extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, struct ncclWorkElem c); \ #define CONCAT(a,b) a##b #define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f) diff --git a/src/include/comm.h b/src/include/comm.h index bcbc695..4b55dc6 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -31,8 +31,6 @@ struct cudaLaunchParams { #define NCCL_LL128_THREAD_THRESHOLD 8 #define NCCL_SIMPLE_THREAD_THRESHOLD 64 -#define NCCL_MAX_INTRA_RANKS 32 - struct ncclSendMem { union { struct { @@ -41,10 +39,10 @@ struct ncclSendMem { void* ptrExchange; uint64_t redOpArgExchange[2]; char pad2[CACHE_LINE_SIZE-sizeof(void*)-2*sizeof(uint64_t)]; + int offsFifo[NCCL_STEPS]; }; char pad3[MEM_ALIGN]; }; - char buff[1]; // Actually larger than that }; struct ncclRecvMem { @@ -53,18 +51,18 @@ struct ncclRecvMem { uint64_t tail; char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; int sizesFifo[NCCL_STEPS]; - void* ptrsFifo[NCCL_STEPS]; + int offsFifo[NCCL_STEPS]; + int flush; // For GDRCopy-based flush }; char pad4[MEM_ALIGN]; }; - char buff[1]; // Actually larger than that }; typedef cudaError_t(*pfn_cuMemGetAddressRange_t)(void**, size_t*, void*); enum helperThreadState {ThreadStart, ThreadStop}; -#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_INTRA_RANKS*NCCL_MAX_OPS) +#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS) struct ncclGraphHelperResources { ncclComm* comm; @@ -82,6 +80,11 @@ struct ncclUserRedOp { ncclDevRedOpFull opFull; }; +struct ncclNodeRanks { + int localRanks; + int* localRankToRank; +}; + struct ncclComm { struct ncclChannel channels[MAXCHANNELS]; @@ -102,12 +105,14 @@ struct ncclComm { int node; int nNodes; - - // Intra-node rank info - int intraNodeGlobalRanks[NCCL_MAX_INTRA_RANKS]; + int localRank; int localRanks; - int intraNodeRank; - int8_t* rankToIntraNodeRank; + int maxLocalRanks; + int* rankToNode; + int* rankToLocalRank; + int* localRankToRank; + // localRanks and localRanktoRank for all nodes + struct ncclNodeRanks* nodeRanks; enum { GROUP, PARALLEL, GROUP_GRAPH } launchMode; cudaStream_t userStream; @@ -161,14 +166,13 @@ struct ncclComm { // Storage for deferred intra-process launch struct cudaLaunchParams * intraParams; struct cudaLaunchParams *myParams; + pthread_t* intraThreads; int* intraCudaDevs; int* intraCGMode; // Whether we can use CUDA9 CGMD or not int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not struct ncclWorkElem args; - void* argsptr; + void* argsptrs[2]; - // Global proxy thread - pthread_t proxyThread; struct ncclProxyState proxyState; // Whether this communicator uses collNet diff --git a/src/include/debug.h b/src/include/debug.h index 6ce90ee..7af38fd 100644 --- a/src/include/debug.h +++ b/src/include/debug.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -16,6 +16,9 @@ #include <string.h> #include <pthread.h> +// Conform to pthread and NVTX standard +#define NCCL_THREAD_NAMELEN 16 + extern int ncclDebugLevel; extern uint64_t ncclDebugMask; extern pthread_mutex_t ncclDebugOutputLock; @@ -37,4 +40,6 @@ extern std::chrono::high_resolution_clock::time_point ncclEpoch; #define TRACE(...) #endif +void ncclSetThreadName(pthread_t thread, const char *fmt, ...); + #endif diff --git a/src/include/devcomm.h b/src/include/devcomm.h index 676ffda..8ff9d4b 100644 --- a/src/include/devcomm.h +++ b/src/include/devcomm.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,8 +11,8 @@ #include "align.h" #include <stdint.h> -#define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now -typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclNumFuncs} ncclFunc_t; +#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now +typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t; extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS]; #define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet @@ -90,16 +90,22 @@ struct ncclConnInfo { uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case int *sizesFifo; // Sizes fifo from GPU to proxy - void* *ptrsFifo; // Buffer fifo from proxy to GPU + int *offsFifo; // Buffer fifo from proxy to GPU uint64_t step; // Keep where we are uint64_t llLastCleaning; }; +struct ncclProxyConnector { + int rank; + int localRank; + struct ncclProxyConnection* connection; + struct ncclComm* comm; +}; + struct ncclConnector { int connected; - struct ncclProxyArgs *proxyAppend; - struct ncclProxyArgs **proxyAppendPtr; + struct ncclProxyConnector proxyConn; struct ncclTransportComm* transportComm; void* transportResources; struct ncclConnInfo conn; @@ -147,63 +153,89 @@ struct ncclPeer { struct ncclDevComm; -#define NCCL_MAX_WORK_ELEMENTS 8 -#define NCCL_MAX_GROUPS (NCCL_MAX_WORK_ELEMENTS*2) - /* ncclWork is to be a power of two, currently 8x64 bytes, */ /* to make sure reads to host from the CUDA kernel are aligned. */ /* Make sure to adjust padding at the end of ncclWorkElem. */ -struct ncclWorkElem { - // Header - struct ncclDevComm* comm; - uint16_t nThreads; +#define NCCL_WORK_SIZE 512 + +enum ncclWorkElemType : uint8_t { + ncclWorkTypeUnused=0, + ncclWorkTypeColl=1, + ncclWorkTypeP2p=2, + ncclWorkTypeRegColl=3 +}; +enum ncclWorkElemSubType : uint8_t { + ncclWorkSubTypeUnused =0, + ncclWorkSubTypeSend, + ncclWorkSubTypeRecv +}; + +struct ncclWorkElemHeader { uint16_t funcIndex; + enum ncclWorkElemType type; + unsigned nWarps:5; + unsigned isLast:1; +}; + +struct ncclWorkElem { + struct ncclWorkElemHeader header; uint8_t regUsed; uint8_t direct; - uint8_t active, redOpArgIsPtr; + uint8_t redOpArgIsPtr; const void * sendbuff; void * recvbuff; - // Op-specific fields. - union { - struct { - size_t count; - size_t lastChunkSize; - uint32_t root; - uint8_t bid; - uint8_t nChannels; - uint64_t redOpArg; - } coll; - struct { - size_t sendCount; - size_t recvCount; - int sendChunkSize; - int recvChunkSize; - int32_t delta; - uint16_t nThreads; - } p2p; - uint64_t align[4]; - }; + size_t count; + size_t lastChunkSize; + uint32_t root; + uint8_t bid; + uint8_t nChannels; + uint64_t redOpArg; + uint64_t pad; +}; +static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElem) == 0, "ncclWorkElem size must be a multiple of ncclWork size"); + +struct ncclWorkElemP2p { + struct ncclWorkElemHeader header; + int32_t peer; + void* buff; + size_t count; + int chunkSize; + uint8_t ngroups; + uint8_t warpStart; + uint8_t nWarps; + enum ncclWorkElemSubType subType; }; -static_assert(sizeof(struct ncclWorkElem) == (0x10*sizeof(int)), "ncclWorkElem must have a pow2 size"); +static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemP2p) == 0, "ncclWorkElemP2p size must be a multiple of ncclWork size"); -struct ncclWorkRegElem { +struct ncclWorkElemReg { struct ncclWorkElem elem; void* dnInputs[NCCL_MAX_DIRECT_ARITY+1]; void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1]; void* upOutputs[NCCL_MAX_DIRECT_ARITY+1]; }; -#define NCCL_REG_ELEM_FACTOR 4 -static_assert(sizeof(struct ncclWorkRegElem) == (NCCL_REG_ELEM_FACTOR*sizeof(struct ncclWorkElem)), "ncclWorkRegElem size must be pow2 times ncclWorkElem size"); +static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemReg) == 0, "ncclWork size must be a multiple of ncclWorkElemReg size"); +static_assert(sizeof(struct ncclWorkElemReg) % sizeof(struct ncclWorkElem) == 0, "ncclWorkElemReg size must be a multiple of ncclWorkElem size"); + +#define NCCL_MAX_WORK_ELEMENTS (NCCL_WORK_SIZE/sizeof(struct ncclWorkElem)) +#define NCCL_MAX_WORK_ELEMENTS_P2P (NCCL_WORK_SIZE/sizeof(struct ncclWorkElemP2p)) +#define NCCL_MAX_WORK_ELEMENTS_REG (NCCL_WORK_SIZE/sizeof(struct ncclWorkElemReg)) +// Number of named barriers supported by CUDA +#define NCCL_MAX_GROUPS 16 struct ncclWork { union { + char pad[NCCL_WORK_SIZE]; + struct ncclWorkElemHeader header; struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS]; - struct ncclWorkRegElem regElems[NCCL_MAX_WORK_ELEMENTS/NCCL_REG_ELEM_FACTOR]; + struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P]; + struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG]; }; }; +static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "ncclWork size needs to be well aligned"); + struct ncclChannel { union { struct { diff --git a/src/include/enqueue.h b/src/include/enqueue.h index 962896e..02a9adb 100644 --- a/src/include/enqueue.h +++ b/src/include/enqueue.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -31,17 +31,17 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph); ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph); struct ncclBuffRegInfo { - void* sendbuffsBase[NCCL_MAX_INTRA_RANKS]; - void* recvbuffsBase[NCCL_MAX_INTRA_RANKS]; - void* sendbuffs[NCCL_MAX_INTRA_RANKS]; - void* recvbuffs[NCCL_MAX_INTRA_RANKS]; + void* sendbuffsBase[NCCL_MAX_LOCAL_RANKS]; + void* recvbuffsBase[NCCL_MAX_LOCAL_RANKS]; + void* sendbuffs[NCCL_MAX_LOCAL_RANKS]; + void* recvbuffs[NCCL_MAX_LOCAL_RANKS]; int nBuffs; }; // Enqueue information (for kernel and proxy) for each operation struct ncclQueueElem { - struct ncclWorkElem work; - struct ncclProxyArgs proxyArgs; + struct ncclWork work; + struct ncclProxyOp proxyOp; struct ncclBuffRegInfo buffRegInfo; }; @@ -87,7 +87,7 @@ static void ncclDestroyQueueInfo(void* ptr) { // but currently the destroy function of CUDA objects does not allow CUDA API calls while (eqElem != NULL) { for (int i=0; i<eqElem->buffRegInfo.nBuffs; i++) { - if (i == eqInfo->comm->intraNodeRank) continue; + if (i == eqInfo->comm->localRank) continue; CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.sendbuffsBase[i])); CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.recvbuffsBase[i])); } diff --git a/src/include/graph.h b/src/include/graph.h index 4b7a836..898b903 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -30,9 +30,12 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm); ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks); // Query topology -ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* net); +ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank); ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank); ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr); +int ncclPxnDisable(); +ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks); +ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank); // Find CPU affinity ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity); @@ -48,6 +51,7 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu #define NCCL_TOPO_CPU_TYPE_YONGFENG 1 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model); ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count); +ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id); #define NCCL_TOPO_MAX_NODES 256 @@ -70,6 +74,7 @@ struct ncclTopoGraph { int nChannels; float speedIntra; float speedInter; + float latencyInter; int typeIntra; int typeInter; int sameChannels; diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h index 4ec1ac6..63555ba 100644 --- a/src/include/ibvwrap.h +++ b/src/include/ibvwrap.h @@ -4,7 +4,7 @@ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -328,7 +328,8 @@ enum ibv_access_flags { IBV_ACCESS_REMOTE_WRITE = (1<<1), IBV_ACCESS_REMOTE_READ = (1<<2), IBV_ACCESS_REMOTE_ATOMIC = (1<<3), - IBV_ACCESS_MW_BIND = (1<<4) + IBV_ACCESS_MW_BIND = (1<<4), + IBV_ACCESS_RELAXED_ORDERING = (1<<20), }; struct ibv_pd { @@ -1065,6 +1066,7 @@ ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context) ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd); ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access); struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access); +ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access); ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr); ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context); ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel); diff --git a/src/include/info.h b/src/include/info.h index 2e99e9c..3461cc7 100644 --- a/src/include/info.h +++ b/src/include/info.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,7 +11,7 @@ #include "devcomm.h" #include "collectives.h" -typedef enum { +typedef enum : uint8_t { ncclPatternRing, ncclPatternRingTwice, ncclPatternPipelineFrom, @@ -19,7 +19,9 @@ typedef enum { ncclPatternTreeUp, ncclPatternTreeDown, ncclPatternTreeUpDown, - ncclPatternCollTreeUpDown + ncclPatternCollTreeUpDown, + ncclPatternSend, + ncclPatternRecv } ncclPattern_t; // Used to pass NCCL call information between functions @@ -32,7 +34,7 @@ struct ncclInfo { size_t count; ncclDataType_t datatype; ncclRedOp_t op; - int root; + int root; // peer for p2p operations ncclComm_t comm; cudaStream_t stream; // Algorithm details @@ -48,11 +50,7 @@ struct ncclInfo { size_t nBytes; int nstepsPerLoop; int nchunksPerLoop; - ssize_t sendbytes; - ssize_t recvbytes; - int recvChunkSize; - int sendChunkSize; - uint32_t delta; + int chunkSize; int channelId; }; diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h index 389c1ea..ce61672 100644 --- a/src/include/nccl_net.h +++ b/src/include/nccl_net.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -10,7 +10,7 @@ #include "nccl.h" #include <stdint.h> -#define NCCL_NET_HANDLE_MAXSIZE 64 +#define NCCL_NET_HANDLE_MAXSIZE 128 #define NCCL_PTR_HOST 0x1 #define NCCL_PTR_CUDA 0x2 @@ -31,10 +31,114 @@ typedef struct { int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA int speed; // Port speed in Mbps. int port; // Port number. + float latency; // Network latency int maxComms; // Maximum number of comms we can create -}ncclNetProperties_v4_t; + int maxRecvs; // Maximum number of grouped receives. +}ncclNetProperties_v5_t; -typedef ncclNetProperties_v4_t ncclNetProperties_t; +typedef ncclNetProperties_v5_t ncclNetProperties_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v5_t; + +typedef ncclNet_v5_t ncclNet_t; + +#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v5 + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v5_t; + +typedef ncclCollNet_v5_t ncclCollNet_t; + +#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v5 + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA + int speed; // Port speed in Mbps. + int port; // Port number. + int maxComms; // Maximum number of comms we can create +} ncclNetProperties_v4_t; typedef struct { // Name of the network (mainly for logs) @@ -75,10 +179,6 @@ typedef struct { ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v4_t; -typedef ncclNet_v4_t ncclNet_t; - -#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v4 - typedef struct { // Name of the collective network (mainly for logs) const char* name; @@ -117,8 +217,4 @@ typedef struct { ncclResult_t (*closeListen)(void* listenComm); } ncclCollNet_v4_t; -typedef ncclCollNet_v4_t ncclCollNet_t; - -#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v4 - #endif // end include guard diff --git a/src/include/net.h b/src/include/net.h index ef553e2..0cc5067 100644 --- a/src/include/net.h +++ b/src/include/net.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,10 +9,14 @@ #include "nccl.h" #include "nccl_net.h" +#include "checks.h" extern ncclNet_t* ncclNet; typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; +ncclResult_t ncclNetInit(); +int ncclNetVersion(); + // Translation to external API static const char* ncclNetName() { return ncclNet->name; } static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; } @@ -22,56 +26,16 @@ static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCC static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; } static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; } static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; } -static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; } -static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; } -static ncclResult_t ncclNetIflush(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, data, size, mhandle, request)); return ncclSuccess; } -static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; } +static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; } +static ncclResult_t ncclNetIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; } +static ncclResult_t ncclNetIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; } +static ncclResult_t ncclNetTest(void* request, int* done, int* sizes) { NCCLCHECK(ncclNet->test(request, done, sizes)); return ncclSuccess; } static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; } static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; } static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; } // Test whether the current GPU support GPU Direct RDMA. -#define GPU_BUF_SIZE (2*1024*1024) -static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) { - int netDevs; - NCCLCHECK(ncclNetDevices(&netDevs)); - *gdrSupport = 0; - for (int dev=0; dev<netDevs; dev++) { - // Find a net device which is GDR-capable - ncclNetProperties_t props; - NCCLCHECK(ncclNet->getProperties(dev, &props)); - if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue; - - // Allocate memory on the GPU and try to register it on the NIC. - void *lComm = NULL, *sComm = NULL, *rComm = NULL; - ncclNetHandle_t handle; - void* gpuPtr = NULL; - void* mHandle = NULL; - ncclResult_t ret; - ncclDebugNoWarn = NCCL_NET; - NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1); - NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2); - NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3); - CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4); - if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) { - NCCLCHECK(ncclNetDeregMr(sComm, mHandle)); - NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle)); - NCCLCHECK(ncclNetDeregMr(rComm, mHandle)); - *gdrSupport = 1; - } - ncclDebugNoWarn = 0; - CUDACHECK(cudaFree(gpuPtr)); -cleanup4: - NCCLCHECK(ncclNetCloseRecv(rComm)); -cleanup3: - NCCLCHECK(ncclNetCloseSend(sComm)); -cleanup2: - NCCLCHECK(ncclNetCloseListen(lComm)); -cleanup1: - break; - } - return ncclSuccess; -} +ncclResult_t ncclGpuGdrSupport(int* gdrSupport); extern ncclNet_t ncclNetIb; extern ncclNet_t ncclNetSocket; diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h index 21ee82e..29731dd 100644 --- a/src/include/nvmlwrap.h +++ b/src/include/nvmlwrap.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,59 +9,13 @@ #include "nccl.h" -// The NVML library doesn't appear to be thread safe -#include <pthread.h> -extern pthread_mutex_t nvmlLock; -#define NVMLLOCK() pthread_mutex_lock(&nvmlLock) -#define NVMLUNLOCK() pthread_mutex_unlock(&nvmlLock) - -#define NVMLLOCKCALL(cmd, ret) do { \ - NVMLLOCK(); \ - ret = cmd; \ - NVMLUNLOCK(); \ -} while(false) - -#define NVMLCHECK(cmd) do { \ - nvmlReturn_t e; \ - NVMLLOCKCALL(cmd, e); \ - if( e != NVML_SUCCESS ) { \ - WARN("NVML failure '%s'", nvmlErrorString(e)); \ - return ncclSystemError; \ - } \ -} while(false) - -//#define NVML_DIRECT 1 -#ifdef NVML_DIRECT -#include "nvml.h" +//#define NCCL_NVML_DIRECT 1 +#ifndef NCCL_NVML_DIRECT +#define NCCL_NVML_DIRECT 0 +#endif -static ncclResult_t wrapNvmlSymbols(void) { return ncclSuccess; } -static ncclResult_t wrapNvmlInit(void) { NVMLCHECK(nvmlInit()); return ncclSuccess; } -static ncclResult_t wrapNvmlShutdown(void) { NVMLCHECK(nvmlShutdown()); return ncclSuccess; } -static ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) { - NVMLCHECK(nvmlDeviceGetHandleByPciBusId(pciBusId, device)); - return ncclSuccess; -} -static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { - NVMLCHECK(nvmlDeviceGetIndex(device, index)); - return ncclSuccess; -} -static ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) { - NVMLCHECK(nvmlDeviceGetNvLinkState(device, link, isActive)); - return ncclSuccess; -} -static ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) { - NVMLCHECK(nvmlDeviceGetNvLinkRemotePciInfo(device, link, pci)); - return ncclSuccess; -} -static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, - nvmlNvLinkCapability_t capability, unsigned int *capResult) { - NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult)); - return ncclSuccess; -} -static ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) { - NVMLCHECK(nvmlDeviceGetCudaComputeCapability(device, major, minor)); - return ncclSuccess; -} +#if NCCL_NVML_DIRECT +#include "nvml.h" #else // Dynamically handle dependencies on NVML @@ -129,21 +83,56 @@ typedef struct nvmlPciInfo_st unsigned int reserved2; unsigned int reserved3; } nvmlPciInfo_t; -/* End of nvml.h */ - -ncclResult_t wrapNvmlSymbols(void); -ncclResult_t wrapNvmlInit(void); -ncclResult_t wrapNvmlShutdown(void); -ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device); -ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index); -ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); -ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); -ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); -ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, - nvmlNvLinkCapability_t capability, unsigned int *capResult); -ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor); - -#endif // NVML_DIRECT +/* P2P Capability Index Status*/ +typedef enum nvmlGpuP2PStatus_enum +{ + NVML_P2P_STATUS_OK = 0, + NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, + NVML_P2P_STATUS_GPU_NOT_SUPPORTED, + NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED, + NVML_P2P_STATUS_DISABLED_BY_REGKEY, + NVML_P2P_STATUS_NOT_SUPPORTED, + NVML_P2P_STATUS_UNKNOWN +} nvmlGpuP2PStatus_t; + +/* P2P Capability Index*/ +typedef enum nvmlGpuP2PCapsIndex_enum +{ + NVML_P2P_CAPS_INDEX_READ = 0, + NVML_P2P_CAPS_INDEX_WRITE, + NVML_P2P_CAPS_INDEX_NVLINK, + NVML_P2P_CAPS_INDEX_ATOMICS, + NVML_P2P_CAPS_INDEX_PROP, + NVML_P2P_CAPS_INDEX_UNKNOWN +} nvmlGpuP2PCapsIndex_t; +/* End of nvml.h */ +#endif // NCCL_NVML_DIRECT + +constexpr int ncclNvmlMaxDevices = 32; +struct ncclNvmlDeviceInfo { + nvmlDevice_t handle; + int computeCapabilityMajor, computeCapabilityMinor; +}; +struct ncclNvmlDevicePairInfo { + nvmlGpuP2PStatus_t p2pStatusRead, p2pStatusWrite; +}; +extern int ncclNvmlDeviceCount; +extern ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices]; +extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices]; + +// All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly. +// Outsiders need only call it if they want to inspect the ncclNvml global +// tables above. +ncclResult_t ncclNvmlEnsureInitialized(); + +ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device); +ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index); +ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); +ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); +ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); +ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult); +ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor); +ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus); #endif // End include guard diff --git a/src/include/param.h b/src/include/param.h index 49c4606..7f749fb 100644 --- a/src/include/param.h +++ b/src/include/param.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -38,6 +38,7 @@ static void setEnvFile(const char* fileName) { strncpy(envValue, line+s, 1023); envValue[1023]='\0'; setenv(envVar, envValue, 0); + //printf("%s : %s->%s\n", fileName, envVar, envValue); } if (line) free(line); fclose(file); diff --git a/src/include/profiler.h b/src/include/profiler.h new file mode 100644 index 0000000..103af99 --- /dev/null +++ b/src/include/profiler.h @@ -0,0 +1,37 @@ +/************************************************************************* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_PROFILER_H_ +#define NCCL_PROFILER_H_ + +#include "proxy.h" + +enum ncclProxyProfileState { + ncclProxyProfileBegin = 0, + + ncclProxyProfileSendGPUWait = 1, + ncclProxyProfileSendWait = 2, + + ncclProxyProfileRecvWait = 1, + ncclProxyProfileRecvFlushWait = 2, + ncclProxyProfileRecvGPUWait = 3, + + ncclProxyProfileEnd = 4, + + ncclProxyProfileSleep = 8, + ncclProxyProfileWakeup = 9, + + ncclProxyProfileIdle = 16, + ncclProxyProfileActive = 17, + + ncclProxyProfileAppend = 24, + ncclProxyProfileAppendEnd = 25 +}; + +ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state); +void ncclProfilingDump(); + +#endif diff --git a/src/include/proxy.h b/src/include/proxy.h index 58a58b2..c7ca0aa 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,27 +7,47 @@ #ifndef NCCL_PROXY_H_ #define NCCL_PROXY_H_ +#include "devcomm.h" +#include "info.h" +#include "socket.h" #include <pthread.h> enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }; struct ncclProxyArgs; -typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*); +typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclComm*, struct ncclProxyArgs*); #define NCCL_PROXY_MAX_SUBS MAXCHANNELS static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements"); +struct ncclProxyOp { + struct ncclProxyConnection* connection; + int channelId; + int nsteps; + ssize_t nbytes; + int root; + int next; + + uint64_t opCount; + int sliceSteps; + int chunkSteps; + int chunkSize; + ncclDataType_t dtype; + ncclRedOp_t redOp; + ncclPattern_t pattern; // uint8_t + uint8_t protocol; + uint16_t pad; +}; +static_assert(sizeof(struct ncclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch"); + struct ncclProxySubArgs { - struct ncclChannel* channel; - struct ncclConnector* connector; + struct ncclProxyConnection* connection; + int channelId; int nsteps; - ssize_t sendbytes; - ssize_t recvbytes; - int sendChunkSize; - int recvChunkSize; - int delta; + ssize_t nbytes; + int peer; - // Internal state + int groupSize; // Number of consecutive sub operations sharing the same recvComm uint64_t base; uint64_t posted; uint64_t received; @@ -36,23 +56,22 @@ struct ncclProxySubArgs { uint64_t done; uint64_t end; void* requests[NCCL_STEPS]; + void* profilingEvents[NCCL_STEPS]; }; struct ncclProxyArgs { - proxyProgressFunc_t progress; struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS]; + proxyProgressFunc_t progress; int nsubs; int done; + uint64_t opCount; int sliceSteps; int chunkSteps; int chunkSize; - uint64_t opCount; - uint64_t commOpCount; - int protocol; ncclDataType_t dtype; ncclRedOp_t redOp; ncclPattern_t pattern; - int root; + uint8_t protocol; int state; char* sharedBuff[NCCL_STEPS]; int sharedSize[NCCL_STEPS]; @@ -60,39 +79,104 @@ struct ncclProxyArgs { int idle; // Element linking - pthread_mutex_t mutex; struct ncclProxyArgs* next; struct ncclProxyArgs* nextPeer; struct ncclProxyArgs** proxyAppendPtr; }; +#define NCCL_MAX_NETDEVS 128 + +// ProxyOps are used to communicate between main thread and service thread +// Make sure we have enough to store two full rounds of operations on all channels. +// Otherwise we'd be unable to post half of them to free new elements. +#define MAX_OPS_PER_PEER (2*MAXCHANNELS*NCCL_MAX_WORK_ELEMENTS_P2P) +#define NCCL_MAX_LOCAL_RANKS 64 +struct ncclProxyOpsPool { + struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS]; + volatile int nextOps; + volatile int nextOpsEnd; + volatile int freeOps[NCCL_MAX_LOCAL_RANKS]; + pthread_mutex_t mutex; + pthread_cond_t cond; +}; + +struct ncclProxyOps { + ncclProxyOpsPool* pool; + int count; + int freeOp; + int nextOps; + int nextOpsEnd; +}; + +struct ncclProxySharedP2p { + int refcount; + int size; + char* cudaBuff; + char* hostBuff; + cudaIpcMemHandle_t ipc; + struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv +}; -struct ncclProxySharedBuffers { +struct ncclProxySharedCollNet { int size; char* cudaBuff; char* hostBuff; - struct ncclProxyArgs* proxyAppend[2*MAXCHANNELS]; // Separate send and recv - // Collnet sharing is technically per device, but for now MAXDEVICES == MAXCHANNELS. - struct ncclProxyArgs* proxyAppendCollNet[2*MAXCHANNELS]; - void* collNetResources; + struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS]; + void* resources; +}; + +struct ncclProxyPeer { + struct ncclProxySharedP2p send; + struct ncclProxySharedP2p recv; +}; + +struct ncclSharedNetComms { + void* sendComm[MAXCHANNELS]; + void* recvComm[MAXCHANNELS]; + int sendRefCount[MAXCHANNELS]; + int recvRefCount[MAXCHANNELS]; }; struct ncclProxyPool; -struct ncclProxyState { - pthread_cond_t cond; - pthread_mutex_t opsMutex; - pthread_mutex_t poolMutex; - bool stop; - struct ncclProxySharedBuffers sharedBuffs; - struct ncclProxyArgs* ops; // Running operations, used by proxy thread - struct ncclProxyArgs* postedOps; // Posted operations, shared between proxy and main thread, locked with opsMutex - struct ncclProxyArgs* postedOpsEnd; - struct ncclProxyArgs* nextOps; // Pending operations, used by main thread (could still be cancelled) - struct ncclProxyArgs* nextOpsEnd; - struct ncclProxyArgs* pool; // Free operations for main thread - struct ncclProxyArgs* poolFreed; // Freed operations by the progress thread - struct ncclProxyArgs* poolReturned; // Shared between main and progress thread, lock with poolMutex +struct ncclProxyProgressState { + // Used by main threads to send work to progress thread + struct ncclProxyOpsPool* opsPool; + char opsPoolShmSuffix[6]; + pthread_t thread; + bool stop; + struct ncclProxyPeer** localPeers; + struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS]; + struct ncclProxySharedCollNet collNet; + struct ncclProxyArgs* active; + struct ncclProxyArgs* pool; struct ncclProxyPool* pools; + int nextOps; +}; + +struct ncclProxyState { + // Service thread + pthread_t thread; + struct ncclSocket* listenSock; + int stop; + + // Used by main thread + union ncclSocketAddress* peerAddresses; + struct ncclSocket* peerSocks; + struct ncclProxyOps* proxyOps; + void** sharedDevMems; + + // Progress thread + struct ncclProxyProgressState progressState; +}; + +struct ncclProxyConnection { + int send, transport, shared; + int localRank; + struct ncclSocket* sock; + struct ncclTransportComm* tcomm; + struct ncclProxyArgs *proxyAppend; + struct ncclProxyArgs **proxyAppendPtr; + void* transportResources; }; typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*); @@ -103,26 +187,25 @@ enum proxyMode { proxyTo = 2 }; -ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int nranks); -ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyArgs* args); -ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args); +ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* proxyOp, int nranks); +ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp); +ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* proxyOp); ncclResult_t ncclProxyStart(struct ncclComm* comm); +ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses); ncclResult_t ncclProxyCreate(struct ncclComm* comm); -ncclResult_t ncclProxyDestroy(struct ncclComm* comm); - -ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr); -ncclResult_t ncclProxySharedBuffersGetP2p(struct ncclComm* comm, int cuda, int type, int channel, int slot, int index, char** ptr); -ncclResult_t ncclProxySharedBuffersGetCollNet(struct ncclComm* comm, int cuda, int type, int slot, int channel, char** ptr); -ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm); - -#include <unistd.h> - -// Spin wait until func evaluates to true -template<typename FUNC> -inline void transportProxyWait(const FUNC& func) { - while (!func()) { - sched_yield(); - } -} +ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn); +enum ncclProxyMsgType { + ncclProxyMsgInit = 1, + ncclProxyMsgSharedInit = 2, + ncclProxyMsgSetup = 3, + ncclProxyMsgConnect = 4, + ncclProxyMsgStart = 5, + ncclProxyMsgClose = 6, + ncclProxyMsgAbort = 7, + ncclProxyMsgStop = 8 +}; +ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize); +ncclResult_t ncclProxyDestroy(struct ncclComm* comm); +ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm); #endif diff --git a/src/include/shm.h b/src/include/shm.h index 7334f16..08dc849 100644 --- a/src/include/shm.h +++ b/src/include/shm.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,65 +7,9 @@ #ifndef NCCL_SHM_H_ #define NCCL_SHM_H_ -#include <sys/types.h> -#include <sys/mman.h> -#include <sys/stat.h> -#include <fcntl.h> - -// Change functions behavior to match other SYS functions -static int shm_allocate(int fd, const int shmsize) { - int err = posix_fallocate(fd, 0, shmsize); - if (err) { errno = err; return -1; } - return 0; -} -static int shm_map(int fd, const int shmsize, void** ptr) { - *ptr = mmap(NULL, shmsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - return (*ptr == MAP_FAILED) ? -1 : 0; -} - -static ncclResult_t shmSetup(const char* shmname, const int shmsize, int* fd, void** ptr, int create) { - SYSCHECKVAL(shm_open(shmname, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "shm_open", *fd); - if (create) SYSCHECK(shm_allocate(*fd, shmsize), "posix_fallocate"); - SYSCHECK(shm_map(*fd, shmsize, ptr), "mmap"); - close(*fd); - *fd = -1; - if (create) memset(*ptr, 0, shmsize); - return ncclSuccess; -} - -static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPtr, void** devShmPtr, int create) { - int fd = -1; - void* ptr = MAP_FAILED; - ncclResult_t res = ncclSuccess; - - NCCLCHECKGOTO(shmSetup(shmname, shmsize, &fd, &ptr, create), res, sysError); - CUDACHECKGOTO(cudaHostRegister(ptr, shmsize, cudaHostRegisterMapped), res, cudaError); - CUDACHECKGOTO(cudaHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError); - - *shmPtr = ptr; - return ncclSuccess; -sysError: - WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmname, shmsize); -cudaError: - if (fd != -1) close(fd); - if (create) shm_unlink(shmname); - if (ptr != MAP_FAILED) munmap(ptr, shmsize); - *shmPtr = NULL; - return res; -} - -static ncclResult_t shmUnlink(const char* shmname) { - if (shmname != NULL) SYSCHECK(shm_unlink(shmname), "shm_unlink"); - return ncclSuccess; -} - -static ncclResult_t shmClose(void* shmPtr, void* devShmPtr, const int shmsize) { - CUDACHECK(cudaHostUnregister(shmPtr)); - if (munmap(shmPtr, shmsize) != 0) { - WARN("munmap of shared memory failed"); - return ncclSystemError; - } - return ncclSuccess; -} +#include "nccl.h" +ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create); +ncclResult_t ncclShmUnlink(const char* shmname); +ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize); #endif diff --git a/src/include/socket.h b/src/include/socket.h index 6ca5f7d..53fda4d 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,14 +7,13 @@ #ifndef NCCL_SOCKET_H_ #define NCCL_SOCKET_H_ +#include "nccl.h" #include <sys/socket.h> #include <arpa/inet.h> #include <netinet/tcp.h> -#include <unistd.h> #include <netdb.h> -#include <ifaddrs.h> -#include <net/if.h> -#include "utils.h" +#include <fcntl.h> +#include <poll.h> #define MAX_IFS 16 #define MAX_IF_NAME_SIZE 16 @@ -24,438 +23,48 @@ #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV) /* Common socket address storage structure for IPv4/IPv6 */ -union socketAddress { +union ncclSocketAddress { struct sockaddr sa; struct sockaddr_in sin; struct sockaddr_in6 sin6; }; -/* Format a string representation of a (union socketAddress *) socket address using getnameinfo() - * - * Output: "IPv4/IPv6 address<port>" - */ -static inline const char *socketToString(union socketAddress *addr, char *buf) { - if (buf == NULL || addr == NULL) return NULL; - struct sockaddr *saddr = &addr->sa; - if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; } - char host[NI_MAXHOST], service[NI_MAXSERV]; - (void) getnameinfo(saddr, sizeof(union socketAddress), host, NI_MAXHOST, service, NI_MAXSERV, NI_NUMERICHOST|NI_NUMERICSERV); - sprintf(buf, "%s<%s>", host, service); - return buf; -} - -static inline uint16_t socketToPort(union socketAddress *addr) { - struct sockaddr *saddr = &addr->sa; - return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port); -} - -/* Allow the user to force the IPv4/IPv6 interface selection */ -static inline int envSocketFamily(void) { - int family = -1; // Family selection is not forced, will use first one found - char* env = getenv("NCCL_SOCKET_FAMILY"); - if (env == NULL) - return family; - - INFO(NCCL_ENV, "NCCL_SOCKET_FAMILY set by environment to %s", env); - - if (strcmp(env, "AF_INET") == 0) - family = AF_INET; // IPv4 - else if (strcmp(env, "AF_INET6") == 0) - family = AF_INET6; // IPv6 - return family; -} - -static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) { -#ifdef ENABLE_TRACE - char line[SOCKET_NAME_MAXLEN+1]; -#endif - struct netIf userIfs[MAX_IFS]; - bool searchNot = prefixList && prefixList[0] == '^'; - if (searchNot) prefixList++; - bool searchExact = prefixList && prefixList[0] == '='; - if (searchExact) prefixList++; - int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS); - - int found = 0; - struct ifaddrs *interfaces, *interface; - getifaddrs(&interfaces); - for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) { - if (interface->ifa_addr == NULL) continue; - - /* We only support IPv4 & IPv6 */ - int family = interface->ifa_addr->sa_family; - if (family != AF_INET && family != AF_INET6) - continue; - - TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, socketToString((union socketAddress *)interface->ifa_addr, line)); - - /* Allow the caller to force the socket family type */ - if (sock_family != -1 && family != sock_family) - continue; - - /* We also need to skip IPv6 loopback interfaces */ - if (family == AF_INET6) { - struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr); - if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue; - } - - // check against user specified interfaces - if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) { - continue; - } - - // Check that this interface has not already been saved - // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link - bool duplicate = false; - for (int i = 0; i < found; i++) { - if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; } - } - - if (!duplicate) { - // Store the interface name - strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize); - // Store the IP address - int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6); - memcpy(addrs+found, interface->ifa_addr, salen); - found++; - } - } - - freeifaddrs(interfaces); - return found; -} - -static bool matchSubnet(struct ifaddrs local_if, union socketAddress* remote) { - /* Check family first */ - int family = local_if.ifa_addr->sa_family; - if (family != remote->sa.sa_family) { - return false; - } - - if (family == AF_INET) { - struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr); - struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask); - struct sockaddr_in& remote_addr = remote->sin; - struct in_addr local_subnet, remote_subnet; - local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr; - remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr; - return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true; - } else if (family == AF_INET6) { - struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr); - struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask); - struct sockaddr_in6& remote_addr = remote->sin6; - struct in6_addr& local_in6 = local_addr->sin6_addr; - struct in6_addr& mask_in6 = mask->sin6_addr; - struct in6_addr& remote_in6 = remote_addr.sin6_addr; - bool same = true; - int len = 16; //IPv6 address is 16 unsigned char - for (int c = 0; c < len; c++) { //Network byte order is big-endian - char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c]; - char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c]; - if (c1 ^ c2) { - same = false; - break; - } - } - // At last, we need to compare scope id - // Two Link-type addresses can have the same subnet address even though they are not in the same scope - // For Global type, this field is 0, so a comparison wouldn't matter - same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id); - return same; - } else { - WARN("Net : Unsupported address family type"); - return false; - } -} - -static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) { -#ifdef ENABLE_TRACE - char line[SOCKET_NAME_MAXLEN+1]; -#endif - char line_a[SOCKET_NAME_MAXLEN+1]; - int found = 0; - struct ifaddrs *interfaces, *interface; - getifaddrs(&interfaces); - for (interface = interfaces; interface && !found; interface = interface->ifa_next) { - if (interface->ifa_addr == NULL) continue; - - /* We only support IPv4 & IPv6 */ - int family = interface->ifa_addr->sa_family; - if (family != AF_INET && family != AF_INET6) - continue; - - // check against user specified interfaces - if (!matchSubnet(*interface, remoteAddr)) { - continue; - } - - // Store the local IP address - int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6); - memcpy(localAddrs+found, interface->ifa_addr, salen); - - // Store the interface name - strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize); - - TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(localAddrs+found, line), socketToString(remoteAddr, line_a)); - found++; - if (found == maxIfs) break; - } - - if (found == 0) { - WARN("Net : No interface found in the same subnet as remote address %s", socketToString(remoteAddr, line_a)); - } - freeifaddrs(interfaces); - return found; -} - -static ncclResult_t GetSocketAddrFromString(union socketAddress* ua, const char* ip_port_pair) { - if (!(ip_port_pair && strlen(ip_port_pair) > 1)) { - WARN("Net : string is null"); - return ncclInvalidArgument; - } - - bool ipv6 = ip_port_pair[0] == '['; - /* Construct the sockaddress structure */ - if (!ipv6) { - struct netIf ni; - // parse <ip_or_hostname>:<port> string, expect one pair - if (parseStringList(ip_port_pair, &ni, 1) != 1) { - WARN("Net : No valid <IPv4_or_hostname>:<port> pair found"); - return ncclInvalidArgument; - } - - struct addrinfo hints, *p; - int rv; - memset(&hints, 0, sizeof(hints)); - hints.ai_family = AF_UNSPEC; - hints.ai_socktype = SOCK_STREAM; - - if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) { - WARN("Net : error encountered when getting address info : %s", gai_strerror(rv)); - return ncclInvalidArgument; - } - - // use the first - if (p->ai_family == AF_INET) { - struct sockaddr_in& sin = ua->sin; - memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in)); - sin.sin_family = AF_INET; // IPv4 - //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr)); // IP address - sin.sin_port = htons(ni.port); // port - } else if (p->ai_family == AF_INET6) { - struct sockaddr_in6& sin6 = ua->sin6; - memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6)); - sin6.sin6_family = AF_INET6; // IPv6 - sin6.sin6_port = htons(ni.port); // port - sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete - sin6.sin6_scope_id = 0; // should be global scope, set to 0 - } else { - WARN("Net : unsupported IP family"); - return ncclInvalidArgument; - } - - freeaddrinfo(p); // all done with this structure - - } else { - int i, j = -1, len = strlen(ip_port_pair); - for (i = 1; i < len; i++) { - if (ip_port_pair[i] == '%') j = i; - if (ip_port_pair[i] == ']') break; - } - if (i == len) { - WARN("Net : No valid [IPv6]:port pair found"); - return ncclInvalidArgument; - } - bool global_scope = (j == -1 ? true : false); // If no % found, global scope; otherwise, link scope - - char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ]; - memset(ip_str, '\0', sizeof(ip_str)); - memset(port_str, '\0', sizeof(port_str)); - memset(if_name, '\0', sizeof(if_name)); - strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1); - strncpy(port_str, ip_port_pair+i+2, len-i-1); - int port = atoi(port_str); - if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name - - struct sockaddr_in6& sin6 = ua->sin6; - sin6.sin6_family = AF_INET6; // IPv6 - inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr)); // IP address - sin6.sin6_port = htons(port); // port - sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete - sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name); // 0 if global scope; intf index if link scope - } - return ncclSuccess; -} - -static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) { - static int shownIfName = 0; - int nIfs = 0; - // Allow user to force the INET socket family selection - int sock_family = envSocketFamily(); - // User specified interface - char* env = getenv("NCCL_SOCKET_IFNAME"); - if (env && strlen(env) > 1) { - INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env); - // Specified by user : find or fail - if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env); - nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); - } else { - // Try to automatically pick the right one - // Start with IB - nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); - // else see if we can get some hint from COMM ID - if (nIfs == 0) { - char* commId = getenv("NCCL_COMM_ID"); - if (commId && strlen(commId) > 1) { - INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId); - // Try to find interface that is in the same subnet as the IP in comm id - union socketAddress idAddr; - GetSocketAddrFromString(&idAddr, commId); - nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs); - } - } - // Then look for anything else (but not docker or lo) - if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); - // Finally look for docker, then lo. - if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); - if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); - } - return nIfs; -} - -static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr) { - /* IPv4/IPv6 support */ - int family = localAddr->sa.sa_family; - int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6); - - /* Create socket and bind it to a port */ - int sockfd = socket(family, SOCK_STREAM, 0); - if (sockfd == -1) { - WARN("Net : Socket creation failed : %s", strerror(errno)); - return ncclSystemError; - } - - if (socketToPort(localAddr)) { - // Port is forced by env. Make sure we get the port. - int opt = 1; -#if defined(SO_REUSEPORT) - SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt"); -#else - SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt"); -#endif - } - - // localAddr port should be 0 (Any port) - SYSCHECK(bind(sockfd, &localAddr->sa, salen), "bind"); - - /* Get the assigned Port */ - socklen_t size = salen; - SYSCHECK(getsockname(sockfd, &localAddr->sa, &size), "getsockname"); - -#ifdef ENABLE_TRACE - char line[SOCKET_NAME_MAXLEN+1]; - TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(localAddr, line)); -#endif - - /* Put the socket in listen mode - * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn - */ - SYSCHECK(listen(sockfd, 16384), "listen"); - *fd = sockfd; - return ncclSuccess; -} - -static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) { - char line[SOCKET_NAME_MAXLEN+1]; - /* IPv4/IPv6 support */ - int family = remoteAddr->sa.sa_family; - if (family != AF_INET && family != AF_INET6) { - WARN("Net : connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)", - socketToString(remoteAddr, line), family, AF_INET, AF_INET6); - return ncclInternalError; - } - int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6); - - /* Connect to a hostname / port */ - *fd = socket(family, SOCK_STREAM, 0); - if (*fd == -1) { - WARN("Net : Socket creation failed : %s", strerror(errno)); - return ncclSystemError; - } - - const int one = 1; - SYSCHECK(setsockopt(*fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt"); - - /* const int bufsize = 128*1024; - SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt"); - SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/ - - TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", socketToString(remoteAddr, line)); +enum ncclSocketState { + ncclSocketConnecting = 0, + ncclSocketConnected = 1, + ncclSocketError = 2, + ncclSocketStateNum = 3 +} ; + +struct ncclSocket { + int fd; + union ncclSocketAddress addr; + volatile uint32_t* abortFlag; + int asyncFlag; + enum ncclSocketState state; +}; - int ret; - int timedout_retries = 0; - int refused_retries = 0; -retry: - SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret); - if (ret == 0) return ncclSuccess; - if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) { - if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) || - (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) { - if (refused_retries % 1000 == 0) INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); - usleep(SLEEP_INT); - goto retry; - } - } - WARN("Net : Connect to %s failed : %s", socketToString(remoteAddr, line), strerror(errno)); - return ncclSystemError; -} +const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf); +ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair); +int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs); +int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs); +// Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call +ncclResult_t ncclSocketListen(struct ncclSocket* sock); +// Connect to sock->addr. sock->fd is set after a successful call. +ncclResult_t ncclSocketConnect(struct ncclSocket* sock); +// Return socket connection state. +ncclResult_t ncclGetSocketState(struct ncclSocket* sock, enum ncclSocketState* state); +// Accept an incoming connection from listenSocket->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr. +ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket); #define NCCL_SOCKET_SEND 0 #define NCCL_SOCKET_RECV 1 -static ncclResult_t socketProgressOpt(int op, int fd, union socketAddress *addr, void* ptr, int size, int* offset, int block) { - int bytes = 0; - char* data = (char*)ptr; - char line[SOCKET_NAME_MAXLEN+1]; - do { - if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT); - if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT); - if (op == NCCL_SOCKET_RECV && bytes == 0) { - WARN("Net : Connection closed by remote peer %s", socketToString(addr, line)); - return ncclSystemError; - } - if (bytes == -1) { - if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { - WARN("Net : Call to recv from %s failed : %s", socketToString(addr, line), strerror(errno)); - return ncclSystemError; - } else { - bytes = 0; - } - } - (*offset) += bytes; - } while (bytes > 0 && (*offset) < size); - return ncclSuccess; -} - -static ncclResult_t socketProgress(int op, int fd, union socketAddress *addr, void* ptr, int size, int* offset) { - return socketProgressOpt(op, fd, addr, ptr, size, offset, 0); -} - -static ncclResult_t socketWait(int op, int fd, union socketAddress *addr, void* ptr, int size, int* offset) { - while (*offset < size) - NCCLCHECK(socketProgressOpt(op, fd, addr, ptr, size, offset, 1)); - return ncclSuccess; -} - -static ncclResult_t socketSend(int fd, union socketAddress *addr, void* ptr, int size) { - int offset = 0; - NCCLCHECK(socketWait(NCCL_SOCKET_SEND, fd, addr, ptr, size, &offset)); - return ncclSuccess; -} - -static ncclResult_t socketRecv(int fd, union socketAddress *addr, void* ptr, int size) { - int offset = 0; - NCCLCHECK(socketWait(NCCL_SOCKET_RECV, fd, addr, ptr, size, &offset)); - return ncclSuccess; -} +ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); +ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); +ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size); +ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size); +ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed); +/* initialize a socket. */ +ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0); #endif diff --git a/src/include/timer.h b/src/include/timer.h new file mode 100644 index 0000000..284fec6 --- /dev/null +++ b/src/include/timer.h @@ -0,0 +1,60 @@ +/************************************************************************* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_TIMER_H_ +#define NCCL_TIMER_H_ +#if ENABLE_TIMER +#include <unistd.h> +#include <sys/time.h> +#include <x86intrin.h> +static double freq = -1; +static void calibrate() { + struct timeval tv; + gettimeofday(&tv, NULL); + uint64_t timeCycles = __rdtsc(); + double time = - tv.tv_sec*1E6 - tv.tv_usec; + uint64_t total = 0ULL; + for (int i=0; i<10000; i++) total += __rdtsc(); + gettimeofday(&tv, NULL); + timeCycles = __rdtsc() - timeCycles; + time += tv.tv_sec*1E6 + tv.tv_usec; + freq = timeCycles/time; +} +static inline double gettime() { + if (freq == -1) calibrate(); + return __rdtsc()/freq; +} +static uint64_t counts[8]; +static double times[8]; +static double startTimes[8]; +#define TIME_START(index) do { \ + counts[index]++; \ + startTimes[index] = gettime(); \ +} while (0); + +#define TIME_STOP(index) do { \ + times[index] += gettime() - startTimes[index]; \ +} while (0); + +#define TIME_CANCEL(index) do { \ + counts[index]--; \ +} while (0); + +#define TIME_PRINT(name) do { \ + printf("%s stats", name); \ + for (int i=0; i<8; i++) { \ + if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \ + counts[i] = 0; \ + } \ + printf("\n"); \ +} while (0); +#else +#define TIME_START(index) while(0); +#define TIME_STOP(index) while(0); +#define TIME_CANCEL(index) while(0); +#define TIME_PRINT(name) +#endif +#endif diff --git a/src/include/transport.h b/src/include/transport.h index e64dfbf..043a415 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,12 +11,14 @@ #include "graph.h" #include "nvmlwrap.h" #include "core.h" -#include "proxy.h" -#define NTRANSPORTS 3 +#define NTRANSPORTS 4 #define TRANSPORT_P2P 0 #define TRANSPORT_SHM 1 #define TRANSPORT_NET 2 +#define TRANSPORT_COLLNET 3 + +#include "proxy.h" extern struct ncclTransport ncclTransports[]; @@ -28,11 +30,14 @@ struct ncclComm; struct ncclPeerInfo { int rank; int cudaDev; + int netDev; int gdrSupport; uint64_t hostHash; uint64_t pidHash; dev_t shmDev; int64_t busId; + struct ncclComm* comm; + int cudaCompCap; }; #define CONNECT_SIZE 128 @@ -43,8 +48,12 @@ struct ncclConnect { struct ncclTransportComm { ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex); ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*); - ncclResult_t (*free)(void*); - ncclResult_t (*proxy)(struct ncclProxyArgs*); + ncclResult_t (*free)(struct ncclConnector*); + ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels); + ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done); + ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done); + ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclComm* comm); + ncclResult_t (*proxyProgress)(struct ncclComm* comm, struct ncclProxyArgs*); }; struct ncclTransport { diff --git a/src/include/utils.h b/src/include/utils.h index 739a774..f08ff37 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,6 +8,7 @@ #define NCCL_UTILS_H_ #include "nccl.h" +#include "checks.h" #include <stdint.h> int ncclCudaCompCap(); @@ -94,6 +95,11 @@ class ncclRecyclableList { return rv; } + T* peakNext() { + if (cursor == NULL || cursor == tail) return NULL; + return &cursor->data; + } + // Recycle the list without freeing the space void recycle() { tail = cursor = head; diff --git a/src/init.cc b/src/init.cc index 1684cc9..4da8dfd 100644 --- a/src/init.cc +++ b/src/init.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -46,90 +46,6 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM); NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); -ncclNet_t* ncclNet = NULL; -ncclCollNet_t* ncclCollNet = NULL; - -// Returns ncclInternalError if anything fails, causing that network to be ignored. -ncclResult_t initNet(ncclNet_t* net) { - int ndev; - if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError; - if (net->devices(&ndev) != ncclSuccess) return ncclInternalError; - if (ndev <= 0) return ncclSystemError; - return ncclSuccess; -} - -ncclResult_t initCollNet(ncclCollNet_t* collnet) { - int ndev; - if (collnet->init(ncclDebugLog) != ncclSuccess) return ncclInternalError; - if (collnet->devices(&ndev) != ncclSuccess) return ncclInternalError; - if (ndev <= 0) return ncclSystemError; - return ncclSuccess; -} - -ncclResult_t initNetPlugin(ncclNet_t** net, ncclCollNet_t** collnet) { - char ncclNetPluginName[128]; - const char* envPluginName = getenv("NCCL_NET_PLUGIN"); - if (envPluginName && strlen(envPluginName)) { - snprintf(ncclNetPluginName, 128, "libnccl-net-%s.so", envPluginName); - INFO(NCCL_INIT, "Plugin name set by env to %s\n", ncclNetPluginName); - } else { - sprintf(ncclNetPluginName, "libnccl-net.so"); - } - void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL); - if (netPluginLib == NULL) { - // dlopen does not guarantee to set errno, but dlerror only gives us a - // string, so checking errno doesn't hurt to try to provide a better - // error message - if (errno == ENOENT) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (%s), using internal implementation", ncclNetPluginName); - } else { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror()); - } - return ncclSuccess; - } - *net = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL)); - if (*net == NULL) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol."); - if (netPluginLib != NULL) dlclose(netPluginLib); - return ncclSuccess; - } - // Check for CollNet - *collnet = (ncclCollNet_t*) dlsym(netPluginLib, STR(NCCL_COLLNET_PLUGIN_SYMBOL)); - if (*collnet == NULL) { - INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_COLLNET_PLUGIN_SYMBOL) " symbol."); - } - return ncclSuccess; -} - -ncclResult_t initNet() { - // Always initialize bootstrap network - NCCLCHECK(bootstrapNetInit()); - - // Initialize main communication network - ncclNet_t* nets[3] = { NULL, &ncclNetIb, &ncclNetSocket }; - ncclCollNet_t* collNets[3] = { NULL, NULL, NULL }; - NCCLCHECK(initNetPlugin(nets+0, collNets+0)); - char* netName = getenv("NCCL_NET"); - - for (int i=0; i<3; i++) { - if (nets[i] == NULL) continue; - if (netName && strcmp(netName, nets[i]->name) != 0) continue; - // net plugin is already initialized - if (initNet(nets[i]) != ncclSuccess) continue; - ncclNet = nets[i]; - if (collNets[i] && initCollNet(collNets[i]) == ncclSuccess) { - ncclCollNet = collNets[i]; - } - break; - } - - if (ncclNet == NULL) { - WARN("Error: network %s not found.", netName ? netName : ""); - return ncclInvalidUsage; - } - return ncclSuccess; -} - // GDRCOPY support: Off by default NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0); @@ -155,7 +71,7 @@ static ncclResult_t ncclInit() { initEnv(); initGdrCopy(); maxLocalSizeBytes = ncclKernMaxLocalSize(); - NCCLCHECK(initNet()); + NCCLCHECK(ncclNetInit()); INFO(NCCL_INIT, "Using network %s", ncclNetName()); initialized = true; } @@ -194,6 +110,9 @@ static ncclResult_t commFree(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; + // First stop all threads before we free anything. + NCCLCHECK(ncclProxyDestroy(comm)); + delete[] comm->userRedOps; free(comm->connectSend); @@ -208,6 +127,10 @@ static ncclResult_t commFree(ncclComm_t comm) { free(comm->peerInfo); ncclTopoFree(comm->topo); + for (int n=0; n<comm->nNodes; n++) free(comm->nodeRanks[n].localRankToRank); + free(comm->nodeRanks); + free(comm->rankToNode); + free(comm->rankToLocalRank); if (comm->bootstrap) NCCLCHECK(bootstrapClose(comm->bootstrap)); @@ -231,8 +154,16 @@ static ncclResult_t commFree(ncclComm_t comm) { int isLast; NCCLCHECK(ncclCpuBarrierIn(comm, &isLast)); if (isLast) { + // Wait for all service threads to be done. We could not + // do it earlier because it could have blocked and prevented + // other ranks in the process to call ncclCommDestroy + for (int i=0; i<comm->intraRanks; i++) { + void* ret; + if (comm->intraThreads[i]) pthread_join(comm->intraThreads[i], &ret); + } free(comm->intraBarrier); free(comm->intraParams); + free(comm->intraThreads); free(comm->intraCudaDevs); free(comm->intraCGMode); free(comm->intraCC); @@ -291,7 +222,8 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { comm->hostDevComm.abortFlag = comm->abortFlag; *comm->abortFlag = 0; - comm->argsptr = &comm->args; + comm->argsptrs[0] = &comm->devComm; + comm->argsptrs[1] = &comm->args; comm->collNetSupport = 0; NCCLCHECK(ncclCalloc(&comm->asyncOps, NCCL_MAX_OPS)); @@ -329,10 +261,6 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { NCCLCHECK(ncclCalloc(&comm->p2pSends, comm->nRanks)); NCCLCHECK(ncclCalloc(&comm->p2pRecvs, comm->nRanks)); - // Create a map between global rank and intra-node rank - NCCLCHECK(ncclCalloc(&comm->rankToIntraNodeRank, comm->nRanks)); - memset(comm->rankToIntraNodeRank, -1, comm->nRanks*sizeof(comm->rankToIntraNodeRank[0])); - // Mark channels as non initialized. for (int c=0; c<MAXCHANNELS; c++) comm->channels[c].id = -1; @@ -389,6 +317,8 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u info->busId = comm->busId; NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport)); + info->comm = comm; + info->cudaCompCap = ncclCudaCompCap(); return ncclSuccess; } @@ -418,7 +348,7 @@ void* waitForNonNullPtr(void* p) { ncclResult_t initParams(struct ncclComm* comm) { struct cudaLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank; - params->args = &comm->argsptr; + params->args = comm->argsptrs; params->stream = NULL; params->sharedMem = 0; params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1; @@ -440,6 +370,7 @@ ncclResult_t ncclCommSetIntraProc(struct ncclComm* comm, int rank, int ranks, st bar[0] = bar[1] = 0; comm->intraBarrier = bar; NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks)); + NCCLCHECK(ncclCalloc(&comm->intraThreads, comm->intraRanks)); NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks)); int* CGMode; NCCLCHECK(ncclCalloc(&CGMode, 1)); @@ -452,11 +383,13 @@ ncclResult_t ncclCommSetIntraProc(struct ncclComm* comm, int rank, int ranks, st } else { comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier); comm->intraParams = (struct cudaLaunchParams*)waitForNonNullPtr(&comm0->intraParams); + comm->intraThreads = (pthread_t*)waitForNonNullPtr(&comm0->intraThreads); comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs); comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode); comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC); } comm->intraCudaDevs[comm->intraRank] = comm->cudaDev; + comm->intraThreads[comm->intraRank] = comm->proxyState.thread; NCCLCHECK(initParams(comm)); int cgMdLaunch = 0; @@ -508,7 +441,6 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) { return ncclSuccess; } -NCCL_PARAM(CrossNic, "CROSS_NIC", 2); NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0); NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2); NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 1); @@ -522,75 +454,19 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm int nranks = comm->nRanks; uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES); TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks); - NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap)); + NCCLCHECK(bootstrapInit(commId, comm)); // AllGather1 - begin - struct { - struct ncclPeerInfo peerInfo; - struct ncclComm* comm; - int cudaCompCap; - } *allGather1Data; - - NCCLCHECK(ncclCalloc(&allGather1Data, nranks)); - allGather1Data[rank].comm = comm; - allGather1Data[rank].cudaCompCap = ncclCudaCompCap(); - struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo; - NCCLCHECK(fillInfo(comm, myInfo, commHash)); - NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data))); - NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks+1)); // Extra rank to represent CollNet root - for (int i = 0; i < nranks; i++) { - memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo)); - if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) { - WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, myInfo->busId); - return ncclInvalidUsage; - } - } + NCCLCHECK(fillInfo(comm, comm->peerInfo+rank, commHash)); + NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo))); - // Compute intra ranks and minimum CUDA Compute capabilities of intra-node GPUs and all GPUs - int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0; - int intraNodeRank0 = -1, intraNodeRank = -1, intraNodeRanks = 0; - int myCompCap = allGather1Data[rank].cudaCompCap; - int minCompCap = myCompCap, maxCompCap = myCompCap; for (int i = 0; i < nranks; i++) { - if (allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) { - // Rank is on same node - if (intraNodeRanks == 0) intraNodeRank0 = i; - if (i == rank) intraNodeRank = intraNodeRanks; - comm->intraNodeGlobalRanks[intraNodeRanks] = i; - comm->rankToIntraNodeRank[i] = intraNodeRanks; - intraNodeRanks++; - if (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash) { - // Rank is in same process - if (intraProcRanks == 0) intraProcRank0 = i; - if (i == rank) intraProcRank = intraProcRanks; - intraProcRanks++; - } + if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) { + WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId); + return ncclInvalidUsage; } - minCompCap = std::min(allGather1Data[i].cudaCompCap, minCompCap); - maxCompCap = std::max(allGather1Data[i].cudaCompCap, maxCompCap); - } - TRACE(NCCL_INIT,"hostHash[%d] %lx intraNodeRank %d intraNodeRanks %d intraNodeRank0 %d", - rank, allGather1Data[rank].peerInfo.hostHash, intraNodeRank, intraNodeRanks, intraNodeRank0); - TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d", - rank, allGather1Data[rank].peerInfo.pidHash, intraProcRank, intraProcRanks, intraProcRank0); - if (intraProcRank == -1 || intraProcRank0 == -1 || allGather1Data[intraProcRank0].comm == NULL) { - WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d", - rank, allGather1Data[rank].peerInfo.hostHash, allGather1Data[rank].peerInfo.pidHash, - intraProcRank, intraProcRanks, intraProcRank0); - return ncclInternalError; - } - if (intraNodeRank == -1 || intraNodeRank0 == -1 || intraNodeRanks == 0) { - WARN("Failed to determine intra node ranks rank %d hostHash %lx pidHash %lx intraNodeRank %d intraNodeRanks %d intraNodeRank0 %d", - rank, allGather1Data[rank].peerInfo.hostHash, allGather1Data[rank].peerInfo.pidHash, - intraNodeRank, intraNodeRanks, intraNodeRank0); - return ncclInternalError; } - struct ncclComm* intraProcRank0Comm = allGather1Data[intraProcRank0].comm; - uint64_t intraNodeRank0pidHash = allGather1Data[intraNodeRank0].peerInfo.pidHash; - comm->intraNodeRank = intraNodeRank; - - free(allGather1Data); // AllGather1 - end @@ -607,11 +483,23 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm // Print final topology NCCLCHECK(ncclTopoPrint(comm->topo)); + // Set Affinity to a CPU local the our GPU, so that all memory we allocate + // on the host is local. + NCCLCHECK(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity)); + cpu_set_t affinitySave; + if (CPU_COUNT(&comm->cpuAffinity)) { + sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); + sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); + } + ncclResult_t ret; + + // Launch proxy service thread + NCCLCHECK(ncclProxyCreate(comm)); + // Get rings and trees struct ncclTopoGraph ringGraph; ringGraph.id = 0; ringGraph.pattern = NCCL_TOPO_PATTERN_RING; - ringGraph.crossNic = ncclParamCrossNic(); ringGraph.collNet = 0; ringGraph.minChannels = 1; ringGraph.maxChannels = MAXCHANNELS/2; @@ -621,7 +509,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm struct ncclTopoGraph treeGraph; treeGraph.id = 1; treeGraph.pattern = NCCL_TOPO_PATTERN_BALANCED_TREE; - treeGraph.crossNic = ncclParamCrossNic(); treeGraph.collNet = 0; treeGraph.minChannels = 1; treeGraph.maxChannels = ringGraph.nChannels; @@ -632,7 +519,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm collNetGraph.id = 2; collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE; collNetGraph.collNet = 1; - collNetGraph.crossNic = ncclParamCrossNic(); collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels; NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph)); NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph)); @@ -644,10 +530,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm // Determine local CollNet support before all-gather if (ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1; - if (intraNodeRanks > 8) { - if (comm->collNetSupport == 1) WARN("CollNet currently only supports up to 8 GPUs per node"); - comm->collNetSupport = 0; - } // AllGather3 - begin struct ncclGraphInfo { @@ -661,6 +543,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm }; struct { + int netDev; int collNetSupport; struct ncclGraphInfo tree; struct ncclGraphInfo ring; @@ -669,6 +552,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm } *allGather3Data; NCCLCHECK(ncclCalloc(&allGather3Data, nranks)); + NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, &allGather3Data[rank].netDev)); allGather3Data[rank].tree.pattern = treeGraph.pattern; allGather3Data[rank].tree.nChannels = treeGraph.nChannels; allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels; @@ -701,45 +585,77 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm int *nodesFirstRank, *nodesTreePatterns; NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks)); NCCLCHECK(ncclCalloc(&nodesTreePatterns, nranks)); - for (int i=0; i<nranks; i++) { - int node = -1; - int firstRank = allGather3Data[i].topoRanks.ringRecv[0]; - for (int n=0; n<comm->nNodes; n++) { - if (nodesFirstRank[n] == firstRank) node = n; - } - if (node == -1) { - node = comm->nNodes++; + NCCLCHECK(ncclCalloc(&comm->rankToNode, comm->nRanks)); + for (int r=0; r<nranks; r++) { + int node; + int firstRank = allGather3Data[r].topoRanks.ringRecv[0]; + for (node=0; node<comm->nNodes && nodesFirstRank[node] != firstRank; node++); + if (node == comm->nNodes) { + comm->nNodes++; nodesFirstRank[node] = firstRank; // Record tree pattern of each node as they can be different depending on sm arch - nodesTreePatterns[node] = allGather3Data[i].tree.pattern; + nodesTreePatterns[node] = allGather3Data[r].tree.pattern; } - if (i == comm->rank) comm->node = node; + comm->rankToNode[r] = node; + } + // Now that we know nNodes, alloc nodeRanks and compute localRanks for each node + NCCLCHECK(ncclCalloc(&comm->nodeRanks, comm->nNodes)); + NCCLCHECK(ncclCalloc(&comm->rankToLocalRank, comm->nRanks)); + for (int r=0; r<comm->nRanks; r++) { + int node = comm->rankToNode[r]; + comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks; + comm->nodeRanks[node].localRanks++; + } + // Allocate ranks arrays for each node + for (int n=0; n<comm->nNodes; n++) { + NCCLCHECK(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks)); + comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks); + comm->nodeRanks[n].localRanks = 0; + } + // And fill the ranks arrays + for (int r=0; r<comm->nRanks; r++) { + int node = comm->rankToNode[r]; + comm->nodeRanks[node].localRankToRank[comm->nodeRanks[node].localRanks++] = r; + } + comm->node = comm->rankToNode[rank]; + comm->localRankToRank = comm->nodeRanks[comm->node].localRankToRank; + comm->localRank = comm->rankToLocalRank[rank]; + comm->localRanks = comm->nodeRanks[comm->node].localRanks; + + TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d", + rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]); + if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) { + WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d", + rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash, + comm->localRank, comm->localRanks, comm->localRankToRank[0]); + return ncclInternalError; } int nChannelsOrig = comm->nChannels; struct ncclTopoRanks** allTopoRanks; NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks)); for (int i=0; i<nranks; i++) { + comm->peerInfo[i].netDev = allGather3Data[i].netDev; allTopoRanks[i] = &allGather3Data[i].topoRanks; // Make sure we align all ranks so that the tuning is consistent across ranks treeGraph.nChannels = std::min(allGather3Data[i].tree.nChannels, treeGraph.nChannels); treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels); treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra); treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter); - treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra); - treeGraph.typeInter = std::min(allGather3Data[i].tree.typeInter, treeGraph.typeInter); + treeGraph.typeIntra = std::max(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra); + treeGraph.typeInter = std::max(allGather3Data[i].tree.typeInter, treeGraph.typeInter); ringGraph.nChannels = std::min(allGather3Data[i].ring.nChannels, ringGraph.nChannels); ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels); ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra); ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter); - ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra); - ringGraph.typeInter = std::min(allGather3Data[i].ring.typeInter, ringGraph.typeInter); + ringGraph.typeIntra = std::max(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra); + ringGraph.typeInter = std::max(allGather3Data[i].ring.typeInter, ringGraph.typeInter); collNetGraph.nChannels = std::min(allGather3Data[i].collNet.nChannels, collNetGraph.nChannels); collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels); collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra); collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter); - collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra); - collNetGraph.typeInter = std::min(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter); + collNetGraph.typeIntra = std::max(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra); + collNetGraph.typeInter = std::max(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter); comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport); } @@ -750,12 +666,20 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel)); } - // Determine CollNet support after all-gather now that we know nNodes - int collNetNodeThreshold = ncclParamCollNetNodeThreshold(); - if (comm->nNodes < collNetNodeThreshold) { - if (comm->collNetSupport == 1) + // Determine CollNet support after all-gather now that we know nNodes and each node localRanks + if (comm->collNetSupport == 1) { + int collNetNodeThreshold = ncclParamCollNetNodeThreshold(); + if (comm->nNodes < collNetNodeThreshold) { INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold); - comm->collNetSupport = 0; + comm->collNetSupport = 0; + } + for (int n=0; n<comm->nNodes; n++) { + if (comm->nodeRanks[n].localRanks > NCCL_MAX_DIRECT_ARITY+1) { + WARN("CollNet currently only supports up to %d GPUs per node, disabling CollNet", NCCL_MAX_DIRECT_ARITY+1); + comm->collNetSupport = 0; + break; + } + } } int *rings; @@ -782,16 +706,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm line[1023] = '\0'; INFO(NCCL_INIT, "Trees%s", line); - // Set Affinity to a CPU local the our GPU, so that all memory we allocate - // on the host is local. - NCCLCHECK(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity)); - cpu_set_t affinitySave; - if (CPU_COUNT(&comm->cpuAffinity)) { - sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); - sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); - } - ncclResult_t ret; - NCCLCHECK(computeBuffSizes(comm)); // Connect with prev/next for each ring @@ -818,7 +732,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm // Check if we can setup CollNet if (comm->collNetSupport > 0) { int collNetSetupFail = 0; - int highestTypes[NCCL_MAX_INTRA_RANKS] = {TRANSPORT_P2P}; + int highestTypes[NCCL_MAX_LOCAL_RANKS] = {TRANSPORT_P2P}; // Find all head ranks int nHeads = collNetGraph.nChannels; int *heads; @@ -858,8 +772,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm // Exchange highest intra-node transport type among ranks // because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer - comm->intraHighestTransportType = highestTypes[comm->intraNodeRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1; - NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, highestTypes, sizeof(int))); + comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1; + NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int))); for (int i=0; i<comm->localRanks; i++) { if (highestTypes[i] > comm->intraHighestTransportType) comm->intraHighestTransportType = highestTypes[i]; @@ -877,7 +791,15 @@ collnet_cleanup: TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); // Compute time models for algorithm and protocol combinations - NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph)); + do { + int myCompCap = comm->peerInfo[rank].cudaCompCap; + int minCompCap = myCompCap, maxCompCap = myCompCap; + for (int i = 0; i < nranks; i++) { + minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap); + maxCompCap = std::max(comm->peerInfo[i].cudaCompCap, maxCompCap); + } + NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph)); + } while(0); // Compute nChannels per peer for p2p NCCLCHECK(ncclTopoComputeP2pChannels(comm)); @@ -892,28 +814,68 @@ collnet_cleanup: int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks; for (int c=0; c<comm->p2pnChannelsPerPeer; c++) { int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels; - if (comm->channels[channelId].peers[peer].recv[0].connected == 0) { // P2P uses only 1 connector + if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector comm->connectRecv[peer] |= (1<<channelId); } } delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks; for (int c=0; c<comm->p2pnChannelsPerPeer; c++) { int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels; - if (comm->channels[channelId].peers[peer].send[0].connected == 0) { // P2P uses only 1 connector + if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector comm->connectSend[peer] |= (1<<channelId); } } } - NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 0)); + NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1)); free(nvbPeers); } - NCCLCHECK(ncclCommSetIntraProc(comm, intraProcRank, intraProcRanks, intraProcRank0Comm)); + // Connect to local net proxy + struct ncclProxyConnector proxyConn; + NCCLCHECK(ncclTopoGetLocalRank(comm->topo, comm->rank, &proxyConn.localRank)); + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn)); + NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0)); + + // Then to remote ones when using PXN + if (ncclPxnDisable() == 0) { + int nranks; + int* pxnPeers; + NCCLCHECK(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks)); + for (int r=0; r<nranks; r++) { + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn)); + NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0)); + } + free(pxnPeers); + } + + do { + // Compute intra-process ranks + int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0; + for (int i = 0; i < nranks; i++) { + if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) + && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) { + // Rank is in same process + if (intraProcRanks == 0) intraProcRank0 = i; + if (i == rank) intraProcRank = intraProcRanks; + intraProcRanks++; + } + } + TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d", + rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0); + if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) { + WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d", + rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash, + intraProcRank, intraProcRanks, intraProcRank0); + return ncclInternalError; + } + NCCLCHECK(ncclCommSetIntraProc(comm, intraProcRank, intraProcRanks, comm->peerInfo[intraProcRank0].comm)); + } while(0); /* Local intra-node barrier */ - NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->intraNodeGlobalRanks, intraNodeRank, intraNodeRanks, (int)intraNodeRank0pidHash)); + NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0])); - if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm)); + // Unlink proxy shm to make sure it will be properly cleaned up. + NCCLCHECK(ncclProxyShmUnlink(comm)); // We should have allocated all buffers, collective fifos, ... we can // restore the affinity. @@ -937,6 +899,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId TRACE(NCCL_INIT, "Setting cudaLimitStackSize to %zi", maxLocalSizeBytes); CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, maxLocalSizeBytes)); } + *newcomm = NULL; NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup); NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup); NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup); @@ -1028,6 +991,12 @@ static ncclResult_t ncclGraphHelperDestroy(ncclComm* comm) { } static ncclResult_t commDestroy(ncclComm_t comm) { + // Try and prevent a double free of the comm struct (user error) + if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) { + WARN("comm %p has already been destroyed", comm); + return ncclInvalidArgument; + } + int savedDevice; CUDACHECK(cudaGetDevice(&savedDevice)); int commDevice = comm->cudaDev; @@ -1039,19 +1008,18 @@ static ncclResult_t commDestroy(ncclComm_t comm) { TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, comm->rank, *comm->abortFlag, comm->fatalError); CUDACHECK(cudaStreamSynchronize(comm->groupStream)); - NCCLCHECK(ncclProxyDestroy(comm)); + ncclDestroyQueueInfo(comm->enqueueInfo); #if CUDART_VERSION >= 11030 NCCLCHECK(ncclGraphHelperDestroy(comm)); #endif INFO(NCCL_COLL, "Created %d queue info, destroyed %d", comm->nQueueInfoCreated, comm->nQueueInfoDestroyed); + NCCLCHECK(commFree(comm)); if (savedDevice != commDevice) CUDACHECK(cudaSetDevice(savedDevice)); - TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, comm->rank); - return ncclSuccess; } @@ -1061,15 +1029,13 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; - TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId); - - // Try and prevent a double free of the comm struct (user error) - if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) { - WARN("comm %p has already been destroyed", comm); - return ncclInvalidArgument; - } + int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev; + int64_t busId = comm->busId; + TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId); - return commDestroy(comm); + NCCLCHECK(commDestroy(comm)); + INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Destroy COMPLETE", comm, rank, nranks, cudaDev, busId); + return ncclSuccess; } NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm); @@ -1078,10 +1044,16 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; + int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev; + int64_t busId = comm->busId; + TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId); + // Ask anything that might still be running on the device to quit *comm->abortFlag = 1; - return commDestroy(comm); + NCCLCHECK(commDestroy(comm)); + INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Abort COMPLETE", comm, rank, nranks, cudaDev, busId); + return ncclSuccess; } NCCL_API(const char*, ncclGetErrorString, ncclResult_t code); diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc index fe4e760..1c5ba3c 100644 --- a/src/misc/argcheck.cc +++ b/src/misc/argcheck.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -63,12 +63,8 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) { } if (info->comm->checkPointers) { - if (info->coll == ncclFuncSendRecv) { - if (strcmp(info->opName, "Send") == 0) { - NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", "Send")); - } else { - NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", "Recv")); - } + if ((info->coll == ncclFuncSend || info->coll == ncclFuncRecv) && info->count > 0) { + NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "buff", info->opName)); } else { // Check CUDA device pointers if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) { diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc index 439712e..e1aabac 100644 --- a/src/misc/ibvwrap.cc +++ b/src/misc/ibvwrap.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -29,6 +29,7 @@ int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int at struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context); int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd); struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access); +struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access); int (*ibv_internal_dereg_mr)(struct ibv_mr *mr); struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); int (*ibv_internal_destroy_cq)(struct ibv_cq *cq); @@ -65,7 +66,7 @@ ncclResult_t wrap_ibv_symbols(void) { } } -#define LOAD_SYM(handle, symbol, funcptr) do { \ +#define LOAD_SYM(handle, symbol, funcptr) do { \ cast = (void**)&funcptr; \ tmp = dlvsym(handle, symbol, IBVERBS_VERSION); \ if (tmp == NULL) { \ @@ -75,6 +76,12 @@ ncclResult_t wrap_ibv_symbols(void) { *cast = tmp; \ } while (0) +// Attempt to load a specific symbol version - fail silently +#define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do { \ + cast = (void**)&funcptr; \ + *cast = dlvsym(handle, symbol, version); \ + } while (0) + LOAD_SYM(ibvhandle, "ibv_get_device_list", ibv_internal_get_device_list); LOAD_SYM(ibvhandle, "ibv_free_device_list", ibv_internal_free_device_list); LOAD_SYM(ibvhandle, "ibv_get_device_name", ibv_internal_get_device_name); @@ -89,6 +96,8 @@ ncclResult_t wrap_ibv_symbols(void) { LOAD_SYM(ibvhandle, "ibv_alloc_pd", ibv_internal_alloc_pd); LOAD_SYM(ibvhandle, "ibv_dealloc_pd", ibv_internal_dealloc_pd); LOAD_SYM(ibvhandle, "ibv_reg_mr", ibv_internal_reg_mr); + // Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8 + LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibv_internal_reg_mr_iova2, "IBVERBS_1.8"); LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibv_internal_dereg_mr); LOAD_SYM(ibvhandle, "ibv_create_cq", ibv_internal_create_cq); LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibv_internal_destroy_cq); @@ -116,6 +125,7 @@ teardown: ibv_internal_alloc_pd = NULL; ibv_internal_dealloc_pd = NULL; ibv_internal_reg_mr = NULL; + ibv_internal_reg_mr_iova2 = NULL; ibv_internal_dereg_mr = NULL; ibv_internal_create_cq = NULL; ibv_internal_destroy_cq = NULL; @@ -260,6 +270,14 @@ struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t len return ibv_internal_reg_mr(pd, addr, length, access); } +ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access) { + if (ibv_internal_reg_mr_iova2 == NULL) { + return ncclInternalError; + } + if (ret == NULL) { return ncclSuccess; } // Assume dummy call + IBV_PTR_CHECK(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2"); +} + ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ IBV_INT_CHECK_RET_ERRNO(ibv_internal_dereg_mr, ibv_internal_dereg_mr(mr), 0, "ibv_dereg_mr"); } diff --git a/src/misc/nvmlwrap.cc b/src/misc/nvmlwrap.cc index e83392d..5db7c6b 100644 --- a/src/misc/nvmlwrap.cc +++ b/src/misc/nvmlwrap.cc @@ -1,219 +1,262 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "nvmlwrap.h" +#include "checks.h" +#include "debug.h" -#ifndef NVML_DIRECT -#include <dlfcn.h> -#include "core.h" - -static enum { nvmlUninitialized, nvmlInitializing, nvmlInitialized, nvmlError } nvmlState = nvmlUninitialized; - -static nvmlReturn_t (*nvmlInternalInit)(void); -static nvmlReturn_t (*nvmlInternalShutdown)(void); -static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device); -static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index); -static const char* (*nvmlInternalErrorString)(nvmlReturn_t r); -static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); -static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); -static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link, - nvmlNvLinkCapability_t capability, unsigned int *capResult); -static nvmlReturn_t (*nvmlInternalDeviceGetCudaComputeCapability)(nvmlDevice_t device, int* major, int* minor); - -// Used to make the NVML library calls thread safe -pthread_mutex_t nvmlLock = PTHREAD_MUTEX_INITIALIZER; - -ncclResult_t wrapNvmlSymbols(void) { - if (nvmlState == nvmlInitialized) - return ncclSuccess; - if (nvmlState == nvmlError) - return ncclSystemError; - - if (__sync_bool_compare_and_swap(&nvmlState, nvmlUninitialized, nvmlInitializing) == false) { - // Another thread raced in front of us. Wait for it to be done. - while (nvmlState == nvmlInitializing) pthread_yield(); - return (nvmlState == nvmlInitialized) ? ncclSuccess : ncclSystemError; - } +#include <initializer_list> +#include <memory> +#include <mutex> - static void* nvmlhandle = NULL; - void* tmp; - void** cast; +int ncclNvmlDeviceCount = 0; +ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices]; +ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices]; - nvmlhandle=dlopen("libnvidia-ml.so.1", RTLD_NOW); - if (!nvmlhandle) { - WARN("Failed to open libnvidia-ml.so.1"); - goto teardown; - } +#if NCCL_NVML_DIRECT + #define NCCL_NVML_FN(name, rettype, arglist) constexpr rettype(*pfn_##name)arglist = name; +#else + #include <dlfcn.h> + #define NCCL_NVML_FN(name, rettype, arglist) rettype(*pfn_##name)arglist = nullptr; +#endif -#define LOAD_SYM(handle, symbol, funcptr) do { \ - cast = (void**)&funcptr; \ - tmp = dlsym(handle, symbol); \ - if (tmp == NULL) { \ - WARN("dlsym failed on %s - %s", symbol, dlerror());\ - goto teardown; \ - } \ - *cast = tmp; \ - } while (0) - -#define LOAD_SYM_OPTIONAL(handle, symbol, funcptr) do {\ - cast = (void**)&funcptr; \ - tmp = dlsym(handle, symbol); \ - if (tmp == NULL) { \ - INFO(NCCL_INIT,"dlsym failed on %s, ignoring", symbol); \ - } \ - *cast = tmp; \ - } while (0) - - LOAD_SYM(nvmlhandle, "nvmlInit", nvmlInternalInit); - LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown); - LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId); - LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex); - LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString); - LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState); - LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo); - LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability); - LOAD_SYM(nvmlhandle, "nvmlDeviceGetCudaComputeCapability", nvmlInternalDeviceGetCudaComputeCapability); - - nvmlState = nvmlInitialized; - return ncclSuccess; +namespace { + NCCL_NVML_FN(nvmlInit, nvmlReturn_t, ()) + NCCL_NVML_FN(nvmlInit_v2, nvmlReturn_t, ()) + NCCL_NVML_FN(nvmlShutdown, nvmlReturn_t, ()) + NCCL_NVML_FN(nvmlDeviceGetCount, nvmlReturn_t, (unsigned int*)) + NCCL_NVML_FN(nvmlDeviceGetCount_v2, nvmlReturn_t, (unsigned int*)) + NCCL_NVML_FN(nvmlDeviceGetHandleByPciBusId, nvmlReturn_t, (const char* pciBusId, nvmlDevice_t* device)) + NCCL_NVML_FN(nvmlDeviceGetHandleByIndex, nvmlReturn_t, (unsigned int index, nvmlDevice_t *device)) + NCCL_NVML_FN(nvmlDeviceGetIndex, nvmlReturn_t, (nvmlDevice_t device, unsigned* index)) + NCCL_NVML_FN(nvmlErrorString, char const*, (nvmlReturn_t r)) + NCCL_NVML_FN(nvmlDeviceGetNvLinkState, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive)) + NCCL_NVML_FN(nvmlDeviceGetNvLinkRemotePciInfo, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci)) + NCCL_NVML_FN(nvmlDeviceGetNvLinkCapability, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult)) + NCCL_NVML_FN(nvmlDeviceGetCudaComputeCapability, nvmlReturn_t, (nvmlDevice_t device, int* major, int* minor)) + NCCL_NVML_FN(nvmlDeviceGetP2PStatus, nvmlReturn_t, (nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus)) -teardown: - nvmlInternalInit = NULL; - nvmlInternalShutdown = NULL; - nvmlInternalDeviceGetHandleByPciBusId = NULL; - nvmlInternalDeviceGetIndex = NULL; - nvmlInternalDeviceGetNvLinkState = NULL; - nvmlInternalDeviceGetNvLinkRemotePciInfo = NULL; - nvmlInternalDeviceGetNvLinkCapability = NULL; - - if (nvmlhandle != NULL) dlclose(nvmlhandle); - nvmlState = nvmlError; - return ncclSystemError; + std::mutex lock; // NVML has had some thread safety bugs + bool initialized = false; + thread_local bool threadInitialized = false; + ncclResult_t initResult; } +ncclResult_t ncclNvmlEnsureInitialized() { + // Optimization to avoid repeatedly grabbing the lock when we only want to + // read from the global tables. + if (threadInitialized) return initResult; + threadInitialized = true; + + std::lock_guard<std::mutex> locked(lock); -ncclResult_t wrapNvmlInit(void) { - if (nvmlInternalInit == NULL) { - WARN("lib wrapper not initialized."); - return ncclInternalError; + if (initialized) return initResult; + initialized = true; + + #if !NCCL_NVML_DIRECT + if (pfn_nvmlInit == nullptr) { + void *libhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW); + if (libhandle == nullptr) { + WARN("Failed to open libnvidia-ml.so.1"); + initResult = ncclSystemError; + return initResult; + } + + struct Symbol { void **ppfn; char const *name; }; + std::initializer_list<Symbol> symbols = { + {(void**)&pfn_nvmlInit, "nvmlInit"}, + {(void**)&pfn_nvmlInit_v2, "nvmlInit_v2"}, + {(void**)&pfn_nvmlShutdown, "nvmlShutdown"}, + {(void**)&pfn_nvmlDeviceGetCount, "nvmlDeviceGetCount"}, + {(void**)&pfn_nvmlDeviceGetCount_v2, "nvmlDeviceGetCount_v2"}, + {(void**)&pfn_nvmlDeviceGetHandleByPciBusId, "nvmlDeviceGetHandleByPciBusId"}, + {(void**)&pfn_nvmlDeviceGetHandleByIndex, "nvmlDeviceGetHandleByIndex"}, + {(void**)&pfn_nvmlDeviceGetIndex, "nvmlDeviceGetIndex"}, + {(void**)&pfn_nvmlErrorString, "nvmlErrorString"}, + {(void**)&pfn_nvmlDeviceGetNvLinkState, "nvmlDeviceGetNvLinkState"}, + {(void**)&pfn_nvmlDeviceGetNvLinkRemotePciInfo, "nvmlDeviceGetNvLinkRemotePciInfo"}, + {(void**)&pfn_nvmlDeviceGetNvLinkCapability, "nvmlDeviceGetNvLinkCapability"}, + {(void**)&pfn_nvmlDeviceGetCudaComputeCapability, "nvmlDeviceGetCudaComputeCapability"}, + {(void**)&pfn_nvmlDeviceGetP2PStatus, "nvmlDeviceGetP2PStatus"} + }; + for(Symbol sym: symbols) { + *sym.ppfn = dlsym(libhandle, sym.name); + } } - nvmlReturn_t ret = nvmlInternalInit(); - if (ret != NVML_SUCCESS) { - WARN("nvmlInit() failed: %s", - nvmlInternalErrorString(ret)); - return ncclSystemError; + #endif + + #if NCCL_NVML_DIRECT + bool have_v2 = true; + #else + bool have_v2 = pfn_nvmlInit_v2 != nullptr; // if this compare is done in the NCCL_NVML_DIRECT=1 case then GCC warns about it never being null + #endif + nvmlReturn_t res1 = (have_v2 ? pfn_nvmlInit_v2 : pfn_nvmlInit)(); + if (res1 != NVML_SUCCESS) { + WARN("nvmlInit%s() failed: %s", have_v2 ? "_v2" : "", pfn_nvmlErrorString(res1)); + initResult = ncclSystemError; + return initResult; } - return ncclSuccess; -} -ncclResult_t wrapNvmlShutdown(void) { - if (nvmlInternalShutdown == NULL) { - WARN("lib wrapper not initialized."); - return ncclInternalError; + unsigned int ndev; + res1 = (have_v2 ? pfn_nvmlDeviceGetCount_v2 : pfn_nvmlDeviceGetCount)(&ndev); + if (res1 != NVML_SUCCESS) { + WARN("nvmlDeviceGetCount%s() failed: %s", have_v2 ? "_v2" :"", pfn_nvmlErrorString(res1)); + initResult = ncclSystemError; + return initResult; } - nvmlReturn_t ret = nvmlInternalShutdown(); - if (ret != NVML_SUCCESS) { - WARN("nvmlShutdown() failed: %s ", - nvmlInternalErrorString(ret)); - return ncclSystemError; + + ncclNvmlDeviceCount = int(ndev); + if (ncclNvmlMaxDevices < ncclNvmlDeviceCount) { + WARN("nvmlDeviceGetCount() reported more devices (%d) than the internal maximum (ncclNvmlMaxDevices=%d)", ncclNvmlDeviceCount, ncclNvmlMaxDevices); + initResult = ncclInternalError; + return initResult; } - return ncclSuccess; -} -ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) { - if (nvmlInternalDeviceGetHandleByPciBusId == NULL) { - WARN("lib wrapper not initialized."); - return ncclInternalError; + for(int a=0; a < ncclNvmlDeviceCount; a++) { + res1 = pfn_nvmlDeviceGetHandleByIndex(a, &ncclNvmlDevices[a].handle); + if (res1 != NVML_SUCCESS) { + WARN("nvmlDeviceGetHandleByIndex(%d) failed: %s", int(a), pfn_nvmlErrorString(res1)); + initResult = ncclSystemError; + return initResult; + } + + res1 = pfn_nvmlDeviceGetCudaComputeCapability(ncclNvmlDevices[a].handle, &ncclNvmlDevices[a].computeCapabilityMajor, &ncclNvmlDevices[a].computeCapabilityMinor); + if (res1 != NVML_SUCCESS) { + WARN("nvmlDeviceGetCudaComputeCapability(%d) failed: %s", int(a), pfn_nvmlErrorString(res1)); + initResult = ncclSystemError; + return initResult; + } } - nvmlReturn_t ret; - NVMLLOCKCALL(nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device), ret); - if (ret != NVML_SUCCESS) { - WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ", - nvmlInternalErrorString(ret)); - return ncclSystemError; + + for(int a=0; a < ncclNvmlDeviceCount; a++) { + for(int b=0; b < ncclNvmlDeviceCount; b++) { + nvmlDevice_t da = ncclNvmlDevices[a].handle; + nvmlDevice_t db = ncclNvmlDevices[b].handle; + + res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_READ, &ncclNvmlDevicePairs[a][b].p2pStatusRead); + if (res1 != NVML_SUCCESS) { + WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1)); + initResult = ncclSystemError; + return initResult; + } + + res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_WRITE, &ncclNvmlDevicePairs[a][b].p2pStatusWrite); + if (res1 != NVML_SUCCESS) { + WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1)); + initResult = ncclSystemError; + return initResult; + } + } } + + initResult = ncclSuccess; + return initResult; +} + +#define NVMLCHECK(name, ...) do { \ + nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \ + if (e44241808 != NVML_SUCCESS) { \ + WARN(#name "() failed: %s", pfn_nvmlErrorString(e44241808)); \ + return ncclSystemError; \ + } \ +} while(0) + +#define NVMLTRY(name, ...) do { \ + if (!NCCL_NVML_DIRECT && pfn_##name == nullptr) \ + return ncclInternalError; /* missing symbol is not a warned error */ \ + nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \ + if (e44241808 != NVML_SUCCESS) { \ + if (e44241808 != NVML_ERROR_NOT_SUPPORTED) \ + INFO(NCCL_INIT, #name "() failed: %s", pfn_nvmlErrorString(e44241808)); \ + return ncclSystemError; \ + } \ +} while(0) + +ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) { + NCCLCHECK(ncclNvmlEnsureInitialized()); + std::lock_guard<std::mutex> locked(lock); + NVMLCHECK(nvmlDeviceGetHandleByPciBusId, pciBusId, device); return ncclSuccess; } -ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { - if (nvmlInternalDeviceGetIndex == NULL) { - WARN("lib wrapper not initialized."); - return ncclInternalError; - } - nvmlReturn_t ret; - NVMLLOCKCALL(nvmlInternalDeviceGetIndex(device, index), ret); - if (ret != NVML_SUCCESS) { - WARN("nvmlDeviceGetIndex() failed: %s ", - nvmlInternalErrorString(ret)); - return ncclSystemError; - } +ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) { + NCCLCHECK(ncclNvmlEnsureInitialized()); + *device = ncclNvmlDevices[index].handle; return ncclSuccess; } -ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) { - if (nvmlInternalDeviceGetNvLinkState == NULL) { - /* Do not warn, this symbol is optional. */ - return ncclInternalError; - } - nvmlReturn_t ret; - NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkState(device, link, isActive), ret); - if (ret != NVML_SUCCESS) { - if (ret != NVML_ERROR_NOT_SUPPORTED) - INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ", - nvmlInternalErrorString(ret)); - return ncclSystemError; +ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { + NCCLCHECK(ncclNvmlEnsureInitialized()); + for (int d=0; d < ncclNvmlDeviceCount; d++) { + if (ncclNvmlDevices[d].handle == device) { + *index = d; + return ncclSuccess; + } } + return ncclInvalidArgument; +} + +ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) { + NCCLCHECK(ncclNvmlEnsureInitialized()); + std::lock_guard<std::mutex> locked(lock); + NVMLTRY(nvmlDeviceGetNvLinkState, device, link, isActive); return ncclSuccess; } -ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) { - if (nvmlInternalDeviceGetNvLinkRemotePciInfo == NULL) { - /* Do not warn, this symbol is optional. */ - return ncclInternalError; - } - nvmlReturn_t ret; - NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci), ret); - if (ret != NVML_SUCCESS) { - if (ret != NVML_ERROR_NOT_SUPPORTED) - INFO(NCCL_INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ", - nvmlInternalErrorString(ret)); - return ncclSystemError; - } +ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) { + NCCLCHECK(ncclNvmlEnsureInitialized()); + std::lock_guard<std::mutex> locked(lock); + NVMLTRY(nvmlDeviceGetNvLinkRemotePciInfo, device, link, pci); return ncclSuccess; } -ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, - nvmlNvLinkCapability_t capability, unsigned int *capResult) { - if (nvmlInternalDeviceGetNvLinkCapability == NULL) { - /* Do not warn, this symbol is optional. */ - return ncclInternalError; - } - nvmlReturn_t ret; - NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult), ret); - if (ret != NVML_SUCCESS) { - if (ret != NVML_ERROR_NOT_SUPPORTED) - INFO(NCCL_INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ", - nvmlInternalErrorString(ret)); - return ncclSystemError; - } +ncclResult_t ncclNvmlDeviceGetNvLinkCapability( + nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, + unsigned int *capResult + ) { + NCCLCHECK(ncclNvmlEnsureInitialized()); + std::lock_guard<std::mutex> locked(lock); + NVMLTRY(nvmlDeviceGetNvLinkCapability, device, link, capability, capResult); return ncclSuccess; } -ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) { - if (nvmlInternalDeviceGetNvLinkCapability == NULL) { - WARN("lib wrapper not initialized."); - return ncclInternalError; +ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) { + NCCLCHECK(ncclNvmlEnsureInitialized()); + + for(int d=0; d < ncclNvmlDeviceCount; d++) { + if(device == ncclNvmlDevices[d].handle) { + *major = ncclNvmlDevices[d].computeCapabilityMajor; + *minor = ncclNvmlDevices[d].computeCapabilityMinor; + return ncclSuccess; + } } - nvmlReturn_t ret; - NVMLLOCKCALL(nvmlInternalDeviceGetCudaComputeCapability(device, major, minor), ret); - if (ret != NVML_SUCCESS) { - WARN("nvmlDeviceGetCudaComputeCapability() failed: %s ", - nvmlInternalErrorString(ret)); - return ncclSystemError; + return ncclInvalidArgument; +} + +ncclResult_t ncclNvmlDeviceGetP2PStatus( + nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, + nvmlGpuP2PStatus_t* p2pStatus + ) { + NCCLCHECK(ncclNvmlEnsureInitialized()); + + if (p2pIndex == NVML_P2P_CAPS_INDEX_READ || p2pIndex == NVML_P2P_CAPS_INDEX_WRITE) { + int a = -1, b = -1; + for(int d=0; d < ncclNvmlDeviceCount; d++) { + if(device1 == ncclNvmlDevices[d].handle) a = d; + if(device2 == ncclNvmlDevices[d].handle) b = d; + } + if (a == -1 || b == -1) return ncclInvalidArgument; + if (p2pIndex == NVML_P2P_CAPS_INDEX_READ) + *p2pStatus = ncclNvmlDevicePairs[a][b].p2pStatusRead; + else + *p2pStatus = ncclNvmlDevicePairs[a][b].p2pStatusWrite; + } + else { + std::lock_guard<std::mutex> locked(lock); + NVMLCHECK(nvmlDeviceGetP2PStatus, device1, device2, p2pIndex, p2pStatus); } return ncclSuccess; } -#endif diff --git a/src/misc/profiler.cc b/src/misc/profiler.cc new file mode 100644 index 0000000..145b18f --- /dev/null +++ b/src/misc/profiler.cc @@ -0,0 +1,115 @@ +/************************************************************************* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "profiler.h" + +//#define PROFILE_PROXY 1 +#ifdef PROFILE_PROXY +#include "timer.h" +#include "alloc.h" + +static const char* profilingStateSendStr[] = { "BufferWait", "GPUWait", "SendWait", "", "End" }; +static const char* profilingStateRecvStr[] = { "BufferWait", "RecvWait", "FlushWait", "GPUWait", "End" }; +static const char* profilingEventStr[] = { "SendRecv", "Sleep", "Idle", "Append" }; +struct ncclProxyProfileEvent { + double timestamp[6]; + uint64_t opCount; + int peer; + int step; + uint16_t channel; + uint8_t type; // send / recv + uint8_t opIndex; +}; + +struct ncclProxyProfileEvent* profilingEvents = NULL; +int profilingIndex = 0; +double profilingStart = 0; +#define MAX_EVENTS 200000 + +ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { + if (profilingEvents == NULL) { + NCCLCHECK(ncclCalloc(&profilingEvents, MAX_EVENTS)); + profilingStart = gettime(); + } + struct ncclProxyProfileEvent* event = NULL; + if (state%8 == 0) { + if (profilingIndex == MAX_EVENTS) return ncclSuccess; + args->subs[sub].profilingEvents[step%NCCL_STEPS] = event = profilingEvents+profilingIndex++; + if (state == ncclProxyProfileBegin) { + // Proxy operation information + event->opCount = args->opCount; + event->channel = args->subs[sub].channelId; + event->peer = args->subs[sub].peer; + event->type = args->pattern; + event->step = step; + event->opIndex = (((uint64_t)args)/sizeof(struct ncclProxyArgs))%256; + } else event->peer = -state; + } else { + event = (struct ncclProxyProfileEvent*)args->subs[sub].profilingEvents[step%NCCL_STEPS]; + if (state == ncclProxyProfileEnd) args->subs[sub].profilingEvents[step%NCCL_STEPS] = NULL; + if (state == ncclProxyProfileAppendEnd) event->opCount = args->opCount; + } + // Timestamp + event->timestamp[state%8] = gettime()-profilingStart; + return ncclSuccess; +} + +void ncclProfilingDump() { + static int dumpDone = 0; + if (dumpDone) return; + dumpDone = 1; + const char* str = getenv("NCCL_PROXY_PROFILE"); + if (!str) { free(profilingEvents); return; } + FILE* f = fopen(str, "w"); + fprintf(f, "[\n"); + + for (int i=0; i<profilingIndex; i++) { + struct ncclProxyProfileEvent* e = profilingEvents+i; + const int sendrecv = e->peer >= 0; + const char* typeStr = sendrecv ? (e->type == ncclPatternSend ? "Send" : "Recv") : + profilingEventStr[-(e->peer/8)]; + + + if (sendrecv) { + int state = ncclProxyProfileBegin; + const char** stateStr = e->type == ncclPatternSend ? profilingStateSendStr : profilingStateRecvStr; + fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f, \"args\": { \"opCount\": %ld, \"proxyOpIndex\":%d } },\n", + typeStr, e->peer, e->step, i, e->channel, e->timestamp[state], e->opCount, e->opIndex); + + while (state<ncclProxyProfileEnd) { + if (e->timestamp[state]) { + const char* name = stateStr[state]; + fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n", + name, i, e->channel, e->timestamp[state]); + state++; + while (e->timestamp[state] == 0) state++; + fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n", + name, i, e->channel, e->timestamp[state]); + } + } + + fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n", + typeStr, e->peer, e->step, i, e->channel, e->timestamp[state]); + } else { + if (e->peer == -ncclProxyProfileAppend) { + fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f, \"args\": { \"added\": %ld } },\n", + typeStr, i, e->timestamp[0], e->opCount); + } else { + fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n", + typeStr, i, e->timestamp[0]); + } + fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n", + typeStr, i, e->timestamp[1]); + } + } + fprintf(f, "{} ]\n"); + fclose(f); + free(profilingEvents); +} +#else +ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; } +void ncclProfilingDump() {} +#endif diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc new file mode 100644 index 0000000..d6bc353 --- /dev/null +++ b/src/misc/shmutils.cc @@ -0,0 +1,90 @@ +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "shm.h" +#include "checks.h" +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +// Change functions behavior to match other SYS functions +static int shm_allocate(int fd, const int shmSize) { + int err = posix_fallocate(fd, 0, shmSize); + if (err) { errno = err; return -1; } + return 0; +} +static int shm_map(int fd, const int shmSize, void** ptr) { + *ptr = mmap(NULL, shmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + return (*ptr == MAP_FAILED) ? -1 : 0; +} + +static ncclResult_t ncclShmSetup(char* shmPath, const int shmSize, int* fd, void** ptr, int create) { + if (create) { + if (shmPath[0] == '\0') { + sprintf(shmPath, "/dev/shm/nccl-XXXXXX"); + *fd = mkstemp(shmPath); + } else { + SYSCHECKVAL(open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "open", *fd); + } + if (ftruncate(*fd, shmSize) != 0) { + WARN("Error: failed to extend %s to %d bytes", shmPath, shmSize); + return ncclSystemError; + } + } else { + SYSCHECKVAL(open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), "open", *fd); + } + *ptr = (char*)mmap(NULL, shmSize, PROT_READ|PROT_WRITE, MAP_SHARED, *fd, 0); + if (*ptr == NULL) { + WARN("Could not map %s\n", shmPath); + return ncclSystemError; + } + close(*fd); + *fd = -1; + if (create) memset(*ptr, 0, shmSize); + return ncclSuccess; +} + +ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create) { + int fd = -1; + void* ptr = MAP_FAILED; + ncclResult_t res = ncclSuccess; + + NCCLCHECKGOTO(ncclShmSetup(shmPath, shmSize, &fd, &ptr, create), res, sysError); + if (devShmPtr) { + CUDACHECKGOTO(cudaHostRegister(ptr, shmSize, cudaHostRegisterMapped), res, cudaError); + CUDACHECKGOTO(cudaHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError); + } + + *shmPtr = ptr; + return ncclSuccess; +sysError: + WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmPath, shmSize); +cudaError: + if (fd != -1) close(fd); + if (create) shm_unlink(shmPath); + if (ptr != MAP_FAILED) munmap(ptr, shmSize); + *shmPtr = NULL; + return res; +} + +ncclResult_t ncclShmUnlink(const char* shmPath) { + if (shmPath != NULL) SYSCHECK(unlink(shmPath), "unlink"); + return ncclSuccess; +} + +ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize) { + if (devShmPtr) CUDACHECK(cudaHostUnregister(shmPtr)); + if (munmap(shmPtr, shmSize) != 0) { + WARN("munmap of shared memory failed"); + return ncclSystemError; + } + return ncclSuccess; +} diff --git a/src/misc/socket.cc b/src/misc/socket.cc new file mode 100644 index 0000000..4e3295f --- /dev/null +++ b/src/misc/socket.cc @@ -0,0 +1,552 @@ +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "socket.h" +#include "utils.h" +#include <stdlib.h> + +#include <unistd.h> +#include <ifaddrs.h> +#include <net/if.h> + +/* Format a string representation of a (union ncclSocketAddress *) socket address using getnameinfo() + * + * Output: "IPv4/IPv6 address<port>" + */ +const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf) { + if (buf == NULL || addr == NULL) return NULL; + struct sockaddr *saddr = &addr->sa; + if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; } + char host[NI_MAXHOST], service[NI_MAXSERV]; + (void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, NI_NUMERICHOST|NI_NUMERICSERV); + sprintf(buf, "%s<%s>", host, service); + return buf; +} + +static uint16_t socketToPort(union ncclSocketAddress *addr) { + struct sockaddr *saddr = &addr->sa; + return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port); +} + +/* Allow the user to force the IPv4/IPv6 interface selection */ +static int envSocketFamily(void) { + int family = -1; // Family selection is not forced, will use first one found + char* env = getenv("NCCL_SOCKET_FAMILY"); + if (env == NULL) + return family; + + INFO(NCCL_ENV, "NCCL_SOCKET_FAMILY set by environment to %s", env); + + if (strcmp(env, "AF_INET") == 0) + family = AF_INET; // IPv4 + else if (strcmp(env, "AF_INET6") == 0) + family = AF_INET6; // IPv6 + return family; +} + +static int findInterfaces(const char* prefixList, char* names, union ncclSocketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) { +#ifdef ENABLE_TRACE + char line[SOCKET_NAME_MAXLEN+1]; +#endif + struct netIf userIfs[MAX_IFS]; + bool searchNot = prefixList && prefixList[0] == '^'; + if (searchNot) prefixList++; + bool searchExact = prefixList && prefixList[0] == '='; + if (searchExact) prefixList++; + int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS); + + int found = 0; + struct ifaddrs *interfaces, *interface; + getifaddrs(&interfaces); + for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) { + if (interface->ifa_addr == NULL) continue; + + /* We only support IPv4 & IPv6 */ + int family = interface->ifa_addr->sa_family; + if (family != AF_INET && family != AF_INET6) + continue; + + TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, ncclSocketToString((union ncclSocketAddress *) interface->ifa_addr, line)); + + /* Allow the caller to force the socket family type */ + if (sock_family != -1 && family != sock_family) + continue; + + /* We also need to skip IPv6 loopback interfaces */ + if (family == AF_INET6) { + struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr); + if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue; + } + + // check against user specified interfaces + if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) { + continue; + } + + // Check that this interface has not already been saved + // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link + bool duplicate = false; + for (int i = 0; i < found; i++) { + if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; } + } + + if (!duplicate) { + // Store the interface name + strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize); + // Store the IP address + int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); + memcpy(addrs+found, interface->ifa_addr, salen); + found++; + } + } + + freeifaddrs(interfaces); + return found; +} + +static bool matchSubnet(struct ifaddrs local_if, union ncclSocketAddress* remote) { + /* Check family first */ + int family = local_if.ifa_addr->sa_family; + if (family != remote->sa.sa_family) { + return false; + } + + if (family == AF_INET) { + struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr); + struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask); + struct sockaddr_in& remote_addr = remote->sin; + struct in_addr local_subnet, remote_subnet; + local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr; + remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr; + return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true; + } else if (family == AF_INET6) { + struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr); + struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask); + struct sockaddr_in6& remote_addr = remote->sin6; + struct in6_addr& local_in6 = local_addr->sin6_addr; + struct in6_addr& mask_in6 = mask->sin6_addr; + struct in6_addr& remote_in6 = remote_addr.sin6_addr; + bool same = true; + int len = 16; //IPv6 address is 16 unsigned char + for (int c = 0; c < len; c++) { //Network byte order is big-endian + char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c]; + char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c]; + if (c1 ^ c2) { + same = false; + break; + } + } + // At last, we need to compare scope id + // Two Link-type addresses can have the same subnet address even though they are not in the same scope + // For Global type, this field is 0, so a comparison wouldn't matter + same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id); + return same; + } else { + WARN("Net : Unsupported address family type"); + return false; + } +} + +int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) { +#ifdef ENABLE_TRACE + char line[SOCKET_NAME_MAXLEN+1]; +#endif + char line_a[SOCKET_NAME_MAXLEN+1]; + int found = 0; + struct ifaddrs *interfaces, *interface; + getifaddrs(&interfaces); + for (interface = interfaces; interface && !found; interface = interface->ifa_next) { + if (interface->ifa_addr == NULL) continue; + + /* We only support IPv4 & IPv6 */ + int family = interface->ifa_addr->sa_family; + if (family != AF_INET && family != AF_INET6) + continue; + + // check against user specified interfaces + if (!matchSubnet(*interface, remoteAddr)) { + continue; + } + + // Store the local IP address + int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); + memcpy(localAddrs+found, interface->ifa_addr, salen); + + // Store the interface name + strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize); + + TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, ncclSocketToString(localAddrs+found, line), ncclSocketToString(remoteAddr, line_a)); + found++; + if (found == maxIfs) break; + } + + if (found == 0) { + WARN("Net : No interface found in the same subnet as remote address %s", ncclSocketToString(remoteAddr, line_a)); + } + freeifaddrs(interfaces); + return found; +} + +ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) { + if (!(ip_port_pair && strlen(ip_port_pair) > 1)) { + WARN("Net : string is null"); + return ncclInvalidArgument; + } + + bool ipv6 = ip_port_pair[0] == '['; + /* Construct the sockaddress structure */ + if (!ipv6) { + struct netIf ni; + // parse <ip_or_hostname>:<port> string, expect one pair + if (parseStringList(ip_port_pair, &ni, 1) != 1) { + WARN("Net : No valid <IPv4_or_hostname>:<port> pair found"); + return ncclInvalidArgument; + } + + struct addrinfo hints, *p; + int rv; + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + + if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) { + WARN("Net : error encountered when getting address info : %s", gai_strerror(rv)); + return ncclInvalidArgument; + } + + // use the first + if (p->ai_family == AF_INET) { + struct sockaddr_in& sin = ua->sin; + memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in)); + sin.sin_family = AF_INET; // IPv4 + //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr)); // IP address + sin.sin_port = htons(ni.port); // port + } else if (p->ai_family == AF_INET6) { + struct sockaddr_in6& sin6 = ua->sin6; + memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6)); + sin6.sin6_family = AF_INET6; // IPv6 + sin6.sin6_port = htons(ni.port); // port + sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete + sin6.sin6_scope_id = 0; // should be global scope, set to 0 + } else { + WARN("Net : unsupported IP family"); + return ncclInvalidArgument; + } + + freeaddrinfo(p); // all done with this structure + + } else { + int i, j = -1, len = strlen(ip_port_pair); + for (i = 1; i < len; i++) { + if (ip_port_pair[i] == '%') j = i; + if (ip_port_pair[i] == ']') break; + } + if (i == len) { + WARN("Net : No valid [IPv6]:port pair found"); + return ncclInvalidArgument; + } + bool global_scope = (j == -1 ? true : false); // If no % found, global scope; otherwise, link scope + + char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ]; + memset(ip_str, '\0', sizeof(ip_str)); + memset(port_str, '\0', sizeof(port_str)); + memset(if_name, '\0', sizeof(if_name)); + strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1); + strncpy(port_str, ip_port_pair+i+2, len-i-1); + int port = atoi(port_str); + if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name + + struct sockaddr_in6& sin6 = ua->sin6; + sin6.sin6_family = AF_INET6; // IPv6 + inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr)); // IP address + sin6.sin6_port = htons(port); // port + sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete + sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name); // 0 if global scope; intf index if link scope + } + return ncclSuccess; +} + +int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) { + static int shownIfName = 0; + int nIfs = 0; + // Allow user to force the INET socket family selection + int sock_family = envSocketFamily(); + // User specified interface + char* env = getenv("NCCL_SOCKET_IFNAME"); + if (env && strlen(env) > 1) { + INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env); + // Specified by user : find or fail + if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env); + nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); + } else { + // Try to automatically pick the right one + // Start with IB + nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); + // else see if we can get some hint from COMM ID + if (nIfs == 0) { + char* commId = getenv("NCCL_COMM_ID"); + if (commId && strlen(commId) > 1) { + INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId); + // Try to find interface that is in the same subnet as the IP in comm id + union ncclSocketAddress idAddr; + ncclGetSocketAddrFromString(&idAddr, commId); + nIfs = ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs); + } + } + // Then look for anything else (but not docker or lo) + if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); + // Finally look for docker, then lo. + if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); + if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs); + } + return nIfs; +} + +ncclResult_t ncclSocketListen(struct ncclSocket* sock) { + /* IPv4/IPv6 support */ + int family = sock->addr.sa.sa_family; + int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); + int flags; + + /* Create socket and bind it to a port */ + int fd = socket(family, SOCK_STREAM, 0); + if (fd == -1) { + WARN("Net : Socket creation failed : %s", strerror(errno)); + return ncclSystemError; + } + + if (socketToPort(&sock->addr)) { + // Port is forced by env. Make sure we get the port. + int opt = 1; +#if defined(SO_REUSEPORT) + SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt"); +#else + SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt"); +#endif + } + + /* make all new sockets non-blocking */ + EQCHECK(flags = fcntl(fd, F_GETFL), -1); + SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); + + // addr port should be 0 (Any port) + SYSCHECK(bind(fd, &sock->addr.sa, salen), "bind"); + + /* Get the assigned Port */ + socklen_t size = salen; + SYSCHECK(getsockname(fd, &sock->addr.sa, &size), "getsockname"); + +#ifdef ENABLE_TRACE + char line[SOCKET_NAME_MAXLEN+1]; + TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", ncclSocketToString(&sock->addr, line)); +#endif + + /* Put the socket in listen mode + * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn + */ + SYSCHECK(listen(fd, 16384), "listen"); + sock->fd = fd; + return ncclSuccess; +} + +static ncclResult_t getFdState(int fd, enum ncclSocketState* state) { + struct pollfd pfd; + int timeout = 1, ret; + socklen_t rlen = sizeof(int); + + memset(&pfd, 0, sizeof(struct pollfd)); + pfd.fd = fd; + pfd.events = POLLOUT; + SYSCHECK(ret = poll(&pfd, 1, timeout), "poll"); + if (ret == 0) { + ret = EINPROGRESS; + } else { + /* check socket status */ + EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0); + SYSCHECK(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt"); + } + + if (ret == EINPROGRESS) + *state = ncclSocketConnecting; + else if (ret == 0) + *state = ncclSocketConnected; + else + *state = ncclSocketError; + return ncclSuccess; +} + +ncclResult_t ncclGetSocketState(struct ncclSocket* sock, enum ncclSocketState* state) { + NCCLCHECK(getFdState(sock->fd, state)); + sock->state = *state; + return ncclSuccess; +} + +ncclResult_t ncclSocketConnect(struct ncclSocket* sock) { + char line[SOCKET_NAME_MAXLEN+1]; + /* IPv4/IPv6 support */ + int family = sock->addr.sa.sa_family; + if (family != AF_INET && family != AF_INET6) { + WARN("Net : connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)", + ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6); + return ncclInternalError; + } + int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); + int flags; + + /* Connect to a hostname / port */ + int fd = socket(family, SOCK_STREAM, 0); + if (fd == -1) { + WARN("Net : Socket creation failed : %s", strerror(errno)); + return ncclSystemError; + } + + const int one = 1; + SYSCHECK(setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt"); + + /* support non-blocking socket; by default, the socket is non-blocking */ + EQCHECK(flags = fcntl(fd, F_GETFL), -1); + SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); + + /* const int bufsize = 128*1024; + SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt"); + SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/ + + TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line)); + + int ret; + int timedout_retries = 0; + int refused_retries = 0; +retry: + /* async connect; abort when error happens and abortFlag is present. */ + ret = connect(fd, &sock->addr.sa, salen); + + if (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) || + (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) { + if (refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno)); + usleep(SLEEP_INT); + goto retry; + } else if (errno == EINPROGRESS && !sock->asyncFlag) { + enum ncclSocketState state; + do { + if (sock->abortFlag) NEQCHECK(*sock->abortFlag, 0); + NCCLCHECK(getFdState(fd, &state)); + } while (state == ncclSocketConnecting); + EQCHECK(state, ncclSocketError); + ret = 0; + } + + if (ret == 0 || (errno == EINPROGRESS && sock->asyncFlag)) { + sock->fd = fd; + return ncclSuccess; + } + + WARN("Net : Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno)); + return ncclSystemError; +} + +ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket) { + socklen_t socklen = sizeof(union ncclSocketAddress); + int tmpFd = sock->fd = -1; + + do { + if (listenSocket->abortFlag) NEQCHECK(*listenSocket->abortFlag, 0); + tmpFd = accept(listenSocket->fd, &sock->addr.sa, &socklen); + } while ((errno == EAGAIN || errno == EWOULDBLOCK) && tmpFd == -1 && !listenSocket->asyncFlag); + + if (!listenSocket->asyncFlag) { + EQCHECK(tmpFd, -1); + } else if (tmpFd == -1 && errno != EAGAIN && errno != EWOULDBLOCK) { + return ncclSystemError; + } + + sock->fd = tmpFd; + return ncclSuccess; +} + +ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr, volatile uint32_t* abortFlag, int asyncFlag) { + if (sock == NULL) + return ncclSuccess; + + sock->fd = -1; + if (addr) { + memcpy(&sock->addr, addr, sizeof(union ncclSocketAddress)); + } else { + memset(&sock->addr, 0, sizeof(union ncclSocketAddress)); + } + sock->abortFlag = abortFlag; + sock->asyncFlag = asyncFlag; + sock->state = ncclSocketStateNum; + return ncclSuccess; +} + +static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) { + int bytes = 0; + *closed = 0; + char* data = (char*)ptr; + char line[SOCKET_NAME_MAXLEN+1]; + do { + if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT); + if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT); + if (op == NCCL_SOCKET_RECV && bytes == 0) { + *closed = 1; + return ncclSuccess; + } + if (bytes == -1) { + if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { + WARN("Net : Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno)); + return ncclSystemError; + } else { + bytes = 0; + } + } + (*offset) += bytes; + if (sock->abortFlag && *sock->abortFlag != 0) { + INFO(NCCL_NET, "Socket progress: abort called"); + return ncclSystemError; + } + } while (bytes > 0 && (*offset) < size); + return ncclSuccess; +} + +ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) { + int closed; + NCCLCHECK(ncclSocketProgressOpt(op, sock, ptr, size, offset, 0, &closed)); + if (closed) { + char line[SOCKET_NAME_MAXLEN+1]; + WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line)); + return ncclSystemError; + } + return ncclSuccess; +} + +ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) { + while (*offset < size) + NCCLCHECK(ncclSocketProgress(op, sock, ptr, size, offset)); + return ncclSuccess; +} + +ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size) { + int offset = 0; + NCCLCHECK(ncclSocketWait(NCCL_SOCKET_SEND, sock, ptr, size, &offset)); + return ncclSuccess; +} + +ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) { + int offset = 0; + NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, sock, ptr, size, &offset)); + return ncclSuccess; +} + +// Receive or detect connection closed +ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed) { + int offset = 0; + *closed = 0; + while (offset < size) { + NCCLCHECK(ncclSocketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed)); + if (*closed) return ncclSuccess; + } + return ncclSuccess; +} diff --git a/src/net.cc b/src/net.cc new file mode 100644 index 0000000..5f68021 --- /dev/null +++ b/src/net.cc @@ -0,0 +1,261 @@ +#include "net.h" +#include "bootstrap.h" +#include "checks.h" + +#include <string.h> +#include <errno.h> +#include <dlfcn.h> +//#include <sys/types.h> +//#include <sys/stat.h> +//#include <unistd.h> + +ncclNet_t *ncclNet; +ncclCollNet_t *ncclCollNet; + +static ncclNet_v5_t ncclNet_v4_as_v5; +static ncclNet_v4_t *ncclNet_v4; +static ncclCollNet_v5_t ncclCollNet_v4_as_v5; +static ncclCollNet_v4_t *ncclCollNet_v4; + +static ncclResult_t ncclNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) { + ncclNetProperties_v4_t p4; + ncclResult_t ans = ncclNet_v4->getProperties(dev, &p4); + if (ans != ncclSuccess) return ans; + props->name = p4.name; + props->pciPath = p4.pciPath; + props->guid = p4.guid; + props->ptrSupport = p4.ptrSupport; + props->speed = p4.speed; + props->port = p4.port; + props->maxComms = p4.maxComms; + props->maxRecvs = 1; + props->latency = 0; + return ncclSuccess; +} + +static ncclResult_t ncclNet_v4_as_v5_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { + return ncclNet_v4->isend(sendComm, data, size, mhandle, request); +} + +static ncclResult_t ncclNet_v4_as_v5_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { + if (n == 0) return ncclSuccess; + if (n != 1) return ncclInvalidArgument; + return ncclNet_v4->irecv(recvComm, data[0], sizes[0], mhandles[0], request); +} + +static ncclResult_t ncclNet_v4_as_v5_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { + if (n == 0) return ncclSuccess; + if (n != 1) return ncclInvalidArgument; + return ncclNet_v4->iflush(recvComm, data[0], sizes[0], mhandles[0], request); +} + +// We use a wrapper around the v4 init to copy over the struct contents +// post-init since they may not be initialized before hand. +static ncclResult_t ncclNet_v4_as_v5_init(ncclDebugLogger_t logfn) { + NCCLCHECK(ncclNet_v4->init(logfn)); + ncclNet_v4_as_v5.name = ncclNet_v4->name; + ncclNet_v4_as_v5.devices = ncclNet_v4->devices; + ncclNet_v4_as_v5.getProperties = ncclNet_v4_as_v5_getProperties; + ncclNet_v4_as_v5.listen = ncclNet_v4->listen; + ncclNet_v4_as_v5.connect = ncclNet_v4->connect; + ncclNet_v4_as_v5.accept = ncclNet_v4->accept; + ncclNet_v4_as_v5.regMr = ncclNet_v4->regMr; + ncclNet_v4_as_v5.deregMr = ncclNet_v4->deregMr; + ncclNet_v4_as_v5.isend = ncclNet_v4_as_v5_isend; + ncclNet_v4_as_v5.irecv = ncclNet_v4_as_v5_irecv; + ncclNet_v4_as_v5.iflush = ncclNet_v4_as_v5_iflush; + ncclNet_v4_as_v5.test = ncclNet_v4->test; + ncclNet_v4_as_v5.closeSend = ncclNet_v4->closeSend; + ncclNet_v4_as_v5.closeRecv = ncclNet_v4->closeRecv; + ncclNet_v4_as_v5.closeListen = ncclNet_v4->closeListen; + return ncclSuccess; +} + +static ncclResult_t ncclCollNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) { + ncclNetProperties_v4_t p4; + ncclResult_t ans = ncclCollNet_v4->getProperties(dev, &p4); + if (ans != ncclSuccess) return ans; + props->name = p4.name; + props->pciPath = p4.pciPath; + props->guid = p4.guid; + props->ptrSupport = p4.ptrSupport; + props->speed = p4.speed; + props->port = p4.port; + props->maxComms = p4.maxComms; + props->maxRecvs = 1; + props->latency = 0; + return ncclSuccess; +} + +// We use a wrapper around the v4 init to copy over the struct contents +// post-init since they may not be initialized before hand. +static ncclResult_t ncclCollNet_v4_as_v5_init(ncclDebugLogger_t logfn) { + NCCLCHECK(ncclCollNet_v4->init(logfn)); + ncclCollNet_v4_as_v5.name = ncclCollNet_v4->name; + ncclCollNet_v4_as_v5.devices = ncclCollNet_v4->devices; + ncclCollNet_v4_as_v5.getProperties = ncclCollNet_v4_as_v5_getProperties; + ncclCollNet_v4_as_v5.listen = ncclCollNet_v4->listen; + ncclCollNet_v4_as_v5.connect = ncclCollNet_v4->connect; + ncclCollNet_v4_as_v5.reduceSupport = ncclCollNet_v4->reduceSupport; + ncclCollNet_v4_as_v5.regMr = ncclCollNet_v4->regMr; + ncclCollNet_v4_as_v5.deregMr = ncclCollNet_v4->deregMr; + ncclCollNet_v4_as_v5.iallreduce = ncclCollNet_v4->iallreduce; + ncclCollNet_v4_as_v5.iflush = ncclCollNet_v4->iflush; + ncclCollNet_v4_as_v5.test = ncclCollNet_v4->test; + ncclCollNet_v4_as_v5.closeColl = ncclCollNet_v4->closeColl; + ncclCollNet_v4_as_v5.closeListen = ncclCollNet_v4->closeListen; + return ncclSuccess; +} + +static void initPlugin(ncclNet_v5_t** net, ncclCollNet_v5_t** collnet) { + char ncclNetPluginName[128]; + const char* envPluginName = getenv("NCCL_NET_PLUGIN"); + if (envPluginName && strlen(envPluginName)) { + snprintf(ncclNetPluginName, 128, "libnccl-net-%s.so", envPluginName); + INFO(NCCL_INIT, "Plugin name set by env to %s\n", ncclNetPluginName); + } else { + sprintf(ncclNetPluginName, "libnccl-net.so"); + } + void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL); + if (netPluginLib == nullptr) { + // dlopen does not guarantee to set errno, but dlerror only gives us a + // string, so checking errno doesn't hurt to try to provide a better + // error message + if (errno == ENOENT) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (%s), using internal implementation", ncclNetPluginName); + } else { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror()); + } + return; + } + + *net = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5"); + if (*net == nullptr) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v5 symbol."); + ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4"); + if (ncclNet_v4 == nullptr) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v4 symbol."); + if (netPluginLib != nullptr) dlclose(netPluginLib); + return; + } + *net = &ncclNet_v4_as_v5; + ncclNet_v4_as_v5.init = ncclNet_v4_as_v5_init; + } + + // Check for CollNet + *collnet = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5"); + if (*collnet == nullptr) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol."); + ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4"); + if (ncclCollNet_v4 == nullptr) { + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol."); + } else { + *collnet = &ncclCollNet_v4_as_v5; + ncclCollNet_v4_as_v5.init = ncclCollNet_v4_as_v5_init; + } + } + return; +} + +ncclResult_t ncclNetInit() { + // Always initialize bootstrap network + NCCLCHECK(bootstrapNetInit()); + + // Initialize main communication network + ncclNet_t* nets[3] = { nullptr, &ncclNetIb, &ncclNetSocket }; + ncclCollNet_t* collNets[3] = { nullptr, nullptr, nullptr }; + initPlugin(&nets[0], &collNets[0]); + char* netName = getenv("NCCL_NET"); + bool ok = false; + + for (int i=0; i<3; i++) { + if (nets[i] == nullptr) continue; + if (netName && strcmp(netName, nets[i]->name) != 0) continue; + + // net plugin is already initialized + int ndev; + if (nets[i]->init(ncclDebugLog) != ncclSuccess) continue; + if (nets[i]->devices(&ndev) != ncclSuccess) continue; + if (ndev <= 0) continue; + ncclNet = nets[i]; + ok = true; + + if (collNets[i]) { + do { + if (collNets[i]->init(ncclDebugLog) != ncclSuccess) break; + if (collNets[i]->devices(&ndev) != ncclSuccess) break; + if (ndev <= 0) break; + ncclCollNet = collNets[i]; + } while(0); + } + break; + } + + if (!ok) { + WARN("Error: network %s not found.", netName ? netName : ""); + return ncclInvalidUsage; + } + return ncclSuccess; +} + +ncclResult_t ncclGpuGdrSupport(int* gdrSupport) { + constexpr int GPU_BUF_SIZE = 2*1024*1024; +#if CUDART_VERSION >= 11030 + // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute + int driverVersion; + CUDACHECK(cudaDriverGetVersion(&driverVersion)); + if (driverVersion >= 11030) { + int cudaDev, attr = 0; + CUDACHECK(cudaGetDevice(&cudaDev)); + CUDACHECK(cudaDeviceGetAttribute(&attr, cudaDevAttrGPUDirectRDMASupported, cudaDev)); + *gdrSupport = attr; + return ncclSuccess; + } +#endif + int netDevs; + NCCLCHECK(ncclNetDevices(&netDevs)); + *gdrSupport = 0; + for (int dev=0; dev<netDevs; dev++) { + // Find a net device which is GDR-capable + ncclNetProperties_t props; + NCCLCHECK(ncclNetGetProperties(dev, &props)); + if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue; + + // Allocate memory on the GPU and try to register it on the NIC. + void *lComm = NULL, *sComm = NULL, *rComm = NULL; + ncclNetHandle_t handle; + void* gpuPtr = NULL; + void* mHandle = NULL; + ncclResult_t ret; + ncclDebugNoWarn = NCCL_NET; + NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1); + while (sComm == NULL) { + NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2); + } + while (rComm == NULL) { + NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3); + } + CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4); + if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) { + NCCLCHECK(ncclNetDeregMr(sComm, mHandle)); + NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle)); + NCCLCHECK(ncclNetDeregMr(rComm, mHandle)); + *gdrSupport = 1; + } + ncclDebugNoWarn = 0; + CUDACHECK(cudaFree(gpuPtr)); +cleanup4: + NCCLCHECK(ncclNetCloseRecv(rComm)); +cleanup3: + NCCLCHECK(ncclNetCloseSend(sComm)); +cleanup2: + NCCLCHECK(ncclNetCloseListen(lComm)); +cleanup1: + break; + } + return ncclSuccess; +} + +int ncclNetVersion() { + return (ncclNet == &ncclNet_v4_as_v5) ? 4 : 5; +} diff --git a/src/proxy.cc b/src/proxy.cc index e5d2eab..7d4f811 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,6 +7,11 @@ #include "comm.h" #include "info.h" #include "collectives.h" +#include "socket.h" +#include "shm.h" +#include "profiler.h" +#define ENABLE_TIMER 0 +#include "timer.h" enum { proxyRecv=0, proxySend=1 }; @@ -14,7 +19,7 @@ static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, in if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true; /* In chains, one rank does not need a proxy. Let's figure out which one it is */ - // Which index in the reorganized rings should we compare root against */ + /* Which index in the reorganized rings should we compare root against */ const int myrank = 0, nextrank = 1, prevrank = nranks-1; int index = pattern == ncclPatternPipelineFrom ? /* no recv / no send if root = */ @@ -24,47 +29,30 @@ static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, in return (root != rank); } -#define PROXYARGS_ALLOCATE_SIZE 128 +#define PROXYARGS_ALLOCATE_SIZE NCCL_MAX_OPS struct ncclProxyPool { struct ncclProxyPool *next; struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE]; }; -static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) { - struct ncclProxyState* state = &comm->proxyState; +static ncclResult_t allocateArgs(struct ncclProxyProgressState* state, struct ncclProxyArgs** argsptr) { struct ncclProxyArgs* elem; if (state->pool == NULL) { - // Check whether there are freed elements - if (state->poolReturned) { - pthread_mutex_lock(&state->poolMutex); - state->pool = state->poolReturned; - state->poolReturned = NULL; - pthread_mutex_unlock(&state->poolMutex); - } else { - // Allocate a new pool of elements. Make sure we allocate the memory close - // to the network thread - struct ncclProxyPool* newPool; - cpu_set_t affinitySave; - if (CPU_COUNT(&comm->cpuAffinity)) { - sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave); - sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); - } - NCCLCHECK(ncclCalloc(&newPool, 1)); - if (CPU_COUNT(&comm->cpuAffinity)) { - sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); - } + // Allocate a new pool of elements. Make sure we allocate the memory close + // to the network thread + struct ncclProxyPool* newPool; + NCCLCHECK(ncclCalloc(&newPool, 1)); - struct ncclProxyArgs* newElems = newPool->elems; - // Chain newly allocated elements - for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) { - if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1; - } - // Add them all to the pool list - state->pool = newElems; - // Save the pool memory block for later resource release - newPool->next = state->pools; - state->pools = newPool; + struct ncclProxyArgs* newElems = newPool->elems; + // Chain newly allocated elements + for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) { + if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1; } + // Add them all to the pool list + state->pool = newElems; + // Save the pool memory block for later resource release + newPool->next = state->pools; + state->pools = newPool; } elem = state->pool; state->pool = state->pool->next; @@ -82,241 +70,393 @@ static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** a #define OP_INDEX(op) ((op) ? (op)-state->pools->elems : -1) #define OP_SEEN 0x100000 -ncclResult_t dumpProxyState(struct ncclProxyState* state) { -#ifdef DEBUG_PROXY - struct ncclProxyArgs* op = state->ops; + +ncclResult_t getOpIndex(struct ncclProxyArgs* op, struct ncclProxyProgressState* state, int* poolIndex, int* opIndex) { + struct ncclProxyPool* pool = state->pools; + int p = 0; + while (pool) { + uint64_t o = op-pool->elems; + if (o < PROXYARGS_ALLOCATE_SIZE) { + *opIndex = o; + *poolIndex = p; + return ncclSuccess; + } + pool = pool->next; + p++; + } + WARN("Could not find pool of op %p\n", op); + return ncclInternalError; +} + +ncclResult_t printProxyOp(struct ncclProxyArgs* op, int poolIndex, int opIndex) { + printf("[%d-%d|%ld| %s", poolIndex, opIndex, op->opCount, op->pattern == ncclPatternSend ? "Send" : op->pattern == ncclPatternRecv ? "Recv" : "Coll"); + for (int s=0; s<op->nsubs; s++) { + struct ncclProxySubArgs* sub = op->subs+s; + if (op->state == ncclProxyOpProgress) { + char status = ' '; + if (op->pattern == ncclPatternRecv) { + if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init + else if (sub->received < sub->posted) status = 'R'; // Receiving + else if (sub->received < sub->transmitted) status = 'R'; // Receiving + else if (sub->transmitted < sub->received) status = 'F'; // Flushing + else if (sub->done < sub->transmitted) status = 'G'; // Waiting on GPU + else status = 'D'; // Done + } else if (op->pattern == ncclPatternSend) { + if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init + else if (sub->transmitted < sub->posted) status = 'G'; // Waiting on GPU + else if (sub->done < sub->transmitted) status = 'S'; // Sending + else status = 'D'; // Done + } + printf(" %d%c/%d", sub->peer, status, sub->channelId); + } else { + printf(" %d/%d", sub->peer, sub->channelId); + } + } + printf("]"); + return ncclSuccess; +} +ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) { + struct ncclProxyArgs* op = state->active; + int poolIndex, opIndex; + printf("ACTIVE OPS\n"); while (op) { - if (op->idle & OP_SEEN) { - WARN("Active list loop at element %ld", OP_INDEX(op)); - } - op->idle |= OP_SEEN; - printf("[%ld(%ld/%d)]", OP_INDEX(op), op->opCount, op->nsubs); - if (op->nextPeer) { - printf("(%ld)", OP_INDEX(op->nextPeer)); - struct ncclProxyArgs* n = op->nextPeer; - n->idle |= OP_SEEN; - while (n->nextPeer) { - n = n->nextPeer; - n->idle |= OP_SEEN; + NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex)); + if (op->state & OP_SEEN) { + WARN("List loop at element %d-%d", poolIndex, opIndex); + } + NCCLCHECK(printProxyOp(op, poolIndex, opIndex)); + op->state |= OP_SEEN; + printf("\n"); + struct ncclProxyArgs* nextOp = op->nextPeer; + while (nextOp) { + NCCLCHECK(getOpIndex(nextOp, state, &poolIndex, &opIndex)); + if (nextOp->state & OP_SEEN) { + WARN("List loop at element %d-%d", poolIndex, opIndex); } + printf("| `-> "); + NCCLCHECK(printProxyOp(nextOp, poolIndex, opIndex)); + nextOp->state |= OP_SEEN; + printf("\n"); + if (nextOp->next) { + WARN("Inactive op has next set!\n"); + } + nextOp = nextOp->nextPeer; } + if (op->nextPeer == NULL) printf("|\n"); + op = op->next; + printf("v\n"); + } + printf("[X]\n"); + +# if 0 + printf("FREE OPS\n"); + op = state->pool; + while (op) { + NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex)); + if (op->state & OP_SEEN) { + WARN("List loop at element %d-%d", poolIndex, opIndex); + } + NCCLCHECK(printProxyOp(op, poolIndex, opIndex)); + op->state |= OP_SEEN; printf("->"); op = op->next; } printf("[X]\n"); +#else + op = state->pool; + while (op) { + NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex)); + if (op->state & OP_SEEN) { + WARN("List loop at element %d-%d", poolIndex, opIndex); + } + op->state |= OP_SEEN; + op = op->next; + } +#endif - struct ncclProxyArgs* free = state->pool; - while (free) { - if (free->idle & OP_SEEN) { - WARN("Free list loop at element %ld", OP_INDEX(free)); - } - free->idle |= OP_SEEN; - free = free->next; - } - - struct ncclProxyPool* p = state->pools; - int i = 0; - while (p) { - for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++) { - if ((p->elems[e].idle & OP_SEEN) == 0) { - WARN("Element %d of pool %d has been lost", e, i); - struct ncclProxyArgs* free = state->pool; - printf("Free list "); - while (free) { - printf("--> %ld ", OP_INDEX(free)); - free = free->next; - } + struct ncclProxyPool* pool = state->pools; + poolIndex = 0; + while (pool) { + struct ncclProxyArgs* elem = pool->elems; + for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++, elem++) { + if ((elem->state & OP_SEEN) == 0) { + printf("Elem %d-%d is not in any list:\n", poolIndex, e); + NCCLCHECK(printProxyOp(elem, poolIndex, e)); printf("\n"); - return ncclInternalError; + } else { + elem->state -= OP_SEEN; } - p->elems[e].idle -= OP_SEEN; } - p = p->next; - i++; + pool = pool->next; + poolIndex++; } -#endif return ncclSuccess; } -static ncclResult_t ProxyAppend(struct ncclProxyState* state, struct ncclProxyArgs* args) { - struct ncclProxyArgs* proxyAppend = *args->proxyAppendPtr; - int shared = args->subs[0].connector->conn.shared; - if (proxyAppend) { - if (shared && proxyAppend->opCount == args->opCount) { - if ((proxyAppend->sliceSteps != args->sliceSteps) || - (proxyAppend->chunkSteps != args->chunkSteps) || - (proxyAppend->protocol != args->protocol) || - (proxyAppend->dtype != args->dtype) || - (proxyAppend->redOp != args->redOp)) { - WARN("Proxy append mismatch"); - return ncclInternalError; - } - if (proxyAppend->nsubs >= NCCL_PROXY_MAX_SUBS) { - WARN("Proxy append out of bound"); - return ncclInternalError; - } - memcpy(proxyAppend->subs+proxyAppend->nsubs, args->subs, sizeof(struct ncclProxySubArgs)); - proxyAppend->nsubs++; - args->next = proxyAppend->next; - // Free args as we merged them - args->next = state->poolFreed; - state->poolFreed = args; - DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as group with %5ld\n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend)); +static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyArgs* args, int subIndex) { + struct ncclProxySubArgs* sub = args->subs+subIndex; + if (subIndex >= NCCL_PROXY_MAX_SUBS) { + WARN("Proxy append out of bounds"); + return ncclInternalError; + } + + //memset(sub, 0, sizeof(struct ncclProxySubArgs)); + sub->connection = op->connection; + sub->channelId = op->channelId; + sub->nsteps = op->nsteps; + sub->nbytes = op->nbytes; + sub->peer = op->root; + args->nsubs = subIndex+1; + if (subIndex) { + if ((args->sliceSteps != op->sliceSteps) || + (args->chunkSteps != op->chunkSteps) || + (args->protocol != op->protocol) || + (args->dtype != op->dtype) || + (args->redOp != op->redOp)) { + WARN("Proxy append mismatch"); + return ncclInternalError; + } + if (args->state != ncclProxyOpReady) { + WARN("Proxy append on running operation"); + return ncclInternalError; + } + return ncclSuccess; + } + //memset(&args->progress, 0, sizeof(struct ncclProxyArgs)-offsetof(struct ncclProxyArgs, progress)); + args->done = 0; + args->opCount = op->opCount; + args->sliceSteps = op->sliceSteps; + args->chunkSteps = op->chunkSteps; + args->chunkSize = op->chunkSize; + args->dtype = op->dtype; + args->redOp = op->redOp; + args->pattern = op->pattern; + args->protocol = op->protocol; + args->state = ncclProxyOpReady; + args->progress = op->connection->tcomm->proxyProgress; + args->proxyAppendPtr = op->connection->proxyAppendPtr; + return ncclSuccess; +} + +static ncclResult_t ProxyAppend(struct ncclProxyProgressState* state, struct ncclProxyOp* op) { + struct ncclProxyConnection* connection = op->connection; + int shared = connection->shared; + struct ncclProxyArgs* args = *connection->proxyAppendPtr; + + if (args) { + if (shared && args->opCount == op->opCount) { + NCCLCHECK(ncclProxyOpToArgs(op, args, args->nsubs)); + DEBUG_PROXY_PRINT("Insert (%d/%5ld/%5ld) as group with %5ld\n", shared, args->opCount, op->opCount, OP_INDEX(args)); } else { - proxyAppend->nextPeer = args; - DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as nextPeer of %5ld\n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend)); + struct ncclProxyArgs* prevArgs = args; + NCCLCHECK(allocateArgs(state, &args)); + NCCLCHECK(ncclProxyOpToArgs(op, args, 0)); + prevArgs->nextPeer = args; + DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as nextPeer of %5ld\n", OP_INDEX(args), shared, prevArgs->opCount, args->opCount, OP_INDEX(prevArgs)); *(args->proxyAppendPtr) = args; } } else { // Nothing running for that peer. Add to the list - if (state->ops == NULL) { + NCCLCHECK(allocateArgs(state, &args)); + NCCLCHECK(ncclProxyOpToArgs(op, args, 0)); + if (state->active == NULL) { // Create the list DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as first element\n", OP_INDEX(args), shared, args->opCount); - state->ops = args; + state->active = args; } else { // Append element at the end of the list - struct ncclProxyArgs* last = state->ops; + struct ncclProxyArgs* last = state->active; while (last->next) last = last->next; last->next = args; - DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as last element\n", OP_INDEX(args),shared, args->opCount); + DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as last element\n", OP_INDEX(args), shared, args->opCount); } *(args->proxyAppendPtr) = args; } return ncclSuccess; } -static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args, int connIndex) { +ncclResult_t ncclProxyPost(struct ncclProxyOpsPool* pool, int nextOps, int nextOpsEnd) { + pthread_mutex_lock(&pool->mutex); + if (pool->nextOps == -1) { + pool->nextOps = nextOps; + pthread_cond_signal(&pool->cond); + } else { + pool->ops[pool->nextOpsEnd].next = nextOps; + } + pool->nextOpsEnd = nextOpsEnd; + pthread_mutex_unlock(&pool->mutex); + return ncclSuccess; +} + +ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, struct ncclProxyOp* proxyOp) { + struct ncclProxyOps* proxyOps = proxyConn->comm->proxyState.proxyOps; + if (proxyOps == NULL) return ncclInternalError; + proxyOps += proxyConn->localRank; + struct ncclProxyOpsPool* pool = proxyOps->pool; + + TIME_START(0); + int opIndex = proxyOps->freeOp; + struct ncclProxyOp* op; + if (opIndex != -1) { + op = pool->ops+opIndex; + proxyOps->freeOp = op->next; + } else { + int freeOp; + while ((freeOp = pool->freeOps[comm->localRank]) == -1) sched_yield(); + int freeOpNew; + while ((freeOpNew = __sync_val_compare_and_swap(pool->freeOps+comm->localRank, freeOp, -1)) != freeOp) freeOp = freeOpNew; + opIndex = freeOp; + op = pool->ops+opIndex; + proxyOps->freeOp = op->next; + } + if (op->next != -1) __builtin_prefetch(pool->ops+op->next); // Prefetch next free op + memcpy(op, proxyOp, sizeof(struct ncclProxyOp)); + op->next = -1; + op->connection = proxyConn->connection; + if (proxyOps->nextOps == -1) { + proxyOps->nextOps = proxyOps->nextOpsEnd = opIndex; + } else { + pool->ops[proxyOps->nextOpsEnd].next = opIndex; + proxyOps->nextOpsEnd = opIndex; + } + if (++proxyOps->count == MAX_OPS_PER_PEER) { + // Post what we have so far to free some ops in the pool + // Do not post last operations as we could have more coming with the same opCount, and posting + // them in different batches would break proxyArgs aggregation with subs. + uint64_t lastOpCount = pool->ops[proxyOps->nextOpsEnd].opCount; + int lastOp = -1; + int toSend = 0; + int ops = 0; + for (int op= proxyOps->nextOps; op != proxyOps->nextOpsEnd; op=pool->ops[op].next) { + ops++; + if (pool->ops[op].opCount != lastOpCount) { + lastOp = op; + toSend = ops; + } + } + if (lastOp == -1) { + WARN("Unable to post incomplete proxy op chain %d..%d (opCount %ld)\n", proxyOps->nextOps, proxyOps->nextOpsEnd, lastOpCount); + return ncclInternalError; + } + // Cut chain at lastOp + int nextOps = proxyOps->nextOps; + proxyOps->nextOps = pool->ops[lastOp].next; + pool->ops[lastOp].next = -1; + NCCLCHECK(ncclProxyPost(proxyOps->pool, nextOps, lastOp)); + proxyOps->count -= toSend; + } + TIME_STOP(0); + return ncclSuccess; +} + +static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex) { if (peer < 0) return ncclSuccess; - struct ncclChannel* channel = args->subs[0].channel; struct ncclPeer* peerComm = channel->peers+peer; struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex; if (connector->transportComm == NULL) { - WARN("Rank %d has no transport for %s peer %d on channel %d", connector->comm->rank, - type == proxyRecv ? "recv" : "send", peer, channel->id); + WARN("Rank %d has no transport for %s peer %d on channel %d/%d", connector->comm->rank, + type == proxyRecv ? "recv" : "send", peer, channel->id, connIndex); return ncclInternalError; } - if (connector->transportComm->proxy == NULL) return ncclSuccess; - - struct ncclProxyState* state = &connector->comm->proxyState; - struct ncclProxyArgs* op; - NCCLCHECK(allocateArgs(connector->comm, &op)); - memcpy(op, args, sizeof(struct ncclProxyArgs)); - op->subs[0].connector = connector; - op->progress = connector->transportComm->proxy; - op->state = ncclProxyOpReady; - op->proxyAppendPtr = connector->proxyAppendPtr; + if (connector->transportComm->proxyProgress == NULL) return ncclSuccess; - if (state->nextOps == NULL) state->nextOps = op; - else state->nextOpsEnd->next = op; - state->nextOpsEnd = op; + NCCLCHECK(ncclLocalOpAppend(connector->comm, &connector->proxyConn, op)); return ncclSuccess; } -ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int nranks) { - struct ncclChannel* channel = args->subs[0].channel; - int pattern = args->pattern; +ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* op, int nranks) { + struct ncclChannel* channel = comm->channels+op->channelId; + int pattern = op->pattern; if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) { struct ncclRing* ring = &channel->ring; - if (NeedProxy(proxyRecv, pattern, args->root, ring, nranks)) NCCLCHECK(SaveProxy(proxyRecv, ring->prev, args, 0)); - if (NeedProxy(proxySend, pattern, args->root, ring, nranks)) NCCLCHECK(SaveProxy(proxySend, ring->next, args, 0)); + if (NeedProxy(proxyRecv, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxyRecv, ring->prev, op, 0)); + if (NeedProxy(proxySend, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxySend, ring->next, op, 0)); } if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) { // Tree up struct ncclTree* tree = &channel->tree; - for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxyRecv, tree->down[i], args, 0)); - NCCLCHECK(SaveProxy(proxySend, tree->up, args, 0)); + for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxyRecv, tree->down[i], op, 0)); + NCCLCHECK(SaveProxy(channel, proxySend, tree->up, op, 0)); } if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) { // Tree down struct ncclTree* tree = &channel->tree; - for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxySend, tree->down[i], args, 0)); - NCCLCHECK(SaveProxy(proxyRecv, tree->up, args, 0)); + for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxySend, tree->down[i], op, 0)); + NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0)); } if (pattern == ncclPatternCollTreeUpDown) { // CollTree up - NCCLCHECK(SaveProxy(proxySend, channel->collTree.out, args, 1)); // For CollTree up, we are using push + NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1)); // For CollTree up, we are using push // CollTree down - NCCLCHECK(SaveProxy(proxyRecv, channel->collTree.out, args, 0)); + NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0)); } return ncclSuccess; } -ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyArgs* args) { - memset(args, 0, sizeof(struct ncclProxyArgs)); - int channelId = info->channelId; - args->nsubs = 1; - struct ncclProxySubArgs* sub = args->subs; +NCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0); +ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op) { + memset(op, 0, sizeof(struct ncclProxyOp)); + int channelId = info->channelId; struct ncclChannel* channel = info->comm->channels+channelId; - sub->channel = channel; - args->sliceSteps = 1; - args->chunkSteps = 1; - args->protocol = NCCL_PROTO_SIMPLE; - args->dtype = info->datatype; - sub->delta = info->delta; - sub->recvbytes = info->recvbytes; - sub->sendbytes = info->sendbytes; + op->channelId = channelId; + op->sliceSteps = 1; + op->chunkSteps = 1; + op->protocol = NCCL_PROTO_SIMPLE; + op->dtype = info->datatype; int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR; - info->recvChunkSize = stepSize; - info->sendChunkSize = stepSize; + info->chunkSize = stepSize; + op->root = info->root; + op->nbytes = info->count; + struct ncclPeer* peer = channel->peers + op->root; - if (info->delta > 0 && info->recvbytes >= 0) { - int peerrecv = (info->comm->nRanks+info->comm->rank-info->delta)%info->comm->nRanks; - if (channel->peers[peerrecv].recv[0].transportComm && channel->peers[peerrecv].recv[0].transportComm->proxy) { + if (info->coll == ncclFuncSend) { + op->pattern = ncclPatternSend; + if (op->root != info->comm->rank && peer->send[1].transportComm && peer->send[1].transportComm->proxyProgress) { // Tune chunk size for the network - if (info->recvbytes < stepSize) info->recvChunkSize /= 4; - else if (info->recvbytes < 8*stepSize) info->recvChunkSize /= 2; + if (info->count < stepSize) info->chunkSize /= 4; + else if (info->count < 8*stepSize) info->chunkSize /= 2; } - sub->recvChunkSize = info->recvChunkSize; - } - if (info->delta > 0 && info->sendbytes >= 0) { - int peersend = (info->comm->rank+info->delta)%info->comm->nRanks; - if (channel->peers[peersend].send[0].transportComm && channel->peers[peersend].send[0].transportComm->proxy) { + } else if (info->coll == ncclFuncRecv) { + op->pattern = ncclPatternRecv; + if (op->root != info->comm->rank && peer->recv[1].transportComm && peer->recv[1].transportComm->proxyProgress) { // Tune chunk size for the network - if (info->sendbytes < stepSize) info->sendChunkSize /= 4; - else if (info->sendbytes < 8*stepSize) info->sendChunkSize /= 2; + if (info->count < stepSize) info->chunkSize /= 4; + else if (info->count < 8*stepSize) info->chunkSize /= 2; } - sub->sendChunkSize = info->sendChunkSize; + } else { + WARN("P2p operation is neither send or recv"); + return ncclInternalError; } + if (ncclParamChunkSize() != 0) { + info->chunkSize = ncclParamChunkSize(); + } + op->chunkSize = info->chunkSize; return ncclSuccess; } -ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args) { - struct ncclProxySubArgs* sub = args->subs; - struct ncclChannel* channel = sub->channel; - args->opCount = channel->workFifoTail-1; - args->commOpCount = comm->opCount; - const ssize_t recvbytesOrig = sub->recvbytes; - const ssize_t sendbytesOrig = sub->sendbytes; - if (sub->delta > 0 && recvbytesOrig >= ssize_t(0)) { - int peerrecv = (comm->nRanks+comm->rank-sub->delta)%comm->nRanks; - sub->recvbytes = recvbytesOrig; - sub->sendbytes = 0; - sub->nsteps = DIVUP(sub->recvbytes, sub->recvChunkSize); - if (sub->nsteps == 0) sub->nsteps = 1; - NCCLCHECK(SaveProxy(proxyRecv, peerrecv, args, 0)); - } - if (sub->delta > 0 && sendbytesOrig >= ssize_t(0)) { - int peersend = (comm->rank+sub->delta)%comm->nRanks; - sub->sendbytes = sendbytesOrig; - sub->recvbytes = 0; - sub->nsteps = DIVUP(sub->sendbytes, sub->sendChunkSize); - if (sub->nsteps == 0) sub->nsteps = 1; - NCCLCHECK(SaveProxy(proxySend, peersend, args, 0)); +ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* op) { + struct ncclChannel* channel = comm->channels+op->channelId; + op->opCount = channel->workFifoTail-1; + if (op->root == comm->rank) return ncclSuccess; + if (op->pattern == ncclPatternRecv) { + op->nsteps = DIVUP(op->nbytes, op->chunkSize); + if (op->nsteps == 0) op->nsteps = 1; + NCCLCHECK(SaveProxy(channel, proxyRecv, op->root, op, 1)); + } else if (op->pattern == ncclPatternSend) { + op->nsteps = DIVUP(op->nbytes, op->chunkSize); + if (op->nsteps == 0) op->nsteps = 1; + NCCLCHECK(SaveProxy(channel, proxySend, op->root, op, 1)); } - // Reset proxy args for potentially multiple cuda graph launches - // It is safe as long as SaveProxy copies contents of args to op - sub->recvbytes = recvbytesOrig; - sub->sendbytes = sendbytesOrig; return ncclSuccess; } -static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) { +static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) { struct ncclProxyArgs* freeOp = *opPtr; - DEBUG_PROXY_PRINT("Remove %ld -> %ld -> %ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(freeOp), OP_INDEX(freeOp->next)); struct ncclProxyArgs* next = freeOp->next; + DEBUG_PROXY_PRINT("Remove %ld -> %ld -> %ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(freeOp), OP_INDEX(next)); *opPtr = next; if (freeOp->nextPeer) { // replace op by nextPeer @@ -324,7 +464,7 @@ static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs* if (*prevOpPtr) { (*prevOpPtr)->next = nextPeer; } else { - state->ops = nextPeer; + state->active = nextPeer; } nextPeer->next = next; *(prevOpPtr) = nextPeer; @@ -333,25 +473,31 @@ static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs* if (*prevOpPtr) { (*prevOpPtr)->next = next; } else { - state->ops = next; + state->active = next; } } - freeOp->next = state->poolFreed; - state->poolFreed = freeOp; - DEBUG_PROXY_PRINT("Removed %5ld (%5ld) : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr)); + freeOp->next = state->pool; + state->pool = freeOp; + DEBUG_PROXY_PRINT("Removed %5ld (%5ld) : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr)); +#ifdef DEBUG_PROXY NCCLCHECK(dumpProxyState(state)); +#endif return ncclSuccess; } -static ncclResult_t progressOps(struct ncclProxyState* state, struct ncclProxyArgs** opsPtr, int* idle, struct ncclComm* comm) { +static ncclResult_t progressOps(struct ncclComm* comm, struct ncclProxyProgressState* state, struct ncclProxyArgs* opStart, int* idle) { struct ncclProxyArgs* prevOp = NULL; - struct ncclProxyArgs* op = *opsPtr; + struct ncclProxyArgs* op = opStart; while (op) { if (op->state == ncclProxyOpNone) return ncclInternalError; - NCCLCHECK(op->progress(op)); + TIME_START(0); TIME_START(1); + NCCLCHECK(op->progress(comm, op)); + if (op->idle) { TIME_STOP(1); TIME_CANCEL(0); } else { TIME_CANCEL(1); TIME_STOP(0); } *idle &= op->idle; if (op->state == ncclProxyOpNone) { + TIME_START(2); NCCLCHECK(removeOp(state, &op, &prevOp)); + TIME_STOP(2); } else { prevOp = op; op = op->next; @@ -360,197 +506,607 @@ static ncclResult_t progressOps(struct ncclProxyState* state, struct ncclProxyAr return ncclSuccess; } -ncclResult_t ncclProxyAppendPosted(struct ncclProxyState* state) { - // Return any freed element first - if (state->poolFreed) { - struct ncclProxyArgs* end = state->poolFreed; - while (end->next) end = end->next; - pthread_mutex_lock(&state->poolMutex); - end->next = state->poolReturned; - state->poolReturned = state->poolFreed; - pthread_mutex_unlock(&state->poolMutex); - state->poolFreed = NULL; - } +static ncclResult_t ncclProxyGetPostedOps(struct ncclComm* comm, int* added) { + struct ncclProxyProgressState* state = &comm->proxyState.progressState; + if (state->opsPool == NULL) return ncclInternalError; + struct ncclProxyOpsPool* pool = state->opsPool; - // Then wait until we have new work to do - pthread_mutex_lock(&state->opsMutex); - while (state->postedOps == NULL) { - if (state->stop) return ncclSuccess; - pthread_cond_wait(&state->cond, &state->opsMutex); - } + struct ncclProxyArgs profArgs; // Only used for profiling purposes + if (state->nextOps != -1) goto process_nextops; - // Sort operations as we append them : collectives and - // receives first, then sends. + // If we have ops to progress, no need to block waiting for something to arrive or even wait for the lock + // to be available. Exit, continue progress, and come back later. + if (state->active != NULL && (pool->nextOps == -1 || pthread_mutex_trylock(&pool->mutex) != 0)) return ncclSuccess; - struct ncclProxyArgs* next, *prev = NULL, *op = state->postedOps; - int commOpCount = op->commOpCount; - while (op && op->commOpCount == commOpCount) { - next = op->next; - if (op->subs[0].sendbytes) { - if (prev) prev->next = next; - else state->postedOps = next; - op->next = NULL; - NCCLCHECK(ProxyAppend(state, op)); - } else prev = op; - op = next; - } - op = state->postedOps; - while (op && op->commOpCount == commOpCount) { - next = op->next; - op->next = NULL; - NCCLCHECK(ProxyAppend(state, op)); - op = next; + if (state->active == NULL) { + pthread_mutex_lock(&pool->mutex); + while (pool->nextOps == -1 && !state->stop) { + struct ncclProxyArgs profArgs; // Only used for profiling purposes + ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileSleep); + pthread_cond_wait(&pool->cond, &pool->mutex); + ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileWakeup); + } + if (state->stop) { // We might have been woken up to stop. + pthread_mutex_unlock(&pool->mutex); + return ncclSuccess; + } } - state->postedOps = op; - if (op == NULL) state->postedOpsEnd = NULL; - NCCLCHECK(dumpProxyState(state)); - pthread_mutex_unlock(&state->opsMutex); - if (state->poolFreed) { - struct ncclProxyArgs* end = state->poolFreed; - while (end->next) end = end->next; - pthread_mutex_lock(&state->poolMutex); - end->next = state->poolReturned; - state->poolReturned = state->poolFreed; - pthread_mutex_unlock(&state->poolMutex); - state->poolFreed = NULL; + state->nextOps = pool->nextOps; + pool->nextOps = pool->nextOpsEnd = -1; + pthread_mutex_unlock(&pool->mutex); + if (state->nextOps == -1) return ncclInternalError; + +process_nextops: + ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppend); + TIME_START(2); + int freeOp[NCCL_MAX_LOCAL_RANKS]; + int freeOpEnd[NCCL_MAX_LOCAL_RANKS]; + for (int i=0; i<comm->localRanks; i++) freeOp[i] = -1; + + for (int opIndex = state->nextOps; opIndex != -1;) { + struct ncclProxyOp* peerOp = pool->ops+opIndex; + int peer = opIndex / MAX_OPS_PER_PEER; + if (peerOp->connection == NULL) return ncclInternalError; + if (peerOp->next != -1) __builtin_prefetch(pool->ops+peerOp->next); + NCCLCHECK(ProxyAppend(state, peerOp)); + (*added)++; + int lastOpIndex = opIndex; + opIndex = peerOp->next; + // Return op to peer pool + if (freeOp[peer] == -1) { + freeOpEnd[peer] = lastOpIndex; + } else { + peerOp->next = freeOp[peer]; + } + freeOp[peer] = lastOpIndex; + state->nextOps = opIndex; } + for (int i=0; i<comm->localRanks; i++) { + if (freeOp[i] == -1) continue; + int newFree = freeOp[i]; + int oldFree = pool->freeOps[i]; + pool->ops[freeOpEnd[i]].next = oldFree; + if (oldFree == -1) { + // Nothing for the main thread to consume, we can set it. + pool->freeOps[i] = newFree; + } else { + // The main thread may recycle free ops at any time, replace the freeOps value atomically and check it worked. + int swap = __sync_val_compare_and_swap(pool->freeOps+i, oldFree, newFree); + if (swap != oldFree) { + if (swap != -1) return ncclInternalError; + // Ops were recycled while we were trying to swap, just set the value directly now. + pool->ops[freeOpEnd[i]].next = -1; + pool->freeOps[i] = newFree; + } + } + } + profArgs.opCount = *added; + ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppendEnd); + TIME_STOP(2); return ncclSuccess; } +#include <signal.h> +static ncclProxyProgressState* ncclLastProxyState; +void ncclDumpProxyState(int signal) { + dumpProxyState(ncclLastProxyState); +} -void* persistentThread(void *comm_) { +void* ncclProxyProgress(void *comm_) { struct ncclComm* comm = (struct ncclComm*)comm_; - struct ncclProxyState* state = &comm->proxyState; - char threadName[16]; - sprintf(threadName, "NCCLproxy %5d", comm->rank); + struct ncclProxyProgressState* state = &comm->proxyState.progressState; + state->nextOps = -1; + signal(SIGUSR1, ncclDumpProxyState); + ncclLastProxyState = state; + char threadName[NCCL_THREAD_NAMELEN]; + snprintf(threadName, NCCL_THREAD_NAMELEN, "NCCL Progress%2d", comm->cudaDev); nvtxNameOsThreadA(syscall(SYS_gettid), threadName); - struct ncclProxyArgs** opsPtr = &state->ops; - while (1) { - if (*comm->abortFlag) { - return NULL; - } - - while (*opsPtr == NULL) { - if (state->stop) { - // No more commands to process and proxy has been requested to stop - return NULL; - } - ncclResult_t ret = ncclProxyAppendPosted(state); - if (ret != ncclSuccess) { - comm->fatalError = ret; - INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); - return NULL; - } - } + int lastIdle = 0; + struct ncclProxyArgs profArgs; // Only used for profiling purposes + while (state->stop == 0 && *comm->abortFlag == 0) { int idle = 1; - ncclResult_t ret = progressOps(state, opsPtr, &idle, comm); + ncclResult_t ret = progressOps(comm, state, state->active, &idle); if (ret != ncclSuccess) { comm->fatalError = ret; INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); return NULL; } + if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle); + if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive); if (idle) { - sched_yield(); // No request progressed. Let others run. + int added = 0; + TIME_START(3); + ret = ncclProxyGetPostedOps(comm, &added); + if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); } + if (ret != ncclSuccess) { + comm->fatalError = ret; + INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); + } + if (added == 0) { + sched_yield(); // No request progressed. Let others run. + } } + lastIdle = idle; } + return NULL; } ncclResult_t ncclProxyStart(struct ncclComm* comm) { - struct ncclProxyState* state = &comm->proxyState; - if (state->nextOps == NULL) return ncclSuccess; - pthread_mutex_lock(&state->opsMutex); - if (state->postedOps) state->postedOpsEnd->next = state->nextOps; - else state->postedOps = state->nextOps; - state->postedOpsEnd = state->nextOpsEnd; - state->nextOps = state->nextOpsEnd = NULL; - pthread_cond_signal(&state->cond); - pthread_mutex_unlock(&state->opsMutex); + struct ncclProxyOps* proxyOps = comm->proxyState.proxyOps; + if (proxyOps == NULL) return ncclSuccess; + TIME_START(1); + for (int r=0; r<comm->localRanks; r++) { + struct ncclProxyOps* ops = proxyOps+r; + if (ops->pool == NULL || ops->nextOps == -1) continue; + NCCLCHECK(ncclProxyPost(ops->pool, ops->nextOps, ops->nextOpsEnd)); + ops->nextOps = ops->nextOpsEnd = -1; + ops->count = 0; + } comm->opCount++; + TIME_STOP(1); return ncclSuccess; } -ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr) { - struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs; - if (state->size == 0) { - int p2pnChannels = 1; - while (p2pnChannels < comm->nChannels) p2pnChannels *= 2; - int p2pSize = 2*p2pnChannels*NCCL_MAX_WORK_ELEMENTS*comm->buffSizes[NCCL_PROTO_SIMPLE]/SENDRECV_SLICEFACTOR; - int collNetSize = 2*comm->nChannels*comm->buffSizes[NCCL_PROTO_SIMPLE]; - state->size = std::max(p2pSize, collNetSize); +ncclResult_t ncclProxyProgressCreate(struct ncclComm* comm) { + struct ncclProxyProgressState* state = &comm->proxyState.progressState; + if (!state->thread) { + pthread_create(&state->thread, NULL, ncclProxyProgress, comm); + ncclSetThreadName(state->thread, "NCCL Progress%2d", comm->cudaDev); } + return ncclSuccess; +} - *size = state->size; +ncclResult_t ncclProxyProgressDestroy(struct ncclComm* comm) { + struct ncclProxyProgressState* state = &comm->proxyState.progressState; - if (cuda && state->cudaBuff == NULL) { - NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, *size)); - } else if (state->hostBuff == NULL) { - NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, *size)); + // Request the proxy to stop and then wake it + if (state->opsPool) { + pthread_mutex_lock(&state->opsPool->mutex); + state->stop = true; + pthread_cond_signal(&state->opsPool->cond); + pthread_mutex_unlock(&state->opsPool->mutex); + pthread_join(state->thread, NULL); + } + + // Free off any memory allocated for the proxy arg pools + while (state->pools != NULL) { + struct ncclProxyPool *next = state->pools->next; + free(state->pools); + state->pools = next; } - *ptr = cuda ? state->cudaBuff : state->hostBuff; + + ncclProfilingDump(); + TIME_PRINT("Proxy"); return ncclSuccess; } -ncclResult_t ncclProxySharedBuffersGetP2p(struct ncclComm* comm, int cuda, int type, int channel, int slot, int index, char** ptr) { - struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs; - // Use different pools for separate send/recv. - char* buff = cuda ? state->cudaBuff : state->hostBuff; - int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR); - int globalSlot = (((type*comm->p2pnChannels+channel)*NCCL_STEPS)+slot)*NCCL_MAX_WORK_ELEMENTS+index; - *ptr = buff + slotSize * globalSlot; +struct ncclProxyAsyncOp { + int type; + struct ncclProxyConnection* connection; + int reqSize, respSize; + char *reqBuff, *respBuff; +}; + +struct ncclProxyLocalPeer { + struct ncclSocket sock; + int localRank; + struct ncclProxyAsyncOp asyncOps; +}; + +#define NCCL_PROXY_CONN_POOL_SIZE_POW2 7 +#define NCCL_PROXY_CONN_POOL_SIZE (1<<(NCCL_PROXY_CONN_POOL_SIZE_POW2)) +#define NCCL_PROXY_CONN_POOL_MASK ((NCCL_PROXY_CONN_POOL_SIZE)-1) +struct ncclProxyConnectionPool { + struct ncclProxyConnection** pools; + int banks; + int offset; + struct ncclProxyAsyncOp* ops; +}; + +static ncclResult_t ncclProxyNewConnection(struct ncclProxyConnectionPool* pool, int* id) { + if (pool->offset == NCCL_PROXY_CONN_POOL_SIZE) { + NCCLCHECK(ncclRealloc(&pool->pools, pool->banks, pool->banks+1)); + NCCLCHECK(ncclCalloc(pool->pools+pool->banks, NCCL_PROXY_CONN_POOL_SIZE)); + pool->banks++; + pool->offset = 0; + } + *id = ((pool->banks-1) << NCCL_PROXY_CONN_POOL_SIZE_POW2) + pool->offset; + pool->offset++; return ncclSuccess; } -ncclResult_t ncclProxySharedBuffersGetCollNet(struct ncclComm* comm, int cuda, int type, int slot, int channel, char** ptr) { - struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs; - // Use different pools for different channels. - char* buff = cuda ? state->cudaBuff : state->hostBuff; - int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; - int globalSlot = (type*NCCL_STEPS+slot)*comm->nChannels+channel; - *ptr = buff + slotSize * globalSlot; + +static ncclResult_t ncclProxyGetConnection(struct ncclProxyConnectionPool* pool, int id, struct ncclProxyConnection** conn) { + int bank = id>>NCCL_PROXY_CONN_POOL_SIZE_POW2; + int offset = id&NCCL_PROXY_CONN_POOL_MASK; + if ((pool->pools == NULL) || (bank > pool->banks) || (pool->pools[bank] == NULL)) return ncclInternalError; + *conn = pool->pools[bank]+offset; return ncclSuccess; } -ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm) { - struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs; - CUDACHECK(cudaFree(state->cudaBuff)); - NCCLCHECK(ncclCudaHostFree(state->hostBuff)); +static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { + if (connection->send) { + NCCLCHECK(ncclTransports[connection->transport].send.proxyFree(connection, comm)); + } else { + NCCLCHECK(ncclTransports[connection->transport].recv.proxyFree(connection, comm)); + } return ncclSuccess; } -ncclResult_t ncclProxyCreate(struct ncclComm* comm) { - if (!comm->proxyThread) { - comm->proxyState.cond = PTHREAD_COND_INITIALIZER; - comm->proxyState.opsMutex = PTHREAD_MUTEX_INITIALIZER; - comm->proxyState.poolMutex = PTHREAD_MUTEX_INITIALIZER; - comm->proxyState.ops = NULL; - pthread_create(&comm->proxyThread, NULL, persistentThread, comm); +static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* pool, struct ncclComm* comm) { + for (int b=0; b<pool->banks; b++) { + int max = b == pool->banks-1 ? pool->offset : NCCL_PROXY_CONN_POOL_SIZE; + for (int i=0; i<max; i++) { + NCCLCHECK(proxyFree(pool->pools[b]+i, comm)); + } + free(pool->pools[b]); } + free(pool->pools); return ncclSuccess; } -ncclResult_t ncclProxyDestroy(struct ncclComm* comm) { - struct ncclProxyState* state = &comm->proxyState; +#include "transport.h" - // Request the proxy to stop and then wake it - pthread_mutex_lock(&state->opsMutex); - state->stop = true; - pthread_cond_signal(&state->cond); - pthread_mutex_unlock(&state->opsMutex); - if (comm->proxyThread) pthread_join(comm->proxyThread, NULL); +ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn) { + // Keep one connection per mlocal rank + proxyConn->connection = NULL; + proxyConn->rank = rank; + if (comm->proxyState.peerSocks == NULL) { + NCCLCHECK(ncclCalloc(&comm->proxyState.peerSocks, comm->localRanks)); + NCCLCHECK(ncclCalloc(&comm->proxyState.proxyOps, comm->localRanks)); + NCCLCHECK(ncclCalloc(&comm->proxyState.sharedDevMems, comm->localRanks)); + for (int r=0; r<comm->localRanks; r++) { + comm->proxyState.peerSocks[r].fd = -1; + comm->proxyState.peerSocks[r].abortFlag = comm->abortFlag; + } + } + NCCLCHECK(ncclTopoGetLocalRank(comm->topo, rank, &proxyConn->localRank)); + struct ncclSocket* sock = comm->proxyState.peerSocks+proxyConn->localRank; + if (sock->fd == -1) { + memcpy(&sock->addr, comm->proxyState.peerAddresses+rank, sizeof(union ncclSocketAddress)); + NCCLCHECK(ncclSocketConnect(sock)); + } + int type = ncclProxyMsgInit; + NCCLCHECK(ncclSocketSend(sock, &type, sizeof(int))); + NCCLCHECK(ncclSocketSend(sock, &transport, sizeof(int))); + NCCLCHECK(ncclSocketSend(sock, &send, sizeof(int))); + NCCLCHECK(ncclSocketSend(sock, &comm->localRank, sizeof(int))); + NCCLCHECK(ncclSocketRecv(sock, &proxyConn->connection, sizeof(void*))); + struct ncclTransportComm* tcomm = send ? &ncclTransports[transport].send : &ncclTransports[transport].recv; + // If we need proxy progress, map progress ops + if (tcomm->proxyProgress) { + char poolPath[] = "/dev/shm/nccl-XXXXXX"; + NCCLCHECK(ncclSocketRecv(sock, poolPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1)); + struct ncclProxyOps* proxyOps = comm->proxyState.proxyOps+proxyConn->localRank; + if (proxyOps->pool == NULL) { + NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, 0)); + proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1; + } + } + INFO(NCCL_NET, "Connection to proxy localRank %d -> connection %p", proxyConn->localRank, proxyConn->connection); + proxyConn->comm = comm; + return ncclSuccess; +} - // Free off any memory allocated for the proxy arg pools - pthread_mutex_lock(&state->poolMutex); - struct ncclProxyState* proxyState = &comm->proxyState; - while (proxyState->pools != NULL) { - struct ncclProxyPool *next = proxyState->pools->next; - free(proxyState->pools); - proxyState->pools = next; +const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop" }; +ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) { + if (proxyConn->comm->proxyState.peerSocks == NULL) return ncclInternalError; + struct ncclSocket* sock = proxyConn->comm->proxyState.peerSocks+proxyConn->localRank; + if (sock->fd == -1) return ncclInternalError; + ncclResult_t ret; + + NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error); + NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error); + NCCLCHECKGOTO(ncclSocketSend(sock, &reqSize, sizeof(int)), ret, error); + NCCLCHECKGOTO(ncclSocketSend(sock, &respSize, sizeof(int)), ret, error); + if (reqSize) NCCLCHECKGOTO(ncclSocketSend(sock, reqBuff, reqSize), ret, error); + if (respSize) NCCLCHECKGOTO(ncclSocketRecv(sock, respBuff, respSize), ret, error); + return ncclSuccess; +error: + WARN("Proxy Call to rank %d failed (%s)", proxyConn->comm->localRankToRank[proxyConn->localRank], ncclProxyMsgTypeStr[type]); + return ret; +} + +static ncclResult_t proxyProgressInit(struct ncclComm* comm) { + struct ncclProxyProgressState* state = &comm->proxyState.progressState; + if (state->opsPool == NULL) { + int size = sizeof(struct ncclProxyOpsPool); + struct ncclProxyOpsPool* pool = NULL; + + char shmPath[sizeof("/dev/shm/nccl-XXXXXX")]; + shmPath[0] = '\0'; + NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, 1)); + + // Init pool + pool->nextOps = -1; + + // The service thread may be launched already but localRanks may not be set yet. + while (comm->localRanks == 0) sched_yield(); + + for (int r=0; r<comm->localRanks; r++) { + pool->freeOps[r] = r*MAX_OPS_PER_PEER; + for (int i=0; i<MAX_OPS_PER_PEER-1; i++) pool->ops[r*MAX_OPS_PER_PEER+i].next = r*MAX_OPS_PER_PEER+i+1; + pool->ops[(r+1)*MAX_OPS_PER_PEER-1].next = -1; + } + + // Setup mutex/cond to work inter-process + pthread_mutexattr_t mutexAttr; + pthread_mutexattr_init(&mutexAttr); + pthread_mutexattr_setpshared(&mutexAttr, PTHREAD_PROCESS_SHARED); + pthread_mutex_init(&pool->mutex, &mutexAttr); + pthread_condattr_t condAttr; + pthread_condattr_setpshared(&condAttr, PTHREAD_PROCESS_SHARED); + pthread_cond_init(&pool->cond, &condAttr); + state->opsPool = pool; + + memcpy(state->opsPoolShmSuffix, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1); + + // All ops structures are created, we can start the progress thread + NCCLCHECK(ncclProxyProgressCreate(comm)); + } + return ncclSuccess; +} + +static void proxyOpsFree(struct ncclComm* comm) { + struct ncclProxyProgressState* state = &comm->proxyState.progressState; + if (ncclShmClose(state->opsPool, NULL, sizeof(struct ncclProxyOpsPool)) != ncclSuccess) { + WARN("[Service thread] shm close failed"); + } +} + +ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm) { + struct ncclProxyProgressState* state = &comm->proxyState.progressState; + if (state->opsPool == NULL) return ncclSuccess; + + char shmPath[] = "/dev/shm/nccl-XXXXXX"; + memcpy(shmPath+sizeof("/dev/shm/nccl-")-1, state->opsPoolShmSuffix, sizeof("XXXXXX")-1); + if (ncclShmUnlink(shmPath) != ncclSuccess) { + WARN("[Service thread] shm unlink failed"); + } + return ncclSuccess; +} + +static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) { + struct ncclSocket* sock = &peer->sock; + char buf[SOCKET_NAME_MAXLEN+1]; + buf[SOCKET_NAME_MAXLEN] = '\0'; + int id; + struct ncclProxyConnection* connection; + NCCLCHECK(ncclProxyNewConnection(connectionPool, &id)); + NCCLCHECK(ncclProxyGetConnection(connectionPool, id, &connection)); + connection->sock = sock; + NCCLCHECK(ncclSocketRecv(sock, &connection->transport, sizeof(int))); + NCCLCHECK(ncclSocketRecv(sock, &connection->send, sizeof(int))); + NCCLCHECK(ncclSocketRecv(sock, &peer->localRank, sizeof(int))); + connection->localRank = peer->localRank; + NCCLCHECK(ncclSocketSend(sock, &connection, sizeof(void*))); + connection->tcomm = connection->send ? &ncclTransports[connection->transport].send : &ncclTransports[connection->transport].recv; + // If we need proxy progress, let's allocate ops and start the thread + if (connection->tcomm->proxyProgress) { + NCCLCHECK(proxyProgressInit(comm)); + struct ncclProxyProgressState* state = &comm->proxyState.progressState; + NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1)); + } + buf[SOCKET_NAME_MAXLEN] = '\0'; + INFO(NCCL_NET, "New proxy %s connection %d from %s, transport %d", connection->send ? "send":"recv", id, ncclSocketToString(&sock->addr, buf), connection->transport); + return ncclSuccess; +} + +static ncclResult_t proxyConnSharedInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) { + struct ncclSocket* sock = &peer->sock; + struct ncclProxyConnection* connection; + NCCLCHECK(ncclSocketRecv(sock, &connection, sizeof(void*))); + int reqSize, respSize; + NCCLCHECK(ncclSocketRecv(sock, &reqSize, sizeof(int))); + NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(int))); + if (reqSize != sizeof(int) || respSize != 0) return ncclInternalError; + int nChannels; + NCCLCHECK(ncclSocketRecv(sock, &nChannels, sizeof(int))); + if (connection->tcomm->proxySharedInit) NCCLCHECK(connection->tcomm->proxySharedInit(connection, comm, nChannels)); + return ncclSuccess; +} + +static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclComm* comm, int* asyncOpCount) { + int done = 1; + if (op->type == ncclProxyMsgSetup) { + NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done)); + } else if (op->type == ncclProxyMsgConnect) { + NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done)); + } else return ncclInternalError; + if (done) { + if (op->respSize) NCCLCHECK(ncclSocketSend(op->connection->sock, op->respBuff, op->respSize)); + if (op->reqBuff) free(op->reqBuff); + if (op->respBuff) free(op->respBuff); + op->reqBuff = NULL; + op->respBuff = NULL; + op->type = 0; + (*asyncOpCount)--; + } + return ncclSuccess; +} + +static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm, int* asyncOpCount) { + struct ncclSocket* sock = &peer->sock; + struct ncclProxyAsyncOp* asyncOp = &peer->asyncOps; + asyncOp->type = type; + NCCLCHECK(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*))); + + NCCLCHECK(ncclSocketRecv(sock, &asyncOp->reqSize, sizeof(int))); + NCCLCHECK(ncclSocketRecv(sock, &asyncOp->respSize, sizeof(int))); + if (asyncOp->reqSize) { + NCCLCHECK(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize)); + NCCLCHECK(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize)); + } + if (asyncOp->respSize) NCCLCHECK(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize)); + (*asyncOpCount)++; + NCCLCHECK(proxyProgressAsync(asyncOp, comm, asyncOpCount)); + return ncclSuccess; +} + +#include <poll.h> + +void* ncclProxyService(void* _args) { + struct ncclComm* comm = (struct ncclComm *) _args; + if (cudaSetDevice(comm->cudaDev) != cudaSuccess) { + WARN("[Proxy Service] Failed to set CUDA device %d", comm->cudaDev); + } + if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity); + + // Prepare poll descriptor + struct ncclProxyConnectionPool connectionPool; + connectionPool.pools = NULL; + connectionPool.banks = 0; + connectionPool.offset = NCCL_PROXY_CONN_POOL_SIZE; + + struct pollfd pollfds[NCCL_MAX_LOCAL_RANKS+1]; + struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS]; + for (int s=0; s<NCCL_MAX_LOCAL_RANKS; s++) { + peers[s].sock.fd = pollfds[s].fd = -1; + peers[s].sock.abortFlag = NULL; + peers[s].sock.asyncFlag = 0; + pollfds[s].events = POLLHUP|POLLIN; + peers[s].asyncOps.type = 0; + } + pollfds[NCCL_MAX_LOCAL_RANKS].fd = comm->proxyState.listenSock->fd; + pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN; + + int maxnpeers = 0; + int npeers = 0; + int stop = 0; + int asyncOpCount = 0; + while (stop == 0 || (stop == 1 && npeers > 0)) { + if (int error = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : -1) < 0) { + WARN("[Proxy Service] Poll failed with error %d", error); + return NULL; + } + if (pollfds[NCCL_MAX_LOCAL_RANKS].revents) { + int s = 0; + while (s < NCCL_MAX_LOCAL_RANKS && peers[s].sock.fd != -1) s++; + if (s == NCCL_MAX_LOCAL_RANKS) { + WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_LOCAL_RANKS); + return NULL; + } + if (maxnpeers < s+1) maxnpeers = s+1; + struct ncclSocket* sock = &peers[s].sock; + if (ncclSocketAccept(sock, comm->proxyState.listenSock) != ncclSuccess) { + WARN("[Service thread] Accept failed %s", strerror(errno)); + } else { + pollfds[s].fd = sock->fd; + npeers++; + peers[s].localRank = -1; + } + } + for (int s=0; s<maxnpeers; s++) { + struct ncclProxyLocalPeer* peer = peers+s; + struct ncclSocket* sock = &peer->sock; + struct ncclProxyAsyncOp* op = &peer->asyncOps; + int closeConn = 0; + int type = 0; + ncclResult_t res = ncclSuccess; + if (op->type != 0) { + res = proxyProgressAsync(op, comm, &asyncOpCount); + type = op->type; + if (res != ncclSuccess) op->type = 0; + } else if (pollfds[s].revents & POLLIN) { + int closed; + if (ncclSocketTryRecv(sock, &type, sizeof(int), &closed) != ncclSuccess) { + WARN("[Service thread] Could not receive type from localRank %d", peer->localRank); + closeConn = 1; + } else if (closed) { + INFO(NCCL_INIT|NCCL_NET, "[Service thread] Connection closed by localRank %d", peer->localRank); + closeConn = 1; + } else { + if (type == ncclProxyMsgAbort) { + stop = 2; + closeConn = 1; + } else if (type == ncclProxyMsgStop) { + stop = 1; + closeConn = 1; + } else if (type == ncclProxyMsgClose) { + closeConn = 1; + } else if (type == ncclProxyMsgInit) { + res = proxyConnInit(peers+s, &connectionPool, comm); + } else if (type == ncclProxyMsgSharedInit) { + res = proxyConnSharedInit(peers+s, &connectionPool, comm); + } else if (type == ncclProxyMsgSetup || type == ncclProxyMsgConnect) { + res = proxyConnSetupConnect(type, peers+s, &connectionPool, comm, &asyncOpCount); + } else { + WARN("[Service thread] Unknown command %d from localRank %d\n", type, peer->localRank); + closeConn = 1; + } + } + } else if (pollfds[s].revents & POLLHUP) { + closeConn = 1; + } + if (res != ncclSuccess) { + WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", comm->rank, ncclProxyMsgTypeStr[type], comm->localRankToRank[peer->localRank], res); + closeConn = 1; + } + if (closeConn) { + close(sock->fd); + sock->fd = pollfds[s].fd = -1; + npeers--; + } + } + } + // Wait for all operations to complete and stop progress thread before freeing any resource + if (ncclProxyProgressDestroy(comm) != ncclSuccess) { + WARN("[Proxy Service] proxyDestroy failed"); } - pthread_mutex_unlock(&state->poolMutex); + for (int s=0; s<maxnpeers; s++) { + if (peers[s].sock.fd != -1) close(peers[s].sock.fd); + } + ncclProxyFreeConnections(&connectionPool, comm); + close(comm->proxyState.listenSock->fd); + free(comm->proxyState.listenSock); + proxyOpsFree(comm); + return NULL; +} + +ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses) { + comm->proxyState.listenSock = sock; + comm->proxyState.peerAddresses = peerAddresses; + ncclSetThreadName(comm->proxyState.thread, "NCCL Service %2d", comm->cudaDev); + return ncclSuccess; +} - NCCLCHECK(ncclProxySharedBuffersDestroy(comm)); +ncclResult_t ncclProxyCreate(struct ncclComm* comm) { + pthread_create(&comm->proxyState.thread, NULL, ncclProxyService, comm); + return ncclSuccess; +} +ncclResult_t ncclProxyDestroy(struct ncclComm* comm) { + struct ncclProxyState* state = &comm->proxyState; + if (state->peerAddresses) { + struct ncclSocket sock; + sock.abortFlag = NULL; + sock.asyncFlag = 0; + memcpy(&sock.addr, comm->proxyState.peerAddresses+comm->rank, sizeof(union ncclSocketAddress)); + NCCLCHECK(ncclSocketConnect(&sock)); + int type = (*comm->abortFlag) ? ncclProxyMsgAbort : ncclProxyMsgStop; + NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int))); + close(sock.fd); + free(state->peerAddresses); + } + if (state->peerSocks) { + for (int i=0; i<comm->localRanks; i++) { + if (state->peerSocks[i].fd != -1) { + if (state->proxyOps[i].pool) { + NCCLCHECK(ncclShmClose(state->proxyOps[i].pool, NULL, sizeof(struct ncclProxyOpsPool))); + } + if (state->sharedDevMems[i]) { + CUDACHECK(cudaIpcCloseMemHandle(state->sharedDevMems[i])); + } + int type = ncclProxyMsgClose; + if (*comm->abortFlag == 0) NCCLCHECK(ncclSocketSend(state->peerSocks+i, &type, sizeof(int))); + close(state->peerSocks[i].fd); + } + } + free(state->peerSocks); + free(state->proxyOps); + free(state->sharedDevMems); + } return ncclSuccess; } diff --git a/src/transport.cc b/src/transport.cc index 2cb5538..7ce5f2e 100644 --- a/src/transport.cc +++ b/src/transport.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,15 +7,19 @@ #include "comm.h" #include "info.h" #include "bootstrap.h" +#define ENABLE_TIMER 0 +#include "timer.h" extern struct ncclTransport p2pTransport; extern struct ncclTransport shmTransport; extern struct ncclTransport netTransport; +extern struct ncclTransport collNetTransport; struct ncclTransport ncclTransports[NTRANSPORTS] = { p2pTransport, shmTransport, netTransport, + collNetTransport }; template <int type> @@ -82,12 +86,15 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* struct ncclConnect* recvData = data; int sendChannels = 0, recvChannels = 0; int type; + TIME_START(0); for (int c=0; c<MAXCHANNELS; c++) { if (recvMask & (1<<c)) { NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type)); if (type > highestType) highestType = type; } } + TIME_STOP(0); + TIME_START(1); struct ncclConnect* sendData = recvData+recvChannels; for (int c=0; c<MAXCHANNELS; c++) { if (sendMask & (1<<c)) { @@ -95,7 +102,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* if (type > highestType) highestType = type; } } + TIME_STOP(1); + TIME_START(2); if (sendPeer == recvPeer) { if (recvChannels+sendChannels) { NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels))); @@ -109,7 +118,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels)); if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels)); } + TIME_STOP(2); + TIME_START(3); for (int c=0; c<MAXCHANNELS; c++) { if (sendMask & (1<<c)) { struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex; @@ -118,6 +129,8 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* CUDACHECK(cudaMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice, transportSetupStream)); } } + TIME_STOP(3); + TIME_START(4); for (int c=0; c<MAXCHANNELS; c++) { if (recvMask & (1<<c)) { struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex; @@ -126,11 +139,13 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* CUDACHECK(cudaMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice, transportSetupStream)); } } + TIME_STOP(4); comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0; } CUDACHECK(cudaStreamSynchronize(transportSetupStream)); CUDACHECK(cudaStreamDestroy(transportSetupStream)); if (highestTransportType != NULL) *highestTransportType = highestType; + TIME_PRINT("P2P Setup/Connect"); return ncclSuccess; } @@ -225,9 +240,9 @@ cleanup: ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) { // AllGather collNet setup results - int allGatherFailures[NCCL_MAX_INTRA_RANKS] = {0}; - allGatherFailures[comm->intraNodeRank] = collNetSetupFail; - NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, allGatherFailures, sizeof(int))); + int allGatherFailures[NCCL_MAX_LOCAL_RANKS] = {0}; + allGatherFailures[comm->localRank] = collNetSetupFail; + NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, allGatherFailures, sizeof(int))); for (int i=0; i<comm->localRanks; i++) { if (allGatherFailures[i] != 0) { collNetSetupFail = 1; @@ -235,7 +250,7 @@ ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFa } } if (collNetSetupFail) { - if (comm->intraNodeRank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead"); + if (comm->localRank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead"); return ncclSystemError; } return ncclSuccess; @@ -248,12 +263,12 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) { struct ncclPeer* peer = channel->peers+comm->nRanks; for (int b=0; b<NCCL_MAX_CONNS; b++) { struct ncclConnector* send = peer->send + b; - if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources)); + if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send)); send->transportResources = NULL; // avoid double free } for (int b=0; b<NCCL_MAX_CONNS; b++) { struct ncclConnector* recv = peer->recv + b; - if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources)); + if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv)); recv->transportResources = NULL; // avoid double free } } diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc index 4c0e76d..26f875f 100644 --- a/src/transport/coll_net.cc +++ b/src/transport/coll_net.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,11 +7,15 @@ #include "comm.h" #include "coll_net.h" #include "graph.h" +#include "proxy.h" +#include "gdrwrap.h" -#define COLLNET_GROUP_NSUBS 8 -#define COLLNET_MAX_GROUPS (NCCL_PROXY_MAX_SUBS/COLLNET_GROUP_NSUBS) +int64_t ncclParamGdrCopySyncEnable(); +int64_t ncclParamGdrCopyFlushEnable(); struct collNetRecvConnectInfo { + int rank; + int nranks; collNetHandle_t collNetHandle; }; @@ -20,128 +24,279 @@ struct collNetSendConnectInfo { void* reqFifo; }; +#define COLLNET_GROUP_NSUBS 8 +#define COLLNET_MAX_GROUPS (NCCL_PROXY_MAX_SUBS/COLLNET_GROUP_NSUBS) + +#define NCCL_NET_MAP_HOSTMEM 0 +#define NCCL_NET_MAP_DEVMEM 1 +#define NCCL_NET_MAP_SHARED_HOSTMEM 2 +#define NCCL_NET_MAP_SHARED_DEVMEM 3 +#define NCCL_NET_MAP_GDCMEM 4 +#define NCCL_NET_MAP_MEMS 5 + +#define NCCL_NET_MAP_MASK_DEVMEM 0x40000000 +#define NCCL_NET_MAP_MASK_SHARED 0x80000000 +#define NCCL_NET_MAP_MASK_USED 0x20000000 +#define NCCL_NET_MAP_MASK_OFFSET 0x1fffffff + +#define NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName) \ + ((mapStruct)->offsets.offsetName >> 30) + +#define NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) \ + (((mapStruct)->offsets.offsetName >> 29) == 0) + +#define NCCL_NET_MAP_GET_POINTER(mapStruct, cpuOrGpu, offsetName) \ + (NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) ? NULL : \ + (mapStruct)->mems[NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName)].cpuOrGpu##Ptr + ((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_OFFSET)) + +#define NCCL_NET_MAP_DEV_MEM(mapStruct, offsetName) \ + (((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_DEVMEM) != 0) + +#define NCCL_NET_MAP_ADD_POINTER(mapStruct, shared, dev, memSize, offsetName) do { \ + int bank = NCCL_NET_MAP_MASK_USED + (dev)*NCCL_NET_MAP_MASK_DEVMEM + (shared)*NCCL_NET_MAP_MASK_SHARED; \ + if ((shared) == 0) { \ + if (dev) { \ + (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size; \ + (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size += memSize; \ + } else { \ + (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size; \ + (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size += memSize; \ + } \ + } else { \ + (mapStruct)->offsets.offsetName = bank; \ + } \ +} while (0); + +struct connectMapMem{ + char* gpuPtr; + char* cpuPtr; + int size; +}; + +struct connectMap { + int shared; + // First 3 bits of offsets determine the mem bank. 001 is host mem, 011 is dev mem, 101 is shared host mem and 111 is shared dev mem. + struct connectMapMem mems[NCCL_NET_MAP_MEMS]; + // Offsets. 3 MSBs indicate mem bank, 111 indicates NULL. + struct { + uint32_t sendMem; + uint32_t recvMem; + uint32_t buffs[NCCL_NUM_PROTOCOLS]; + } offsets; +}; + struct reqSlot { volatile void* recvBuff; volatile int size; }; -struct collNetSendResources { - struct ncclComm* comm; +struct sendResources { + struct connectMap map; void* collNetComm; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; + + int rank; + int nranks; int netDev; int useGdr; + uint64_t* gdcSync; + void* gdrDesc; void* sendMhandles[NCCL_NUM_PROTOCOLS]; void* recvMhandles[NCCL_NUM_PROTOCOLS]; - struct ncclRecvMem* devRecvMem; uint64_t step; - uint64_t llLastCleaning; struct reqSlot (*reqFifo)[NCCL_STEPS]; int collNetRank; }; -struct collNetRecvResources { - struct ncclComm* comm; +struct recvResources { + struct connectMap map; void* collNetComm; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; + + int rank; + int nranks; int netDev; int useGdr; + uint64_t* gdcSync; + uint64_t* gdcFlush; + void* gdrDesc; void* mhandles[NCCL_NUM_PROTOCOLS]; - struct ncclRecvMem* devRecvMem; uint64_t step; - uint64_t llLastCleaning; struct reqSlot reqFifo[COLLNET_MAX_GROUPS][NCCL_STEPS]; int collNetRank; }; -struct collNetSharedResources { - void* collNetListenComms[MAXCHANNELS]; - void* collNetComms[MAXCHANNELS]; - int collNetCommRefCount[MAXCHANNELS]; -}; - /* Determine if we can communicate with the peer */ -ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { +static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { *ret = 1; return ncclSuccess; } -ncclResult_t collNetSharedListen(struct ncclComm* comm, int netDev, void* collNetHandle) { - struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources; - if (resources == NULL) { - NCCLCHECK(ncclCalloc(&resources, 1)); - comm->proxyState.sharedBuffs.collNetResources = resources; +struct setupReq { + int netDev; + int useGdr; +}; + + +/* Setup send connector, and return connect information for others in the coll + * communicator to connect to me */ +static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { + struct setupReq req; + + int proxyRank; + NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank)); + NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr)); + send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0; + + NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.localRank)); + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn)); + NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); + + INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), req.netDev, + req.useGdr ? "/GDRDMA" : ""); + return ncclSuccess; +} + +static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { + struct setupReq req; + + int proxyRank; + NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank)); + NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr)); + recv->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0; + + NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.localRank)); + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn)); + struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo; + NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t))); + + INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), req.netDev, + req.useGdr ? "/GDRDMA" : ""); + return ncclSuccess; +} + +static ncclResult_t collNetDumpMap(struct connectMap* map) { + printf("Dump map\n"); + struct connectMapMem *mem = map->mems+NCCL_NET_MAP_HOSTMEM; + printf("Mem 0: Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); + mem = map->mems+NCCL_NET_MAP_DEVMEM; + printf("Mem 1: Vid mem CPU (%x B) %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); + mem = map->mems+NCCL_NET_MAP_SHARED_HOSTMEM; + printf("Mem 2: Shared Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); + mem = map->mems+NCCL_NET_MAP_SHARED_DEVMEM; + printf("Mem 3: Shared Vid (%x B) mem CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); + printf("SendMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n", + map->offsets.sendMem & NCCL_NET_MAP_MASK_USED ? 1 : 0, + NCCL_NET_MAP_OFFSET_BANK(map, sendMem), map->offsets.sendMem & NCCL_NET_MAP_MASK_OFFSET, + NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem), NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem)); + printf("RecvMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n", + map->offsets.recvMem & NCCL_NET_MAP_MASK_USED ? 1 : 0, + NCCL_NET_MAP_OFFSET_BANK(map, recvMem), map->offsets.recvMem & NCCL_NET_MAP_MASK_OFFSET, + NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem), NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem)); + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + printf("Proto %d -> Used %d Bank %d Offset %x, cpu %p, gpu %p\n", p, + map->offsets.buffs[p] & NCCL_NET_MAP_MASK_USED ? 1 : 0, + NCCL_NET_MAP_OFFSET_BANK(map, buffs[p]), map->offsets.buffs[p] & NCCL_NET_MAP_MASK_OFFSET, + NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]), NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p])); } - if (resources->collNetComms[netDev] == NULL) - NCCLCHECK(collNetListen(netDev, collNetHandle, resources->collNetListenComms+netDev)); + printf("End of dump\n"); return ncclSuccess; } -/* Setup send connector, and return connect information for others in the coll communicator to connect to me */ -ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { - struct collNetSendResources* resources; - NCCLCHECK(ncclCalloc(&resources, 1)); - send->transportResources = resources; - send->conn.shared = 1; - resources->comm = comm; +struct collNetConnectArgs { + int rank; + int nranks; + struct ncclConnect* connectInfos; +}; + +static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) { + // We're on the same process as the proxy. We can pass a pointer to a struct. + struct collNetConnectArgs args = { rank, nranks, connectInfos }; + struct connectMap* map; + NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*))); + + //NCCLCHECK(collNetDumpMap(map)); + + struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem); + void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr; + send->conn.head = gdcMem ? (uint64_t*)gdcMem : &sendMem->head; + + struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem); + send->conn.tail = &recvMem->tail; + send->conn.sizesFifo = recvMem->sizesFifo; + for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1; + send->conn.offsFifo = recvMem->offsFifo; + + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) + send->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); + return ncclSuccess; +} - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &resources->netDev)); - NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr)); +static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) { + // We're on the same process as the proxy. We can pass a pointer to a struct. + struct collNetConnectArgs args = { rank, nranks, connectInfos }; + struct connectMap* map; + NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*))); - send->proxyAppendPtr = comm->proxyState.sharedBuffs.proxyAppendCollNet+2*resources->netDev+1; + //NCCLCHECK(collNetDumpMap(map)); - NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1)); + struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem); + recv->conn.head = &sendMem->head; - int recvSize = offsetof(struct ncclRecvMem, buff); - // Simple uses shared buffers and we don't support LL128 - recvSize += send->comm->buffSizes[NCCL_PROTO_LL]; + struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem); + void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr; + recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail; + recv->conn.offsFifo = recvMem->offsFifo; - if (resources->useGdr) { - NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize)); + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + recv->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); } - NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize)); + return ncclSuccess; +} - INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev, - resources->useGdr ? "/GDRDMA" : ""); +static ncclResult_t sendFree(struct ncclConnector* send) { return ncclSuccess; } -/* Setup recv connector */ -ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { - struct collNetRecvResources* resources; - NCCLCHECK(ncclCalloc(&resources, 1)); - recv->transportResources = resources; - recv->conn.shared = 1; - resources->comm = comm; +static ncclResult_t recvFree(struct ncclConnector* recv) { + return ncclSuccess; +} - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &resources->netDev)); - NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr)); +static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + struct setupReq* req = (struct setupReq*)reqBuff; + if (reqSize != sizeof(struct setupReq)) return ncclInternalError; - recv->proxyAppendPtr = comm->proxyState.sharedBuffs.proxyAppendCollNet+2*resources->netDev; + struct sendResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + connection->transportResources = resources; + connection->shared = 1; - NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1)); + resources->netDev = req->netDev; + resources->useGdr = req->useGdr; + return ncclSuccess; +} - int recvSize = offsetof(struct ncclRecvMem, buff); - // Simple uses shared buffers and we don't support LL128 - recvSize += recv->comm->buffSizes[NCCL_PROTO_LL]; +struct sharedResources { + void* collNetListenComms[MAXCHANNELS]; + void* collNetComms[MAXCHANNELS]; + int commRefCount[NCCL_MAX_NETDEVS]; +}; - if (resources->useGdr) { - NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize)); +ncclResult_t sharedListen(struct ncclComm* comm, int netDev, void* collNetHandle) { + struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources; + if (resources == NULL) { + NCCLCHECK(ncclCalloc(&resources, 1)); + comm->proxyState.progressState.collNet.resources = resources; } - NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize)); - - INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev, - resources->useGdr ? "/GDRDMA" : ""); - struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo; - - NCCLCHECK(collNetSharedListen(comm, resources->netDev, &info->collNetHandle)); + if (resources->collNetComms[netDev] == NULL) + NCCLCHECK(collNetListen(netDev, collNetHandle, resources->collNetListenComms+netDev)); return ncclSuccess; } -ncclResult_t collNetSharedConnect(struct ncclComm* comm, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, void** collNetComm) { - struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources; +static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, void** collNetComm) { + struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources; if (resources->collNetComms[netDev] == NULL) { // Connect to coll comm collNetHandle_t** handlePtrs = NULL; @@ -159,152 +314,234 @@ ncclResult_t collNetSharedConnect(struct ncclComm* comm, int netDev, struct nccl NCCLCHECK(collNetCloseListen(resources->collNetListenComms[netDev])); } *collNetComm = resources->collNetComms[netDev]; - resources->collNetCommRefCount[netDev]++; + resources->commRefCount[netDev]++; return ncclSuccess; } -ncclResult_t collNetSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) { - // Setup device pointers - struct collNetSendResources* resources = (struct collNetSendResources*)send->transportResources; - struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank); - - // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host - send->conn.buffs[NCCL_PROTO_LL] = resources->recvMem->buff; - send->conn.buffs[NCCL_PROTO_LL128] = send->conn.buffs[NCCL_PROTO_SIMPLE] = NULL; - send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0; - - // Head/Tail/Opcount/Fifos are always on host - send->conn.tail = &resources->recvMem->tail; - send->conn.sizesFifo = resources->recvMem->sizesFifo; - send->conn.ptrsFifo = resources->recvMem->ptrsFifo; - send->conn.head = &resources->sendMem->head; - resources->sendMem->head = -NCCL_STEPS; // Don't give any credit yet when sharing buffers - for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1; +static ncclResult_t sharedFree(struct ncclComm* comm, int netDev) { + struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources; + resources->commRefCount[netDev]--; + if (resources->commRefCount[netDev] == 0) { + NCCLCHECK(collNetCloseColl(resources->collNetComms[netDev])); + } + for (int n=0; n<NCCL_MAX_NETDEVS; n++) if (resources->commRefCount[n]) return ncclSuccess; + comm->proxyState.progressState.collNet.resources = NULL; + free(resources); + return ncclSuccess; +} + +static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, char** gpuPtr, char** cpuPtr, int* size) { + struct ncclProxySharedCollNet* state = &comm->proxyState.progressState.collNet; + if (state->size == 0) { + state->size = 2*comm->nChannels*comm->buffSizes[NCCL_PROTO_SIMPLE]; + } + + *size = state->size; + + if (cuda && state->cudaBuff == NULL) { + NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, *size)); + } + if (!cuda && state->hostBuff == NULL) { + NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, *size)); + } + *gpuPtr = *cpuPtr = cuda ? state->cudaBuff : state->hostBuff; + return ncclSuccess; +} + +static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int type, int slot, int channel, int* offset) { + // Use different pools for different channels and also separate send/recv. + int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; + int globalSlot = (type*NCCL_STEPS+slot)*comm->nChannels+channel; + *offset = slotSize * globalSlot; + return ncclSuccess; +} + +static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm) { + struct ncclProxySharedCollNet* state = &comm->proxyState.progressState.collNet; + if (state->size == 0) return ncclSuccess; + CUDACHECK(cudaFree(state->cudaBuff)); + NCCLCHECK(ncclCudaHostFree(state->hostBuff)); + // This will be called multiple times, with multiple channels and send/recv. Make sure we only do it once. + state->size = 0; + return ncclSuccess; +} + +static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + struct setupReq* req = (struct setupReq*)reqBuff; + if (reqSize != sizeof (struct setupReq)) return ncclInternalError; + + struct recvResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + connection->transportResources = resources; + connection->shared = 1; + + resources->netDev = req->netDev; + resources->useGdr = req->useGdr; + + collNetHandle_t* netHandle = (collNetHandle_t*) respBuff; + if (respSize != sizeof(collNetHandle_t)) return ncclInternalError; + + NCCLCHECK(sharedListen(comm, req->netDev, netHandle)); + return ncclSuccess; +} + +static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld\n", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; } + struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff; + struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank); + + struct sendResources* resources = (struct sendResources*)(connection->transportResources); // Get info from recv side - resources->collNetRank = rank; + resources->collNetRank = args->rank; resources->reqFifo = (struct reqSlot (*)[NCCL_STEPS])(info->reqFifo); for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) resources->recvMhandles[p] = info->mhandles[p]; - NCCLCHECK(collNetSharedConnect(comm, resources->netDev, connectInfos, nranks, rank, &resources->collNetComm)); + NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm)); + connection->proxyAppendPtr = comm->proxyState.progressState.collNet.proxyAppend+2*resources->netDev; + + struct connectMap* map = &resources->map; + + NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); + NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); + + NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); + map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; + if (ncclGdrCopy && ncclParamGdrCopySyncEnable()) { + uint64_t *cpuPtr, *gpuPtr; + NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc)); + + resources->gdcSync = cpuPtr; + struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; + gdcMem->cpuPtr = (char*)cpuPtr; + gdcMem->gpuPtr = (char*)gpuPtr; + gdcMem->size = sizeof(uint64_t); // sendMem->head + } + + resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem); + resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem); + // Don't give credits yet in shared mode. + resources->sendMem->head = -NCCL_STEPS; - int size; - char* ptr; // Allocate & Register shared buffers for the Simple protocol - NCCLCHECK(ncclProxySharedBuffersInit(send->comm, resources->useGdr, &size, &ptr)); - NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size, + int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; + struct connectMapMem* mapMem = map->mems+bank; + NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size)); + NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); + + NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size, resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->sendMhandles[NCCL_PROTO_SIMPLE])); - // Allocate & Register shared buffers for the LL protocol - NCCLCHECK(ncclProxySharedBuffersInit(send->comm, 0, &size, &ptr)); - NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size, - NCCL_PTR_HOST, - &resources->sendMhandles[NCCL_PROTO_LL])); + if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; } + *((struct connectMap**)respBuff) = &resources->map; return ncclSuccess; } -ncclResult_t collNetRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) { - // Setup device pointers - struct collNetRecvResources* resources = (struct collNetRecvResources*)recv->transportResources; - struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank); - resources->collNetRank = rank; +static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld\n", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; } + struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff; - // Intermediate buffering on GPU for GPU Direct RDMA - struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->recvMem; - int offset = 0; - for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { - recv->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->recvMem->buff : recvMem->buff) + offset; - offset += recv->comm->buffSizes[p]; - } - recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0; + struct recvResources* resources = (struct recvResources*)(connection->transportResources); + struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank); + resources->collNetRank = args->rank; - // Head/Tail/Opcount are always on host - recv->conn.tail = &resources->recvMem->tail; - recv->conn.ptrsFifo = resources->recvMem->ptrsFifo; - recv->conn.head = &resources->sendMem->head; + NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm)); + connection->proxyAppendPtr = comm->proxyState.progressState.collNet.proxyAppend+2*resources->netDev+1; - NCCLCHECK(collNetSharedConnect(comm, resources->netDev, connectInfos, nranks, rank, &resources->collNetComm)); + struct connectMap* map = &resources->map; - int size; - char* ptr; + NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); + NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); + + NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); + map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; + if (ncclGdrCopy) { + uint64_t *cpuPtr, *gpuPtr; + NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc)); + + if (ncclParamGdrCopySyncEnable()) { + resources->gdcSync = cpuPtr; + struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; + gdcMem->cpuPtr = (char*)cpuPtr; + gdcMem->gpuPtr = (char*)gpuPtr; + gdcMem->size = sizeof(uint64_t); + } + if (ncclParamGdrCopyFlushEnable()) resources->gdcFlush = cpuPtr + 1; + } + + resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem); + resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem); // Allocate & Register shared buffers for the Simple protocol - NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, resources->useGdr, &size, &ptr)); - NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size, + int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; + struct connectMapMem* mapMem = map->mems+bank; + NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size)); + NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); + + NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size, resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[NCCL_PROTO_SIMPLE])); - // Allocate & Register shared buffers for the LL protocol - NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, 0, &size, &ptr)); - NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size, - NCCL_PTR_HOST, - &resources->mhandles[NCCL_PROTO_LL])); - // Pass info to send side info->reqFifo = resources->reqFifo; for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) info->mhandles[p] = resources->mhandles[p]; + if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; } + *((struct connectMap**)respBuff) = &resources->map; return ncclSuccess; } -ncclResult_t collNetSharedFree(struct ncclComm* comm, int netDev) { - struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources; - resources->collNetCommRefCount[netDev]--; - if (resources->collNetCommRefCount[netDev] == 0) { - NCCLCHECK(collNetCloseColl(resources->collNetComms[netDev])); +static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { + struct sendResources* resources = (struct sendResources*)(connection->transportResources); + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + if (resources->sendMhandles[p]) { + NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[p])); + } } - for (int c=0; c<MAXCHANNELS; c++) if (resources->collNetCommRefCount[c]) return ncclSuccess; - comm->proxyState.sharedBuffs.collNetResources = NULL; - free(resources); + struct connectMapMem* mems = resources->map.mems; + NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); + CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); + if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); + NCCLCHECK(sharedBuffersDestroy(comm)); + NCCLCHECK(sharedFree(comm, resources->netDev)); + free(connection->transportResources); return ncclSuccess; } -ncclResult_t collNetSendFree(void* sendTransportResources) { - struct collNetSendResources* resources = (struct collNetSendResources*)sendTransportResources; - NCCLCHECK(ncclCudaHostFree(resources->sendMem)); - NCCLCHECK(ncclCudaHostFree(resources->recvMem)); - if (resources->collNetComm) { - NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[NCCL_PROTO_LL])); - NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[NCCL_PROTO_SIMPLE])); +static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { + struct recvResources* resources = (struct recvResources*)(connection->transportResources); + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + if (resources->mhandles[p]) { + NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[p])); + } } - if (resources->useGdr) CUDACHECK(cudaFree(resources->devRecvMem)); - - NCCLCHECK(collNetSharedFree(resources->comm, resources->netDev)); - free(resources); + struct connectMapMem* mems = resources->map.mems; + NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); + CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); + if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); + NCCLCHECK(sharedBuffersDestroy(comm)); + NCCLCHECK(sharedFree(comm, resources->netDev)); + free(connection->transportResources); return ncclSuccess; } -ncclResult_t collNetRecvFree(void* recvTransportResources) { - struct collNetRecvResources* resources = (struct collNetRecvResources*)recvTransportResources; - NCCLCHECK(ncclCudaHostFree(resources->sendMem)); - NCCLCHECK(ncclCudaHostFree(resources->recvMem)); - if (resources->collNetComm) { - NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[NCCL_PROTO_LL])); - NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[NCCL_PROTO_SIMPLE])); - } - if (resources->useGdr) CUDACHECK(cudaFree(resources->devRecvMem)); - - NCCLCHECK(collNetSharedFree(resources->comm, resources->netDev)); - free(resources); - return ncclSuccess; -} #define LAST_OF_GROUP(s) \ (s % COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || s == args->nsubs-1) -ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) { - if (args->protocol == NCCL_PROTO_LL128) { - WARN("CollNet does not support LL128"); +static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { + if (args->protocol != NCCL_PROTO_SIMPLE) { + WARN("CollNet does not support LL/LL128"); return ncclInternalError; } if (args->state == ncclProxyOpReady) { for (int s=0; s<args->nsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; - struct collNetSendResources* resources = (struct collNetSendResources*) (sub->connector->transportResources); + struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->received = sub->transmitted = sub->done = 0; @@ -319,23 +556,21 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) { int perGroupSteps = NCCL_STEPS / nGroups; for (int s=0; s<args->nsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; - struct collNetSendResources* resources = (struct collNetSendResources*) (sub->connector->transportResources); + struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources); void* sendMhandle = resources->sendMhandles[p]; void* recvMhandle = resources->recvMhandles[p]; - int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS; auto reqFifo = resources->reqFifo; if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) { int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; - if (p == NCCL_PROTO_SIMPLE) { - char* ptr; - int sharedBuffSlot = sub->posted%NCCL_STEPS; - NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, resources->useGdr, 0, sharedBuffSlot, 0, &ptr)); - resources->recvMem->ptrsFifo[buffSlot] = ptr + s*args->chunkSize; - __sync_synchronize(); - } - volatile uint64_t* sendHead = &resources->sendMem->head; + int sharedBuffSlot = sub->posted%NCCL_STEPS; + int offset; + NCCLCHECK(sharedBuffersGet(comm, 0, sharedBuffSlot, 0, &offset)); + resources->recvMem->offsFifo[buffSlot] = offset + s*args->chunkSize; + __sync_synchronize(); + volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head; sub->posted += args->sliceSteps; *sendHead = sub->base + sub->posted - NCCL_STEPS; + if (resources->gdcSync) wc_store_fence(); // Flush out WC write } // Enforce sync between operations of the same group. bool groupSync = (((s == 0) && ((sub+args->nsubs-1)->received == sub->received)) || (s && (sub-1)->received > sub->received)); @@ -344,30 +579,15 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) { int sharedBuffSlot = sub->received%NCCL_STEPS; volatile int* sizesFifo = resources->recvMem->sizesFifo; volatile uint64_t* recvTail = &resources->recvMem->tail; - if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->received)) || p == NCCL_PROTO_LL)) { + char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[p]); + if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->received)))) { // We have something to receive, let's check whether data is ready. - int size = sizesFifo[buffSlot]; int ready = 1; if (s == 0) { - NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, p == NCCL_PROTO_SIMPLE ? resources->useGdr : 0, 0, sharedBuffSlot, 0, &args->sharedBuff[sharedBuffSlot])); - args->sharedSize[sharedBuffSlot] = p == NCCL_PROTO_SIMPLE ? args->chunkSize : size/2; - } - if (p == NCCL_PROTO_LL) { - char* localBuff = sub->connector->conn.buffs[p]; - uint32_t flag = NCCL_LL_FLAG(sub->base + sub->received + 1); - int nFifoLines = size / sizeof(union ncclLLFifoLine); - union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize); - // Pack data into the shared buffer - uint32_t* sendBuff = (uint32_t*)(args->sharedBuff[sharedBuffSlot]+args->sharedSize[sharedBuffSlot]*s); - for (int i=0; i<nFifoLines; i++) { - volatile uint32_t *f1 = &lines[i].flag1; - volatile uint32_t *d1 = &lines[i].data1; - volatile uint32_t *f2 = &lines[i].flag2; - volatile uint32_t *d2 = &lines[i].data2; - if (f1[0] != flag || f2[0] != flag) { ready = 0; break; } - sendBuff[2*i] = d1[0]; - sendBuff[2*i+1] = d2[0]; - } + int offset; + NCCLCHECK(sharedBuffersGet(comm, 0, sharedBuffSlot, 0, &offset)); + args->sharedBuff[sharedBuffSlot] = localBuff + offset; + args->sharedSize[sharedBuffSlot] = args->chunkSize; } if (ready) { sizesFifo[buffSlot] = -1; @@ -426,15 +646,15 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) { return ncclSuccess; } -ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) { - if (args->protocol == NCCL_PROTO_LL128) { - WARN("CollNet does not support LL128"); +static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { + if (args->protocol != NCCL_PROTO_SIMPLE) { + WARN("CollNet does not support LL/LL128"); return ncclInternalError; } if (args->state == ncclProxyOpReady) { for (int s=0; s<args->nsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; - struct collNetRecvResources* resources = (struct collNetRecvResources*) (sub->connector->transportResources); + struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->received = sub->flushed = sub->transmitted = sub->done = 0; @@ -449,19 +669,20 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) { int perGroupSteps = NCCL_STEPS / nGroups; for (int s=0; s<args->nsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; - struct collNetRecvResources* resources = (struct collNetRecvResources*) (sub->connector->transportResources); + struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); void* mhandle = resources->mhandles[p]; - int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS; auto reqFifo = resources->reqFifo; + char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); + // Enforce sync between operations of the same group. if (LAST_OF_GROUP(s) && (sub->posted < sub->done + perGroupSteps) && (sub->posted < sub->nsteps)) { int group = s / COLLNET_GROUP_NSUBS; int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; - char* ptr; int sharedBuffSlot = sub->posted%NCCL_STEPS; int startChannel = group*COLLNET_GROUP_NSUBS; - NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, p == NCCL_PROTO_SIMPLE ? resources->useGdr : 0, 1, sharedBuffSlot, startChannel, &ptr)); - reqFifo[group][buffSlot].recvBuff = ptr; + int offset; + NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset)); + reqFifo[group][buffSlot].recvBuff = localBuff + offset; TRACE(NCCL_NET, "recvProxy [%d/%d/%d] posted buffer %p", sub->posted, group, buffSlot, reqFifo[group][buffSlot].recvBuff); sub->posted += args->sliceSteps; args->idle = 0; @@ -476,11 +697,24 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) { int totalSize = args->sharedSize[sharedBuffSlot]*(s-group*COLLNET_GROUP_NSUBS+1); TRACE(NCCL_NET, "recvProxy [%d/%d/%d] received, size %d", sub->received, group, buffSlot, totalSize); sub->received += args->sliceSteps; - if (reqFifo[group][buffSlot].size > 0 && p == NCCL_PROTO_SIMPLE && resources->useGdr) { - int startChannel = group*COLLNET_GROUP_NSUBS; - char* groupRecvAddress; - NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, 1, 1, sharedBuffSlot, startChannel, &groupRecvAddress)); - NCCLCHECK(collNetIflush(resources->collNetComm, groupRecvAddress, totalSize, mhandle, sub->requests+buffSlot)); + sub->requests[buffSlot] = NULL; + if (reqFifo[group][buffSlot].size > 0 && resources->useGdr) { + // GDRCOPY support + if (resources->gdcFlush) { +#if defined (__x86_64__) + // Force a PCI-E read from GPU memory + asm volatile ("mov (%0), %%eax" :: "l"(resources->gdcFlush) : "%eax"); +#else + WARN("NET: GDR Flush only supported on x86_64"); + return ncclInternalError; +#endif + sub->requests[buffSlot] = NULL; + } else { + int startChannel = group*COLLNET_GROUP_NSUBS; + int offset; + NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset)); + NCCLCHECK(collNetIflush(resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot)); + } } else { for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps; } @@ -506,27 +740,14 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) { int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS; int sharedBuffSlot = sub->transmitted%NCCL_STEPS; int startChannel = group*COLLNET_GROUP_NSUBS; - char* groupRecvAddress; - NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, p == NCCL_PROTO_SIMPLE ? resources->useGdr : 0, 1, sharedBuffSlot, startChannel, &groupRecvAddress)); - char* ptr = groupRecvAddress + (s%COLLNET_GROUP_NSUBS)*args->sharedSize[sharedBuffSlot]; - if (p == NCCL_PROTO_SIMPLE) { - volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo; - ptrsFifo[buffSlot] = ptr; - __sync_synchronize(); - resources->recvMem->tail = sub->base + sub->flushed; - } - if (p == NCCL_PROTO_LL) { // ll - // re-attach flag - char* localBuff = sub->connector->conn.buffs[p]; - uint32_t flag = NCCL_LL_FLAG(sub->base + sub->transmitted + 1); - union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize); - uint32_t* recvData = (uint32_t*)ptr; - int nFifoLines = DIVUP(args->sharedSize[sharedBuffSlot], 2*sizeof(uint32_t)); - for (int i=0; i<nFifoLines; i++) { - lines[i].v[0] = ((uint64_t)flag << 32) + recvData[2*i]; - lines[i].v[1] = ((uint64_t)flag << 32) + recvData[2*i+1]; - } - } + int offset; + NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset)); + volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo; + offsFifo[buffSlot] = offset; + __sync_synchronize(); + volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail; + *recvTail = sub->base + sub->flushed; + if (resources->gdcSync) wc_store_fence(); // Flush out WC write sub->transmitted += args->sliceSteps; args->idle = 0; continue; @@ -551,7 +772,7 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) { struct ncclTransport collNetTransport = { "COL", - collNetCanConnect, - { collNetSendSetup, collNetSendConnect, collNetSendFree, collNetSendProxy }, - { collNetRecvSetup, collNetRecvConnect, collNetRecvFree, collNetRecvProxy } + canConnect, + { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress }, + { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress } }; diff --git a/src/transport/net.cc b/src/transport/net.cc index 5abc32d..56f0315 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,51 +7,125 @@ #include "comm.h" #include "net.h" #include "graph.h" +#include "proxy.h" #include "collectives.h" #include "gdrwrap.h" +#include "shm.h" +#include "profiler.h" -struct netConnectInfo { - ncclNetHandle_t netHandle; +static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large"); + +#define NCCL_NET_MAP_HOSTMEM 0 +#define NCCL_NET_MAP_DEVMEM 1 +#define NCCL_NET_MAP_SHARED_HOSTMEM 2 +#define NCCL_NET_MAP_SHARED_DEVMEM 3 +#define NCCL_NET_MAP_GDCMEM 4 +#define NCCL_NET_MAP_MEMS 5 + +#define NCCL_NET_MAP_MASK_DEVMEM 0x40000000 +#define NCCL_NET_MAP_MASK_SHARED 0x80000000 +#define NCCL_NET_MAP_MASK_USED 0x20000000 +#define NCCL_NET_MAP_MASK_OFFSET 0x1fffffff + +#define NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName) \ + ((mapStruct)->offsets.offsetName >> 30) + +#define NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) \ + (((mapStruct)->offsets.offsetName >> 29) == 0) + +#define NCCL_NET_MAP_GET_POINTER(mapStruct, cpuOrGpu, offsetName) \ + (NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) ? NULL : \ + (mapStruct)->mems[NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName)].cpuOrGpu##Ptr + ((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_OFFSET)) + +#define NCCL_NET_MAP_DEV_MEM(mapStruct, offsetName) \ + (((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_DEVMEM) != 0) + +#define NCCL_NET_MAP_ADD_POINTER(mapStruct, shared, dev, memSize, offsetName) do { \ + int bank = NCCL_NET_MAP_MASK_USED + (dev)*NCCL_NET_MAP_MASK_DEVMEM + (shared)*NCCL_NET_MAP_MASK_SHARED; \ + if ((shared) == 0) { \ + if (dev) { \ + (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size; \ + (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size += memSize; \ + } else { \ + (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size; \ + (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size += memSize; \ + } \ + } else { \ + (mapStruct)->offsets.offsetName = bank; \ + } \ +} while (0); + +struct connectMapMem{ + char* gpuPtr; + char* cpuPtr; + int size; + union { + char shmPath[PATH_MAX]; + cudaIpcMemHandle_t ipc; + }; }; -#define LOC_HOSTMEM 0 -#define LOC_DEVMEM 1 -#define LOC_COUNT 2 +struct connectMap { + int sameProcess; + int shared; + int cudaDev; + // First 3 bits of offsets determine the mem bank. 001 is host mem, 011 is dev mem, 101 is shared host mem and 111 is shared dev mem. + struct connectMapMem mems[NCCL_NET_MAP_MEMS]; + // Offsets. 3 MSBs indicate mem bank, 111 indicates NULL. + struct { + uint32_t sendMem; + uint32_t recvMem; + uint32_t buffs[NCCL_NUM_PROTOCOLS]; + } offsets; +}; -struct netSendResources { +struct sendResources { + struct connectMap map; void* netSendComm; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; + + int rank; + int localRank; + int remoteRank; int netDev; int useGdr; + int maxRecvs; + uint64_t* gdcSync; + void* gdrDesc; int shared; - char* buffers[LOC_COUNT]; - int buffSizes[LOC_COUNT]; - void* mhandles[LOC_COUNT]; - void** mhandlesProto[NCCL_NUM_PROTOCOLS]; + int channelId; + int connIndex; + char* buffers[NCCL_NUM_PROTOCOLS]; + int buffSizes[NCCL_NUM_PROTOCOLS]; + void* mhandles[NCCL_NUM_PROTOCOLS]; uint64_t step; uint64_t llLastCleaning; }; -struct netRecvResources { +struct recvResources { + struct connectMap map; void* netListenComm; void* netRecvComm; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; - // GDRCOPY support - void* gdrMemDesc; - struct ncclRecvMem* devRecvMem; - void* gdrFlushDesc; - int* devFlushMem; - + int rank; + int localRank; + int remoteRank; + int proxyRank; int netDev; int useGdr; + int maxRecvs; + uint64_t* gdcSync; + uint64_t* gdcFlush; + void* gdrDesc; int shared; - char* buffers[LOC_COUNT]; - int buffSizes[LOC_COUNT]; - void* mhandles[LOC_COUNT]; - void** mhandlesProto[NCCL_NUM_PROTOCOLS]; + int channelId; + int connIndex; + char* buffers[NCCL_NUM_PROTOCOLS]; + int buffSizes[NCCL_NUM_PROTOCOLS]; + void* mhandles[NCCL_NUM_PROTOCOLS]; uint64_t step; uint64_t llLastCleaning; }; @@ -59,7 +133,7 @@ struct netRecvResources { NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", -2); /* Determine if two peers can communicate with NET */ -ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { +static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { // Same host? if (info1->hostHash == info2->hostHash) { // User disabled NET for intra-node? @@ -73,274 +147,670 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop } NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2); +NCCL_PARAM(NetSharedComms, "NET_SHARED_COMMS", 1); + +struct setupReq { + int rank; + int localRank; + int remoteRank; + int shared; + int netDev; + int useGdr; + int channelId; + int connIndex; +}; /* Determine if we will use this transport for this peer and return connect * information for this peer */ -ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { - struct netSendResources* resources; - NCCLCHECK(ncclCalloc(&resources, 1)); - send->transportResources = resources; - send->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1; - send->proxyAppendPtr = send->conn.shared ? comm->proxyState.sharedBuffs.proxyAppend+2*channelId+1 : &send->proxyAppend; +static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { + struct setupReq req; - // Send/Receive: Round-robin NICs based on the receiver's CUDA device - int nicRR = comm->peerInfo[peerInfo->rank].cudaDev; - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &resources->netDev)); - NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr)); + send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; + req.channelId = channelId; + req.connIndex = connIndex; - NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1)); - NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1)); + int proxyRank; + NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank)); + NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr)); + send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0; - send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0; - send->conn.tail = &resources->recvMem->tail; - send->conn.sizesFifo = resources->recvMem->sizesFifo; - // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree - send->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL; - send->conn.head = &resources->sendMem->head; - resources->sendMem->head = resources->shared ? -NCCL_STEPS : 0; // Don't give any credit yet when sharing buffers - for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1; - - if (resources->shared == 0) { - int protoLoc[NCCL_NUM_PROTOCOLS]; - for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { - protoLoc[p] = p != NCCL_PROTO_LL && resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM; - } - int buffSizes[NCCL_NUM_PROTOCOLS]; - for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { - buffSizes[p] = send->comm->buffSizes[p]; - resources->buffSizes[protoLoc[p]] += buffSizes[p]; - } - - if (resources->buffSizes[LOC_DEVMEM]) { - NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM])); - } - if (resources->buffSizes[LOC_HOSTMEM]) { - NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM])); - } + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn)); + req.rank = myInfo->rank; + NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank)); + req.remoteRank = peerInfo->rank; + NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0)); - int offsets[LOC_COUNT]; - offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0; - for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { - resources->mhandlesProto[p] = resources->mhandles+protoLoc[p]; - send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]]; - offsets[protoLoc[p]] += buffSizes[p]; - } + if (proxyRank == myInfo->rank) { + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev, + req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); + } else { + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev, + proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); } - - INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev, - resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : ""); + *((int*)connectInfo) = proxyRank; return ncclSuccess; } // GDRCOPY support: TAIL_ENABLE When enabled locates the RX proxy tail in CUDA memory -NCCL_PARAM(GdrCopyTailEnable, "GDRCOPY_TAIL_ENABLE", 1); +NCCL_PARAM(GdrCopySyncEnable, "GDRCOPY_SYNC_ENABLE", 1); // GDRCOPY support: FLUSH_ENABLE When enabled uses a PCI-E read to flush GDRDMA buffers NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0); -ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { - struct netRecvResources* resources; - NCCLCHECK(ncclCalloc(&resources, 1)); - recv->transportResources = resources; - recv->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1; - recv->proxyAppendPtr = recv->conn.shared ? comm->proxyState.sharedBuffs.proxyAppend+2*channelId : &recv->proxyAppend; - - // Send/Receive: Round-robin NICs based on the receiver's CUDA device - int nicRR = comm->cudaDev; - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &resources->netDev)); - NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr)); - - NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1)); - NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1)); - - // GDRCOPY tail support - if (ncclGdrCopy != NULL && ncclParamGdrCopyTailEnable() == 1) { - struct ncclRecvMem* devCudaPtr; - NCCLCHECK(ncclGdrCudaCalloc(&resources->devRecvMem, &devCudaPtr, 1, &resources->gdrMemDesc)); - // The GDR mapped VA doesn't work on the SMs - recv->conn.tail = &((struct ncclRecvMem*)devCudaPtr)->tail; - } else { - recv->conn.tail = &resources->recvMem->tail; +/* Setup recv connector */ +static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { + struct setupReq req; + + recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1; + req.channelId = channelId; + req.connIndex = connIndex; + + // Use myInfo->rank as the receiver uses its own NIC + int proxyRank; + NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank)); + NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr)); + + // We don't support PXN on receive yet + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn)); + + req.rank = myInfo->rank; + NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank)); + req.remoteRank = peerInfo->rank; + NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t))); + + INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), req.netDev, + req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : ""); + return ncclSuccess; +} + +static ncclResult_t netMapShm(struct connectMapMem* mem) { + NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, 0)); + NCCLCHECK(ncclShmUnlink(mem->shmPath)); + return ncclSuccess; +} +static ncclResult_t netCreateShm(struct connectMapMem* mem) { + mem->shmPath[0] = '\0'; // Let ncclShmOpen create a tmp file + NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, NULL, 1)); + return ncclSuccess; +} + +static ncclResult_t netDumpMap(struct connectMap* map) { + printf("Dump map same process %d shared %d\n", map->sameProcess, map->shared); + struct connectMapMem *mem = map->mems+NCCL_NET_MAP_HOSTMEM; + printf("Mem 0: Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr); + mem = map->mems+NCCL_NET_MAP_DEVMEM; + printf("Mem 1: Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); + mem = map->mems+NCCL_NET_MAP_SHARED_HOSTMEM; + printf("Mem 2: Shared Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr); + mem = map->mems+NCCL_NET_MAP_SHARED_DEVMEM; + printf("Mem 3: Shared Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr); + printf("SendMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n", + map->offsets.sendMem & NCCL_NET_MAP_MASK_USED ? 1 : 0, + NCCL_NET_MAP_OFFSET_BANK(map, sendMem), map->offsets.sendMem & NCCL_NET_MAP_MASK_OFFSET, + NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem), NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem)); + printf("RecvMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n", + map->offsets.recvMem & NCCL_NET_MAP_MASK_USED ? 1 : 0, + NCCL_NET_MAP_OFFSET_BANK(map, recvMem), map->offsets.recvMem & NCCL_NET_MAP_MASK_OFFSET, + NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem), NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem)); + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + printf("Proto %d -> Used %d Bank %d Offset %x, cpu %p, gpu %p\n", p, + map->offsets.buffs[p] & NCCL_NET_MAP_MASK_USED ? 1 : 0, + NCCL_NET_MAP_OFFSET_BANK(map, buffs[p]), map->offsets.buffs[p] & NCCL_NET_MAP_MASK_OFFSET, + NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]), NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p])); } + printf("End of dump\n"); + return ncclSuccess; +} - // GDRCOPY flush support -#if defined (__x86_64__) - if (ncclGdrCopy != NULL && ncclParamGdrCopyFlushEnable() == 1) { - int* cudaPtr; - NCCLCHECK(ncclGdrCudaCalloc(&resources->devFlushMem, &cudaPtr, 1, &resources->gdrFlushDesc)); +static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { + // Setup device pointers + struct connectMap* map; + NCCLCHECK(ncclCalloc(&map, 1)); + send->transportResources = map; + NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), map, sizeof(struct connectMap))); + + if (map->sameProcess) { + if (map->cudaDev != comm->cudaDev) { + // Enable P2P access + cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0); + if (err == cudaErrorPeerAccessAlreadyEnabled) { + cudaGetLastError(); + } else if (err != cudaSuccess) { + WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err)); + return ncclInternalError; + } + } + } else { + NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM)); + if (map->mems[NCCL_NET_MAP_DEVMEM].size) { + CUDACHECK(cudaIpcOpenMemHandle((void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].ipc, cudaIpcMemLazyEnablePeerAccess)); + map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = NULL; + } + if (map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size) { + void** sharedDevMemPtr = comm->proxyState.sharedDevMems+send->proxyConn.localRank; + if (*sharedDevMemPtr == NULL) { + CUDACHECK(cudaIpcOpenMemHandle(sharedDevMemPtr, map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipc, cudaIpcMemLazyEnablePeerAccess)); + } + map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = (char*)(*sharedDevMemPtr); + map->mems[NCCL_NET_MAP_SHARED_DEVMEM].cpuPtr = NULL; + } } -#endif + //NCCLCHECK(netDumpMap(map)); - recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0; + struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem); + void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr; + send->conn.head = gdcMem ? (uint64_t*)gdcMem : &sendMem->head; + + struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem); + send->conn.tail = &recvMem->tail; + send->conn.sizesFifo = recvMem->sizesFifo; // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree - recv->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL; - recv->conn.head = &resources->sendMem->head; + send->conn.offsFifo = map->shared ? recvMem->offsFifo : NULL; - if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree not for p2p - int protoLoc[NCCL_NUM_PROTOCOLS]; - for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { - protoLoc[p] = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM; - } + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) + send->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); + return ncclSuccess; +} - int buffSizes[NCCL_NUM_PROTOCOLS]; - for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { - buffSizes[p] = recv->comm->buffSizes[p]; - resources->buffSizes[protoLoc[p]] += buffSizes[p]; - } +/* Connect to this peer */ +static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { + struct connectMap* map; + NCCLCHECK(ncclCalloc(&map, 1)); + recv->transportResources = map; + NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), map, sizeof(struct connectMap))); + //NCCLCHECK(netDumpMap(map)); - if (resources->buffSizes[LOC_DEVMEM]) { - NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM])); - } - if (resources->buffSizes[LOC_HOSTMEM]) { - NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM])); + struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem); + recv->conn.head = &sendMem->head; + + struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem); + void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr; + recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail; + recv->conn.sizesFifo = recvMem->sizesFifo; + // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree + recv->conn.offsFifo = map->shared ? recvMem->offsFifo : NULL; + + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) + recv->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]); + return ncclSuccess; +} + +static ncclResult_t sendFree(struct ncclConnector* send) { + struct connectMap* map = (struct connectMap*)(send->transportResources); + if (map->sameProcess == 0) { + NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); + if (map->mems[NCCL_NET_MAP_DEVMEM].size) { + CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); } + } + return ncclSuccess; +} - int offsets[LOC_COUNT]; - offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0; - for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { - resources->mhandlesProto[p] = resources->mhandles+protoLoc[p]; - recv->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]]; - offsets[protoLoc[p]] += buffSizes[p]; +static ncclResult_t recvFree(struct ncclConnector* recv) { + return ncclSuccess; +} + +#define NCCL_SHARED_STEPS 16 +static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int localRank, int type, int sameProcess, + int nChannels, char** gpuPtr, char** cpuPtr, int* size, cudaIpcMemHandle_t* ipc) { + if (cuda == 0 && sameProcess == 0) { + WARN("PXN should not use host buffers for data"); + return ncclInternalError; + } + struct ncclProxyProgressState* progressState = &comm->proxyState.progressState; + if (progressState->localPeers == NULL) { + NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks)); + } + struct ncclProxyPeer** localPeers = progressState->localPeers; + if (localPeers[localRank] == NULL) { + NCCLCHECK(ncclCalloc(localPeers+localRank, 1)); + } + struct ncclProxyPeer* peer = localPeers[localRank]; + struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv; + state->refcount++; + if (state->size == 0) { + state->size = nChannels*(NCCL_SHARED_STEPS/NCCL_STEPS)*comm->buffSizes[NCCL_PROTO_SIMPLE]/SENDRECV_SLICEFACTOR; + } + + if (size) *size = state->size; + + if (cuda && state->cudaBuff == NULL) { + NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size)); + if (sameProcess == 0) { + CUDACHECK(cudaIpcGetMemHandle(&state->ipc, state->cudaBuff)); } } + if (!cuda && state->hostBuff == NULL) { + NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size)); + } + if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff; + if (sameProcess) { + if (gpuPtr) *gpuPtr = *cpuPtr; + } else { + if (gpuPtr) *gpuPtr = NULL; + if (ipc) memcpy(ipc, &state->ipc, sizeof(cudaIpcMemHandle_t)); + } + return ncclSuccess; +} + +static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int channel, int slot, int* offset) { + // Use different pools for different channels and also separate send/recv. + int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR); + int globalSlot = (channel*NCCL_SHARED_STEPS)+slot; + *offset = slotSize * globalSlot; + return ncclSuccess; +} + +static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm, int localRank, int type) { + if (comm->proxyState.progressState.localPeers == NULL) NCCLCHECK(ncclInternalError); + struct ncclProxyPeer* peer = comm->proxyState.progressState.localPeers[localRank]; + if (peer == NULL) NCCLCHECK(ncclInternalError;) + struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv; + if (state->size == 0) NCCLCHECK(ncclInternalError); + state->refcount--; + if (state->refcount == 0) { + if (state->cudaBuff) CUDACHECK(cudaFree(state->cudaBuff)); + if (state->hostBuff) NCCLCHECK(ncclCudaHostFree(state->hostBuff)); + } + if (peer->send.refcount || peer->recv.refcount) return ncclSuccess; + free(peer); + comm->proxyState.progressState.localPeers[localRank] = NULL; + for (int r=0; r<comm->localRanks; r++) { + if (comm->proxyState.progressState.localPeers[r]) return ncclSuccess; + } + // All peers are freed, free array + free(comm->proxyState.progressState.localPeers); + comm->proxyState.progressState.localPeers = NULL; + return ncclSuccess; +} - INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev, - resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : ""); - struct netConnectInfo* info = (struct netConnectInfo*) connectInfo; - NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm)); +static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels) { + int rank = comm->localRankToRank[connection->localRank]; + int sameProcess = comm->peerInfo[rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0; + NCCLCHECK(sharedBuffersInit(comm, 1, connection->localRank, 0, sameProcess, nChannels, NULL, NULL, NULL, NULL)); + return ncclSuccess; +} +static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + struct setupReq* req = (struct setupReq*) reqBuff; + if (reqSize != sizeof(struct setupReq)) return ncclInternalError; + + struct sendResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + connection->transportResources = resources; + + resources->rank = req->rank; + resources->localRank = req->localRank; + resources->remoteRank = req->remoteRank; + resources->netDev = req->netDev; + resources->shared = connection->shared = req->shared; + resources->useGdr = req->useGdr; + resources->channelId = req->channelId; + resources->connIndex = req->connIndex; + ncclNetProperties_t props; + NCCLCHECK(ncclNetGetProperties(req->netDev, &props)); + resources->maxRecvs = props.maxRecvs; + + // We don't return any data + if (respSize != 0) return ncclInternalError; + *done = 1; return ncclSuccess; } -ncclResult_t netSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) { - // Setup device pointers - struct netSendResources* resources = (struct netSendResources*)send->transportResources; - struct netConnectInfo* info = (struct netConnectInfo*)connectInfo; +static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + struct setupReq* req = (struct setupReq*) reqBuff; + if (reqSize != sizeof(struct setupReq)) return ncclInternalError; + + struct recvResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + connection->transportResources = resources; + + resources->rank = req->rank; + resources->localRank = req->localRank; + resources->remoteRank = req->remoteRank; + resources->netDev = req->netDev; + resources->shared = connection->shared = req->shared; + resources->useGdr = req->useGdr; + resources->channelId = req->channelId; + resources->connIndex = req->connIndex; + ncclNetProperties_t props; + NCCLCHECK(ncclNetGetProperties(req->netDev, &props)); + resources->maxRecvs = props.maxRecvs; - // Connect to remote peer - NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm)); + if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError; + NCCLCHECK(ncclNetListen(req->netDev, respBuff, &resources->netListenComm)); + *done = 1; + return ncclSuccess; +} + +static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + struct sendResources* resources = (struct sendResources*)(connection->transportResources); + if (reqSize != sizeof(ncclNetHandle_t)) return ncclInternalError; if (resources->shared) { + // Shared buffers + struct ncclProxyProgressState* progressState = &comm->proxyState.progressState; + if (progressState->localPeers == NULL) { + NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks)); + } + struct ncclProxyPeer** localPeers = progressState->localPeers; + if (localPeers[resources->localRank] == NULL) { + NCCLCHECK(ncclCalloc(localPeers+resources->localRank, 1)); + } + connection->proxyAppendPtr = localPeers[resources->localRank]->send.proxyAppend+resources->channelId; + + if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { + // Connect or reuse connection for a netdev/remote rank. + if (progressState->netComms[resources->netDev] == NULL) { + NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks)); + } + struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank; + if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, comms->sendComm+resources->channelId)); + resources->netSendComm = comms->sendComm[resources->channelId]; + if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++; + } else { + NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm)); + } + } else { + // Connect to remote peer + NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm)); + connection->proxyAppendPtr = &connection->proxyAppend; + } + if (resources->netSendComm == NULL) { + *done = 0; + return ncclSuccess; + } + *done = 1; + + // Create structures + struct connectMap* map = &resources->map; + map->sameProcess = + comm->peerInfo[resources->rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0; + map->shared = resources->shared; + CUDACHECK(cudaGetDevice(&map->cudaDev)); + + if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + NCCL_NET_MAP_ADD_POINTER(map, 0, p!= NCCL_PROTO_LL && resources->useGdr, comm->buffSizes[p], buffs[p]); + resources->buffSizes[p] = comm->buffSizes[p]; + } + } else { // Get shared buffers - int loc = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM; - NCCLCHECK(ncclProxySharedBuffersInit(send->comm, resources->useGdr, resources->buffSizes+loc, resources->buffers+loc)); - resources->mhandlesProto[NCCL_PROTO_SIMPLE] = resources->mhandles+loc; + int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; + struct connectMapMem* mapMem = map->mems+bank; + NCCLCHECK(sharedBuffersInit( + comm, resources->useGdr, resources->localRank, 0, map->sameProcess, comm->p2pnChannels, + &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipc)); + resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size; + NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); } - if (resources->buffSizes[LOC_DEVMEM]) { - NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM])); + NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); + NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); + + if (map->mems[NCCL_NET_MAP_DEVMEM].size) { + if (resources->shared == 0) { + if (!map->sameProcess) { + ALIGN_SIZE(map->mems[NCCL_NET_MAP_DEVMEM].size, CUDA_IPC_MIN); + } + NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size)); + map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr; + } + if (!map->sameProcess) { + CUDACHECK(cudaIpcGetMemHandle(&map->mems[NCCL_NET_MAP_DEVMEM].ipc, map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr)); + } + } + if (map->sameProcess) { + NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); + map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; + } else { + NCCLCHECK(netCreateShm(map->mems+NCCL_NET_MAP_HOSTMEM)); } - if (resources->buffSizes[LOC_HOSTMEM]) { - NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM])); + if (ncclGdrCopy && map->sameProcess && ncclParamGdrCopySyncEnable()) { + uint64_t *cpuPtr, *gpuPtr; + NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc)); + + resources->gdcSync = cpuPtr; + struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; + gdcMem->cpuPtr = (char*)cpuPtr; + gdcMem->gpuPtr = (char*)gpuPtr; + gdcMem->size = sizeof(uint64_t); // sendMem->head + } + + resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem); + resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem); + + // Don't give credits yet in shared mode. + resources->sendMem->head = map->shared ? -NCCL_STEPS : 0; + for (int i=0; i<NCCL_STEPS; i++) resources->recvMem->sizesFifo[i] = -1; + + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]); + if (resources->buffers[p]) { + NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); + } } + + //NCCLCHECK(netDumpMap(map)); + if (respSize != sizeof(struct connectMap)) return ncclInternalError; + memcpy(respBuff, map, sizeof(struct connectMap)); return ncclSuccess; } -/* Connect to this peer */ -ncclResult_t netRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) { - // Setup device pointers - struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources; +static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + if (reqSize != sizeof(int)) return ncclInternalError; + struct recvResources* resources = (struct recvResources*)(connection->transportResources); + resources->proxyRank = *(int*)reqBuff; // Finish connection establishment from remote peer - NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm)); + if (resources->shared) { + // Shared buffers + struct ncclProxyProgressState* progressState = &comm->proxyState.progressState; + if (progressState->localPeers == NULL) { + NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks)); + } + struct ncclProxyPeer** localPeers = progressState->localPeers; + if (localPeers[resources->localRank] == NULL) { + NCCLCHECK(ncclCalloc(localPeers+resources->localRank, 1)); + } + connection->proxyAppendPtr = localPeers[resources->localRank]->recv.proxyAppend+resources->channelId; + + if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { + // Connect or reuse connection for a netdev/remote rank. + if (progressState->netComms[resources->netDev] == NULL) { + NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks)); + } + struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank; + if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(resources->netListenComm, comms->recvComm+resources->channelId)); + resources->netRecvComm = comms->recvComm[resources->channelId]; + if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++; + } else { + NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm)); + } + } else { + // Connect to remote peer + NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm)); + connection->proxyAppendPtr = &connection->proxyAppend; + } + if (resources->netRecvComm == NULL) { + *done = 0; + return ncclSuccess; + } + *done = 1; NCCLCHECK(ncclNetCloseListen(resources->netListenComm)); - if (resources->shared) { + // Create structures + struct connectMap* map = &resources->map; + map->sameProcess = + comm->peerInfo[resources->rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0; + if (map->sameProcess == 0) return ncclInternalError; // We don't support remote proxy for recv + map->shared = resources->shared; + + if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + NCCL_NET_MAP_ADD_POINTER(map, 0, resources->useGdr, comm->buffSizes[p], buffs[p]); + resources->buffSizes[p] = comm->buffSizes[p]; + } + } else { // Get shared buffers - int loc = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM; - NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, resources->useGdr, resources->buffSizes+loc, resources->buffers+loc)); - resources->mhandlesProto[NCCL_PROTO_SIMPLE] = resources->mhandles+loc; + int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM; + struct connectMapMem* mapMem = map->mems+bank; + NCCLCHECK(sharedBuffersInit( + comm, resources->useGdr, resources->localRank, 1, 1, comm->p2pnChannels, + &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, NULL)); + resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size; + NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]); } - if (resources->buffSizes[LOC_DEVMEM]) { - NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM])); + NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem); + NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem); + + if (map->mems[NCCL_NET_MAP_DEVMEM].size) { + if (resources->shared == 0) { + NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size)); + map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr; + } + } + NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size)); + map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr; + if (ncclGdrCopy && map->sameProcess) { + uint64_t *cpuPtr, *gpuPtr; + NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc)); + + if (ncclParamGdrCopySyncEnable()) { + resources->gdcSync = cpuPtr; + struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM; + gdcMem->cpuPtr = (char*)cpuPtr; + gdcMem->gpuPtr = (char*)gpuPtr; + gdcMem->size = sizeof(uint64_t); + } + if (ncclParamGdrCopyFlushEnable()) resources->gdcFlush = cpuPtr + 1; } - if (resources->buffSizes[LOC_HOSTMEM]) { - NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM])); + + resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem); + resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem); + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]); + if (resources->buffers[p]) { + NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p])); + } } + + //NCCLCHECK(netDumpMap(map)); + if (respSize != sizeof(struct connectMap)) return ncclInternalError; + memcpy(respBuff, map, sizeof(struct connectMap)); return ncclSuccess; } -ncclResult_t netSendFree(void* transportResources) { - struct netSendResources* resources = (struct netSendResources*)transportResources; - NCCLCHECK(ncclCudaHostFree(resources->sendMem)); - NCCLCHECK(ncclCudaHostFree(resources->recvMem)); - for (int l=0; l<LOC_COUNT; l++) { - if (resources->buffers[l]) - NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[l])); +static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { + struct sendResources* resources = (struct sendResources*)(connection->transportResources); + if (resources == NULL) { // NVB Preconnect + NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 0)); + return ncclSuccess; + } + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + if (resources->buffers[p]) { + NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[p])); + } } - if (resources->shared == 0) { - NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM])); - CUDACHECK(cudaFree(resources->buffers[LOC_DEVMEM])); + struct connectMapMem* mems = resources->map.mems; + if (resources->map.sameProcess) { + NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); + } else { + NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, NULL, mems[NCCL_NET_MAP_HOSTMEM].size)); + } + CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); + if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); + if (resources->shared) { + NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 0)); + if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { + struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->remoteRank; + comms->sendRefCount[resources->channelId]--; + if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comms->sendComm[resources->channelId])); + } else { + NCCLCHECK(ncclNetCloseSend(resources->netSendComm)); + } + } else { + NCCLCHECK(ncclNetCloseSend(resources->netSendComm)); } - NCCLCHECK(ncclNetCloseSend(resources->netSendComm)); free(resources); return ncclSuccess; } -ncclResult_t netRecvFree(void* transportResources) { - struct netRecvResources* resources = (struct netRecvResources*)transportResources; - // GDRCOPY support - if (resources->gdrFlushDesc) { - NCCLCHECK(ncclGdrCudaFree(resources->gdrFlushDesc)); - } - // GDRCOPY support - if (resources->gdrMemDesc) { - NCCLCHECK(ncclGdrCudaFree(resources->gdrMemDesc)); +static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { + struct recvResources* resources = (struct recvResources*)(connection->transportResources); + if (resources == NULL) { // NVB Preconnect + NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 1)); + return ncclSuccess; } - NCCLCHECK(ncclCudaHostFree(resources->sendMem)); - NCCLCHECK(ncclCudaHostFree(resources->recvMem)); - for (int l=0; l<LOC_COUNT; l++) { - if (resources->buffers[l]) - NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[l])); + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { + if (resources->buffers[p]) { + NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[p])); + } } - if (resources->shared == 0) { - NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM])); - CUDACHECK(cudaFree(resources->buffers[LOC_DEVMEM])); + struct connectMapMem* mems = resources->map.mems; + NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr)); + CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr)); + if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc)); + if (resources->shared) { + NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 1)); + if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) { + struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->proxyRank; + comms->recvRefCount[resources->channelId]--; + if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comms->recvComm[resources->channelId])); + } else { + NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm)); + } + } else { + NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm)); } - NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm)); free(resources); return ncclSuccess; } static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps"); -ncclResult_t netSendProxy(struct ncclProxyArgs* args) { +static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { for (int s=0; s<args->nsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; - struct netSendResources* resources = (struct netSendResources*) (sub->connector->transportResources); + struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources); // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->transmitted = sub->done = 0; + for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin); } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; + int maxDepth = std::min(NCCL_STEPS, NCCL_SHARED_STEPS/args->nsubs); for (int s=0; s<args->nsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; if (sub->done == sub->nsteps) continue; - struct netSendResources* resources = (struct netSendResources*) (sub->connector->transportResources); - void* mhandle = *(resources->mhandlesProto[p]); - int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS; - char* localBuff = sub->connector->conn.buffs[p]; + struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources); + void* mhandle = resources->mhandles[p]; + int stepSize = resources->buffSizes[p] / NCCL_STEPS; + char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); int buffSize = stepSize*args->sliceSteps; - if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR; - if (sub->sendbytes < buffSize) buffSize = sub->sendbytes; + if (sub->nbytes < buffSize) buffSize = sub->nbytes; // Post buffers to the GPU - if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) { + if (sub->posted < sub->nsteps && sub->posted < sub->done + maxDepth) { int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; if (resources->shared) { - char* ptr; - int sharedBuffSlot = sub->posted%NCCL_STEPS; - NCCLCHECK(ncclProxySharedBuffersGetP2p(sub->connector->comm, resources->useGdr, 0, sub->channel->id, sharedBuffSlot, s, &ptr)); - resources->recvMem->ptrsFifo[buffSlot] = ptr; + int sharedBuffSlot = sub->posted%maxDepth; + int offset; + NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset)); + resources->recvMem->offsFifo[buffSlot] = offset; __sync_synchronize(); - volatile uint64_t* sendHead = &resources->sendMem->head; + volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head; sub->posted += args->sliceSteps; *sendHead = sub->base + sub->posted - NCCL_STEPS; + if (resources->gdcSync) wc_store_fence(); // Flush out WC write } else sub->posted += args->sliceSteps; + for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) { + ncclProfilingRecord(args, s, step, ncclProxyProfileSendGPUWait); + } args->idle = 0; continue; } @@ -352,7 +822,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) { // We have something to receive, let's check if it's completely ready. int size = sizesFifo[buffSlot]; - char* buff = resources->shared ? (char*)resources->recvMem->ptrsFifo[buffSlot] : localBuff+buffSlot*stepSize; + char* buff = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize; int ready = 1; if (p == NCCL_PROTO_LL128) { ready = resources->useGdr; @@ -379,13 +849,14 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { } if (ready) { // Data is ready, try to send. - NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, sub->requests+buffSlot)); + NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot)); if (sub->requests[buffSlot] != NULL) { - TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend (LL) posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]); + TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]); sizesFifo[buffSlot] = -1; // Make sure size is reset to zero before we update the head. __sync_synchronize(); sub->transmitted += args->sliceSteps; + for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileSendWait); args->idle = 0; continue; } @@ -400,9 +871,12 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { if (done) { TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]); sub->done += args->sliceSteps; + for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd); if (resources->shared == 0) { - resources->sendMem->head = sub->base + sub->done; + volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head; + *sendHead = sub->base + sub->done; + if (resources->gdcSync) wc_store_fence(); // Flush out WC write } args->idle = 0; if (sub->done == sub->nsteps) { @@ -419,111 +893,203 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { return ncclSuccess; } -ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { +static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) { if (args->state == ncclProxyOpReady) { + // Initialize subs and group them by same recvComm. + void* recvComm; + int groupSize = 0; + int maxRecvs = 1; for (int s=0; s<args->nsubs; s++) { struct ncclProxySubArgs* sub = args->subs+s; - struct netRecvResources* resources = (struct netRecvResources*) (sub->connector->transportResources); + if (groupSize == maxRecvs) { + groupSize = 0; + } else if (s>0) { // Find next sub with the same recvComm + int next; + for (next=s; next<args->nsubs; next++) { + struct recvResources* nextRes = (struct recvResources*) (args->subs[next].connection->transportResources); + if (nextRes->netRecvComm == recvComm) break; + } + if (next == args->nsubs) { // Not found + groupSize = 0; + } else if (s != next) { // We found a sub later with the same recvComm ; swap subs + struct ncclProxySubArgs temp; + memcpy(&temp, sub, sizeof(struct ncclProxySubArgs)); + memcpy(sub, args->subs+next, sizeof(struct ncclProxySubArgs)); + memcpy(args->subs+next, &temp, sizeof(struct ncclProxySubArgs)); + } + } + groupSize++; + struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + maxRecvs = resources->maxRecvs; + recvComm = resources->netRecvComm; // Round to next multiple of sliceSteps sub->base = ROUNDUP(resources->step, args->chunkSteps); sub->posted = sub->received = sub->transmitted = sub->done = 0; + for (int i=0; i<groupSize; i++) sub[-i].groupSize = groupSize; + for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin); } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; - for (int s=0; s<args->nsubs; s++) { - struct ncclProxySubArgs* sub = args->subs+s; - if (sub->done == sub->nsteps) continue; - struct netRecvResources* resources = (struct netRecvResources*) (sub->connector->transportResources); - void* mhandle = *(resources->mhandlesProto[p]); - int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS; - char* localBuff = sub->connector->conn.buffs[p]; - int buffSize = stepSize*args->sliceSteps; - if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR; - if (sub->recvbytes < buffSize) buffSize = sub->recvbytes; + int maxDepth = std::min(NCCL_STEPS, NCCL_SHARED_STEPS/args->nsubs); + for (int s=0; s<args->nsubs; s+=args->subs[s].groupSize) { + struct ncclProxySubArgs* subGroup = args->subs+s; + int subCount = 0; + void* ptrs[NCCL_PROXY_MAX_SUBS]; + int sizes[NCCL_PROXY_MAX_SUBS]; + int tags[NCCL_PROXY_MAX_SUBS]; + void* mhandles[NCCL_PROXY_MAX_SUBS]; - if ((sub->posted < sub->done + NCCL_STEPS) && (sub->posted < sub->nsteps)) { - int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; - char* ptr; - if (resources->shared) { - int sharedBuffSlot = sub->posted%NCCL_STEPS; - NCCLCHECK(ncclProxySharedBuffersGetP2p(sub->connector->comm, resources->useGdr, 1, sub->channel->id, sharedBuffSlot, s, &ptr)); - volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo; - ptrsFifo[buffSlot] = ptr; - } else { - ptr = localBuff+buffSlot*stepSize; + for (int i=0; i<subGroup->groupSize; i++) { + struct ncclProxySubArgs* sub = subGroup + i; + if (sub->posted < sub->nsteps) { + if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; } + struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + int stepSize = resources->buffSizes[p] / NCCL_STEPS; + char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); + int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; + if (resources->shared) { + int sharedBuffSlot = sub->posted%maxDepth; + int offset; + NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset)); + volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo; + offsFifo[buffSlot] = offset; + ptrs[subCount] = localBuff+offset; + } else { + ptrs[subCount] = localBuff+buffSlot*stepSize; + } + sizes[subCount] = stepSize*args->sliceSteps; + if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes; + tags[subCount] = resources->remoteRank; + mhandles[subCount] = resources->mhandles[p]; + subCount++; } - NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, sub->requests+buffSlot)); - if (sub->requests[buffSlot] != NULL) { - TRACE(NCCL_NET, "recvProxy [%ld/%d] posted recv request %p", sub->posted, buffSlot, sub->requests[buffSlot]); - sub->posted += args->sliceSteps; + } + if (subCount) { + uint64_t step = subGroup->posted; + struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources); + void** requestPtr = subGroup->requests+(step%NCCL_STEPS); + NCCLCHECK(ncclNetIrecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr)); + if (*requestPtr) { + for (int i=0; i<subGroup->groupSize; i++) { + struct ncclProxySubArgs* sub = subGroup+i; + sub->posted += args->sliceSteps; + for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvWait); + } args->idle = 0; - continue; } } - if (sub->posted > sub->received) { - int buffSlot = (sub->base+sub->received)%NCCL_STEPS; - int done, size; - NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, &size)); - if (done) { - sub->received += args->sliceSteps; - if (size > 0 && p == NCCL_PROTO_SIMPLE && resources->useGdr) { - // Don't pass data to the GPU yet, flush first. + } + if (args->idle == 0) return ncclSuccess; + for (int s=0; s<args->nsubs; s+=args->subs[s].groupSize) { + struct ncclProxySubArgs* subGroup = args->subs+s; + if (subGroup->posted > subGroup->received) { + uint64_t step = subGroup->received; + int done; + void* ptrs[NCCL_PROXY_MAX_SUBS]; + int sizes[NCCL_PROXY_MAX_SUBS]; + void* mhandles[NCCL_PROXY_MAX_SUBS]; + for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) sizes[i] = 0; + NCCLCHECK(ncclNetTest(subGroup->requests[step%NCCL_STEPS], &done, sizes)); + if (done) { + int useGdr = 0; + int totalSize = 0; + for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) totalSize += sizes[i]; + for (int i=0; i<subGroup->groupSize; i++) { + struct ncclProxySubArgs* sub = subGroup + i; + sub->received += args->sliceSteps; + for (uint64_t step=sub->received-args->sliceSteps; step<sub->received; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait); + if (step < sub->nsteps) { + struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + if (resources->useGdr) useGdr = 1; + } + } + subGroup->requests[step%NCCL_STEPS] = NULL; + if (totalSize > 0 && p == NCCL_PROTO_SIMPLE && useGdr) { // GDRCOPY support - if (resources->devFlushMem) { + struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources); + if (resources->gdcFlush) { #if defined (__x86_64__) // Force a PCI-E read from GPU memory - asm volatile ("mov (%0), %%eax" :: "l"(resources->devFlushMem) : "%eax"); + asm volatile ("mov (%0), %%eax" :: "l"(resources->gdcFlush) : "%eax"); #else WARN("NET: GDR Flush only supported on x86_64"); return ncclInternalError; #endif - sub->requests[buffSlot] = NULL; } else { - volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo; - char* ptr = resources->shared ? (char*)(ptrsFifo[buffSlot]) : localBuff+buffSlot*stepSize; - NCCLCHECK(ncclNetIflush(resources->netRecvComm, ptr, size, mhandle, sub->requests+buffSlot)); + int subCount = 0; + for (int i=0; i<subGroup->groupSize; i++) { + struct ncclProxySubArgs* sub = subGroup + i; + if (step < sub->nsteps) { + struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + int stepSize = resources->buffSizes[p] / NCCL_STEPS; + char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]); + int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; + ptrs[subCount] = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize; + mhandles[subCount] = resources->mhandles[p]; + subCount++; + } + } + struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources); + NCCLCHECK(ncclNetIflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS))); } - } else { - sub->requests[buffSlot] = NULL; } args->idle = 0; - continue; } } - if (sub->received > sub->transmitted) { - // Progress flush operations - int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; + } + if (args->idle == 0) return ncclSuccess; + + for (int s=0; s<args->nsubs; s+=args->subs[s].groupSize) { + struct ncclProxySubArgs* subGroup = args->subs+s; + if (subGroup->received > subGroup->transmitted) { + uint64_t step = subGroup->transmitted; int done = 1; - if (sub->requests[buffSlot]) NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL)); + void* request = subGroup->requests[step%NCCL_STEPS]; + if (request) NCCLCHECK(ncclNetTest(request, &done, NULL)); if (done) { - sub->transmitted += args->sliceSteps; - __sync_synchronize(); - if (resources->devRecvMem) { - // GDRCOPY support: Write updated tail directly to the device memory - resources->devRecvMem->tail = sub->base + sub->transmitted; - wc_store_fence(); // Flush out WC write - } else { - resources->recvMem->tail = sub->base + sub->transmitted; + for (int i=0; i<subGroup->groupSize; i++) { + struct ncclProxySubArgs* sub = subGroup + i; + sub->transmitted += args->sliceSteps; + for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvGPUWait); + if (step < sub->nsteps) { + __sync_synchronize(); + struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail; + *recvTail = sub->base + sub->transmitted; + if (resources->gdcSync) wc_store_fence(); // Flush out WC write + } } args->idle = 0; - continue; } } - if (sub->transmitted > sub->done) { - volatile uint64_t* sendHead = &resources->sendMem->head; - uint64_t done = *sendHead; - while (done > sub->base + sub->done && - // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted. - sub->transmitted > sub->done) { - sub->done += args->sliceSteps; - args->idle = 0; - if (sub->done == sub->nsteps) { - resources->step = sub->base + sub->nsteps; - args->done++; + } + if (args->idle == 0) return ncclSuccess; + + for (int s=0; s<args->nsubs; s+=args->subs[s].groupSize) { + struct ncclProxySubArgs* subGroup = args->subs+s; + for (int i=0; i<subGroup->groupSize; i++) { + struct ncclProxySubArgs* sub = subGroup + i; + if (sub->done == sub->nsteps) continue; + if (sub->transmitted > sub->done) { + struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + volatile uint64_t* sendHead = &resources->sendMem->head; + uint64_t done = *sendHead; + while (done > sub->base + sub->done && + // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted. + sub->transmitted > sub->done) { + sub->done += args->sliceSteps; + for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileEnd); + args->idle = 0; + if (sub->done == sub->nsteps) { + struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources); + resources->step = sub->base + sub->nsteps; + args->done++; + break; + } } } } @@ -537,7 +1103,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { struct ncclTransport netTransport = { "NET", - netCanConnect, - { netSendSetup, netSendConnect, netSendFree, netSendProxy }, - { netRecvSetup, netRecvConnect, netRecvFree, netRecvProxy } + canConnect, + { sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress }, + { recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress } }; diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index db27eae..4edff0f 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -20,26 +20,44 @@ #include <poll.h> #include <sys/types.h> #include <unistd.h> +#define ENABLE_TIMER 0 +#include "timer.h" #include "ibvwrap.h" #define USE_RDMA_WRITE 1 #define MAXNAMESIZE 64 static char ncclIbIfName[MAX_IF_NAME_SIZE+1]; -static union socketAddress ncclIbIfAddr; +static union ncclSocketAddress ncclIbIfAddr; + +struct ncclIbMr { + uintptr_t addr; + int pages; + int refs; + ibv_mr *mr; +}; + +struct ncclIbMrCache { + struct ncclIbMr *slots; + int capacity, population; +}; static int ncclNIbDevs = -1; struct ncclIbDev { + pthread_mutex_t lock; int device; uint64_t guid; uint8_t port; uint8_t link; int speed; ibv_context* context; + int pdRefs; + ibv_pd* pd; char devName[MAXNAMESIZE]; char* pciPath; int realPort; int maxQp; + struct ncclIbMrCache mrCache; }; #define MAX_IB_PORT 15 @@ -52,6 +70,7 @@ struct userIbDev { struct ncclIbDev ncclIbDevs[MAX_IB_DEVS]; struct userIbDev userIbDevs[MAX_IB_DEVS]; pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER; +static int ncclIbRelaxedOrderingEnabled = 0; NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", 0); NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 14); @@ -61,6 +80,7 @@ NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0); NCCL_PARAM(IbSl, "IB_SL", 0); NCCL_PARAM(IbTc, "IB_TC", 0); NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192); +NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2); pthread_t ncclIbAsyncThread; static void* ncclIbAsyncThreadMain(void* args) { @@ -114,17 +134,28 @@ static int ncclIbSpeed(int speed) { return ibvSpeeds[firstBitSet(speed, sizeof(ibvSpeeds)/sizeof(int)-1)]; } +// Determine whether RELAXED_ORDERING is enabled and possible +static int ncclIbRelaxedOrderingCapable(void) { + int roMode = ncclParamIbPciRelaxedOrdering(); + ncclResult_t r = ncclInternalError; + if (roMode == 1 || roMode == 2) { + // Query IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support + r = wrap_ibv_reg_mr_iova2(NULL, NULL, NULL, 0, 0, 0); + } + return r == ncclInternalError ? 0 : 1; +} + ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { + if (ncclParamIbDisable()) return ncclInternalError; static int shownIbHcaEnv = 0; if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; } - if (ncclParamIbDisable()) return ncclInternalError; if (ncclNIbDevs == -1) { pthread_mutex_lock(&ncclIbLock); wrap_ibv_fork_init(); if (ncclNIbDevs == -1) { ncclNIbDevs = 0; - if (findInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) { + if (ncclFindInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) { WARN("NET/IB : No IP interface found."); return ncclInternalError; } @@ -175,18 +206,26 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { } TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port, portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); + pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL); ncclIbDevs[ncclNIbDevs].device = d; ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid; ncclIbDevs[ncclNIbDevs].port = port; ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer; ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width); ncclIbDevs[ncclNIbDevs].context = context; + ncclIbDevs[ncclNIbDevs].pdRefs = 0; + ncclIbDevs[ncclNIbDevs].pd = NULL; strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE); NCCLCHECK(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort)); ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp; + ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0; + ncclIbDevs[ncclNIbDevs].mrCache.population = 0; + ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL; + + pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context); + ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs); ncclNIbDevs++; nPorts++; - pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context); } if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } } @@ -197,13 +236,16 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { } else { char line[1024]; line[0] = '\0'; + // Determine whether RELAXED_ORDERING is enabled and possible + ncclIbRelaxedOrderingEnabled = ncclIbRelaxedOrderingCapable(); for (int d=0; d<ncclNIbDevs; d++) { snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%d/%s", d, ncclIbDevs[d].devName, ncclIbDevs[d].port, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); } line[1023] = '\0'; char addrline[SOCKET_NAME_MAXLEN+1]; - INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s ; OOB %s:%s", line, ncclIbIfName, socketToString(&ncclIbIfAddr, addrline)); + INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "", + ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline)); } pthread_mutex_unlock(&ncclIbLock); } @@ -231,11 +273,13 @@ ncclResult_t ncclIbGdrSupport(int ibDev) { return ncclSuccess; } -static ncclResult_t GetSocketAddr(union socketAddress* addr) { +static ncclResult_t GetSocketAddr(union ncclSocketAddress* addr) { memcpy(addr, &ncclIbIfAddr, sizeof(*addr)); return ncclSuccess; } +#define NCCL_NET_IB_MAX_RECVS 8 + ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) { props->name = ncclIbDevs[dev].devName; props->pciPath = ncclIbDevs[dev].pciPath; @@ -247,18 +291,23 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) { props->ptrSupport |= NCCL_PTR_CUDA; } props->speed = ncclIbDevs[dev].speed; + props->latency = 0; // Not set props->port = ncclIbDevs[dev].port + ncclIbDevs[dev].realPort; props->maxComms = ncclIbDevs[dev].maxQp; + props->maxRecvs = NCCL_NET_IB_MAX_RECVS; return ncclSuccess; } -#define MAX_REQUESTS NCCL_NET_MAX_REQUESTS +// We need to support NCCL_NET_MAX_REQUESTS for each concurrent receive +#define MAX_REQUESTS (NCCL_NET_MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS) +static_assert(MAX_REQUESTS <= 256, "request id are encoded in wr_id and we need up to 8 requests ids per completion"); #define NCCL_IB_MAX_QPS 128 struct ncclIbQpInfo { uint32_t lid; uint8_t ib_port; + uint8_t link_layer; uint32_t qpn[NCCL_IB_MAX_QPS]; // For RoCE @@ -271,46 +320,83 @@ struct ncclIbQpInfo { uint64_t fifoAddr; }; +enum ncclIbCommState { + ncclIbCommStateStart = 0, + ncclIbCommStateConnect = 1, + ncclIbCommStateAccept = 3, + ncclIbCommStateSend = 4, + ncclIbCommStateRecv = 5, + ncclIbCommStateConnected = 6, +}; + +struct ncclIbCommStage { + enum ncclIbCommState state; + int offset; + void* buffer; + void* comm; +}; + struct ncclIbHandle { - union socketAddress connectAddr; + union ncclSocketAddress connectAddr; // Filled by the target + struct ncclIbCommStage stage; // Used by the other side when connecting }; +#define NCCL_NET_IB_REQ_UNUSED 0 +#define NCCL_NET_IB_REQ_SEND 1 +#define NCCL_NET_IB_REQ_RECV 2 +#define NCCL_NET_IB_REQ_FLUSH 3 + struct ncclIbRequest { - int used; - int type; struct ncclIbVerbs* verbs; + int type; int events; - int size; - union socketAddress *addr; + union ncclSocketAddress *addr; + int nreqs; + union { + struct { + int size; + void* data; + uint32_t lkey; + int offset; + } send; + struct { + int sizes[NCCL_NET_IB_MAX_RECVS]; + } recv; + }; }; struct ncclIbVerbs { - struct ibv_pd* pd; + int dev; + struct ibv_pd* pd; // duplicate of ncclIbDevs[dev].pd struct ibv_cq* cq; - uint64_t pad[2]; + uint64_t pad[1]; struct ncclIbRequest reqs[MAX_REQUESTS]; }; struct ncclIbListenComm { int dev; - int fd; + struct ncclSocket sock; + struct ncclIbCommStage stage; }; struct ncclIbSendFifo { uint64_t addr; int size; - uint32_t seq; uint32_t rkey; - uint32_t ready; - uint64_t pad[1]; // Pad FIFO element size to be 32-bytes + uint32_t nreqs; + uint32_t tag; + uint64_t idx; }; struct ncclIbSendComm { struct ncclIbVerbs verbs; - struct ncclIbSendFifo fifo[MAX_REQUESTS]; - uint32_t fifoHead; - int fd; - union socketAddress addr; + struct ncclIbSendFifo fifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; + uint64_t fifoHead; + struct ncclIbRequest* fifoReqs[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; + struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS+1]; + struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS]; + struct ncclSocket sock; + int ready; struct ibv_qp* qps[NCCL_IB_MAX_QPS]; int nqps; @@ -331,10 +417,10 @@ struct ncclIbGpuFlush { }; struct ncclIbRemFifo { - struct ncclIbSendFifo elems[MAX_REQUESTS]; + struct ncclIbSendFifo elems[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS]; + uint64_t fifoTail; uint64_t addr; uint32_t rkey; - uint32_t tail; uint32_t flags; struct ibv_mr* mr; struct ibv_sge sge; @@ -343,8 +429,7 @@ struct ncclIbRemFifo { struct ncclIbRecvComm { struct ncclIbVerbs verbs; struct ncclIbRemFifo remFifo; - int fd; - union socketAddress addr; + struct ncclSocket sock; int ready; struct ibv_qp* qps[NCCL_IB_MAX_QPS]; int nqps; @@ -354,17 +439,39 @@ static_assert((offsetof(struct ncclIbRecvComm, remFifo) % 32) == 0, "ncclIbSendC NCCL_PARAM(IbQpsPerConn, "IB_QPS_PER_CONNECTION", 1); -ncclResult_t ncclIbInitVerbs(ibv_context* ctx, struct ncclIbVerbs* verbs) { - NCCLCHECK(wrap_ibv_alloc_pd(&verbs->pd, ctx)); +ncclResult_t ncclIbInitVerbs(int dev, struct ibv_context* ctx, struct ncclIbVerbs* verbs) { + verbs->dev = dev; + + pthread_mutex_lock(&ncclIbDevs[dev].lock); + if (0 == ncclIbDevs[dev].pdRefs++) { + ncclResult_t res; + NCCLCHECKGOTO(wrap_ibv_alloc_pd(&ncclIbDevs[dev].pd, ctx), res, failure); + if (0) { + failure: + pthread_mutex_unlock(&ncclIbDevs[dev].lock); + return res; + } + } + verbs->pd = ncclIbDevs[dev].pd; + pthread_mutex_unlock(&ncclIbDevs[dev].lock); + // Recv requests can generate 2 completions (one for the post FIFO, one for the Recv). NCCLCHECK(wrap_ibv_create_cq(&verbs->cq, ctx, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), NULL, NULL, 0)); return ncclSuccess; } ncclResult_t ncclIbDestroyVerbs(struct ncclIbVerbs* verbs) { + ncclResult_t res; NCCLCHECK(wrap_ibv_destroy_cq(verbs->cq)); - NCCLCHECK(wrap_ibv_dealloc_pd(verbs->pd)); - return ncclSuccess; + + pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock); + if (0 == --ncclIbDevs[verbs->dev].pdRefs) { + NCCLCHECKGOTO(wrap_ibv_dealloc_pd(ncclIbDevs[verbs->dev].pd), res, returning); + } + res = ncclSuccess; +returning: + pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock); + return res; } ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int access_flags, struct ibv_qp** qp) { @@ -390,7 +497,7 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int acce return ncclSuccess; } -ncclResult_t ncclIbRtrQp(ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) { +ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) { struct ibv_qp_attr qpAttr; memset(&qpAttr, 0, sizeof(struct ibv_qp_attr)); qpAttr.qp_state = IBV_QPS_RTR; @@ -399,7 +506,7 @@ ncclResult_t ncclIbRtrQp(ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) { qpAttr.rq_psn = 0; qpAttr.max_dest_rd_atomic = 1; qpAttr.min_rnr_timer = 12; - if (info->lid == 0) { + if (info->link_layer == IBV_LINK_LAYER_ETHERNET) { qpAttr.ah_attr.is_global = 1; qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->spn; qpAttr.ah_attr.grh.dgid.global.interface_id = info->iid; @@ -418,7 +525,7 @@ ncclResult_t ncclIbRtrQp(ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) { return ncclSuccess; } -ncclResult_t ncclIbRtsQp(ibv_qp* qp) { +ncclResult_t ncclIbRtsQp(struct ibv_qp* qp) { struct ibv_qp_attr qpAttr; memset(&qpAttr, 0, sizeof(struct ibv_qp_attr)); qpAttr.qp_state = IBV_QPS_RTS; @@ -431,33 +538,56 @@ ncclResult_t ncclIbRtsQp(ibv_qp* qp) { return ncclSuccess; } - ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) { struct ncclIbListenComm* comm; NCCLCHECK(ncclCalloc(&comm, 1)); struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle; static_assert(sizeof(struct ncclIbHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclIbHandle size too large"); + memset(handle, 0, sizeof(struct ncclIbHandle)); comm->dev = dev; - NCCLCHECK(GetSocketAddr(&(handle->connectAddr))); - NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr)); + NCCLCHECK(GetSocketAddr(&comm->sock.addr)); + NCCLCHECK(ncclSocketListen(&comm->sock)); + memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress)); *listenComm = comm; return ncclSuccess; } ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) { - struct ncclIbSendComm* comm; - NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm))); - struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle; - NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr)); - *sendComm = comm; + enum ncclSocketState conState; + struct ncclIbCommStage* stage = &handle->stage; + struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm; + *sendComm = NULL; + + if (stage->state == ncclIbCommStateConnect) goto ib_connect_check; + if (stage->state == ncclIbCommStateSend) goto ib_send; + if (stage->state != ncclIbCommStateStart) { + WARN("Error: trying to connect already connected sendComm"); + return ncclInternalError; + } - comm->addr = handle->connectAddr; + NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm))); + NCCLCHECK(ncclSocketInit(&comm->sock, &handle->connectAddr, NULL, 1)); + stage->comm = comm; + stage->state = ncclIbCommStateConnect; + NCCLCHECK(ncclSocketConnect(&comm->sock)); + +ib_connect_check: + /* since ncclSocketConnect is async, we must check if connection is complete */ + NCCLCHECK(ncclGetSocketState(&comm->sock, &conState)); + if (conState == ncclSocketConnecting) { + /* expect user to call again */ + return ncclSuccess; + } else if (conState == ncclSocketError) { + return ncclSystemError; + } // IB Setup - ibv_context* ctx = ncclIbDevs[dev].context; - NCCLCHECK(ncclIbInitVerbs(ctx, &comm->verbs)); - uint8_t ib_port = ncclIbDevs[dev].port; + struct ibv_context* ctx; + ctx = ncclIbDevs[dev].context; + NCCLCHECK(ncclIbInitVerbs(dev, ctx, &comm->verbs)); + uint8_t ib_port; + ib_port = ncclIbDevs[dev].port; comm->nqps = ncclParamIbQpsPerConn(); for (int q=0; q<comm->nqps; q++) { NCCLCHECK(ncclIbCreateQp(ib_port, &comm->verbs, IBV_ACCESS_REMOTE_WRITE, comm->qps+q)); @@ -472,13 +602,14 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) { qpInfo.mtu = portAttr.active_mtu; // Prepare my fifo - NCCLCHECK(wrap_ibv_reg_mr(&comm->fifoMr, comm->verbs.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); + NCCLCHECK(wrap_ibv_reg_mr(&comm->fifoMr, comm->verbs.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); qpInfo.fifoRkey = comm->fifoMr->rkey; qpInfo.fifoAddr = (uint64_t)comm->fifo; // RoCE support qpInfo.lid = portAttr.lid; - if (qpInfo.lid) { // IB + qpInfo.link_layer = portAttr.link_layer; + if (qpInfo.link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB for (int q=0; q<comm->nqps; q++) INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, qpInfo.lid); } else { // RoCE @@ -490,7 +621,19 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) { INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid); } - NCCLCHECK(socketSend(comm->fd, &comm->addr, &qpInfo, sizeof(qpInfo))); + stage->state = ncclIbCommStateSend; + stage->offset = 0; + NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(qpInfo))); + memcpy(stage->buffer, &qpInfo, sizeof(qpInfo)); + +ib_send: + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, stage->buffer, sizeof(qpInfo), &stage->offset)); + if (stage->offset != sizeof(qpInfo)) + return ncclSuccess; + + free(stage->buffer); + stage->state = ncclIbCommStateConnected; + *sendComm = comm; return ncclSuccess; } @@ -498,24 +641,53 @@ NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0); ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) { struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm; - struct ncclIbRecvComm* rComm; + struct ncclIbCommStage* stage = &lComm->stage; + struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm; + *recvComm = NULL; + + if (stage->state == ncclIbCommStateAccept) goto ib_accept; + if (stage->state == ncclIbCommStateRecv) goto ib_recv; + if (stage->state == ncclIbCommStateSend) goto ib_send; + if (stage->state != ncclIbCommStateStart) { + WARN("Listencomm in unknown state %d\n", stage->state); + return ncclInternalError; + } + NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm))); + stage->comm = rComm; + stage->state = ncclIbCommStateAccept; + lComm->sock.asyncFlag = 1; + rComm->sock.asyncFlag = 1; + +ib_accept: + NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock)); + if (rComm->sock.fd == -1) + return ncclSuccess; - socklen_t socklen = sizeof(union socketAddress); - SYSCHECKVAL(accept(lComm->fd, &rComm->addr.sa, &socklen), "accept", rComm->fd); struct ncclIbQpInfo remQpInfo; - NCCLCHECK(socketRecv(rComm->fd, &rComm->addr, &remQpInfo, sizeof(remQpInfo))); + stage->state = ncclIbCommStateRecv; + stage->offset = 0; + NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remQpInfo))); +ib_recv: + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->sock, stage->buffer, sizeof(remQpInfo), &stage->offset)); + if (stage->offset != sizeof(remQpInfo)) + return ncclSuccess; + + /* copy back the received info */ + memcpy(&remQpInfo, stage->buffer, sizeof(struct ncclIbQpInfo)); // IB setup - ibv_context* ctx = ncclIbDevs[lComm->dev].context; - uint8_t ib_port = ncclIbDevs[lComm->dev].port; + struct ibv_context* ctx; + uint8_t ib_port; + ctx = ncclIbDevs[lComm->dev].context; + ib_port = ncclIbDevs[lComm->dev].port; struct ibv_port_attr portAttr; NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr)); union ibv_gid gid; NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid)); // QP Creation - NCCLCHECK(ncclIbInitVerbs(ctx, &rComm->verbs)); + NCCLCHECK(ncclIbInitVerbs(lComm->dev, ctx, &rComm->verbs)); rComm->nqps = ncclParamIbQpsPerConn(); for (int q=0; q<rComm->nqps; q++) { NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_REMOTE_WRITE, rComm->qps+q)); @@ -534,8 +706,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) { // Retain remote fifo info and prepare my RDMA ops rComm->remFifo.rkey = remQpInfo.fifoRkey; rComm->remFifo.addr = remQpInfo.fifoAddr; - NCCLCHECK(wrap_ibv_reg_mr(&rComm->remFifo.mr, rComm->verbs.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ)); - rComm->remFifo.sge.length = sizeof(struct ncclIbSendFifo); + NCCLCHECK(wrap_ibv_reg_mr(&rComm->remFifo.mr, rComm->verbs.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ)); rComm->remFifo.sge.lkey = rComm->remFifo.mr->lkey; if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE; @@ -549,6 +720,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rComm->gpuFlush.qp)); struct ncclIbQpInfo localQpInfo; localQpInfo.lid=portAttr.lid; + localQpInfo.link_layer=portAttr.link_layer; localQpInfo.ib_port=ib_port; localQpInfo.spn=gid.global.subnet_prefix; localQpInfo.iid=gid.global.interface_id; @@ -560,26 +732,39 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) { // Fill Handle struct ncclIbQpInfo qpInfo; qpInfo.lid=portAttr.lid; + qpInfo.link_layer=portAttr.link_layer; qpInfo.ib_port=ib_port; for (int q=0; q<rComm->nqps; q++) qpInfo.qpn[q]=rComm->qps[q]->qp_num; qpInfo.spn=gid.global.subnet_prefix; qpInfo.iid=gid.global.interface_id; qpInfo.mtu=remQpInfo.mtu; - NCCLCHECK(socketSend(rComm->fd, &rComm->addr, &qpInfo, sizeof(qpInfo))); + stage->state = ncclIbCommStateSend; + stage->offset = 0; + if (stage->buffer) free(stage->buffer); + NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbQpInfo))); + memcpy(stage->buffer, &qpInfo, sizeof(struct ncclIbQpInfo)); +ib_send: + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->sock, stage->buffer, sizeof(struct ncclIbQpInfo), &stage->offset)); + if (stage->offset < sizeof(struct ncclIbQpInfo)) return ncclSuccess; + + free(stage->buffer); *recvComm = rComm; + + /* reset lComm stage */ + stage->state = ncclIbCommStateStart; + stage->offset = 0; + stage->comm = NULL; + stage->buffer = NULL; return ncclSuccess; } ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest** req) { for (int i=0; i<MAX_REQUESTS; i++) { struct ncclIbRequest* r = verbs->reqs+i; - if (r->used == 0) { - r->used = 1; - r->type = 0; + if (r->type == NCCL_NET_IB_REQ_UNUSED) { r->verbs = verbs; r->events = 1; - r->size = -1; r->addr = NULL; *req = r; return ncclSuccess; @@ -590,7 +775,7 @@ ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest** return ncclInternalError; } ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) { - r->used = 0; + r->type = NCCL_NET_IB_REQ_UNUSED; return ncclSuccess; } @@ -599,9 +784,9 @@ ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) { // Do not block on this receive, return if not ready. int bytes = 0; - NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &remQpInfo, sizeof(remQpInfo), &bytes)); + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, &remQpInfo, sizeof(remQpInfo), &bytes)); if (bytes == 0) return ncclSuccess; // Try again later - NCCLCHECK(socketWait(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &remQpInfo, sizeof(remQpInfo), &bytes)); + NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, &comm->sock, &remQpInfo, sizeof(remQpInfo), &bytes)); for (int q=0; q<comm->nqps; q++) { struct ibv_qp* qp = comm->qps[q]; @@ -610,7 +795,7 @@ ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) { } comm->ready = 1; // Block until this is done. It *should* not block indefinitely. - NCCLCHECK(socketSend(comm->fd, &comm->addr, &comm->ready, sizeof(int))); + NCCLCHECK(ncclSocketSend(&comm->sock, &comm->ready, sizeof(int))); return ncclSuccess; } @@ -618,39 +803,170 @@ ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) { ncclResult_t ncclRecvCheck(struct ncclIbRecvComm* comm) { // Do not block on this receive, return if not ready. int bytes = 0; - NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &comm->ready, sizeof(int), &bytes)); + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, &comm->ready, sizeof(int), &bytes)); if (bytes == 0) return ncclSuccess; // Try again later - NCCLCHECK(socketWait(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &comm->ready, sizeof(int), &bytes)); + NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, &comm->sock, &comm->ready, sizeof(int), &bytes)); return ncclSuccess; } ncclResult_t ncclIbTest(void* request, int* done, int* size); -#define REG_ALIGN (4096) - ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) { static_assert(offsetof(struct ncclIbSendComm, verbs) == offsetof(struct ncclIbRecvComm, verbs), "Send and recv comms must have verbs at the same offset"); - struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm; - uint64_t addr = (uint64_t)data; assert(size > 0); - // Deregister / register - uint64_t regAddr = addr & (~(REG_ALIGN-1)); - uint64_t regSize = addr+size - regAddr; - regSize = ((regSize + REG_ALIGN-1) / REG_ALIGN ) * REG_ALIGN; - struct ibv_mr* mr; - NCCLCHECK(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); - *mhandle = (void*)mr; - TRACE(NCCL_INIT,"regAddr %lx size %ld rkey %x", regAddr, regSize, mr->rkey); - return ncclSuccess; + static __thread uintptr_t pageSize = 0; + if (pageSize == 0) pageSize = sysconf(_SC_PAGESIZE); + + struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm; + struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache; + uintptr_t addr = (uintptr_t)data & -pageSize; + int pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize; + ncclResult_t res; + pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock); + for (int slot=0; /*true*/; slot++) { + if (slot == cache->population) { // didn't find in cache + if (cache->population == cache->capacity) { // must grow cache + cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity; + NCCLCHECKGOTO(ncclRealloc(&cache->slots, cache->population, cache->capacity), res, returning); + } + // Deregister / register + struct ibv_mr* mr; + unsigned int flags = IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ; + if (ncclIbRelaxedOrderingEnabled) { + // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support + NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, (uintptr_t)addr, flags|IBV_ACCESS_RELAXED_ORDERING), res, returning); + } + else { + NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning); + } + TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x", (unsigned long long)addr, (long long)pages*PageSize, mr->rkey); + cache->population += 1; + cache->slots[slot].addr = addr; + cache->slots[slot].pages = pages; + cache->slots[slot].refs = 1; + cache->slots[slot].mr = mr; + *mhandle = (void*)mr; + res = ncclSuccess; + goto returning; + } + else if (cache->slots[slot].addr == addr && cache->slots[slot].pages == pages) { + cache->slots[slot].refs += 1; + *mhandle = (void*)cache->slots[slot].mr; + res = ncclSuccess; + goto returning; + } + } +returning: + pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock); + return res; } ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) { - NCCLCHECK(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle)); + struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm; + struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache; + ncclResult_t res; + pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock); + for (int i=0; i < cache->population; i++) { + if (mhandle == cache->slots[i].mr) { + if (0 == --cache->slots[i].refs) { + memmove(&cache->slots[i], &cache->slots[--cache->population], sizeof(struct ncclIbMr)); + if (cache->population == 0) { + free(cache->slots); + cache->slots = NULL; + cache->capacity = 0; + } + NCCLCHECKGOTO(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle), res, returning); + } + res = ncclSuccess; + goto returning; + } + } + WARN("NET/IB: could not find mr %p inside cache of %d entries", mhandle, cache->population); + res = ncclInternalError; +returning: + pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock); + return res; +} + +ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) { + struct ncclIbRequest** reqs = comm->fifoReqs[slot]; + volatile struct ncclIbSendFifo* slots = comm->fifo[slot]; + int nreqs = slots[0].nreqs; + if (nreqs > NCCL_NET_IB_MAX_RECVS) return ncclInternalError; + + uint64_t wr_id = 0ULL; + + for (int r=0; r<nreqs; r++) { + struct ibv_send_wr* wr = comm->wrs+r; + memset(wr, 0, sizeof(struct ibv_send_wr)); + + struct ibv_sge* sge = comm->sges+r; + sge->addr=(uintptr_t)reqs[r]->send.data; + sge->lkey=reqs[r]->send.lkey; + + wr->opcode = IBV_WR_RDMA_WRITE; + wr->send_flags = 0; + wr->wr.rdma.remote_addr = slots[r].addr; + wr->wr.rdma.rkey = slots[r].rkey; + wr->next = wr+1; + wr_id += (reqs[r] - comm->verbs.reqs) << (r*8); + } + + // Write size as immediate data. In the case of multi-send, only write + // 0 or 1 as size to indicate whether there was data sent or received. + uint64_t immData = 0; + if (nreqs == 1) { + immData = reqs[0]->send.size; + } else { + uint8_t* multiImmData = (uint8_t*)&immData; + for (int r=0; r<nreqs; r++) { + multiImmData[r] = reqs[r]->send.size ? 1 : 0; + } + } + + struct ibv_send_wr* lastWr = comm->wrs+nreqs-1; + if (nreqs > 1 || reqs[0]->send.size > ncclParamIbArThreshold()) { + // When using adaptive routing, send the bulk of the data first as an + // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote + // completion. + lastWr++; + memset(lastWr, 0, sizeof(struct ibv_send_wr)); + } + lastWr->wr_id = wr_id; + lastWr->opcode = IBV_WR_RDMA_WRITE_WITH_IMM; + lastWr->imm_data = immData; + lastWr->next = NULL; + lastWr->send_flags = IBV_SEND_SIGNALED; + + for (int q=0; q<comm->nqps; q++) { + for (int r=0; r<nreqs; r++) { + int chunkSize = std::max(8, DIVUP(reqs[r]->send.size, comm->nqps)); + int length = std::min(reqs[r]->send.size-reqs[r]->send.offset, chunkSize); + if (length <= 0) { + comm->wrs[r].sg_list = NULL; + comm->wrs[r].num_sge = 0; + } else { + comm->sges[r].length = length; + comm->wrs[r].sg_list = comm->sges+r; + comm->wrs[r].num_sge = 1; + } + } + struct ibv_send_wr* bad_wr; + NCCLCHECK(wrap_ibv_post_send(comm->qps[q], comm->wrs, &bad_wr)); + + for (int r=0; r<nreqs; r++) { + int chunkSize = std::max(8, DIVUP(reqs[r]->send.size, comm->nqps)); + reqs[r]->send.offset += chunkSize; + comm->sges[r].addr += chunkSize; + comm->wrs[r].wr.rdma.remote_addr += chunkSize; + } + } + return ncclSuccess; } -ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { +ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm; if (comm->ready == 0) NCCLCHECK(ncclSendCheck(comm)); if (comm->ready == 0) { *request = NULL; return ncclSuccess; } @@ -658,108 +974,84 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo struct ibv_mr* mr = (struct ibv_mr*)mhandle; // Wait for the receiver to have posted the corresponding receive - volatile struct ncclIbSendFifo* slot = comm->fifo + (comm->fifoHead%MAX_REQUESTS); - volatile uint32_t * readyPtr = &slot->ready; - if (*readyPtr == 0) { *request = NULL; return ncclSuccess; } - - struct ncclIbRequest* req; - NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req)); - req->size = size; - req->addr = &comm->addr; + int nreqs = 0; + volatile struct ncclIbSendFifo* slots; + + int slot = (comm->fifoHead)%MAX_REQUESTS; + struct ncclIbRequest** reqs = comm->fifoReqs[slot]; + slots = comm->fifo[slot]; + int idx = comm->fifoHead+1; + if (slots[0].idx != idx) { *request = NULL; return ncclSuccess; } + nreqs = slots[0].nreqs; + // Wait until all data has arrived + for (int r=1; r<nreqs; r++) while(slots[r].idx != idx); + __sync_synchronize(); // order the nreqsPtr load against tag/rkey/addr loads below + for (int r=0; r<nreqs; r++) { + if (reqs[r] != NULL || slots[r].tag != tag) continue; + + // Sanity checks to catch user collective call count/size mismatches + // plus any potential programming errors + if (size > slots[r].size || slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkey == 0) { + char line[SOCKET_NAME_MAXLEN+1]; + WARN("NET/IB : req %d/%d tag %x peer %s collective mismatch error local size %d remote %d addr %lx rkey %x", + r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line), size, slots[r].size, slots[r].addr, slots[r].rkey); + return ncclInternalError; + } + struct ncclIbRequest* req; + NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req)); + req->type = NCCL_NET_IB_REQ_SEND; + req->addr = &comm->sock.addr; + req->verbs = &comm->verbs; + req->nreqs = nreqs; + req->send.size = size; + req->send.data = data; + req->send.lkey = mr->lkey; + req->send.offset = 0; + req->addr = &comm->sock.addr; + req->events = comm->nqps; + *request = reqs[r] = req; + + // If this is a multi-recv, send only when all requests have matched. + for (int r=0; r<nreqs; r++) { + if (reqs[r] == NULL) return ncclSuccess; + } - struct ibv_send_wr wr[2]; - memset(&wr[0], 0, sizeof(wr[0])); - wr[0].wr_id = (uint64_t)req; + TIME_START(0); + NCCLCHECK(ncclIbMultiSend(comm, slot)); - struct ibv_sge sge; - sge.addr=(uintptr_t)data; sge.lkey=mr->lkey; - -#if USE_RDMA_WRITE == 0 - wr[0].opcode = IBV_WR_SEND; - wr[0].send_flags = IBV_SEND_SIGNALED; -#else - __sync_synchronize(); // order the readyPtr load against rkey load below - // Sanity checks to catch user collective call count/size mismatches - // plus any potential programming errors - if (size > slot->size || slot->size < 0 || slot->addr == 0 || slot->rkey == 0 || slot->seq != comm->fifoHead) { - char line[SOCKET_NAME_MAXLEN+1]; - WARN("NET/IB : peer %s collective mismatch error local size %d remote %d addr %lx rkey %x seq %x/%x", - socketToString(req->addr, line), size, slot->size, slot->addr, slot->rkey, slot->seq, comm->fifoHead); - return ncclInternalError; + // Clear slots[0]->nreqs, as well as other fields to help debugging and sanity checks + memset((void*)slots, 0, sizeof(struct ncclIbSendFifo)); + memset(reqs, 0, NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbRequest*)); + comm->fifoHead++; + TIME_STOP(0); + return ncclSuccess; } - wr[0].opcode = IBV_WR_RDMA_WRITE_WITH_IMM; - wr[0].send_flags = IBV_SEND_SIGNALED; - wr[0].wr.rdma.remote_addr = slot->addr; - wr[0].wr.rdma.rkey = slot->rkey; - wr[0].imm_data = size; // Send the message size via imm_data - __sync_synchronize(); -#endif - // We must clear slot->ready, but reset other fields to aid - // debugging and sanity checks - slot->ready = 0; - slot->addr = 0ULL; - slot->rkey = slot->size = slot->seq = 0; - comm->fifoHead++; - - -#if USE_RDMA_WRITE - // When using adaptive routing, send the bulk of the data first as an - // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote - // completion. - if (size > ncclParamIbArThreshold()) { - memset(&wr[1], 0, sizeof(wr[1])); - memcpy(&wr[1], &wr[0], sizeof(wr[0])); - wr[1].sg_list = NULL; - wr[1].num_sge = 0; - wr[0].next = &wr[1]; - - wr[0].opcode = IBV_WR_RDMA_WRITE; - wr[1].opcode = IBV_WR_RDMA_WRITE_WITH_IMM; - - wr[0].send_flags = 0; - wr[1].send_flags = IBV_SEND_SIGNALED; - } -#endif - - int chunkSize = std::max(8, DIVUP(size, comm->nqps)); - - int offset = 0; - for (int q=0; q<comm->nqps; q++) { - int length = std::min(size-offset, chunkSize); - if (length <= 0) { - wr[0].sg_list = NULL; - wr[0].num_sge = 0; - } else { - sge.length = length; - wr[0].sg_list = &sge; - wr[0].num_sge = 1; - } - struct ibv_send_wr* bad_wr; - NCCLCHECK(wrap_ibv_post_send(comm->qps[q], wr, &bad_wr)); - offset += chunkSize; - sge.addr += chunkSize; - wr[0].wr.rdma.remote_addr += chunkSize; - } - req->events = comm->nqps; - *request = req; + *request = NULL; return ncclSuccess; } -ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t addr, int size, struct ncclIbRequest* req) { +ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int* sizes, int* tags, void** mhandles, struct ncclIbRequest* req) { struct ibv_send_wr wr; memset(&wr, 0, sizeof(wr)); - int slot = comm->remFifo.tail%MAX_REQUESTS; - struct ncclIbSendFifo* localElem = comm->remFifo.elems + slot; - localElem->addr = addr; - localElem->rkey = rkey; - localElem->ready = 1; - localElem->size = size; // Sanity/Debugging - localElem->seq = comm->remFifo.tail; // Sanity/Debugging - wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*sizeof(struct ncclIbSendFifo); + int slot = comm->remFifo.fifoTail%MAX_REQUESTS; + struct ncclIbSendFifo* localElem = comm->remFifo.elems[slot]; + + for (int i=0; i<n; i++) { + localElem[i].addr = (uint64_t)data[i]; + struct ibv_mr* mr = (struct ibv_mr*)mhandles[i]; + localElem[i].rkey = mr->rkey; + localElem[i].nreqs = n; + localElem[i].size = sizes[i]; // Sanity/Debugging + localElem[i].tag = tags[i]; + localElem[i].idx = comm->remFifo.fifoTail+1; + } + + wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbSendFifo); wr.wr.rdma.rkey = comm->remFifo.rkey; comm->remFifo.sge.addr = (uint64_t)localElem; + comm->remFifo.sge.length = n*sizeof(struct ncclIbSendFifo); wr.sg_list = &comm->remFifo.sge; wr.num_sge = 1; wr.opcode = IBV_WR_RDMA_WRITE; @@ -788,92 +1080,107 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t // if (slot == 0) { wr.send_flags |= IBV_SEND_SIGNALED; - wr.wr_id = (uint64_t)req; + wr.wr_id = req - comm->verbs.reqs; req->events++; } struct ibv_send_wr* bad_wr; NCCLCHECK(wrap_ibv_post_send(comm->qps[0], &wr, &bad_wr)); - comm->remFifo.tail++; + comm->remFifo.fifoTail++; return ncclSuccess; } -ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { +ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; if (comm->ready == 0) NCCLCHECK(ncclRecvCheck(comm)); if (comm->ready == 0) { *request = NULL; return ncclSuccess; } - - struct ibv_mr* mr = (struct ibv_mr*)mhandle; + if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError; struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req)); - req->size = size; - req->addr = &comm->addr; + req->type = NCCL_NET_IB_REQ_RECV; + req->addr = &comm->sock.addr; + req->nreqs = n; + for (int i=0; i<n; i++) req->recv.sizes[i] = 0; struct ibv_recv_wr wr; memset(&wr, 0, sizeof(wr)); - wr.wr_id = (uint64_t)req; + wr.wr_id = req - comm->verbs.reqs; wr.sg_list = NULL; wr.num_sge = 0; + TIME_START(1); for (int q=0; q<comm->nqps; q++) { struct ibv_qp* qp = comm->qps[q]; struct ibv_recv_wr* bad_wr; NCCLCHECK(wrap_ibv_post_recv(qp, &wr, &bad_wr)); } + TIME_STOP(1); req->events = comm->nqps; *request = req; // Post to FIFO to notify sender - NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size, req)); + TIME_START(2); + NCCLCHECK(ncclIbPostFifo(comm, n, data, sizes, tags, mhandles, req)); + TIME_STOP(2); return ncclSuccess; } -ncclResult_t ncclIbIflush(void* recvComm, void* data, int size, void* mhandle, void** request) { +ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; - if (comm->gpuFlush.enabled == 0 || size == 0) return ncclSuccess; + int last = -1; + for (int i=0; i<n; i++) if (sizes[i]) last = i; + if (comm->gpuFlush.enabled == 0 || last == -1) return ncclSuccess; + // Only flush once using the last non-zero receive struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req)); - req->addr = &comm->addr; - struct ibv_mr* mr = (struct ibv_mr*)mhandle; + req->type = NCCL_NET_IB_REQ_FLUSH; + req->addr = &comm->sock.addr; + struct ibv_mr* mr = (struct ibv_mr*)mhandles[last]; struct ibv_send_wr wr; memset(&wr, 0, sizeof(wr)); - wr.wr_id = (uint64_t)req; + wr.wr_id = req - comm->verbs.reqs; - wr.wr.rdma.remote_addr = (uint64_t)data; + wr.wr.rdma.remote_addr = (uint64_t)data[last]; wr.wr.rdma.rkey = mr->rkey; wr.sg_list = &comm->gpuFlush.sge; wr.num_sge = 1; wr.opcode = IBV_WR_RDMA_READ; wr.send_flags = IBV_SEND_SIGNALED; + TIME_START(4); struct ibv_send_wr* bad_wr; NCCLCHECK(wrap_ibv_post_send(comm->gpuFlush.qp, &wr, &bad_wr)); + TIME_STOP(4); *request = req; return ncclSuccess; } -ncclResult_t ncclIbTest(void* request, int* done, int* size) { +ncclResult_t ncclIbTest(void* request, int* done, int* sizes) { struct ncclIbRequest *r = (struct ncclIbRequest*)request; *done = 0; while (1) { if (r->events == 0) { *done = 1; - if (size) *size = r->size; + if (sizes && r->type == NCCL_NET_IB_REQ_RECV) { + for (int i=0; i<r->nreqs; i++) sizes[i] = r->recv.sizes[i]; + } NCCLCHECK(ncclIbFreeRequest(r)); return ncclSuccess; } int wrDone = 0; struct ibv_wc wcs[4]; + TIME_START(3); NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 4, wcs, &wrDone)); + if (wrDone == 0) { TIME_CANCEL(3); } else { TIME_STOP(3); } if (wrDone == 0) return ncclSuccess; for (int w=0; w<wrDone; w++) { @@ -881,23 +1188,31 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) { if (wc->status != IBV_WC_SUCCESS) { char line[SOCKET_NAME_MAXLEN+1]; WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d", - socketToString(r->addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err); + ncclSocketToString(r->addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err); return ncclSystemError; } - struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc->wr_id; - if (doneReq) { - if (wc->opcode == IBV_WC_RECV) { - doneReq->size = wc->byte_len; -#if USE_RDMA_WRITE - } else if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) { - if (doneReq->size == -1) - doneReq->size = wc->imm_data; - else - doneReq->size += wc->imm_data; -#endif + struct ncclIbRequest* req = r->verbs->reqs+(wc->wr_id & 0xff); + if (req->type == NCCL_NET_IB_REQ_SEND) { + for (int i=0; i<req->nreqs; i++) { + struct ncclIbRequest* sendReq = r->verbs->reqs+((wc->wr_id >> (i*8)) & 0xff); + if ((sendReq->events <= 0)) return ncclInternalError; + sendReq->events--; + } + } else { + if (req && wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) { + if (req->type != NCCL_NET_IB_REQ_RECV) return ncclInternalError; + if (req->nreqs > 1) { + // In the case of a multi recv, we only set sizes to 0 or 1. + uint8_t* sizes = (uint8_t*)&wc->imm_data; + for (int i=0; i<req->nreqs; i++) { + req->recv.sizes[i] |= sizes[i]; + } + } else { + req->recv.sizes[0] += wc->imm_data; + } } - doneReq->events--; + req->events--; } } } @@ -906,20 +1221,21 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) { ncclResult_t ncclIbCloseSend(void* sendComm) { struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm; if (comm) { - close(comm->fd); + close(comm->sock.fd); for (int q=0; q<comm->nqps; q++) if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q])); if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr)); NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs)); free(comm); } + TIME_PRINT("IB"); return ncclSuccess; } ncclResult_t ncclIbCloseRecv(void* recvComm) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; if (comm) { - close(comm->fd); + close(comm->sock.fd); for (int q=0; q<comm->nqps; q++) if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q])); if (comm->gpuFlush.enabled) { @@ -936,7 +1252,7 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) { ncclResult_t ncclIbCloseListen(void* listenComm) { struct ncclIbListenComm* comm = (struct ncclIbListenComm*)listenComm; if (comm) { - close(comm->fd); + close(comm->sock.fd); free(comm); } return ncclSuccess; diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index c045a8f..d92c46f 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -19,7 +19,7 @@ /* Init functions */ static int ncclNetIfs = -1; struct ncclSocketDev { - union socketAddress addr; + union ncclSocketAddress addr; char devName[MAX_IF_NAME_SIZE]; char* pciPath; }; @@ -40,8 +40,8 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) { pthread_mutex_lock(&ncclSocketLock); if (ncclNetIfs == -1) { char names[MAX_IF_NAME_SIZE*MAX_IFS]; - union socketAddress addrs[MAX_IFS]; - ncclNetIfs = findInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS); + union ncclSocketAddress addrs[MAX_IFS]; + ncclNetIfs = ncclFindInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS); if (ncclNetIfs <= 0) { WARN("NET/Socket : no interface found"); return ncclInternalError; @@ -53,10 +53,10 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) { addrline[SOCKET_NAME_MAXLEN] = '\0'; for (int i=0; i<ncclNetIfs; i++) { strcpy(ncclSocketDevs[i].devName, names+i*MAX_IF_NAME_SIZE); - memcpy(&ncclSocketDevs[i].addr, addrs+i, sizeof(union socketAddress)); + memcpy(&ncclSocketDevs[i].addr, addrs+i, sizeof(union ncclSocketAddress)); NCCLCHECK(ncclSocketGetPciPath(ncclSocketDevs[i].devName, &ncclSocketDevs[i].pciPath)); snprintf(line+strlen(line), MAX_LINE_LEN-strlen(line), " [%d]%s:%s", i, names+i*MAX_IF_NAME_SIZE, - socketToString(&addrs[i], addrline)); + ncclSocketToString(&addrs[i], addrline)); } line[MAX_LINE_LEN] = '\0'; INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line); @@ -97,12 +97,14 @@ ncclResult_t ncclSocketGetProperties(int dev, ncclNetProperties_t* props) { props->guid = dev; props->ptrSupport = NCCL_PTR_HOST; NCCLCHECK(ncclSocketGetSpeed(props->name, &props->speed)); + props->latency = 0; // Not set props->port = 0; props->maxComms = 65536; + props->maxRecvs = 1; return ncclSuccess; } -ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) { +ncclResult_t GetSocketAddr(int dev, union ncclSocketAddress* addr) { if (dev >= ncclNetIfs) return ncclInternalError; memcpy(addr, &ncclSocketDevs[dev].addr, sizeof(*addr)); return ncclSuccess; @@ -118,18 +120,33 @@ ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) { NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2); NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2); +enum ncclSocketCommState { + ncclSocketCommStateStart = 0, + ncclSocketCommStateConnect = 1, + ncclSocketCommStateAccept = 3, + ncclSocketCommStateSend = 4, + ncclSocketCommStateRecv = 5, +}; + +struct ncclSocketCommStage { + enum ncclSocketCommState state; + uint8_t iteration; + struct ncclSocket* sock; + struct ncclSocketComm* comm; +}; + struct ncclSocketHandle { - union socketAddress connectAddr; + union ncclSocketAddress connectAddr; int nSocks; int nThreads; + struct ncclSocketCommStage stage; }; struct ncclSocketTask { int op; void* data; int size; - int fd; - union socketAddress *addr; + struct ncclSocket* sock; int offset; int used; ncclResult_t result; @@ -139,8 +156,7 @@ struct ncclSocketRequest { int op; void* data; int size; - int ctrlFd; - union socketAddress *addr; + struct ncclSocket* ctrlSock; int offset; int used; struct ncclSocketComm* comm; @@ -154,29 +170,30 @@ struct ncclSocketTaskQueue { struct ncclSocketTask* tasks; }; -enum threadState {start, stop}; - struct ncclSocketThreadResources { struct ncclSocketTaskQueue threadTaskQueue; - enum threadState state; + int stop; struct ncclSocketComm* comm; pthread_mutex_t threadLock; pthread_cond_t threadCond; }; struct ncclSocketListenComm { - int fd; + struct ncclSocket sock; + struct ncclSocketCommStage stage; int nSocks; int nThreads; + int dev; }; struct ncclSocketComm { - int ctrlFd; - union socketAddress addr; - int fds[MAX_SOCKETS]; + struct ncclSocket ctrlSock; + struct ncclSocket socks[MAX_SOCKETS]; + int dev; + int cudaDev; int nSocks; int nThreads; - int nextFd; + int nextSock; struct ncclSocketRequest requests[MAX_REQUESTS]; pthread_t helperThread[MAX_THREADS]; struct ncclSocketThreadResources threadResources[MAX_THREADS]; @@ -185,7 +202,6 @@ struct ncclSocketComm { void* persistentSocketThread(void *args_) { struct ncclSocketThreadResources* resource = (struct ncclSocketThreadResources*)args_; struct ncclSocketComm* comm = resource->comm; - volatile enum threadState* state = &resource->state; struct ncclSocketTaskQueue* myQueue = &resource->threadTaskQueue; int nSocksPerThread = comm->nSocks / comm->nThreads; while (1) { @@ -198,7 +214,7 @@ void* persistentSocketThread(void *args_) { for (int j=0; j<nSocksPerThread; j++) { struct ncclSocketTask* r = myQueue->tasks+i+j; if (r != NULL && r->used == 1 && r->offset < r->size) { - r->result = socketProgress(r->op, r->fd, r->addr, r->data, r->size, &r->offset); + r->result = ncclSocketProgress(r->op, r->sock, r->data, r->size, &r->offset); if (r->result != ncclSuccess) { WARN("NET/Socket : socket progress error"); return NULL; @@ -211,12 +227,12 @@ void* persistentSocketThread(void *args_) { } if (idle) { pthread_mutex_lock(&resource->threadLock); - while (mark == myQueue->next && *state != stop) { // no new tasks, wait + while (mark == myQueue->next && resource->stop == 0) { // no new tasks, wait pthread_cond_wait(&resource->threadCond, &resource->threadLock); } pthread_mutex_unlock(&resource->threadLock); } - if (*state == stop) return NULL; + if (resource->stop) return NULL; } } @@ -271,17 +287,17 @@ end: ncclResult_t ncclSocketNewListenComm(struct ncclSocketListenComm** comm) { NCCLCHECK(ncclCalloc(comm, 1)); - (*comm)->fd = -1; + (*comm)->sock.fd = -1; return ncclSuccess; } ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) { NCCLCHECK(ncclCalloc(comm, 1)); - (*comm)->ctrlFd = -1; + (*comm)->ctrlSock.fd = -1; for (int i=0; i < MAX_SOCKETS; i++) { - (*comm)->fds[i] = -1; + (*comm)->socks[i].fd = -1; } - (*comm)->nextFd = 0; + (*comm)->nextSock = 0; return ncclSuccess; } @@ -290,14 +306,18 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) { return ncclInternalError; } struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle; - static_assert(sizeof(struct ncclSocketHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large"); + memset(handle, 0, sizeof(struct ncclSocketHandle)); + static_assert(sizeof(struct ncclSocketHandle) <= NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large"); struct ncclSocketListenComm* comm; NCCLCHECK(ncclSocketNewListenComm(&comm)); - NCCLCHECK(GetSocketAddr(dev, &handle->connectAddr)); - NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr)); + NCCLCHECK(GetSocketAddr(dev, &comm->sock.addr)); + NCCLCHECK(ncclSocketListen(&comm->sock)); + memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress)); NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads)); handle->nSocks = comm->nSocks; handle->nThreads = comm->nThreads; + comm->sock.asyncFlag = 1; + comm->dev = dev; *listenComm = comm; return ncclSuccess; } @@ -306,38 +326,99 @@ ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) { if (dev < 0) { // data transfer socket is based on specified dev return ncclInternalError; } - struct ncclSocketComm* comm; - NCCLCHECK(ncclSocketNewComm(&comm)); + + enum ncclSocketState conState; struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle; + struct ncclSocketCommStage* stage = &handle->stage; + struct ncclSocketComm* comm = stage->comm; + uint8_t i = stage->iteration; + struct ncclSocket* sock = stage->sock; + *sendComm = NULL; + + if (stage->state == ncclSocketCommStateConnect) goto socket_connect_check; + if (stage->state == ncclSocketCommStateSend) goto socket_send; + + NCCLCHECK(ncclSocketNewComm(&comm)); + stage->comm = comm; comm->nSocks = handle->nSocks; comm->nThreads = handle->nThreads; - for (int i=0; i<comm->nSocks+1; i++) { - int tmpFd, offset=0; - NCCLCHECK(connectAddress(&tmpFd, &handle->connectAddr)); - NCCLCHECK(socketWait(NCCL_SOCKET_SEND, tmpFd, &handle->connectAddr, &i, sizeof(int), &offset)); - if (i == comm->nSocks) comm->ctrlFd = tmpFd; - else comm->fds[i] = tmpFd; + comm->dev = dev; + CUDACHECK(cudaGetDevice(&comm->cudaDev)); + for (; i<comm->nSocks+1; i++) { + sock = i == comm->nSocks ? &comm->ctrlSock : comm->socks+i; + NCCLCHECK(ncclSocketInit(sock, &handle->connectAddr, NULL, 1)); + + stage->sock = sock; + stage->state = ncclSocketCommStateConnect; + stage->iteration = i; + NCCLCHECK(ncclSocketConnect(sock)); + +socket_connect_check: + NCCLCHECK(ncclGetSocketState(sock, &conState)); + if (conState == ncclSocketConnecting) { + /* expect user to call again */ + return ncclSuccess; + } else if (conState == ncclSocketError) { + return ncclSystemError; + } + stage->state = ncclSocketCommStateSend; + +socket_send: + int done = 0; + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &i, sizeof(uint8_t), &done)); + if (done == 0) return ncclSuccess; } *sendComm = comm; - comm->addr = handle->connectAddr; return ncclSuccess; } ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) { struct ncclSocketListenComm* lComm = (struct ncclSocketListenComm*)listenComm; - struct ncclSocketComm* rComm; + struct ncclSocketCommStage* stage = &lComm->stage; + struct ncclSocketComm* rComm = stage->comm; + uint8_t i = stage->iteration; + struct ncclSocket* sock = stage->sock; + + *recvComm = NULL; + if (stage->state == ncclSocketCommStateAccept) goto socket_accept; + if (stage->state == ncclSocketCommStateRecv) goto socket_recv; + NCCLCHECK(ncclSocketNewComm(&rComm)); + stage->comm = rComm; rComm->nSocks = lComm->nSocks; rComm->nThreads = lComm->nThreads; - for (int i=0; i<rComm->nSocks+1; i++) { - int tmpFd, sendSockIdx, offset=0; - socklen_t socklen = sizeof(union socketAddress); - SYSCHECKVAL(accept(lComm->fd, &rComm->addr.sa, &socklen), "accept", tmpFd); - NCCLCHECK(socketWait(NCCL_SOCKET_RECV, tmpFd, &rComm->addr, &sendSockIdx, sizeof(int), &offset)); - if (sendSockIdx == rComm->nSocks) rComm->ctrlFd = tmpFd; - else rComm->fds[sendSockIdx] = tmpFd; + rComm->dev = lComm->dev; + CUDACHECK(cudaGetDevice(&rComm->cudaDev)); + lComm->sock.asyncFlag = 1; + for (; i<rComm->nSocks+1; i++) { + uint8_t sendSockIdx; + ncclCalloc(&sock, 1); + NCCLCHECK(ncclSocketInit(sock, NULL, NULL, 1)); + stage->sock = sock; + stage->state = ncclSocketCommStateAccept; + stage->iteration = i; +socket_accept: + NCCLCHECK(ncclSocketAccept(sock, &lComm->sock)); + if (sock->fd == -1) return ncclSuccess; + + stage->state = ncclSocketCommStateRecv; +socket_recv: + int done = 0; + NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &sendSockIdx, sizeof(uint8_t), &done)); + if (done == 0) return ncclSuccess; + + if (sendSockIdx == rComm->nSocks) memcpy(&rComm->ctrlSock, sock, sizeof(struct ncclSocket)); + else memcpy(rComm->socks+sendSockIdx, sock, sizeof(struct ncclSocket)); + + free(sock); } *recvComm = rComm; + + /* reset lComm state */ + stage->state = ncclSocketCommStateStart; + stage->iteration = 0; + stage->sock = NULL; + stage->comm = NULL; return ncclSuccess; } @@ -348,8 +429,7 @@ ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* dat r->op = op; r->data = data; r->size = size; - r->ctrlFd = comm->ctrlFd; - r->addr = &comm->addr; + r->ctrlSock = &comm->ctrlSock; r->used = 1; r->comm = comm; r->nSubs = 0; @@ -362,7 +442,7 @@ ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* dat } ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketTask** req) { - int tid = comm->nextFd % comm->nThreads; + int tid = comm->nextSock % comm->nThreads; struct ncclSocketThreadResources* res = comm->threadResources+tid; struct ncclSocketTaskQueue* queue = &res->threadTaskQueue; // create helper threads and prepare per-thread task queue @@ -377,22 +457,21 @@ ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data, pthread_mutex_init(&res->threadLock, NULL); pthread_cond_init(&res->threadCond, NULL); pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res); + ncclSetThreadName(comm->helperThread[tid], "NCCL Sock%c%1u%2u%2u", op == NCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->cudaDev); } struct ncclSocketTask* r = queue->tasks+queue->next; if (r->used == 0) { r->op = op; r->data = data; r->size = size; - r->fd = comm->fds[comm->nextFd]; - r->addr = &comm->addr; + r->sock = comm->socks+comm->nextSock; r->offset = 0; r->result = ncclSuccess; - comm->nextFd = (comm->nextFd + 1) % comm->nSocks; + comm->nextSock = (comm->nextSock + 1) % comm->nSocks; r->used = 1; *req = r; pthread_mutex_lock(&res->threadLock); queue->next = (queue->next+1)%queue->len; - res->state = start; pthread_cond_signal(&res->threadCond); pthread_mutex_unlock(&res->threadLock); return ncclSuccess; @@ -411,17 +490,17 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) { if (r->used == 1) { /* try to send/recv size */ int data = r->size; int offset = 0; - NCCLCHECK(socketProgress(r->op, r->ctrlFd, r->addr, &data, sizeof(int), &offset)); + NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, &data, sizeof(int), &offset)); if (offset == 0) return ncclSuccess; /* Not ready -- retry later */ // Not sure we could ever receive less than 4 bytes, but just in case ... - if (offset < sizeof(int)) NCCLCHECK(socketWait(r->op, r->ctrlFd, r->addr, &data, sizeof(int), &offset)); + if (offset < sizeof(int)) NCCLCHECK(ncclSocketWait(r->op, r->ctrlSock, &data, sizeof(int), &offset)); // Check size is less or equal to the size provided by the user if (r->op == NCCL_SOCKET_RECV && data > r->size) { char line[SOCKET_NAME_MAXLEN+1]; - WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", socketToString(r->addr, line), data, r->size); + WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", ncclSocketToString(&r->ctrlSock->addr, line), data, r->size); return ncclInternalError; } r->size = data; @@ -459,7 +538,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) { } } else { // progress request using main thread if (r->offset < r->size) { - NCCLCHECK(socketProgress(r->op, r->ctrlFd, r->addr, r->data, r->size, &r->offset)); + NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, r->data, r->size, &r->offset)); } if (r->offset == r->size) { if (size) *size = r->size; @@ -476,19 +555,20 @@ ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void** } ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } -ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { +ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm; NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclSocketRequest**)request)); return ncclSuccess; } -ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { +ncclResult_t ncclSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm; - NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_RECV, data, size, (struct ncclSocketRequest**)request)); + if (n != 1) return ncclInternalError; + NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], sizes[0], (struct ncclSocketRequest**)request)); return ncclSuccess; } -ncclResult_t ncclSocketIflush(void* recvComm, void* data, int size, void* mhandle, void** request) { +ncclResult_t ncclSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { // We don't support CUDA pointers, so we don't need a flush operation return ncclInternalError; } @@ -496,7 +576,7 @@ ncclResult_t ncclSocketIflush(void* recvComm, void* data, int size, void* mhandl ncclResult_t ncclSocketCloseListen(void* opaqueComm) { struct ncclSocketListenComm* comm = (struct ncclSocketListenComm*)opaqueComm; if (comm) { - if (comm->fd != -1) close(comm->fd); + if (comm->sock.fd != -1) close(comm->sock.fd); free(comm); } return ncclSuccess; @@ -509,16 +589,16 @@ ncclResult_t ncclSocketClose(void* opaqueComm) { struct ncclSocketThreadResources* res = comm->threadResources+i; if (comm->helperThread[i]) { pthread_mutex_lock(&res->threadLock); - res->state = stop; + res->stop = 1; pthread_cond_signal(&res->threadCond); pthread_mutex_unlock(&res->threadLock); pthread_join(comm->helperThread[i], NULL); } free(res->threadTaskQueue.tasks); } - if (comm->ctrlFd != -1) close(comm->ctrlFd); + if (comm->ctrlSock.fd != -1) close(comm->ctrlSock.fd); for (int i=0; i<comm->nSocks; i++) { - if (comm->fds[i] != -1) close(comm->fds[i]); + if (comm->socks[i].fd != -1) close(comm->socks[i].fd); } free(comm); } diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index ca59f3b..e71e157 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,31 +7,29 @@ #include "comm.h" #include "graph.h" #include "utils.h" -#include "bootstrap.h" + +struct ncclP2pBuff { + void* directPtr; + cudaIpcMemHandle_t devIpc; +}; struct p2pConnectInfo { int rank; int read; - void* directPtr; - cudaIpcMemHandle_t devIpc; + struct ncclP2pBuff p2pBuff; }; +static_assert(sizeof(p2pConnectInfo) <= CONNECT_SIZE, "P2P Connect info is too large"); struct p2pSendResources { struct ncclSendMem* devMem; - void* ipcPtr; - int remoteId; - int memRank; - void* remIpcPtr; - void* bootstrap; + void* sendMemIpc; + void* recvMemIpc; }; struct p2pRecvResources { struct ncclRecvMem* devMem; - void* ipcPtr; - int remoteId; - int memRank; - void* remIpcPtr; - void* bootstrap; + void* sendMemIpc; + void* recvMemIpc; }; #include <sys/types.h> @@ -90,17 +88,23 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop return ncclSuccess; } - // Check that legacy IPC support is available if (p2p != 0) { + // Cached result of the legacyIPC detection + static int legacyIPC = -1; + if (legacyIPC >= 0) { + *ret = legacyIPC; + return ncclSuccess; + } + // Check that legacy IPC support is available (WSL WAR) char *dummy; cudaIpcMemHandle_t ipc; NCCLCHECK(ncclCudaCalloc(&dummy, CUDA_IPC_MIN)); if (cudaIpcGetMemHandle(&ipc, dummy) != cudaSuccess) { - INFO(NCCL_INIT|NCCL_P2P,"Legacy IPC not supported on dev %d(=%lx)", - cudaDev1, info1->busId); + INFO(NCCL_INIT|NCCL_P2P,"Legacy IPC not supported"); *ret = 0; } CUDACHECK(cudaFree(dummy)); + legacyIPC = *ret; return ncclSuccess; } @@ -120,6 +124,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \ } while (0) + // Setting this to non zero causes P2P to use Reads rather than Writes NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2); @@ -134,7 +139,7 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* return ncclSuccess; } -static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct p2pConnectInfo* p2pInfo, void** devMem, void** ipcPtr) { +static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) { if (myInfo->pidHash == peerInfo->pidHash) { if (peerInfo->cudaDev != myInfo->cudaDev) { // Enable P2P access @@ -147,10 +152,10 @@ static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* pee return ncclInternalError; } } - *devMem = p2pInfo->directPtr; + *devMem = p2pBuff->directPtr; *ipcPtr = NULL; } else { - CUDACHECK(cudaIpcOpenMemHandle(devMem, p2pInfo->devIpc, cudaIpcMemLazyEnablePeerAccess)); + CUDACHECK(cudaIpcOpenMemHandle(devMem, p2pBuff->devIpc, cudaIpcMemLazyEnablePeerAccess)); *ipcPtr = *devMem; } return ncclSuccess; @@ -165,44 +170,40 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st int useRead, intermediateRank; NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank)); - struct p2pConnectInfo info; - // For CollNet, we use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0) - info.read = (connIndex == 0) ? useRead : 0; - const char* useReadStr = info.read ? "/read" : ""; + static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); + struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; + info->read = useRead; + // For CollNet, use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0) + if (graph && connIndex == 1) info->read = 0; + const char* useReadStr = info->read ? "/read" : ""; int sendSize = sizeof(struct ncclSendMem); // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure - if (info.read) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE]; + if (info->read) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE]; ALIGN_SIZE(sendSize, CUDA_IPC_MIN); - resources->remoteId = -1; - resources->bootstrap = comm->bootstrap; if (intermediateRank == -1) { - NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, sendSize)); - info.rank = myInfo->rank; + info->rank = myInfo->rank; if (myInfo->pidHash == peerInfo->pidHash) { - send->conn.direct |= info.read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; + send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr); } else { - send->conn.direct |= info.read ? NCCL_IPC_READ : NCCL_IPC_WRITE; - CUDACHECK(cudaIpcGetMemHandle(&info.devIpc, info.directPtr)); + send->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE; INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr); } } else { - NCCLCHECK(bootstrapRemAlloc(sendSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr)); - info.rank = intermediateRank; + info->rank = intermediateRank; INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank, comm->peerInfo[intermediateRank].busId, useReadStr); } - resources->memRank = info.rank; - NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info.rank, &info, (void**)&resources->devMem, &resources->ipcPtr)); + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn)); + NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); - static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); - memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo)); + NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc)); return ncclSuccess; } @@ -215,36 +216,32 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st int useRead, intermediateRank; NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank)); - struct p2pConnectInfo info; - // For CollNet, we use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0) - info.read = (connIndex == 0) ? useRead : 0; + static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); + struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; + info->read = useRead; + // For CollNet, use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0) + if (graph && connIndex == 1) info->read = 0; - int recvSize = offsetof(struct ncclRecvMem, buff); + int recvSize = sizeof(struct ncclRecvMem); // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure - for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(info.read && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p]; + for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(info->read && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p]; ALIGN_SIZE(recvSize, CUDA_IPC_MIN); - resources->remoteId = -1; - resources->bootstrap = comm->bootstrap; if (intermediateRank == -1) { - NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, recvSize)); - info.rank = myInfo->rank; + info->rank = myInfo->rank; if (myInfo->pidHash == peerInfo->pidHash) { - recv->conn.direct |= info.read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; + recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE; } else { - recv->conn.direct |= info.read ? NCCL_IPC_READ : NCCL_IPC_WRITE; - CUDACHECK(cudaIpcGetMemHandle(&info.devIpc, info.directPtr)); + recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE; } } else { - NCCLCHECK(bootstrapRemAlloc(recvSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr)); - info.rank = intermediateRank; + info->rank = intermediateRank; } - resources->memRank = info.rank; - NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info.rank, &info, (void**)&resources->devMem, &resources->ipcPtr)); + NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn)); + NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff))); - static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); - memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo)); + NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->recvMemIpc)); return ncclSuccess; } @@ -254,16 +251,16 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co struct ncclRecvMem* remDevMem; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; - NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->remIpcPtr)); + NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc)); - int offset = 0; + char* buff = (char*)(remDevMem+1); for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { if (info->read && p == NCCL_PROTO_SIMPLE) { /* For P2P Read the SIMPLE buffer is local (ncclSendMem) */ - send->conn.buffs[p] = resources->devMem->buff; + send->conn.buffs[p] = (char*)(resources->devMem+1); } else { - send->conn.buffs[p] = remDevMem->buff + offset; - offset += send->comm->buffSizes[p]; + send->conn.buffs[p] = buff; + buff += send->comm->buffSizes[p]; } } send->conn.tail = &remDevMem->tail; @@ -279,16 +276,16 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn struct ncclSendMem* remDevMem; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; - NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->remIpcPtr)); + NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc)); - int offset = 0; + char* buff = (char*)(resources->devMem+1); for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { if (info->read && p == NCCL_PROTO_SIMPLE) { /* For P2P Read the SIMPLE buffer is remote (ncclSendMem) */ - recv->conn.buffs[p] = remDevMem->buff; + recv->conn.buffs[p] = (char*)(remDevMem+1); } else { - recv->conn.buffs[p] = resources->devMem->buff + offset; - offset += recv->comm->buffSizes[p]; + recv->conn.buffs[p] = buff; + buff += recv->comm->buffSizes[p]; } } recv->conn.tail = &resources->devMem->tail; @@ -298,39 +295,49 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn return ncclSuccess; } -ncclResult_t p2pSendFree(void* resources) { - struct p2pSendResources* sendRes = (struct p2pSendResources*)resources; - if (sendRes->ipcPtr) - CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr)); - if (sendRes->remIpcPtr) - CUDACHECK(cudaIpcCloseMemHandle(sendRes->remIpcPtr)); - if (sendRes->remoteId != -1) { - NCCLCHECK(bootstrapRemFree(sendRes->remoteId, sendRes->memRank, sendRes->bootstrap)); - sendRes->devMem = NULL; - } - CUDACHECK(cudaFree(sendRes->devMem)); - free(sendRes); +ncclResult_t p2pSendFree(struct ncclConnector* send) { + struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources; + if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc)); + if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc)); + free(resources); return ncclSuccess; } -ncclResult_t p2pRecvFree(void* resources) { - struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources; - if (recvRes->ipcPtr) - CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr)); - if (recvRes->remIpcPtr) - CUDACHECK(cudaIpcCloseMemHandle(recvRes->remIpcPtr)); - if (recvRes->remoteId != -1) { - NCCLCHECK(bootstrapRemFree(recvRes->remoteId, recvRes->memRank, recvRes->bootstrap)); - recvRes->devMem = NULL; +ncclResult_t p2pRecvFree(struct ncclConnector* recv) { + struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources; + if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc)); + if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc)); + free(resources); + return ncclSuccess; +} + +static ncclResult_t p2pProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) { + if (reqSize != sizeof(int)) return ncclInternalError; + int size = *((int*)reqBuff); + if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError; + struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff; + NCCLCHECK(ncclCudaCalloc((char**)&p2pBuff->directPtr, size)); + connection->transportResources = p2pBuff->directPtr; + cudaError_t res = cudaIpcGetMemHandle(&p2pBuff->devIpc, p2pBuff->directPtr); + if (res != cudaSuccess) { + WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res)); + cudaFree(p2pBuff->directPtr); + free(p2pBuff); + CUDACHECK(res); } - CUDACHECK(cudaFree(recvRes->devMem)); - free(recvRes); + *done = 1; + return ncclSuccess; +} + +static ncclResult_t p2pProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) { + // Do not check return code as CUDA may have already shut down + cudaFree(connection->transportResources); return ncclSuccess; } struct ncclTransport p2pTransport = { "P2P", p2pCanConnect, - { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL }, - { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL } + { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL }, + { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL } }; diff --git a/src/transport/shm.cc b/src/transport/shm.cc index 98e25a9..974a2ab 100644 --- a/src/transport/shm.cc +++ b/src/transport/shm.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,12 +8,10 @@ #include "shm.h" struct shmConnectInfo { - uint64_t pidHash; - int id; - int sendRank; - int recvRank; + char shmName[7]; int shmSize; }; +static_assert(sizeof(shmConnectInfo) <= CONNECT_SIZE, "SHM Connect info is too large"); struct shmSendResources { int remShmSize; @@ -62,21 +60,17 @@ ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; - struct shmConnectInfo info; - info.id = channelId; - info.pidHash = myInfo->pidHash; - info.sendRank = myInfo->rank; - info.recvRank = peerInfo->rank; + static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big"); + struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; - char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank); - info.shmSize = resources->shmSize = sizeof(struct ncclSendMem); - TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); - NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); + char shmPath[PATH_MAX]; + shmPath[0] = '\0'; + info->shmSize = resources->shmSize = sizeof(struct ncclSendMem); + NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); + TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize); + memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName)); INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); - static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big"); - memcpy(connectInfo, &info, sizeof(struct shmConnectInfo)); return ncclSuccess; } @@ -85,22 +79,18 @@ ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; - struct shmConnectInfo info; - info.id = channelId; - info.pidHash = myInfo->pidHash; - info.sendRank = peerInfo->rank; - info.recvRank = myInfo->rank; + static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big"); + struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; - char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank); - int shmSize = offsetof(struct ncclRecvMem, buff); + char shmPath[PATH_MAX]; + shmPath[0] = '\0'; + int shmSize = sizeof(struct ncclRecvMem); for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p]; - info.shmSize = resources->shmSize = shmSize; - TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); - NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); + info->shmSize = resources->shmSize = shmSize; + NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); + TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize); + memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName)); - static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big"); - memcpy(connectInfo, &info, sizeof(struct shmConnectInfo)); return ncclSuccess; } @@ -110,18 +100,18 @@ ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectIn struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; struct shmSendResources* resources = (struct shmSendResources*)send->transportResources; - char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank); + char shmPath[PATH_MAX]; + sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName); resources->remShmSize = info->shmSize; - TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize); - NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); + TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize); + NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); // Remove the file to ensure proper clean-up - NCCLCHECK(shmUnlink(shmName)); + NCCLCHECK(ncclShmUnlink(shmPath)); send->transportResources = resources; int offset = 0; for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { - send->conn.buffs[p] = resources->devRemHostMem->buff + offset; + send->conn.buffs[p] = (char*)(resources->devRemHostMem+1) + offset; offset += send->comm->buffSizes[p]; } send->conn.tail = &resources->devRemHostMem->tail; @@ -135,35 +125,35 @@ ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources; struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; - char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank); + char shmPath[PATH_MAX]; + sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName); resources->remShmSize = info->shmSize; - TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize); - NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); - NCCLCHECK(shmUnlink(shmName)); + TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize); + NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); + NCCLCHECK(ncclShmUnlink(shmPath)); recv->conn.head = &resources->devRemHostMem->head; int offset = 0; for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) { - recv->conn.buffs[p] = resources->devHostMem->buff + offset; + recv->conn.buffs[p] = (char*)(resources->devHostMem+1) + offset; offset += recv->comm->buffSizes[p]; } recv->conn.tail = &resources->devHostMem->tail; return ncclSuccess; } -ncclResult_t shmSendFree(void* transportResources) { - struct shmSendResources* resources = (struct shmSendResources*)transportResources; - NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize)); - NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize)); +ncclResult_t shmSendFree(struct ncclConnector* send) { + struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources; + NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize)); + NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize)); free(resources); return ncclSuccess; } -ncclResult_t shmRecvFree(void* transportResources) { - struct shmRecvResources* resources = (struct shmRecvResources*)transportResources; - NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize)); - NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize)); +ncclResult_t shmRecvFree(struct ncclConnector* recv) { + struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources; + NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize)); + NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize)); free(resources); return ncclSuccess; } @@ -171,6 +161,6 @@ ncclResult_t shmRecvFree(void* transportResources) { struct ncclTransport shmTransport = { "SHM", shmCanConnect, - { shmSendSetup, shmSendConnect, shmSendFree, NULL }, - { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL } + { shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL }, + { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL } }; |