Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/NVIDIA/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--makefiles/common.mk5
-rw-r--r--makefiles/version.mk4
-rw-r--r--src/Makefile16
-rw-r--r--src/bootstrap.cc387
-rw-r--r--src/channel.cc6
-rw-r--r--src/collectives/device/all_gather.h16
-rw-r--r--src/collectives/device/all_reduce.h88
-rw-r--r--src/collectives/device/broadcast.h18
-rw-r--r--src/collectives/device/common.h149
-rw-r--r--src/collectives/device/common_kernel.h11
-rw-r--r--src/collectives/device/onerank_reduce.cu12
-rw-r--r--src/collectives/device/primitives.h4
-rw-r--r--src/collectives/device/prims_ll.h10
-rw-r--r--src/collectives/device/prims_ll128.h10
-rw-r--r--src/collectives/device/prims_simple.h56
-rw-r--r--src/collectives/device/reduce.h18
-rw-r--r--src/collectives/device/reduce_scatter.h16
-rw-r--r--src/collectives/device/sendrecv.h124
-rw-r--r--src/collectives/sendrecv.cc8
-rw-r--r--src/debug.cc18
-rw-r--r--src/enhcompat.cc28
-rw-r--r--src/enqueue.cc451
-rw-r--r--src/graph/connect.cc5
-rw-r--r--src/graph/paths.cc167
-rw-r--r--src/graph/search.cc221
-rw-r--r--src/graph/topo.cc51
-rw-r--r--src/graph/topo.h19
-rw-r--r--src/graph/tuning.cc12
-rw-r--r--src/graph/xml.cc20
-rw-r--r--src/graph/xml.h10
-rw-r--r--src/group.cc151
-rw-r--r--src/include/alloc.h25
-rw-r--r--src/include/bootstrap.h7
-rw-r--r--src/include/checks.h80
-rw-r--r--src/include/coll_net.h4
-rw-r--r--src/include/collectives.h4
-rw-r--r--src/include/comm.h34
-rw-r--r--src/include/debug.h7
-rw-r--r--src/include/devcomm.h110
-rw-r--r--src/include/enqueue.h16
-rw-r--r--src/include/graph.h9
-rw-r--r--src/include/ibvwrap.h6
-rw-r--r--src/include/info.h16
-rw-r--r--src/include/nccl_net.h120
-rw-r--r--src/include/net.h56
-rw-r--r--src/include/nvmlwrap.h125
-rw-r--r--src/include/param.h3
-rw-r--r--src/include/profiler.h37
-rw-r--r--src/include/proxy.h191
-rw-r--r--src/include/shm.h66
-rw-r--r--src/include/socket.h467
-rw-r--r--src/include/timer.h60
-rw-r--r--src/include/transport.h19
-rw-r--r--src/include/utils.h8
-rw-r--r--src/init.cc402
-rw-r--r--src/misc/argcheck.cc10
-rw-r--r--src/misc/ibvwrap.cc22
-rw-r--r--src/misc/nvmlwrap.cc397
-rw-r--r--src/misc/profiler.cc115
-rw-r--r--src/misc/shmutils.cc90
-rw-r--r--src/misc/socket.cc552
-rw-r--r--src/net.cc261
-rw-r--r--src/proxy.cc1226
-rw-r--r--src/transport.cc29
-rw-r--r--src/transport/coll_net.cc685
-rw-r--r--src/transport/net.cc1112
-rw-r--r--src/transport/net_ib.cc730
-rw-r--r--src/transport/net_socket.cc214
-rw-r--r--src/transport/p2p.cc189
-rw-r--r--src/transport/shm.cc94
70 files changed, 6366 insertions, 3343 deletions
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 64f8d2d..1a1c2b6 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -23,7 +23,6 @@ CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
#$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR})
-
# You should define NVCC_GENCODE in your environment to the minimal set
# of archs to reduce compile time.
CUDA8_GENCODE = -gencode=arch=compute_35,code=sm_35 \
@@ -39,7 +38,7 @@ CUDA11_PTX = -gencode=arch=compute_80,code=compute_80
# Include Ampere support if we're using CUDA11 or above
ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
- NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX) $(CUDA11_GENCODE) $(CUDA11_PTX)
+ NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX)
# Include Volta support if we're using CUDA9 or above
else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0)
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
diff --git a/makefiles/version.mk b/makefiles/version.mk
index 22bddce..e7fe35e 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
-NCCL_MINOR := 11
-NCCL_PATCH := 4
+NCCL_MINOR := 12
+NCCL_PATCH := 7
NCCL_SUFFIX :=
PKG_REVISION := 1
diff --git a/src/Makefile b/src/Makefile
index a548840..65c8b28 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -9,8 +9,8 @@ include ../makefiles/version.mk
##### src files
INCEXPORTS := nccl.h nccl_net.h
-LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc \
- misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc \
+LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc enhcompat.cc net.cc \
+ misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc misc/socket.cc misc/shmutils.cc misc/profiler.cc \
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
@@ -74,14 +74,14 @@ $(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB)
ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
+null :=
+space := $(null) #
+comma := ,
+
$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB)
@printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@
mkdir -p $(LIBDIR)
- $(eval TMP := $(shell mktemp -d))
- cp $(LIBOBJ) $(TMP)
- cd $(TMP) && ar x $(DEVICELIB) && cd -
- ar cr $@ $(LIBOBJ) $(TMP)/*.o
- rm -Rf $(TMP)
+ printf "create $@\naddlib $(DEVICELIB)\naddmod $(subst $(space),$(comma),$(strip $(LIBOBJ)))\nsave\nend" | ar -M
$(PKGDIR)/nccl.pc : nccl.pc.in
mkdir -p $(PKGDIR)
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index ae9da9b..db1e70e 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,13 +9,13 @@
#include "utils.h"
#include "bootstrap.h"
#include "net.h"
-#include "socket.h"
#include <unistd.h>
#include <sys/types.h>
+#include "proxy.h"
/* Init functions */
static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
-static union socketAddress bootstrapNetIfAddr;
+static union ncclSocketAddress bootstrapNetIfAddr;
static int bootstrapNetInitDone = 0;
pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
@@ -25,17 +25,17 @@ ncclResult_t bootstrapNetInit() {
if (bootstrapNetInitDone == 0) {
char* env = getenv("NCCL_COMM_ID");
if (env) {
- union socketAddress remoteAddr;
- if (GetSocketAddrFromString(&remoteAddr, env) != ncclSuccess) {
+ union ncclSocketAddress remoteAddr;
+ if (ncclGetSocketAddrFromString(&remoteAddr, env) != ncclSuccess) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return ncclInvalidArgument;
}
- if (findInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
+ if (ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
WARN("NET/Socket : No usable listening interface found");
return ncclSystemError;
}
} else {
- int nIfs = findInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
+ int nIfs = ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
if (nIfs <= 0) {
WARN("Bootstrap : no socket interface found");
return ncclInternalError;
@@ -43,7 +43,7 @@ ncclResult_t bootstrapNetInit() {
}
char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2];
sprintf(line, " %s:", bootstrapNetIfName);
- socketToString(&bootstrapNetIfAddr, line+strlen(line));
+ ncclSocketToString(&bootstrapNetIfAddr, line+strlen(line));
INFO(NCCL_INIT, "Bootstrap : Using%s", line);
bootstrapNetInitDone = 1;
}
@@ -55,35 +55,28 @@ ncclResult_t bootstrapNetInit() {
/* Socket Interface Selection type */
enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
-static ncclResult_t bootstrapNetAccept(int listenFd, int* recvFd, union socketAddress *addr) {
- struct sockaddr *saddr = &addr->sa;
- socklen_t socklen = sizeof(union socketAddress);
- SYSCHECKVAL(accept(listenFd, saddr, &socklen), "accept", *recvFd);
- return ncclSuccess;
-}
-
// Additional sync functions
-static ncclResult_t bootstrapNetSend(int fd, union socketAddress *addr, void* data, int size) {
- NCCLCHECK(socketSend(fd, addr, &size, sizeof(int)));
- NCCLCHECK(socketSend(fd, addr, data, size));
+static ncclResult_t bootstrapNetSend(struct ncclSocket* sock, void* data, int size) {
+ NCCLCHECK(ncclSocketSend(sock, &size, sizeof(int)));
+ NCCLCHECK(ncclSocketSend(sock, data, size));
return ncclSuccess;
}
-static ncclResult_t bootstrapNetRecv(int fd, union socketAddress *addr, void* data, int size) {
+static ncclResult_t bootstrapNetRecv(struct ncclSocket* sock, void* data, int size) {
int recvSize;
- NCCLCHECK(socketRecv(fd, addr, &recvSize, sizeof(int)));
+ NCCLCHECK(ncclSocketRecv(sock, &recvSize, sizeof(int)));
if (recvSize > size) {
WARN("Message truncated : received %d bytes instead of %d", recvSize, size);
return ncclInternalError;
}
- NCCLCHECK(socketRecv(fd, addr, data, std::min(recvSize, size)));
+ NCCLCHECK(ncclSocketRecv(sock, data, std::min(recvSize, size)));
return ncclSuccess;
}
struct extInfo {
int rank;
int nranks;
- union socketAddress extAddressListenRoot;
- union socketAddress extAddressListen;
+ union ncclSocketAddress extAddressListenRoot;
+ union ncclSocketAddress extAddressListen;
};
#include <sys/resource.h>
@@ -97,24 +90,24 @@ static ncclResult_t setFilesLimit() {
}
static void *bootstrapRoot(void* args) {
- int listenFd = (uint64_t)args;
+ struct ncclSocket* listenSock = (struct ncclSocket*)args;
ncclResult_t res = ncclSuccess;
int nranks = 0, c = 0;
struct extInfo info;
- union socketAddress *rankAddresses = NULL;
- union socketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange
- union socketAddress *zero = NULL;
+ union ncclSocketAddress *rankAddresses = NULL;
+ union ncclSocketAddress *rankAddressesRoot = NULL; // for initial rank <-> root information exchange
+ union ncclSocketAddress *zero = NULL;
NCCLCHECKGOTO(ncclCalloc(&zero, 1), res, out);
setFilesLimit();
TRACE(NCCL_INIT, "BEGIN");
/* Receive addresses from all ranks */
do {
- int tmpFd;
- union socketAddress addr;
- NCCLCHECKGOTO(bootstrapNetAccept(listenFd, &tmpFd, &addr), res, out);
- NCCLCHECKGOTO(bootstrapNetRecv(tmpFd, &addr, &info, sizeof(info)), res, out);
- close(tmpFd);
+ struct ncclSocket sock;
+ sock.abortFlag = NULL;
+ NCCLCHECKGOTO(ncclSocketAccept(&sock, listenSock), res, out);
+ NCCLCHECKGOTO(bootstrapNetRecv(&sock, &info, sizeof(info)), res, out);
+ close(sock.fd);
if (c == 0) {
nranks = info.nranks;
@@ -127,14 +120,14 @@ static void *bootstrapRoot(void* args) {
goto out;
}
- if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union socketAddress)) != 0) {
+ if (memcmp(zero, &rankAddressesRoot[info.rank], sizeof(union ncclSocketAddress)) != 0) {
WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks);
goto out;
}
// Save the connection handle for that rank
- memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union socketAddress));
- memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union socketAddress));
+ memcpy(rankAddressesRoot+info.rank, &info.extAddressListenRoot, sizeof(union ncclSocketAddress));
+ memcpy(rankAddresses+info.rank, &info.extAddressListen, sizeof(union ncclSocketAddress));
++c;
TRACE(NCCL_INIT, "Received connect from rank %d total %d/%d", info.rank, c, nranks);
@@ -144,15 +137,18 @@ static void *bootstrapRoot(void* args) {
// Send the connect handle for the next rank in the AllGather ring
for (int r=0; r<nranks; ++r) {
int next = (r+1) % nranks;
- int tmpSendFd;
- NCCLCHECKGOTO(connectAddress(&tmpSendFd, rankAddressesRoot+r), res, out);
- NCCLCHECKGOTO(bootstrapNetSend(tmpSendFd, rankAddressesRoot+r, rankAddresses+next, sizeof(union socketAddress)), res, out);
- close(tmpSendFd);
+ struct ncclSocket sock;
+ sock.abortFlag = NULL;
+ memcpy(&sock.addr, rankAddressesRoot+r, sizeof(union ncclSocketAddress));
+ NCCLCHECKGOTO(ncclSocketConnect(&sock), res, out);
+ NCCLCHECKGOTO(bootstrapNetSend(&sock, rankAddresses+next, sizeof(union ncclSocketAddress)), res, out);
+ close(sock.fd);
}
TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);
out:
- close(listenFd);
+ close(listenSock->fd);
+ free(listenSock);
if (rankAddresses) free(rankAddresses);
if (rankAddressesRoot) free(rankAddressesRoot);
if (zero) free(zero);
@@ -162,28 +158,31 @@ out:
}
ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
- union socketAddress* connectAddr = (union socketAddress*) id;
- int listenFd;
- NCCLCHECK(createListenSocket(&listenFd, connectAddr));
+ struct ncclSocket* listenSock;
+ NCCLCHECK(ncclCalloc(&listenSock, 1));
+ memcpy(&listenSock->addr, id, sizeof(union ncclSocketAddress));
+ NCCLCHECK(ncclSocketListen(listenSock));
+ memcpy(id, &listenSock->addr, sizeof(union ncclSocketAddress));
pthread_t thread;
- pthread_create(&thread, NULL, bootstrapRoot, (void*)(uint64_t)listenFd);
+ pthread_create(&thread, NULL, bootstrapRoot, (void*)listenSock);
+ ncclSetThreadName(thread, "NCCL BootstrapR");
return ncclSuccess;
}
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
- static_assert(sizeof(union socketAddress) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
+ static_assert(sizeof(union ncclSocketAddress) < sizeof(ncclUniqueId), "NetId does not fit inside ncclUniqueId");
memset(id, 0, sizeof(ncclUniqueId));
- union socketAddress* connectAddr = (union socketAddress*) id;
+ union ncclSocketAddress* connectAddr = (union ncclSocketAddress*) id;
char* env = getenv("NCCL_COMM_ID");
if (env) {
INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", env);
- if (GetSocketAddrFromString(connectAddr, env) != ncclSuccess) {
+ if (ncclGetSocketAddrFromString(connectAddr, env) != ncclSuccess) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return ncclInvalidArgument;
}
} else {
- memcpy(id, &bootstrapNetIfAddr, sizeof(union socketAddress));
+ memcpy(id, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
NCCLCHECK(bootstrapCreateRoot(id, false));
}
@@ -193,157 +192,51 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) {
struct unexConn {
int peer;
int tag;
- int fd;
- union socketAddress addr;
+ struct ncclSocket sock;
struct unexConn* next;
};
-// Remote allocator state
-struct remAllocState {
- int cudaDev;
- int listenFd;
- volatile int stop;
-};
-
-struct extState {
- int extListenFd;
- int extRingRecvFd;
- int extRingSendFd;
- union socketAddress extRingRecvAddr, extRingSendAddr;
- union socketAddress* peerCommAddresses;
- union socketAddress* peerAllocAddresses;
+struct bootstrapState {
+ struct ncclSocket listenSock;
+ struct ncclSocket ringRecvSocket;
+ struct ncclSocket ringSendSocket;
+ union ncclSocketAddress* peerCommAddresses;
+ union ncclSocketAddress* peerProxyAddresses;
struct unexConn* unexpectedConnections;
int cudaDev;
int rank;
int nranks;
-
- // Intermediate memory allocation service
- struct remAllocState* allocState;
- pthread_t allocThread;
+ volatile uint32_t *abortFlag;
};
-#define MAX_SEGMENTS 128
-
-static ncclResult_t remoteAlloc(void** ptr, int fd, union socketAddress *addr) {
- size_t size;
- NCCLCHECK(socketRecv(fd, addr, &size, sizeof(size_t)));
- cudaIpcMemHandle_t devIpc;
- NCCLCHECK(ncclCudaCalloc((char**)ptr, size));
- cudaError_t res = cudaIpcGetMemHandle(&devIpc, *ptr);
- if (res != cudaSuccess) {
- WARN("[Rem Allocator] cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res));
- cudaFree(*ptr);
- CUDACHECK(res);
- }
- // The CUDA IPC
- NCCLCHECK(socketSend(fd, addr, &devIpc, sizeof(cudaIpcMemHandle_t)));
- // And the direct pointer
- NCCLCHECK(socketSend(fd, addr, ptr, sizeof(void*)));
- return ncclSuccess;
-}
-
-#include <poll.h>
-
-// Service thread to allocate memory for other GPUs, used as intermediate step.
-void* ncclRemoteMemAllocationService(void* args) {
- struct remAllocState* state = (struct remAllocState *) args;
- if (cudaSetDevice(state->cudaDev) != cudaSuccess) {
- WARN("[Rem Allocator] Failed to set CUDA device %d", state->cudaDev);
- }
-
- // Prepare poll descriptor
- void* segments[MAX_SEGMENTS];
- struct pollfd pollfds[MAX_SEGMENTS+1];
- for (int s=0; s<MAX_SEGMENTS; s++) segments[s] = NULL;
- for (int s=0; s<MAX_SEGMENTS; s++) {
- pollfds[s].fd = -1;
- pollfds[s].events = POLLIN;
- }
- pollfds[MAX_SEGMENTS].fd = state->listenFd;
- pollfds[MAX_SEGMENTS].events = POLLIN;
-
- int nbuffers = 0;
- while (state->stop == 0 || (state->stop == 1 && nbuffers > 0)) {
- if (int error = poll(pollfds, MAX_SEGMENTS+1, 100/*ms*/) < 0) {
- WARN("[Rem Allocator] Poll failed with error %d", error);
- return NULL;
- }
- if (pollfds[MAX_SEGMENTS].revents) {
- int s = 0;
- union socketAddress addr;
- while (segments[s] != NULL && s < MAX_SEGMENTS) s++;
- if (bootstrapNetAccept(pollfds[MAX_SEGMENTS].fd, &pollfds[s].fd, &addr) != ncclSuccess) {
- pollfds[s].fd = -1;
- } else {
- if (s == MAX_SEGMENTS || (remoteAlloc(segments+s, pollfds[s].fd, &addr) != ncclSuccess)) {
- WARN("[Rem Allocator] Allocation failed (segment %d, fd %d)", s, pollfds[s].fd);
- close(pollfds[s].fd);
- pollfds[s].fd = -1;
- } else {
- nbuffers++;
- }
- }
- }
- for (int s=0; s<MAX_SEGMENTS; s++) {
- if (pollfds[s].revents & (POLLIN|POLLHUP)) {
- if (cudaFree(segments[s]) != cudaSuccess) {
- WARN("[Rem Allocator] cudaFree %p failed", segments[s]);
- }
- segments[s] = NULL;
- close(pollfds[s].fd);
- pollfds[s].fd = -1;
- nbuffers--;
- }
- }
- }
- for (int s=0; s<MAX_SEGMENTS; s++) {
- if (segments[s]) cudaFree(segments[s]);
- close(pollfds[s].fd);
- }
- close(state->listenFd);
- free(state);
- return NULL;
-}
-
-ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, cudaIpcMemHandle_t* ipc, void** ptr) {
- struct extState* state = (struct extState*)commState;
- int fd;
- ncclResult_t res;
- *id = -1;
- union socketAddress *addr = state->peerAllocAddresses+rank;
- NCCLCHECK(connectAddress(&fd, addr));
- NCCLCHECKGOTO(socketSend(fd, addr, &size, sizeof(size_t)), res, end);
- NCCLCHECKGOTO(socketRecv(fd, addr, ipc, sizeof(cudaIpcMemHandle_t)), res, end);
- NCCLCHECKGOTO(socketRecv(fd, addr, ptr, sizeof(void*)), res, end);
- *id = fd;
-end:
- return res;
-}
-
-ncclResult_t bootstrapRemFree(int id, int rank, void* commState) {
- SYSCHECK(close(id), "close");
- return ncclSuccess;
-}
-
-ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState) {
- struct extState* state;
+ncclResult_t bootstrapInit(ncclUniqueId * id, struct ncclComm* comm) {
+ int rank = comm->rank;
+ int nranks = comm->nRanks;
+ struct bootstrapState* state;
NCCLCHECK(ncclCalloc(&state, 1));
state->rank = rank;
state->nranks = nranks;
- *commState = state;
+ state->abortFlag = comm->abortFlag;
+ comm->bootstrap = state;
TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks);
struct extInfo info = { 0 };
info.rank = rank;
info.nranks = nranks;
- int tmpSendFd, tmpRecvFd;
+ struct ncclSocket sock, listenSockRoot;
+ sock.abortFlag = listenSockRoot.abortFlag = comm->abortFlag;
+ sock.asyncFlag = listenSockRoot.asyncFlag = 0;
+
+ // Create socket for other ranks to contact me
+ memcpy(&state->listenSock.addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
+ NCCLCHECK(ncclSocketListen(&state->listenSock));
+ memcpy(&info.extAddressListen, &state->listenSock.addr, sizeof(union ncclSocketAddress));
- int extListenFdRoot;
- memcpy(&info.extAddressListen, &bootstrapNetIfAddr, sizeof(union socketAddress));
- memcpy(&info.extAddressListenRoot, &bootstrapNetIfAddr, sizeof(union socketAddress));
- NCCLCHECK(createListenSocket(&state->extListenFd, &info.extAddressListen));
- NCCLCHECK(createListenSocket(&extListenFdRoot, &info.extAddressListenRoot));
+ // Create socket for root to contact me
+ memcpy(&listenSockRoot.addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
+ NCCLCHECK(ncclSocketListen(&listenSockRoot));
+ memcpy(&info.extAddressListenRoot, &listenSockRoot.addr, sizeof(union ncclSocketAddress));
// stagger connection times to avoid an overload of the root
if (nranks > 128) {
@@ -356,35 +249,36 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
}
// send info on my listening socket to root
- union socketAddress* rootAddr = (union socketAddress*)id;
- NCCLCHECK(connectAddress(&tmpSendFd, rootAddr));
- NCCLCHECK(bootstrapNetSend(tmpSendFd, rootAddr, &info, sizeof(info)));
- close(tmpSendFd);
+ memcpy(&sock.addr, id, sizeof(union ncclSocketAddress));
+ NCCLCHECK(ncclSocketConnect(&sock));
+ NCCLCHECK(bootstrapNetSend(&sock, &info, sizeof(info)));
+ close(sock.fd);
// get info on my "next" rank in the bootstrap ring from root
- union socketAddress addr;
- NCCLCHECK(bootstrapNetAccept(extListenFdRoot, &tmpRecvFd, &addr));
- NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, &state->extRingSendAddr, sizeof(state->extRingSendAddr)));
- close(tmpRecvFd);
- close(extListenFdRoot);
+ NCCLCHECK(ncclSocketAccept(&sock, &listenSockRoot));
+ NCCLCHECK(bootstrapNetRecv(&sock, &state->ringSendSocket.addr, sizeof(union ncclSocketAddress)));
+ close(sock.fd);
+ close(listenSockRoot.fd);
- NCCLCHECK(connectAddress(&state->extRingSendFd, &state->extRingSendAddr));
+ NCCLCHECK(ncclSocketConnect(&state->ringSendSocket));
// Accept the connect request from the previous rank in the AllGather ring
- NCCLCHECK(bootstrapNetAccept(state->extListenFd, &state->extRingRecvFd, &state->extRingRecvAddr));
+ NCCLCHECK(ncclSocketAccept(&state->ringRecvSocket, &state->listenSock));
// AllGather all listen handlers
NCCLCHECK(ncclCalloc(&state->peerCommAddresses, nranks));
- memcpy(state->peerCommAddresses+rank, &info.extAddressListen, sizeof(union socketAddress));
- NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union socketAddress)));
-
- // Create the memory allocation service
- NCCLCHECK(ncclCalloc(&state->peerAllocAddresses, nranks));
- memcpy(state->peerAllocAddresses+rank, &bootstrapNetIfAddr, sizeof(union socketAddress));
- NCCLCHECK(ncclCalloc(&state->allocState, 1));
- CUDACHECK(cudaGetDevice(&state->allocState->cudaDev));
- NCCLCHECK(createListenSocket(&state->allocState->listenFd, state->peerAllocAddresses+rank));
- pthread_create(&state->allocThread, NULL, ncclRemoteMemAllocationService, state->allocState);
- NCCLCHECK(bootstrapAllGather(state, state->peerAllocAddresses, sizeof(union socketAddress)));
+ memcpy(state->peerCommAddresses+rank, &state->listenSock.addr, sizeof(union ncclSocketAddress));
+ NCCLCHECK(bootstrapAllGather(state, state->peerCommAddresses, sizeof(union ncclSocketAddress)));
+
+ // Create the service proxy
+ NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks));
+ struct ncclSocket* proxySocket;
+ NCCLCHECK(ncclCalloc(&proxySocket, 1));
+ proxySocket->abortFlag = NULL; // proxy is aborted through a message
+ memcpy(&proxySocket->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
+ NCCLCHECK(ncclSocketListen(proxySocket));
+ memcpy(state->peerProxyAddresses+rank, &proxySocket->addr, sizeof(union ncclSocketAddress));
+ NCCLCHECK(bootstrapAllGather(state, state->peerProxyAddresses, sizeof(union ncclSocketAddress)));
+ NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses));
TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
@@ -392,7 +286,7 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
}
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
- struct extState* state = (struct extState*)commState;
+ struct bootstrapState* state = (struct bootstrapState*)commState;
char* data = (char*)allData;
int rank = state->rank;
int nranks = state->nranks;
@@ -408,9 +302,9 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
size_t sslice = (rank - i + nranks) % nranks;
// Send slice to the right
- NCCLCHECK(bootstrapNetSend(state->extRingSendFd, &state->extRingSendAddr, data+sslice*size, size));
+ NCCLCHECK(bootstrapNetSend(&state->ringSendSocket, data+sslice*size, size));
// Recv slice from the left
- NCCLCHECK(bootstrapNetRecv(state->extRingRecvFd, &state->extRingRecvAddr, data+rslice*size, size));
+ NCCLCHECK(bootstrapNetRecv(&state->ringRecvSocket, data+rslice*size, size));
}
TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size);
@@ -418,14 +312,15 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) {
}
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) {
- struct extState* state = (struct extState*)commState;
- int tmpSendFd;
- union socketAddress *addr = state->peerCommAddresses+peer;
- NCCLCHECK(connectAddress(&tmpSendFd, addr));
- NCCLCHECK(bootstrapNetSend(tmpSendFd, addr, &state->rank, sizeof(int)));
- NCCLCHECK(bootstrapNetSend(tmpSendFd, addr, &tag, sizeof(int)));
- NCCLCHECK(bootstrapNetSend(tmpSendFd, addr, data, size));
- close(tmpSendFd);
+ struct bootstrapState* state = (struct bootstrapState*)commState;
+ struct ncclSocket sock;
+ sock.abortFlag = state->abortFlag;
+ memcpy(&sock.addr, state->peerCommAddresses+peer, sizeof(union ncclSocketAddress));
+ NCCLCHECK(ncclSocketConnect(&sock));
+ NCCLCHECK(bootstrapNetSend(&sock, &state->rank, sizeof(int)));
+ NCCLCHECK(bootstrapNetSend(&sock, &tag, sizeof(int)));
+ NCCLCHECK(bootstrapNetSend(&sock, data, size));
+ close(sock.fd);
return ncclSuccess;
}
@@ -466,14 +361,13 @@ ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank,
return ncclSuccess;
}
-ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int tag, int fd, union socketAddress *addr) {
+ncclResult_t unexpectedEnqueue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) {
// New unex
struct unexConn* unex;
NCCLCHECK(ncclCalloc(&unex, 1));
unex->peer = peer;
unex->tag = tag;
- unex->fd = fd;
- unex->addr = *addr;
+ memcpy(&unex->sock, sock, sizeof(struct ncclSocket));
// Enqueue
struct unexConn* list = state->unexpectedConnections;
@@ -486,7 +380,7 @@ ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int tag, int fd
return ncclSuccess;
}
-int unexpectedDequeue(struct extState* state, int peer, int tag, union socketAddress *addr) {
+ncclResult_t unexpectedDequeue(struct bootstrapState* state, int peer, int tag, struct ncclSocket* sock) {
struct unexConn* elem = state->unexpectedConnections;
struct unexConn* prev = NULL;
while (elem) {
@@ -496,79 +390,72 @@ int unexpectedDequeue(struct extState* state, int peer, int tag, union socketAdd
} else {
prev->next = elem->next;
}
- int fd = elem->fd;
- *addr = elem->addr;
+ memcpy(sock, &elem->sock, sizeof(struct ncclSocket));
free(elem);
- return fd;
+ return ncclSuccess;
}
prev = elem;
elem = elem->next;
}
- return -1;
+ sock->fd = -1;
+ return ncclSuccess;
}
// We can't know who we'll receive from, so we need to receive everything at once
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) {
- struct extState* state = (struct extState*)commState;
+ struct bootstrapState* state = (struct bootstrapState*)commState;
- int tmpRecvFd;
- union socketAddress addr;
+ struct ncclSocket sock;
+ sock.abortFlag = state->abortFlag;
// Search unexpected connections first
- if ((tmpRecvFd = unexpectedDequeue(state, peer, tag, &addr)) != -1) {
- NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, ((char*)data), size));
- close(tmpRecvFd);
+ NCCLCHECK(unexpectedDequeue(state, peer, tag, &sock));
+ if (sock.fd != -1) {
+ NCCLCHECK(bootstrapNetRecv(&sock, ((char*)data), size));
+ close(sock.fd);
return ncclSuccess;
}
// Then look for new connections
while (1) {
- union socketAddress addr;
- NCCLCHECK(bootstrapNetAccept(state->extListenFd, &tmpRecvFd, &addr));
+ NCCLCHECK(ncclSocketAccept(&sock, &state->listenSock));
int newPeer, newTag;
- NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, &newPeer, sizeof(int)));
- NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, &newTag, sizeof(int)));
+ NCCLCHECK(bootstrapNetRecv(&sock, &newPeer, sizeof(int)));
+ NCCLCHECK(bootstrapNetRecv(&sock, &newTag, sizeof(int)));
if (newPeer == peer && newTag == tag) {
- NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &addr, ((char*)data), size));
- close(tmpRecvFd);
+ NCCLCHECK(bootstrapNetRecv(&sock, ((char*)data), size));
+ close(sock.fd);
return ncclSuccess;
}
// Unexpected connection. Save for later.
- NCCLCHECK(unexpectedEnqueue(state, newPeer, newTag, tmpRecvFd, &addr));
+ NCCLCHECK(unexpectedEnqueue(state, newPeer, newTag, &sock));
}
}
ncclResult_t bootstrapClose(void* commState) {
- struct extState* state = (struct extState*)commState;
+ struct bootstrapState* state = (struct bootstrapState*)commState;
if (state->unexpectedConnections != NULL) {
WARN("Unexpected connections are not empty");
return ncclInternalError;
}
- close(state->extListenFd);
- close(state->extRingSendFd);
- close(state->extRingRecvFd);
-
- state->allocState->stop = 1;
-
- // Join the allocThread so we catch resource leaks as being hung here
- // pthread_join(state->allocThread, nullptr);
+ close(state->listenSock.fd);
+ close(state->ringSendSocket.fd);
+ close(state->ringRecvSocket.fd);
free(state->peerCommAddresses);
- free(state->peerAllocAddresses);
free(state);
return ncclSuccess;
}
ncclResult_t bootstrapAbort(void* commState) {
- struct extState* state = (struct extState*)commState;
+ struct bootstrapState* state = (struct bootstrapState*)commState;
if (commState == NULL) return ncclSuccess;
- if (state->extListenFd) close(state->extListenFd);
- if (state->extRingSendFd) close(state->extRingSendFd);
- if (state->extRingRecvFd) close(state->extRingRecvFd);
- if (state->allocState) state->allocState->stop = 2;
+ if (state->listenSock.fd) close(state->listenSock.fd);
+ if (state->ringSendSocket.fd) close(state->ringSendSocket.fd);
+ if (state->ringRecvSocket.fd) close(state->ringRecvSocket.fd);
free(state->peerCommAddresses);
- free(state->peerAllocAddresses);
+ free(state->peerProxyAddresses);
free(state);
return ncclSuccess;
}
diff --git a/src/channel.cc b/src/channel.cc
index a07e38a..87cec65 100644
--- a/src/channel.cc
+++ b/src/channel.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -64,13 +64,13 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
for (int r=0; r<nRanks+1; r++) {
struct ncclPeer* peer = channel->peers+r;
for (int b=0; b<NCCL_MAX_CONNS; b++) {
- if (peer->send[b].transportResources) NCCLCHECK(peer->send[b].transportComm->free(peer->send[b].transportResources));
+ if (peer->send[b].transportComm) NCCLCHECK(peer->send[b].transportComm->free(peer->send+b));
}
}
for (int r=0; r<nRanks+1; r++) {
struct ncclPeer* peer = channel->peers+r;
for (int b=0; b<NCCL_MAX_CONNS; b++) {
- if (peer->recv[b].transportResources) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv[b].transportResources));
+ if (peer->recv[b].transportComm) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv+b));
}
}
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index 83b0da9..c86384c 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -12,9 +12,9 @@ namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
const int tid = threadIdx.x;
- const int nthreads = args->nThreads;
- const int bid = args->coll.bid;
- const int nChannels = args->coll.nChannels;
+ const int nthreads = args->header.nWarps*WARP_SIZE;
+ const int bid = args->bid;
+ const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem.channel.ring;
const int *ringRanks = ring->devUserRanks;
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1));
@@ -22,12 +22,12 @@ namespace {
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
const int nranks = ncclShmem.comm.nRanks;
const ssize_t loopSize = nChannels*int(chunkSize);
- const ssize_t size = args->coll.count;
+ const ssize_t size = args->count;
T *inputBuf = (T*)args->sendbuff;
T *outputBuf = (T*)args->recvbuff;
- Primitives<T, RedOp, FanSymmetric<1>, 1, Proto>
- prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->coll.redOpArg);
+ Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
+ (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t realChunkSize;
@@ -36,7 +36,7 @@ namespace {
realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
}
else if (Proto::Id == NCCL_PROTO_LL)
- realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize;
+ realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
else if (Proto::Id == NCCL_PROTO_LL128)
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128);
realChunkSize = int(realChunkSize);
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index c3171bf..41ef255 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -12,15 +12,15 @@ namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
const int tid = threadIdx.x;
- const int nthreads = args->nThreads;
- const int bid = args->coll.bid;
- const int nChannels = args->coll.nChannels;
+ const int nthreads = args->header.nWarps*WARP_SIZE;
+ const int bid = args->bid;
+ const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem.channel.ring;
int ringIx = ring->index;
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLREDUCE_CHUNKSTEPS : 1));
const int nranks = ncclShmem.comm.nRanks;
const ssize_t loopSize = nChannels*nranks*chunkSize;
- const ssize_t size = args->coll.count;
+ const ssize_t size = args->count;
int minChunkSize;
if (Proto::Id == NCCL_PROTO_LL)
@@ -30,8 +30,8 @@ namespace {
minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2;
}
- Primitives<T, RedOp, FanSymmetric<1>, 1, Proto> prims
- (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg);
+ Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
+ (tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t realChunkSize;
@@ -97,25 +97,25 @@ namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __forceinline__ void runTreeUpDown(ncclWorkElem *args) {
const int tid = threadIdx.x;
- const int nthreads = args->nThreads;
- const int bid = args->coll.bid;
- const int nChannels = args->coll.nChannels;
+ const int nthreads = args->header.nWarps*WARP_SIZE;
+ const int bid = args->bid;
+ const int nChannels = args->nChannels;
ncclTree *tree = &ncclShmem.channel.tree;
ssize_t chunkSize = int(
- Proto::Id == NCCL_PROTO_SIMPLE ? args->coll.lastChunkSize
+ Proto::Id == NCCL_PROTO_SIMPLE ? args->lastChunkSize
/* LL & LL128 */ : Proto::calcBytePerStep()/sizeof(T));
const ssize_t minChunkSize = int(
Proto::Id == NCCL_PROTO_SIMPLE ? (nthreads-2*WARP_SIZE)*8*(sizeof(uint64_t)/sizeof(T))
/* LL & LL128 */ : nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
const ssize_t loopSize = int(nChannels*chunkSize);
- const ssize_t size = args->coll.count;
+ const ssize_t size = args->count;
if (loopSize > size)
chunkSize = divUp((int)size, int(nChannels*minChunkSize))*int(minChunkSize);
{ // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
- Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto> prims
- (tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->coll.redOpArg);
+ Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto, 0> prims
+ (tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg);
if (tree->up == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
@@ -140,8 +140,8 @@ namespace {
}
{ // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
- Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto> prims
- (tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg);
+ Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto, 0> prims
+ (tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
if (tree->up == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
@@ -169,19 +169,19 @@ namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __forceinline__ void runTreeSplit(ncclWorkElem *args) {
const int tid = threadIdx.x;
- const int nthreads = args->nThreads;
- const int bid = args->coll.bid;
- const int nChannels = args->coll.nChannels;
+ const int nthreads = args->header.nWarps*WARP_SIZE;
+ const int bid = args->bid;
+ const int nChannels = args->nChannels;
ncclTree *tree = &ncclShmem.channel.tree;
ssize_t chunkSize = int(
- Proto::Id != NCCL_PROTO_LL ? args->coll.lastChunkSize
+ Proto::Id != NCCL_PROTO_LL ? args->lastChunkSize
: Proto::calcBytePerStep()/sizeof(T));
const ssize_t minChunkSize = int(
Proto::Id == NCCL_PROTO_SIMPLE ? (nthreads - 2*WARP_SIZE)*8*(sizeof(uint64_t)/sizeof(T)) :
Proto::Id == NCCL_PROTO_LL ? nthreads*(Proto::calcBytePerGrain()/sizeof(T))
/* LL128 */ : nthreads*(Proto::calcBytePerGrain()/sizeof(T))/8);
const ssize_t loopSize = int(nChannels*chunkSize);
- const ssize_t size = args->coll.count;
+ const ssize_t size = args->count;
int nthreadsSplit;
if (Proto::Id == NCCL_PROTO_SIMPLE) {
@@ -198,8 +198,8 @@ namespace {
if (tree->up == -1) {
// Reduce and broadcast. Max number of recv is 3, max number of send is 3
- Primitives<T, RedOp, FanSymmetric<NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto>
- prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg);
+ Primitives<T, RedOp, FanSymmetric<NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto, 0>
+ prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
@@ -215,8 +215,8 @@ namespace {
* into DirectRecv and DirectSend capabilities, this ctor would have both=0,
* but the ctor above for tree roots would be DirectRecv=0 DirectSend=1.
*/
- Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/1, Proto>
- prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->coll.redOpArg, 0*Proto::MaxGroupWidth);
+ Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/1, Proto, 0>
+ prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg, 0*Proto::MaxGroupWidth);
if (tree->down[0] == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
@@ -234,8 +234,8 @@ namespace {
}
else {
// Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local)
- Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto>
- prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg, 1*Proto::MaxGroupWidth);
+ Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/1, Proto, 0>
+ prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, 1*Proto::MaxGroupWidth);
if (tree->down[0] == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
@@ -278,11 +278,11 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
__device__ __forceinline__ void run(ncclWorkElem *args) {
static constexpr int COLLNET_COPY_THREADS = 96;
const int tid = threadIdx.x;
- const int bid = args->coll.bid;
- const int nChannels = args->coll.nChannels;
+ const int bid = args->bid;
+ const int nChannels = args->nChannels;
struct ncclDirect* tree = &ncclShmem.channel.collTree;
- const ssize_t chunkSize = int(args->coll.lastChunkSize);
- const ssize_t size = args->coll.count;
+ const ssize_t chunkSize = int(args->lastChunkSize);
+ const ssize_t size = args->count;
const ssize_t loopSize = nChannels*tree->nHeads*chunkSize;
const int hasUp = (tree->up[0] >= 0) ? 1 : 0;
@@ -290,7 +290,7 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
const int nThreadsScatter = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 3*COLLNET_COPY_THREADS : 0);
const int nThreadsGather = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0);
const int nThreadsBcast = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 2*COLLNET_COPY_THREADS);
- const int nThreadsReduce = args->nThreads - nThreadsScatter - nThreadsGather - nThreadsBcast;
+ const int nThreadsReduce = args->header.nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
const int tidStartBcast = nThreadsGather;
const int tidStartScatter = tidStartBcast + nThreadsBcast;
const int tidStartReduce = tidStartScatter + nThreadsScatter;
@@ -300,8 +300,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) {
// Scatter
int group = (2*Proto::MaxGroupWidth) | (1<<16);
- Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto>
- prims(tid-tidStartScatter, nThreadsScatter, NULL, tree->up, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args);
+ Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
+ prims(tid-tidStartScatter, nThreadsScatter, NULL, tree->up, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize;
int nelem = min(tree->nHeads*chunkSize, size-offset);
@@ -315,8 +315,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
int group = (3*Proto::MaxGroupWidth) | (1<<16);
if (hasDn) {
// Reduce, send to network
- Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto>
- prims(tid-tidStartReduce, nThreadsReduce, tree->down, &tree->out, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args);
+ Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
+ prims(tid-tidStartReduce, nThreadsReduce, tree->down, &tree->out, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
@@ -328,8 +328,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
}
} else {
// Directly send to network
- Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto>
- prims(tid-tidStartReduce, nThreadsReduce, nullptr, &tree->out, args->sendbuff, args->recvbuff, args->coll.redOpArg, group);
+ Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
+ prims(tid-tidStartReduce, nThreadsReduce, nullptr, &tree->out, args->sendbuff, args->recvbuff, args->redOpArg, group);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
@@ -339,8 +339,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
} else if (tid < tidStartBcast && hasUp) {
// Gather
int group = (0*Proto::MaxGroupWidth) | (0<<16);
- Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto>
- prims(tid, nThreadsGather, tree->up, NULL, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args);
+ Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto, 0>
+ prims(tid, nThreadsGather, tree->up, NULL, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize;
int nelem = min(tree->nHeads*chunkSize, size-offset);
@@ -350,8 +350,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
int group = (1*Proto::MaxGroupWidth) | (0<<16);
if (hasDn) {
// Recv from network, broadcast
- Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto>
- prims(tid-tidStartBcast, nThreadsBcast, &tree->out, tree->down, args->sendbuff, args->recvbuff, args->coll.redOpArg, group, args);
+ Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
+ prims(tid-tidStartBcast, nThreadsBcast, &tree->out, tree->down, args->sendbuff, args->recvbuff, args->redOpArg, group, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
@@ -359,8 +359,8 @@ struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET, NCCL_PROTO
}
} else {
// Recv from network (no post thread needed)
- Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto>
- prims(tid-tidStartBcast, nThreadsBcast, &tree->out, nullptr, args->sendbuff, args->recvbuff, args->coll.redOpArg, group);
+ Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
+ prims(tid-tidStartBcast, nThreadsBcast, &tree->out, nullptr, args->sendbuff, args->recvbuff, args->redOpArg, group);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index 61c60b9..ba4ef56 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -12,22 +12,22 @@ namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
const int tid = threadIdx.x;
- const int nthreads = args->nThreads;
- const int bid = args->coll.bid;
- const int nChannels = args->coll.nChannels;
+ const int nthreads = args->header.nWarps*WARP_SIZE;
+ const int bid = args->bid;
+ const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem.channel.ring;
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? BROADCAST_CHUNKSTEPS : 1));
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
const ssize_t loopSize = nChannels*chunkSize;
- const ssize_t size = args->coll.count;
+ const ssize_t size = args->count;
const int rank = ring->devUserRanks[0];
const int nextRank = ring->devUserRanks[1];
- const int root = args->coll.root;
+ const int root = args->root;
T *inputBuf = (T*)args->sendbuff;
T *outputBuf = (T*)args->recvbuff;
- Primitives<T, RedOp, FanSymmetric<1>, 0, Proto>
- prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->coll.redOpArg);
+ Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
+ prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t realChunkSize;
@@ -36,7 +36,7 @@ namespace {
realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
}
else if (Proto::Id == NCCL_PROTO_LL)
- realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize;
+ realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
else if (Proto::Id == NCCL_PROTO_LL128)
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128);
realChunkSize = int(realChunkSize);
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index ff410d7..40a2303 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,6 +9,7 @@
#include "collectives.h"
#include "devcomm.h"
+#include "op128.h"
#if __CUDA_ARCH__ >= 800
#define COLL_UNROLL 8
@@ -23,11 +24,31 @@ __device__ inline bool barrierReduceAny(int bit) {
asm ("{"
".reg .pred barr_pred;"
"setp.eq.u32 barr_pred, %1, 1;"
- "bar.red.popc.u32 %0, 0, barr_pred;"
+ "bar.red.popc.u32 %0, 2, barr_pred;"
"}" : "=r"(popc) : "r"(bit));
return popc != 0;
}
+// Copy src to dst and fill extra size with zeroes
+template<typename Tdst, typename Tsrc>
+__device__ void copyToShmem(Tdst *dst, Tsrc const *src, int tid, int nthreads) {
+ static_assert(sizeof(Tdst)%(2*sizeof(uint64_t)) == 0 && sizeof(Tsrc)%(2*sizeof(uint64_t)) == 0,
+ "copyToShmem needs sizes which are multiple of 16B");
+ static_assert(sizeof(Tdst) >= sizeof(Tsrc), "Tdst size is too small");
+ static_assert(sizeof(Tdst) <= WARP_SIZE*2*sizeof(uint64_t), "copyToShmem limited to 512B to make sure it can always be done in one cycle");
+ uint64_t *d = reinterpret_cast<uint64_t*>(dst);
+ uint64_t const *s = reinterpret_cast<uint64_t const*>(src);
+ uint64_t *shmemPtr = shmemCvtPtr(d);
+ int offset = 2*tid;
+ uint64_t v0, v1;
+ if (offset >= sizeof(Tsrc)/sizeof(uint64_t)) {
+ v0 = v1 = 0ULL;
+ } else {
+ v0 = s[offset] ; v1 = s[offset+1];
+ }
+ if (offset < sizeof(Tdst)/sizeof(uint64_t)) storeShmem128(shmemPtr+offset, v0, v1);
+}
+
template<typename T>
__device__ int copyToShmem(T *dst, T const *src, int turn=0) {
static_assert(sizeof(uint64_t) <= alignof(T), "Uhoh");
@@ -67,41 +88,16 @@ struct RunWorkElement {
}
};
-#if CUDART_VERSION >= 11030
-__device__ constexpr int ncclWorkElemFactors[NCCL_NUM_ALGORITHMS] =
-#else
-static __device__ __constant__ int ncclWorkElemFactors[NCCL_NUM_ALGORITHMS] =
-#endif
-{/*Tree*/1, /*Ring and P2P*/1, /*CollNet*/NCCL_REG_ELEM_FACTOR};
-
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
struct RunWork {
// This __forceinline__ is necessary. The compiler was inserting a function call
// here from the LL ncclKernel.
__device__ __forceinline__ void run(ncclWork *w) {
- int tid = threadIdx.x;
- /* Some invariants that must hold:
- * 1. All elems[] have same funcIndex.
- * 2. All elems[] have same nThreads.
- * 3. The thread-to-group relation (as in prims group numbers) is the same
- * for all elems[].
- *
- * If (1) isn't true then we might be in the wrong function since dispatch
- * on ncclFuncs[w->funcIndex] is how we got here.
- *
- * If (2) or (3) aren't true, then threads from different work elements
- * could race for barrier resources (barrier numbers 0...15) which is fatal.
- *
- * IMPORTANT!!! To ensure (3), implementations of
- * `RunWorkElement<Fn,T,RedOp,Algo,Proto>::run()` may only use the following
- * when deciding how to map threads to groups:
- * Fn, T, RedOp, Algo, Proto, nThreads
- *
- * This last one is difficult to enforce so I hope everyone reads this.
- */
- if (tid < w->elems[0].nThreads) {
- #pragma unroll 1
- for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].active != 0; e+=ncclWorkElemFactors[Algo])
+ int wid = threadIdx.x / WARP_SIZE;
+ int inc = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) / sizeof(ncclWorkElem) : 1;
+ #pragma unroll 1
+ for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e += inc) {
+ if (wid < w->header.nWarps)
RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(&w->elems[e]);
}
}
@@ -124,30 +120,51 @@ struct ncclShmemData {
struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
};
uint64_t redOpArgs[NCCL_MAX_DIRECT_ARITY+1];
- ncclDevComm comm;
- ncclChannel channel;
- ncclWork work;
+ struct ncclDevComm comm;
+ struct ncclChannel channel;
+ uint64_t pad;
+ struct ncclWork work;
};
+static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "shmem.work needs to be 16B aligned");
+
+static __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
+ if (we->header.type != ncclWorkTypeUnused && we->redOpArgIsPtr) {
+ /* redOpArg is a pointer to the scalar value, so we'll dereference it
+ * here so that redOpArg holds the bits of the scalar going forward.
+ * The tricky thing is we don't know its type T since that's encoded in
+ * the funcIndex. Because it would be difficult to get sizeof(T) from
+ * funcIndex, we'll cheat and just dereference the largest possible size
+ * given the alignment of the pointer. We might be reading in more bytes
+ * than we need but that's harmless.
+ */
+ if (we->redOpArg%2 != 0)
+ we->redOpArg = *reinterpret_cast<uint8_t*>(we->redOpArg);
+ else if (we->redOpArg%4 != 0)
+ we->redOpArg = *reinterpret_cast<uint16_t*>(we->redOpArg);
+ else if (we->redOpArg%8 != 0)
+ we->redOpArg = *reinterpret_cast<uint32_t*>(we->redOpArg);
+ else
+ we->redOpArg = *reinterpret_cast<uint64_t*>(we->redOpArg);
+ }
+}
extern __shared__ ncclShmemData ncclShmem;
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex>
-__device__ void ncclKernel(ncclWorkElem first) {
+__device__ void ncclKernel(struct ncclDevComm* comm, ncclWorkElem first) {
int tid = threadIdx.x;
+ int nthreads = blockDim.x;
int bid = blockIdx.x;
- int turn = copyToShmem(&ncclShmem.comm, first.comm);
+ int turn = copyToShmem(&ncclShmem.comm, comm);
// get address of channel without incurring indirect load from ncclDevCom::channels
- ncclChannel *channel = &((ncclDevCommAndChannels*)first.comm)->channels[bid];
+ ncclChannel *channel = &((ncclDevCommAndChannels*)comm)->channels[bid];
turn = copyToShmem(&ncclShmem.channel, channel, turn);
// To optimize for latency, (only) the first operation is passed as argument.
- if (bid == 0 && first.active != 0) {
- turn = copyToShmem(&ncclShmem.work.elems[0], &first, turn);
- if (1 <= tid && tid < NCCL_MAX_WORK_ELEMENTS && tid % ncclWorkElemFactors[Algo] == 0) {
- ncclShmem.work.elems[tid].active = 0;
- ncclShmem.work.elems[tid].redOpArgIsPtr = 0;
- }
+ if (bid == 0 && first.header.type != ncclWorkTypeUnused) {
+ // Copy first elem to work and zero out the rest
+ copyToShmem(&ncclShmem.work, &first, tid, nthreads);
}
__syncthreads(); // publish ncclShmem
@@ -155,17 +172,17 @@ __device__ void ncclKernel(ncclWorkElem first) {
ncclWork *workFifoDev = ncclShmem.channel.workFifoDev;
int workFifoIx = ncclShmem.channel.index;
- if (bid == 0 && first.active != 0)
+ if (bid == 0 && first.header.type != ncclWorkTypeUnused)
goto SkipLoadWork;
while (true) {
- copyToShmem(&ncclShmem.work, &workFifoDev[workFifoIx]); // turn no longer helps
+ copyToShmem(&ncclShmem.work, &workFifoDev[workFifoIx], tid, nthreads);
{ // Check whether the last operation was aborted and make sure all threads exit
- int aborted = tid == 0 ? *ncclShmem.comm.abortFlag : 0;
+ int aborted = tid == 0 ? *comm->abortFlag : 0;
if (barrierReduceAny(aborted)) // publish ncclShmem.work
break;
if (tid == 0)
- workFifoHost[workFifoIx].elems[0].active = 0;
+ workFifoHost[workFifoIx].header.type = ncclWorkTypeUnused;
}
SkipLoadWork:
@@ -173,36 +190,20 @@ __device__ void ncclKernel(ncclWorkElem first) {
if (tid == 0)
channel->index = workFifoIx; // write back to real channel, not shmem shadow
- if (tid < NCCL_MAX_WORK_ELEMENTS && tid % ncclWorkElemFactors[Algo] == 0) {
- ncclWorkElem *we = &ncclShmem.work.elems[tid];
- if (we->redOpArgIsPtr && we->active != 0) {
- /* redOpArg is a pointer to the scalar value, so we'll dereference it
- * here so that redOpArg holds the bits of the scalar going forward.
- * The tricky thing is we don't know its type T since that's encoded in
- * the funcIndex. Because it would be difficult to get sizeof(T) from
- * funcIndex, we'll cheat and just dereference the largest possible size
- * given the alignment of the pointer. We might be reading in more bytes
- * than we need but that's harmless.
- */
- if (we->coll.redOpArg%2 != 0)
- we->coll.redOpArg = *reinterpret_cast<uint8_t*>(we->coll.redOpArg);
- else if (we->coll.redOpArg%4 != 0)
- we->coll.redOpArg = *reinterpret_cast<uint16_t*>(we->coll.redOpArg);
- else if (we->coll.redOpArg%8 != 0)
- we->coll.redOpArg = *reinterpret_cast<uint32_t*>(we->coll.redOpArg);
- else
- we->coll.redOpArg = *reinterpret_cast<uint64_t*>(we->coll.redOpArg);
- }
+ __syncwarp();
+ if (ncclShmem.work.header.type == ncclWorkTypeColl) {
+ if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&ncclShmem.work.elems[tid]);
+ } else if (ncclShmem.work.header.type == ncclWorkTypeRegColl) {
+ if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&ncclShmem.work.regElems[tid].elem);
}
__syncthreads();
- if (ncclShmem.work.elems[0].funcIndex == FnIndex)
+ if (ncclShmem.work.header.funcIndex == FnIndex)
RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem.work);
else
- ncclFuncs[ncclShmem.work.elems[0].funcIndex]();
+ ncclFuncs[ncclShmem.work.header.funcIndex]();
- if (ncclShmem.work.elems[0].active == 2)
- break;
+ if (ncclShmem.work.header.isLast) break;
__syncthreads();
}
}
@@ -210,8 +211,8 @@ __device__ void ncclKernel(ncclWorkElem first) {
// Only generate kernels for SUM
#if NCCL_OP == 0
#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
-__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(ncclWorkElem first) { \
- ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex>(first); \
+__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, struct ncclWorkElem first) { \
+ ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex>(comm, first); \
}
#else
#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fInded)
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
index dcf1f66..c21d373 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -16,10 +16,11 @@
// Define min for ssize_t
static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
-template <typename T>
-inline __device__ void loadPtr(void** ptr, T* &v) {
- asm volatile("ld.volatile.global.u64 %0, [%1];"
- : "=l"(v) : "l"(ptr));
+inline __device__ int loadInt(int* ptr) {
+ int v;
+ asm volatile("ld.volatile.global.u32 %0, [%1];"
+ : "=r"(v) : "l"(ptr));
+ return v;
}
typedef uint64_t PackType;
diff --git a/src/collectives/device/onerank_reduce.cu b/src/collectives/device/onerank_reduce.cu
index f451582..b7dc3e9 100644
--- a/src/collectives/device/onerank_reduce.cu
+++ b/src/collectives/device/onerank_reduce.cu
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -16,11 +16,11 @@ namespace {
int tid = threadIdx.x;
int tn = blockDim.x;
#pragma unroll 1
- for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].active != 0; e++) {
+ for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].header.type != ncclWorkTypeUnused; e++) {
ncclWorkElem *we = &w->elems[e];
- intptr_t eltN = we->coll.count;
- int bid = we->coll.bid;
- int bn = we->coll.nChannels;
+ intptr_t eltN = we->count;
+ int bid = we->bid;
+ int bn = we->nChannels;
T const *src = (T const*)we->sendbuff;
T *dst = (T*)we->recvbuff;
@@ -36,7 +36,7 @@ namespace {
src += i0;
dst += i0;
ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 1>
- (tid, tn, &(we->coll.redOpArg), true, 1, &src, 1, &dst, i1-i0);
+ (tid, tn, &(we->redOpArg), true, 1, &src, 1, &dst, i1-i0);
}
}
}
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index 8f63447..ccc0d22 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -109,7 +109,7 @@ struct FanSymmetric {
};
// The primitives class. Specialized per protocol in the other headers.
-template<typename T, typename RedOp, typename Fan, int Direct, typename Proto>
+template<typename T, typename RedOp, typename Fan, int Direct, typename Proto, int P2p>
class Primitives;
// Used by LL & LL128 to implement direct members in the naive way.
diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h
index 8fa84e5..afed3df 100644
--- a/src/collectives/device/prims_ll.h
+++ b/src/collectives/device/prims_ll.h
@@ -1,12 +1,12 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
-template<typename T, typename RedOp, typename Fan, int Direct>
-class Primitives<T, RedOp, Fan, Direct, ProtoLL>:
- public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL>> {
+template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
+class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
+ public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>> {
static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
static constexpr int Input=0, Output=1;
@@ -41,7 +41,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL>:
inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
inline __device__ void barrier() {
- asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(1+group));
+ asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
}
uint32_t abort = 0;
diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h
index 3c049d1..8090385 100644
--- a/src/collectives/device/prims_ll128.h
+++ b/src/collectives/device/prims_ll128.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,9 +8,9 @@
#define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1)
-template<typename T, typename RedOp, typename Fan, int Direct>
-class Primitives<T, RedOp, Fan, Direct, ProtoLL128>:
- public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL128>> {
+template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
+class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
+ public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>> {
static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
static constexpr int Input=0, Output=1;
@@ -49,7 +49,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128>:
inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; }
inline __device__ void barrier() {
- asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(1+group));
+ asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
}
uint32_t abort = 0;
diff --git a/src/collectives/device/prims_simple.h b/src/collectives/device/prims_simple.h
index c30ff40..fd61dc4 100644
--- a/src/collectives/device/prims_simple.h
+++ b/src/collectives/device/prims_simple.h
@@ -1,13 +1,13 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
template<typename T, typename RedOp, typename Fan, int Direct,
- int SlicePerChunk, int StepPerSlice, int Unroll>
+ int SlicePerChunk, int StepPerSlice, int Unroll, int P2p>
class Primitives<
- T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll>
+ T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll>, P2p
> {
static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
static constexpr int Input=0, Output=1;
@@ -18,7 +18,7 @@ class Primitives<
RolePostSend = 0x10,
RolePostRecv = 0x20,
Aborted = 0x40,
- PtrsFifoEnabled = 0x80,
+ OffsFifoEnabled = 0x80,
SizesFifoEnabled = 0x100,
DirectWrite = 0x200,
DirectRead = 0x400,
@@ -32,10 +32,10 @@ class Primitives<
int flags;
int group;
uint64_t step;
+ int *connOffsFifoPtr; // (flags & OffsFifoEnabled)
union {
- void **connPtrsFifoPtr; // (flags & PtrsFifoEnabled)
T *userBuff; // (flags & (RoleInput|RoleOutput))
- T *connEltsFifo; // !(flags & (PtrsFifoEnabled|RoleInput|RoleOutput))
+ T *connEltsFifo; // !(flags & (RoleInput|RoleOutput))
};
union {
int volatile *connSizesFifoPtr; // (flags & SizesFifoEnabled)
@@ -49,14 +49,14 @@ class Primitives<
if (nthreads == WARP_SIZE)
__syncwarp();
else
- asm volatile("bar.sync %0, %1;" :: "r"(group+1), "r"(nthreads));
+ asm volatile("bar.sync %0, %1;" :: "r"(15-group), "r"(nthreads));
flags |= ThreadsSynced;
}
inline __device__ void subBarrier() {
if (nworkers == nthreads)
barrier();
else
- asm volatile("bar.sync %0, %1;" :: "r"(group+2), "r"(nworkers));
+ asm volatile("bar.sync %0, %1;" :: "r"(8-group), "r"(nworkers));
}
inline __device__ bool checkAbort(int &spins) {
@@ -89,8 +89,8 @@ class Primitives<
void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst)
: (ncclShmem.groups[group].srcs + Src);
- if (flags & PtrsFifoEnabled)
- loadPtr(connPtrsFifoPtr + step%NCCL_STEPS, ptrs[index]);
+ if (flags & OffsFifoEnabled)
+ ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T);
else if (isSendNotRecv && DirectSend) {
if (flags & DirectWrite) {
ptrs[index] = directBuff + remoteIx + offset;
@@ -232,6 +232,8 @@ class Primitives<
}
// Scatter/Gather generic op
+ // skip: my own rank order in the buffer chunks
+ // shift: peer offset to avoid all ranks sending to or receiving from same peer
template <int DirectRecv1, int DirectSend1, int Recv, int Send>
__device__ __forceinline__ void
ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int skip, int shift, bool postOp) {
@@ -254,14 +256,17 @@ class Primitives<
waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize);
subBarrier();
#pragma unroll
+ // Loop over peers
for (int j=0; j<fan.nsend(); j++) {
int i = (j+shift)%fan.nsend();
int peerOffset = i*peerElem;
+ // Skip the data I am responsible of reducing myself
if (skip >= 0 && i >= skip) peerOffset += peerElem;
const T* src0 = (T*)ncclShmem.groups[group].srcs[0] + peerOffset;
int realPeerSize = min(realSize, totalElem-peerOffset);
if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
ReduceOrCopyMulti<Unroll, RedOp, T, 1, 1, 1, 1, PreOpN>(tid, nworkers, ncclShmem.redOpArgs, false, 1, &src0, 1, (T**)ncclShmem.groups[group].dsts+i, realPeerSize);
+ // Mark for threadfence at the end
if (tid == 0) ncclShmem.groups[group].totalSendSize[slice] += realPeerSize;
}
}
@@ -289,6 +294,7 @@ class Primitives<
}
}
barrier();
+ // If we indeed send something, threadfence
if (Send && (flags & RolePostSend) && ncclShmem.groups[group].totalSendSize[slice] > 0 && index == 0)
__threadfence_system();
__syncwarp();
@@ -310,18 +316,18 @@ class Primitives<
ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
connStepPtr = conn->tail;
connStepCache = *connStepPtr;
- flags |= (conn->ptrsFifo != nullptr) ? PtrsFifoEnabled : 0;
+ flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
if (Direct) {
// User buffers have been registered
if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
- if (connIndex == 1) {
+ if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
(e->direct & NCCL_DIRECT_READ) ? DirectRead : 0;
}
} else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
- if (connIndex == 1) {
+ if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
// direct read not allowed in non-register case
@@ -330,10 +336,9 @@ class Primitives<
}
}
}
- if (flags & PtrsFifoEnabled)
- connPtrsFifoPtr = conn->ptrsFifo;
- else
- connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
+ if (flags & OffsFifoEnabled)
+ connOffsFifoPtr = conn->offsFifo;
+ connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
}
}
}
@@ -350,11 +355,10 @@ class Primitives<
ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
connStepPtr = conn->head;
connStepCache = *connStepPtr;
- flags |= (conn->ptrsFifo != nullptr) ? PtrsFifoEnabled : 0;
- if (flags & PtrsFifoEnabled)
- connPtrsFifoPtr = conn->ptrsFifo;
- else
- connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
+ flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
+ if (flags & OffsFifoEnabled)
+ connOffsFifoPtr = conn->offsFifo;
+ connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
if (conn->sizesFifo != nullptr) {
flags |= SizesFifoEnabled;
@@ -362,14 +366,14 @@ class Primitives<
} else if (Direct) {
// User buffers have been registered
if ((conn->direct & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
- if (connIndex == 1) {
+ if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
(e->direct & NCCL_DIRECT_READ) ? DirectRead : 0;
}
} else if (conn->direct & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
- if (connIndex == 1) {
+ if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
// direct read not allowed in non-register case
@@ -427,7 +431,7 @@ class Primitives<
loadRecvConn(&ncclShmem.channel.devPeers[peer], connIndex, e);
loadSendConn(&ncclShmem.channel.devPeers[peer], connIndex, e);
- setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkRegElem*)e);
+ setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
}
__device__ ~Primitives() {
@@ -444,7 +448,7 @@ class Primitives<
barrier();
}
- __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkRegElem* e) {
+ __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkElemReg* e) {
if (flags & RoleInput) {
userBuff = (T*)inputBuf;
ncclShmem.redOpArgs[0] = redOpArg; // scaler for local input
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index fbc5be9..8dc867b 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -12,21 +12,21 @@ namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
const int tid = threadIdx.x;
- const int nthreads = args->nThreads;
- const int bid = args->coll.bid;
- const int nChannels = args->coll.nChannels;
+ const int nthreads = args->header.nWarps*WARP_SIZE;
+ const int bid = args->bid;
+ const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem.channel.ring;
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCE_CHUNKSTEPS : 1));
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
const int nranks = ncclShmem.comm.nRanks;
const ssize_t loopSize = nChannels*chunkSize;
- const ssize_t size = args->coll.count;
+ const ssize_t size = args->count;
const int rank = ncclShmem.comm.rank;
const int prevRank = ring->devUserRanks[nranks-1];
- const int root = args->coll.root;
+ const int root = args->root;
- Primitives<T, RedOp, FanSymmetric<1>, 0, Proto>
- prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg);
+ Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
+ prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
auto calcChunkSize = [&]__device__(ssize_t gridOffset)->int {
int realChunkSize;
@@ -35,7 +35,7 @@ namespace {
realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
}
else if (Proto::Id == NCCL_PROTO_LL)
- realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize;
+ realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
else if (Proto::Id == NCCL_PROTO_LL128)
realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize);
return realChunkSize;
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index 0334448..3f38b1a 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -12,9 +12,9 @@ namespace {
template<typename T, typename RedOp, typename Proto>
__device__ __forceinline__ void runRing(ncclWorkElem *args) {
const int tid = threadIdx.x;
- const int nthreads = args->nThreads;
- const int bid = args->coll.bid;
- const int nChannels = args->coll.nChannels;
+ const int nthreads = args->header.nWarps*WARP_SIZE;
+ const int bid = args->bid;
+ const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem.channel.ring;
int const *ringRanks = ring->devUserRanks;
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1));
@@ -22,10 +22,10 @@ namespace {
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
const int nranks = ncclShmem.comm.nRanks;
const ssize_t loopSize = nChannels*chunkSize;
- const ssize_t size = args->coll.count;
+ const ssize_t size = args->count;
- Primitives<T, RedOp, FanSymmetric<1>, 0, Proto>
- prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->coll.redOpArg);
+ Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
+ prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t realChunkSize;
@@ -34,7 +34,7 @@ namespace {
realChunkSize = roundUp(realChunkSize, (nthreads-WARP_SIZE)*sizeof(uint64_t)/sizeof(T));
}
else if (Proto::Id == NCCL_PROTO_LL)
- realChunkSize = size-gridOffset < loopSize ? args->coll.lastChunkSize : chunkSize;
+ realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
else if (Proto::Id == NCCL_PROTO_LL128)
realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize);
realChunkSize = int(realChunkSize);
diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h
index 76f49c0..be0dbc5 100644
--- a/src/collectives/device/sendrecv.h
+++ b/src/collectives/device/sendrecv.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,73 +10,67 @@
template<typename T, typename RedOp>
struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
- __device__ __forceinline__ void run(ncclWork *work) {
- int tid = threadIdx.x;
- int group = 0;
- const int rank = ncclShmem.comm.rank;
- const int nRanks = ncclShmem.comm.nRanks;
- using Proto = ProtoSimple<1, 1>;
-
- for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s++) {
- ncclWorkElem *args = &work->elems[s];
- int nThreadsSegment = args->p2p.nThreads;
- if (args->active == 0 || nThreadsSegment == 0) break;
-
- int nThreadsSplit = (nThreadsSegment - (nThreadsSegment > 128 ? WARP_SIZE : 0))/2;
- int groupRecv = group;
- group += Proto::calcGroupWidth(/*send=*/false, nThreadsSplit);
- int groupSend = group;
- group += Proto::calcGroupWidth(/*send=*/true, nThreadsSegment - nThreadsSplit);
+ __device__ __forceinline__ void runSend(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
+ if (args->peer == ncclShmem.comm.rank) {
+ struct ncclWorkElemP2p* recvArgs = args-1;
+ if (args->buff != recvArgs->buff) {
+ ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nthreads, nullptr, false, 1, (const T**)&args->buff, 1, (T**)&recvArgs->buff, args->count);
+ }
+ } else {
+ using Proto = ProtoSimple<1, 1>;
+ ssize_t const count = args->count;
+ int const chunkSize = args->chunkSize/sizeof(T);
+ int const peer = args->peer;
+ Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1> prims
+ (tid, nthreads, nullptr, &peer, args->buff, nullptr, /*redOpArg(ignored)=*/0, group);
+ ssize_t offset = 0;
+ do {
+ int nelem = min(chunkSize, count-offset);
+ prims.directSend(offset, offset, nelem);
+ offset += nelem;
+ } while(offset < count);
+ }
+ }
- if (tid < nThreadsSegment) {
- // Compute pointers
- T const* sendbuff = (const T*)args->sendbuff;
- T* recvbuff = (T*)args->recvbuff;
- ssize_t const sendCount = args->p2p.sendCount;
- ssize_t const recvCount = args->p2p.recvCount;
- int const delta = args->p2p.delta;
+ __device__ __forceinline__ void runRecv(const int tid, const int nthreads, const int group, struct ncclWorkElemP2p* args) {
+ if (args->peer != ncclShmem.comm.rank) {
+ using Proto = ProtoSimple<1, 1>;
+ ssize_t const count = args->count;
+ int const chunkSize = args->chunkSize/sizeof(T);
+ int const peer = args->peer;
+ Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1> prims
+ (tid, nthreads, &peer, nullptr, nullptr, args->buff, /*redOpArg(ignored)=*/0, group);
+ ssize_t offset = 0;
+ do {
+ int nelem = min(chunkSize, count-offset);
+ prims.directRecv(offset, nelem);
+ offset += nelem;
+ } while(offset < count);
+ }
+ }
- if (delta == 0) {
- if (sendbuff != recvbuff) {
- ReduceOrCopyMulti<COLL_UNROLL, RedOp, T, 1, 1, 1, 1, 0>(tid, nThreadsSegment, nullptr, false, 1, &sendbuff, 1, &recvbuff, sendCount);
- }
- }
- else {
- if ((tid < nThreadsSplit) && recvCount >= 0) {
- int const peer = (rank - delta + nRanks)%nRanks;
- int const t0 = 0;
- int const nt = nThreadsSplit;
- int const chunkSize = args->p2p.recvChunkSize/sizeof(T);
- Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto> prims
- (tid-t0, nt, &peer, nullptr, nullptr, recvbuff, /*redOpArg(ignored)=*/0, groupRecv);
- ssize_t offset = 0;
- do {
- int nelem = roundUp(chunkSize, nt*(sizeof(uint64_t)/sizeof(T)));
- nelem = min(chunkSize, recvCount-offset);
- prims.directRecv(offset, nelem);
- offset += nelem;
- } while(offset < recvCount);
- }
+ __device__ __forceinline__ void run(ncclWork *work) {
+ struct ncclWorkElemP2p* args = work->p2pElems;
+ int ngroups = args->ngroups;
+ int tid = threadIdx.x;
+ int wid = tid / WARP_SIZE;
+ // This has to work even for groups of 2.5 warps (which is 8 groups, and means 3
+ // warps for send, 2 warps for recv).
+ // warpStarts were rounded thanks to int division, but for group number we need to round the other way around
+ // So we mirror wid then mirror again the group.
+ #define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
+ int group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
+ args += group;
+ if (args->header.type == ncclWorkTypeUnused) return;
- if ((tid >= nThreadsSplit) && sendCount >= 0) {
- int const peer = (rank + delta)%nRanks;
- int const t0 = nThreadsSplit;
- int const nt = nThreadsSegment - nThreadsSplit;
- int const chunkSize = args->p2p.sendChunkSize/sizeof(T);
- Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto> prims
- (tid-t0, nt, nullptr, &peer, sendbuff, nullptr, /*redOpArg(ignored)=*/0, groupSend);
- ssize_t offset = 0;
- do {
- int nelem = roundUp(chunkSize, nt*(sizeof(uint64_t)/sizeof(T)));
- nelem = min(chunkSize, sendCount-offset);
- prims.directSend(offset, offset, nelem);
- offset += nelem;
- } while(offset < sendCount);
- }
- }
- break;
- }
- tid -= nThreadsSegment;
+ tid -= args->warpStart * WARP_SIZE;
+ int nthreads = args->nWarps * WARP_SIZE;
+ group |= 1<<16; // Used to select connIndex 1
+ if (tid >= nthreads || args->peer == -1) return;
+ if ((group%2) == 0) {
+ runRecv(tid, nthreads, group, args);
+ } else {
+ runSend(tid, nthreads, group, args);
}
}
};
diff --git a/src/collectives/sendrecv.cc b/src/collectives/sendrecv.cc
index 65222a5..0e9ca4f 100644
--- a/src/collectives/sendrecv.cc
+++ b/src/collectives/sendrecv.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -13,8 +13,8 @@ NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataTyp
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
- struct ncclInfo info = { ncclFuncSendRecv, "Send",
- sendbuff, NULL, count, datatype, ncclSum, peer, comm, stream, /* Args */
+ struct ncclInfo info = { ncclFuncSend, "Send",
+ NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
1, 1 };
ncclResult_t ret;
NCCLCHECK(ncclGroupStart());
@@ -28,7 +28,7 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NVTX3_FUNC_RANGE_IN(nccl_domain);
- struct ncclInfo info = { ncclFuncSendRecv, "Recv",
+ struct ncclInfo info = { ncclFuncRecv, "Recv",
NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
1, 1 };
ncclResult_t ret;
diff --git a/src/debug.cc b/src/debug.cc
index 795c401..9060abb 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -167,3 +167,19 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
}
pthread_mutex_unlock(&ncclDebugLock);
}
+
+NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
+
+void ncclSetThreadName(pthread_t thread, const char *fmt, ...) {
+ // pthread_setname_np is nonstandard GNU extension
+ // needs the following feature test macro
+#ifdef _GNU_SOURCE
+ if (ncclParamSetThreadName() != 1) return;
+ char threadName[NCCL_THREAD_NAMELEN];
+ va_list vargs;
+ va_start(vargs, fmt);
+ vsnprintf(threadName, NCCL_THREAD_NAMELEN, fmt, vargs);
+ va_end(vargs);
+ pthread_setname_np(thread, threadName);
+#endif
+}
diff --git a/src/enhcompat.cc b/src/enhcompat.cc
new file mode 100644
index 0000000..97f5a3f
--- /dev/null
+++ b/src/enhcompat.cc
@@ -0,0 +1,28 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+/* Define weak symbols used to allow libnccl_static.a to work with older libcudart_static.a */
+
+enum cudaError_t { cudaErrorStubLibrary = 34 };
+
+extern "C" {
+
+cudaError_t cudaStreamGetCaptureInfo_v2(...) __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaStreamGetCaptureInfo_v2(...) { return cudaErrorStubLibrary; }
+
+cudaError_t cudaUserObjectCreate(...) __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaUserObjectCreate(...) { return cudaErrorStubLibrary; }
+
+cudaError_t cudaGraphRetainUserObject(...) __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaGraphRetainUserObject(...) { return cudaErrorStubLibrary; }
+
+cudaError_t cudaStreamUpdateCaptureDependencies(...) __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorStubLibrary; }
+
+cudaError_t cudaGetDriverEntryPoint(...) __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaGetDriverEntryPoint(...) { return cudaErrorStubLibrary; }
+
+}
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 4deac18..d28191b 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -156,21 +156,23 @@ static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** wor
}
int opIndex = channel->workFifoTail%NCCL_MAX_OPS;
struct ncclWork* w = channel->workFifo+opIndex;
- struct ncclWorkElem* e = w->elems;
- volatile uint8_t* activePtr = (volatile uint8_t*)&e->active;
- while (activePtr[0] != 0) sched_yield();
+ volatile uint8_t* typePtr = (volatile uint8_t*)&w->header.type;
+ while (typePtr[0] != ncclWorkTypeUnused) sched_yield();
memset(w, 0, sizeof(struct ncclWork));
// Initialize with work elem if provided
- if (base) memcpy(e, base, sizeof(struct ncclWorkElem));
- e->active = 1;
+ if (base) memcpy(w->elems, base, sizeof(struct ncclWorkElem));
channel->workFifoTail++;
channel->workCount++;
if (work) *work = w;
return ncclSuccess;
}
+// Finalize channel work FIFO states before launch
+// Called during dynamic enqueue
static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph) {
ncclComm_t comm = eqInfo->comm;
+ // Do not use comm->myParams in this function unless in non-graph mode
+ // In graph mode, enqueue is async to capture, myParams can have been changed
struct cudaLaunchParams* params = comm->myParams;
// Only launch blocks where we have work to do.
@@ -185,26 +187,24 @@ static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph
eqInfo->maxChannels = params->gridDim.x;
}
- // Set active = 2 for the last operation and add a no-op on empty channels (p2p case).
+ // Set isLast = 1 for the last operation and add a no-op on empty channels (p2p case).
for (int c=0; c<eqInfo->maxChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
if (channel->workCount == 0) {
struct ncclWork* w;
NCCLCHECK(getNextOp(channel, &w, NULL));
- struct ncclWorkElem* e = w->elems;
- e->comm = comm->devComm;
- e->funcIndex = FUNC_INDEX_P2P;
- e->p2p.nThreads = 0;
+ w->header.funcIndex = FUNC_INDEX_P2P;
+ w->header.type = ncclWorkTypeP2p;
+ w->header.nWarps = 0;
}
- channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].elems[0].active = 2;
+ channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].header.isLast = 1;
if (c == 0) {
// As we inline the first coll directly, we can free it immediately.
// Except P2P or aggregation or registration cases
struct ncclWork* work = channel->workFifo+((channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS);
- struct ncclWorkElem* elem = work->elems;
- if (elem->funcIndex != FUNC_INDEX_P2P && eqInfo->elemList->count() == 1 && elem->regUsed == 0)
- elem->active = 0;
+ if (work->header.type == ncclWorkTypeColl && eqInfo->elemList->count() == 1)
+ work->header.type = ncclWorkTypeUnused;
}
if (channel->gdrMemDesc) {
@@ -264,6 +264,8 @@ ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) {
return ncclSuccess;
}
+// Check dependency wrt outside streams or previous launches
+// Launch kernel in GROUP mode
ncclResult_t ncclLaunchBarrier(struct ncclComm* comm) {
struct cudaLaunchParams* params = comm->myParams;
if (params->gridDim.x == 0) return ncclSuccess;
@@ -299,6 +301,7 @@ ncclResult_t ncclLaunchBarrier(struct ncclComm* comm) {
return ncclSuccess;
}
+// Launch kernel in PARALLEL mode
ncclResult_t ncclLaunchKernel(ncclComm_t comm) {
struct cudaLaunchParams *params = comm->myParams;
if (params->gridDim.x == 0) return ncclSuccess;
@@ -321,6 +324,7 @@ ncclResult_t ncclLaunchKernel(ncclComm_t comm) {
return ncclSuccess;
}
+// Launch network proxy
static ncclResult_t ncclLaunchProxy(struct ncclQueueInfo* eqInfo) {
// Start the network proxies as soon as the kernel has been launched. We can't
// perform any CUDA call between the two or having a cudaFree between the CUDA
@@ -340,6 +344,7 @@ static ncclResult_t ncclLaunchProxy(struct ncclQueueInfo* eqInfo) {
return ncclSuccess;
}
+// Record done event for current launch
ncclResult_t ncclRecordEvents(ncclComm_t comm) {
struct cudaLaunchParams *params = comm->myParams;
@@ -358,6 +363,7 @@ ncclResult_t ncclRecordEvents(ncclComm_t comm) {
return ncclSuccess;
}
+// Reset parameter space for launch
ncclResult_t ncclLaunchReset(ncclComm_t comm) {
comm->userStreamSet = false;
@@ -371,6 +377,8 @@ ncclResult_t ncclLaunchReset(ncclComm_t comm) {
NCCLCHECK(ncclResetQueueInfo(comm->enqueueInfo));
}
+ // After capturing an op in graph mode or launching the op in non-graph mode
+ // we can reset myParams for use in next op
struct cudaLaunchParams *params = comm->myParams;
params->gridDim.x = params->blockDim.x = 0;
params->func = NULL;
@@ -388,6 +396,7 @@ ncclResult_t ncclLaunchReset(ncclComm_t comm) {
static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNetTypeSupport) {
if (info->comm->collNetSupport > 0) {
+ // Translate ncclAvg and PreMulSum
ncclRedOp_t netOp = info->op == ncclAvg || info->op >= ncclNumOps ? ncclSum : info->op;
NCCLCHECK(collNetReduceSupport(info->datatype, netOp, collNetTypeSupport));
} else {
@@ -396,6 +405,7 @@ static inline ncclResult_t getCollNetSupport(struct ncclInfo* info, int* collNet
return ncclSuccess;
}
+// numPipeOps: number of pipelined ops. Can be greater than 1 in aggregation mode. Used to adjust latency.
static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, int numPipeOps) {
struct ncclComm* comm = info->comm;
if (comm->nRanks == 1) {
@@ -432,6 +442,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
int nt = comm->maxThreads[info->algorithm][info->protocol];
int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
if (info->algorithm == NCCL_ALGO_COLLNET) {
+ // CollNet channel tuning
int ncSwitch = 16;
bool flag = true;
while (ncSwitch >= 1 && flag) {
@@ -442,6 +453,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
ncSwitch /= 2;
}
} else {
+ // Ring/Tree channel tuning
while (info->nBytes < nc*nt*threadThreshold) {
if (nc >= 2) nc--;
else if ((nt % 128) == 0) nt/=2;
@@ -450,6 +462,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info, int collNetTypeSupport, i
}
if (info->protocol == NCCL_PROTO_SIMPLE) {
nt += WARP_SIZE; // Extra warp for sync
+ // More threads or sync warps needed due to split thread model
if (info->algorithm == NCCL_ALGO_TREE) nt += 3*WARP_SIZE;
if (info->algorithm == NCCL_ALGO_COLLNET) nt += 3*WARP_SIZE;
}
@@ -497,11 +510,10 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
return ncclSuccess;
}
-static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWorkElem* work, struct ncclProxyArgs* proxyArgs /* output */) {
- work->comm = info->comm->devComm;
-
+static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWorkElem* work, struct ncclProxyOp* proxyOp /* output */) {
int collNetTypeSupport = 0;
- // Check whether algo and proto have been preset
+ // Check whether algo and proto have been preset (as in aggregation case)
+ // If so, skip the calculation
if (info->nChannels > 0 && info->nThreads > 0) goto comp_next;
NCCLCHECK(getCollNetSupport(info, &collNetTypeSupport));
NCCLCHECK(getAlgoInfo(info, collNetTypeSupport, 1));
@@ -511,22 +523,23 @@ comp_next:
NCCLCHECK(getPatternInfo(info));
NCCLCHECK(getLoopInfo(info));
+ work->header.type = ncclWorkTypeColl;
work->sendbuff = info->sendbuff;
work->recvbuff = info->recvbuff;
- work->coll.root = info->root;
- work->coll.count = info->count;
- work->coll.nChannels = info->nChannels;
- work->nThreads = info->nThreads;
- work->coll.redOpArg = info->opFull.scalarArg;
+ work->root = info->root;
+ work->count = info->count;
+ work->nChannels = info->nChannels;
+ work->header.nWarps = info->nThreads / WARP_SIZE;
+ work->redOpArg = info->opFull.scalarArg;
work->redOpArgIsPtr = info->opFull.scalarArgIsPtr;
if (info->comm->nRanks == 1) {
// one-rank reduce index
- work->funcIndex = 1 + int(info->datatype);
+ work->header.funcIndex = 1 + int(info->datatype);
return ncclSuccess;
}
- work->funcIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol);
+ work->header.funcIndex = FUNC_INDEX(info->coll, info->opFull.op, info->datatype, info->algorithm, info->protocol);
int stepSize = info->comm->buffSizes[info->protocol]/NCCL_STEPS;
int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
@@ -542,22 +555,22 @@ comp_next:
while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2;
}
// Use lastChunkSize as chunkSize
- work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+ work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) {
// Optimize chunkSize / nSteps
while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*64 && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 65536) chunkSize /= 2;
while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 32768) chunkSize /= 2;
// Use lastChunkSize as chunkSize
- work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+ work->lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
// Set direct direction for broadcast-gather (read or write)
work->direct = (info->nBytes / info->nChannels <= 1024*1024) ? NCCL_DIRECT_WRITE : NCCL_DIRECT_READ;
} else if (info->protocol == NCCL_PROTO_LL) {
const ssize_t sliceSize = stepSize*sizeof(uint64_t)/sizeof(union ncclLLFifoLine);
const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
- work->coll.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
- ALIGN_SIZE(work->coll.lastChunkSize, info->nThreads*sizeof(uint64_t));
- work->coll.lastChunkSize /= ncclTypeSize(info->datatype);
+ work->lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), info->nChannels*info->nchunksPerLoop);
+ ALIGN_SIZE(work->lastChunkSize, info->nThreads*sizeof(uint64_t));
+ work->lastChunkSize /= ncclTypeSize(info->datatype);
} else if (info->algorithm == NCCL_ALGO_TREE && info->protocol == NCCL_PROTO_LL128) {
int nNodes = info->comm->nNodes;
float ppn = info->comm->nRanks / (float)nNodes;
@@ -565,7 +578,7 @@ comp_next:
while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*64/ppn && chunkSize > 131072) chunkSize /= 2;
while (info->nBytes / (info->nChannels*chunkSize) < nstepsLL128*16/ppn && chunkSize > 32768) chunkSize /= 2;
// Use lastChunkSize as chunkSize
- work->coll.lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
+ work->lastChunkSize = chunkSize*NCCL_LL128_DATAELEMS/(NCCL_LL128_LINEELEMS*ncclTypeSize(info->datatype));
}
// Compute nSteps for proxies
@@ -574,25 +587,25 @@ comp_next:
if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS;
//if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol);
int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize)));
- proxyArgs->subs[0].nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
- proxyArgs->sliceSteps = sliceSteps;
- proxyArgs->chunkSteps = chunkSteps;
- proxyArgs->chunkSize = chunkSize;
- proxyArgs->protocol = info->protocol;
- proxyArgs->dtype = info->datatype;
- proxyArgs->redOp = info->algorithm != NCCL_ALGO_COLLNET ? ncclNumOps : // Only set redOp when using CollNet
+ proxyOp->nsteps = info->nstepsPerLoop * nLoops * chunkSteps;
+ proxyOp->sliceSteps = sliceSteps;
+ proxyOp->chunkSteps = chunkSteps;
+ proxyOp->chunkSize = chunkSize;
+ proxyOp->protocol = info->protocol;
+ proxyOp->dtype = info->datatype;
+ proxyOp->redOp = info->algorithm != NCCL_ALGO_COLLNET ? ncclNumOps : // Only set redOp when using CollNet
info->opFull.op==ncclDevPreMulSum || info->opFull.op==ncclDevSumPostDiv ? ncclSum : // Network sees avg as sum
info->op;
- proxyArgs->pattern = info->pattern;
- proxyArgs->root = info->root;
+ proxyOp->pattern = info->pattern;
+ proxyOp->root = info->root;
// This is used by P2P to reduce the receive buffer size. We don't use it in collectives
// because some protocols need to transmit more than the total size, plus they sometimes
// round up
- proxyArgs->subs[0].recvbytes = stepSize*proxyArgs->sliceSteps;
+ proxyOp->nbytes = stepSize*proxyOp->sliceSteps;
TRACE(NCCL_COLL,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d chunksize %d comm %p",
- proxyArgs->opCount, sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
- nLoops, proxyArgs->subs[0].nsteps, chunkSize, info->comm);
+ proxyOp->opCount, sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
+ nLoops, proxyOp->nsteps, chunkSize, info->comm);
return ncclSuccess;
}
@@ -607,6 +620,7 @@ static ncclResult_t checkSetStream(struct ncclInfo* info) {
return ncclSuccess;
}
+// Handle structure for user buffer registration (IPC) exchange
struct ncclBuffRegHandle {
cudaIpcMemHandle_t sendBuffIpc;
cudaIpcMemHandle_t recvBuffIpc;
@@ -621,37 +635,48 @@ static ncclResult_t ncclRegBuffAndExchange(struct ncclInfo* info, struct ncclBuf
if (comm->localRanks == 1) return ncclSuccess;
if (comm->pfnCuMemGetAddressRange == NULL) return ncclSuccess; // CUDA toolkit or driver version too old
- struct ncclBuffRegHandle regHandles[NCCL_MAX_INTRA_RANKS];
+ ncclResult_t ret = ncclSuccess;
+ struct ncclBuffRegHandle regHandles[NCCL_MAX_LOCAL_RANKS];
// Get IPC handles
// Note: the handle only corresponds to the base address of the allocation
- CUDACHECK(cudaIpcGetMemHandle(&regHandles[comm->intraNodeRank].sendBuffIpc, (void*)info->sendbuff));
- CUDACHECK(cudaIpcGetMemHandle(&regHandles[comm->intraNodeRank].recvBuffIpc, (void*)info->recvbuff));
+ CUDACHECKGOTO(cudaIpcGetMemHandle(&regHandles[comm->localRank].sendBuffIpc, (void*)info->sendbuff), ret, reg_fallback);
+ CUDACHECKGOTO(cudaIpcGetMemHandle(&regHandles[comm->localRank].recvBuffIpc, (void*)info->recvbuff), ret, reg_fallback);
// Get offset of user buffer within allocation
void* baseAddr;
size_t size;
+ // Get base address
CUDACHECK(comm->pfnCuMemGetAddressRange(&baseAddr, &size, (void*)info->sendbuff));
- regHandles[comm->intraNodeRank].sendBuffOffset = (char*)info->sendbuff - (char*)baseAddr;
+ regHandles[comm->localRank].sendBuffOffset = (char*)info->sendbuff - (char*)baseAddr;
CUDACHECK(comm->pfnCuMemGetAddressRange(&baseAddr, &size, (void*)info->recvbuff));
- regHandles[comm->intraNodeRank].recvBuffOffset = (char*)info->recvbuff - (char*)baseAddr;
- TRACE(NCCL_COLL, "Base %p size %lu offset %ld", baseAddr, size, regHandles[comm->intraNodeRank].recvBuffOffset);
+ regHandles[comm->localRank].recvBuffOffset = (char*)info->recvbuff - (char*)baseAddr;
+ TRACE(NCCL_COLL, "Base %p size %lu offset %ld", baseAddr, size, regHandles[comm->localRank].recvBuffOffset);
// Exchange handles within node
- NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, regHandles, sizeof(struct ncclBuffRegHandle)));
+ NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regHandles, sizeof(struct ncclBuffRegHandle)));
// Open handles at local process
for (int i=0; i<comm->localRanks; i++) {
- if (i == comm->intraNodeRank) {
+ // Skip myself
+ if (i == comm->localRank) {
regInfo->sendbuffsBase[i] = regInfo->recvbuffsBase[i] = NULL;
continue;
}
+ // Get base address of mapping
CUDACHECK(cudaIpcOpenMemHandle(regInfo->sendbuffsBase+i, regHandles[i].sendBuffIpc, cudaIpcMemLazyEnablePeerAccess));
CUDACHECK(cudaIpcOpenMemHandle(regInfo->recvbuffsBase+i, regHandles[i].recvBuffIpc, cudaIpcMemLazyEnablePeerAccess));
- // Get real address of buffer
+ // Get real buffer address by adding offset in the mapping
regInfo->sendbuffs[i] = (char*)regInfo->sendbuffsBase[i] + regHandles[i].sendBuffOffset;
regInfo->recvbuffs[i] = (char*)regInfo->recvbuffsBase[i] + regHandles[i].recvBuffOffset;
}
+ // Marks the operation as being buffer registered
regInfo->nBuffs = comm->localRanks;
TRACE(NCCL_COLL, "Rank %d exchanged %d buffers", comm->rank, regInfo->nBuffs);
return ncclSuccess;
+
+reg_fallback:
+ // If we cannot register specific buffer types, we just bypass this stage, and continue without failing
+ (void)ret;
+ WARN("Unable to register user buffers");
+ return ncclSuccess;
}
// Compute enqueue element, save it in list
@@ -670,9 +695,8 @@ static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) {
// Compute cuda kernel arg and proxy arg templates
struct ncclQueueElem* eqElem;
NCCLCHECK(comm->enqueueInfo->elemList->getNewElem(&eqElem));
- struct ncclWorkElem* work = &eqElem->work;
- eqElem->proxyArgs.nsubs = 1;
- NCCLCHECK(computeColl(info, work, &eqElem->proxyArgs));
+ struct ncclWork* work = &eqElem->work;
+ NCCLCHECK(computeColl(info, work->elems, &eqElem->proxyOp));
// Determine grid size
struct cudaLaunchParams* params = comm->myParams;
@@ -681,14 +705,6 @@ static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) {
params->blockDim.x = std::max<unsigned>(params->blockDim.x, info->nThreads);
comm->enqueueInfo->maxChannels = params->gridDim.x; // params may be varied by a second graph hence we need to capture it here
- // Inline the first kernel
- if (params->func == NULL) {
- params->func = ncclKerns[work->funcIndex];
- memcpy(&comm->args, work, sizeof(struct ncclWorkElem));
- comm->args.coll.bid = 0; // Only inline for channel 0
- comm->args.active = 2; // I am so far the last element; may be changed later in aggregation mode
- }
-
// Register and exchange input and output buffers
if (comm->usingCudaGraph && // only in CUDA graph mode
comm->graphRegister == 1 && // when registration is enabled
@@ -696,15 +712,26 @@ static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) {
comm->intraHighestTransportType == TRANSPORT_P2P && // only when all ranks can p2p each other
comm->intraRanks == 1) { // only in multi-process mode
NCCLCHECK(ncclRegBuffAndExchange(info, &eqElem->buffRegInfo));
- // Disable inline argument because we need kernel to copy the entire ncclWork from workFifo
- // because the registered addresses are in ncclWork
- if (eqElem->buffRegInfo.nBuffs > 0) comm->args.active = 0;
comm->enqueueInfo->nRegBuffs += eqElem->buffRegInfo.nBuffs;
+ work->header.type = ncclWorkTypeRegColl;
+ }
+
+ // Inline the first kernel
+ if (params->func == NULL) {
+ params->func = ncclKerns[work->header.funcIndex];
+ if (work->header.type == ncclWorkTypeColl) {
+ // Copy the first operation to the inline argument. Type may be set later to
+ // ncclWorkTypeUnused if we have more than one coll element.
+ memcpy(&comm->args, work->elems, sizeof(struct ncclWorkElem));
+ comm->args.bid = 0; // Only inline for channel 0
+ comm->args.header.isLast = 1; // I am so far the last element
+ }
}
return ncclSuccess;
}
+// Find the channel with the least enqueued work (counted in bytes)
static inline int findShortestChannel(ncclComm_t comm) {
size_t minSize = SIZE_MAX;
int minC = 0;
@@ -718,6 +745,7 @@ static inline int findShortestChannel(ncclComm_t comm) {
return minC;
}
+// Get next channel based on shortest-queue mode or round-robin mode
static inline int getNextChannel(ncclComm_t comm, int aggMode) {
int nextChannel = 0;
if (aggMode && comm->asyncAllocMode == ncclComm::SHORTEST_QUEUE) {
@@ -729,6 +757,8 @@ static inline int getNextChannel(ncclComm_t comm, int aggMode) {
return nextChannel;
}
+// Setup aggregated kernels
+// Op info has been previously saved in comm->asyncOps
ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
if (comm->asyncOpCount == 0) {
return ncclSuccess;
@@ -739,16 +769,22 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
NCCLCHECK(ncclSetupCollKernel(info));
} else {
// Aggregation
+ // Determine a per-channel chunk size used to divide an operation into multiple channels
size_t channelSize;
if (comm->channelSize > 0) {
+ // Set by user
channelSize = comm->channelSize;
} else if (comm->collNetSupport && comm->asyncOps[0].coll == ncclFuncAllReduce) {
+ // CollNet specific size (tuned based on experiments)
channelSize = 256 * 1024;
} else {
- channelSize = NCCL_AGG_CHANNEL_SIZE * std::min(16, comm->nRanks); // scale channel size based on nranks as latency increases
+ // Latency increases as scale increases
+ // We would thus want to increase the chunk size to compensate for the lost efficiency
+ channelSize = NCCL_AGG_CHANNEL_SIZE * std::min(16, comm->nRanks);
}
// Reduce the per-channel size if we cannot fully utilize the channels
while (comm->asyncTotalSize < channelSize * comm->nChannels && channelSize > NCCL_MIN_CHANNEL_SIZE) channelSize /= 2;
+ // Check whether the ops have same reduce and data types (and hence can be packed in same ncclWork)
int channelUsed = 0;
int homogeneous = 1;
int allCollNetSupport = comm->collNetSupport;
@@ -763,6 +799,7 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
if (allCollNetSupport > 0) NCCLCHECK(getCollNetSupport(info, &allCollNetSupport));
}
// Compute algo, proto, nthreads for the entire kernel
+ // Prepare a synthetic op info to calculate the collective algo
struct ncclInfo total;
total.comm = comm;
total.coll = comm->asyncOps[0].coll;
@@ -770,16 +807,18 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
total.nChannels = std::min(channelUsed, comm->nChannels);
int perChannelOps = DIVUP(channelUsed, total.nChannels);
if (homogeneous) NCCLCHECK(getAlgoInfo(&total, allCollNetSupport, perChannelOps));
+ // Set for each op
for (int c = 0; c < comm->asyncOpCount; c++) {
struct ncclInfo* info = comm->asyncOps+c;
if (homogeneous) {
+ // Set fields to skip the individual computeColl in ncclSetupCollKernel
info->algorithm = total.algorithm;
info->protocol = total.protocol;
info->nThreads = total.nThreads;
}
NCCLCHECK(ncclSetupCollKernel(info));
}
- comm->args.active = 0; // disable inline argument
+ comm->args.header.type = ncclWorkTypeUnused; // disable inline argument
}
// Reset counters
comm->asyncOpCount = 0;
@@ -787,6 +826,7 @@ ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) {
return ncclSuccess;
}
+// Store aggregated operations info
static ncclResult_t ncclSaveAsyncColl(struct ncclInfo* info) {
ncclComm_t comm = info->comm;
if (comm->asyncOpCount >= NCCL_MAX_OPS) {
@@ -805,25 +845,38 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
struct ncclComm* comm = info->comm;
int peer = info->root;
ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
- if (info->opName[0] == 'S') { // Send
+ int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
+ int peerNode = comm->rankToNode[peer];
+ int peerIndex = comm->rankToLocalRank[peer];
+ int nsteps = comm->maxLocalRanks;
+ int rankIndex = comm->rankToLocalRank[comm->rank];
+ if (info->coll == ncclFuncSend) {
if (peer != comm->rank) {
- int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
+ int step = (nsteps + peerIndex - rankIndex)%nsteps;
+ int delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes;
+ if (comm->nNodes == 1) delta = (comm->nRanks + peer - comm->rank) % comm->nRanks;
+ // Mark channels that need pre-connect
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
- int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
- if (comm->channels[channelId].peers[peer].send[0].connected == 0) { // P2P uses only 1 connector
+ int shuffle = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
+ int channelId = (shuffle+comm->p2pChannels[c]) % comm->p2pnChannels;
+ if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
comm->connectSend[peer] |= (1<<channelId);
comm->connect = 1;
}
}
}
- NCCLCHECK(ncclSaveP2pInfo(comm->p2pSends[info->root], (void*)info->sendbuff, nBytes));
+ NCCLCHECK(ncclSaveP2pInfo(comm->p2pSends[info->root], info->recvbuff, nBytes));
comm->p2pSendCount++;
} else {
if (peer != comm->rank) {
- int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
+ int step = (nsteps + rankIndex - peerIndex)%nsteps;
+ int delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes;
+ if (comm->nNodes == 1) delta = (comm->nRanks - peer + comm->rank) % comm->nRanks;
+ // Mark channels that need pre-connect
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
- int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
- if (comm->channels[channelId].peers[peer].recv[0].connected == 0) { // P2P uses only 1 connector
+ int shuffle = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
+ int channelId = (shuffle+comm->p2pChannels[c]) % comm->p2pnChannels;
+ if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
comm->connectRecv[peer] |= (1<<channelId);
comm->connect = 1;
}
@@ -835,134 +888,155 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) {
return ncclSuccess;
}
-enum { RingTree_Segment=0, P2P_Segment=1, CollNet_Segment=2 };
-static int getSegment(int type, int delta, struct ncclWork* work) {
- // Current ncclWork is full
- if (work->elems[NCCL_MAX_WORK_ELEMENTS-1].active != 0) return -1;
+static int getSegment(enum ncclWorkElemType type, enum ncclWorkElemSubType subType, int peer, struct ncclWork* work) {
+ if (work->header.type && (work->header.type != type)) return -1;
- if (type == P2P_Segment) { // P2P
- // Do not mix P2P and collective ops
- if (work->elems[0].funcIndex != FUNC_INDEX_P2P) return -1;
- for (int s=0; s<NCCL_MAX_WORK_ELEMENTS && work->elems[s].p2p.delta != delta; s++) {
- if (work->elems[s].active == 0) return s;
+ if (type == ncclWorkTypeP2p) { // P2P
+ int start = subType == ncclWorkSubTypeRecv ? 0 : 1;
+ for (int s=start; s<NCCL_MAX_WORK_ELEMENTS_P2P; s+=2) {
+ if (work->p2pElems[s].peer == -1) return s;
+ // Do not aggregate multiple sends to the same peer (or receives from the same peer)
+ if (work->p2pElems[s].peer == peer) return -1;
}
- } else if (type == CollNet_Segment) { // CollNet
- for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s+=NCCL_REG_ELEM_FACTOR) {
- if (work->elems[s].active == 0) return s;
+ } else if (type == ncclWorkTypeRegColl) { // CollNet
+ for (int s=0; s<NCCL_MAX_WORK_ELEMENTS_REG; s++) {
+ if (work->regElems[s].elem.header.type == ncclWorkTypeUnused) return s;
}
- } else { // Ring or Tree
+ } else if (type == ncclWorkTypeColl) { // Ring or Tree
for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s++) {
- if (work->elems[s].active == 0) return s;
+ if (work->elems[s].header.type == ncclWorkTypeUnused) return s;
}
}
return -1;
}
-static ncclResult_t computeP2pWorkElem(struct ncclInfo* info /* input */, struct ncclWorkElem* elem /* output */) {
- elem->comm = info->comm->devComm;
- elem->funcIndex = FUNC_INDEX_P2P;
- elem->nThreads = NCCL_MAX_NTHREADS;
- elem->sendbuff = info->sendbuff;
- elem->recvbuff = info->recvbuff;
- elem->p2p.sendCount = info->sendbytes;
- elem->p2p.recvCount = info->recvbytes;
- elem->p2p.sendChunkSize = info->sendChunkSize;
- elem->p2p.recvChunkSize = info->recvChunkSize;
- elem->p2p.delta = info->delta;
+// Compute kernel arguments for P2P ops
+static ncclResult_t computeP2pWorkElem(struct ncclInfo* info /* input */, struct ncclWorkElemP2p* elem /* output */) {
+ elem->header.type = ncclWorkTypeP2p;
+ elem->header.funcIndex = FUNC_INDEX_P2P;
+ elem->header.nWarps = NCCL_MAX_NTHREADS/WARP_SIZE;
+ elem->buff = info->recvbuff;
+ elem->subType = info->coll == ncclFuncSend ? ncclWorkSubTypeSend : ncclWorkSubTypeRecv;
+ elem->count = info->count;
+ elem->chunkSize = info->chunkSize;
+ elem->peer = info->root;
return ncclSuccess;
}
-static ncclResult_t enqueueSegOp(int type, struct ncclWorkElem* elem /* input */, struct ncclWork* work, int s,
+// Equeue work elements into segment of ncclWork
+// Supporting both collectives (aggregated or not) and P2P
+static ncclResult_t enqueueSegOp(enum ncclWorkElemType type, struct ncclWork* elem /* input */, struct ncclWork* work, int s,
struct ncclBuffRegInfo* regInfo, struct ncclChannel* channel, struct ncclComm* comm) {
- // Copy element into corresponding segment of ncclWork
- memcpy(work->elems+s, elem, sizeof(struct ncclWorkElem));
- work->elems[s].active = 1;
-
- // Determine nThreads at dynamic time
- if (type == P2P_Segment) {
- const int nsegments = s+1;
- int nThreads = 512;
- while (nsegments*nThreads > 512) nThreads /= 2;
- if (nThreads >= 128) nThreads += WARP_SIZE;
- for (int i=0; i<nsegments; i++) work->elems[i].p2p.nThreads = nThreads;
+
+ if (type == ncclWorkTypeP2p) {
+ memcpy(work->p2pElems+s, elem, sizeof(struct ncclWorkElemP2p));
+ int nelems = 0;
+ for (int i=0; i<NCCL_MAX_WORK_ELEMENTS_P2P; i++) {
+ if (work->p2pElems[i].header.type) nelems = i+1;
+ }
+
+ int ngroups = 1;
+ while (ngroups < nelems) ngroups *= 2;
+ int nWarps = 1;
+ while (nWarps*ngroups <= elem->header.nWarps/2) nWarps *= 2;
+
+ for (int i=0; i<ngroups; i++) {
+ work->p2pElems[i].ngroups = ngroups;
+ work->p2pElems[i].warpStart =
+ i*(NCCL_MAX_NTHREADS/WARP_SIZE)/ngroups;
+ int extraWarp = nWarps >= 2 ? i%2 : 0;
+ work->p2pElems[i].nWarps = nWarps + extraWarp;
+ }
+ return ncclSuccess;
}
+ memcpy(work->elems+s, elem, sizeof(struct ncclWorkElem));
+
+ if (regInfo->nBuffs == 0) return ncclSuccess;
+
// Copy registered buffer addresses into ncclWork
- if (regInfo->nBuffs > 0) {
- struct ncclWorkRegElem* regElem = (struct ncclWorkRegElem*)(work->elems+s);
- // For CollNet
- for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) {
- int peer = channel->collTree.down[i];
- if (peer == -1) break;
- int j = comm->rankToIntraNodeRank[peer];
- if (j < 0) {
- WARN("Invalid intra-node rank %d for peer %d", j, peer);
- return ncclInternalError;
- }
- regElem->dnInputs[i] = regInfo->sendbuffs[j];
- regElem->dnOutputs[i] = regInfo->recvbuffs[j];
+ struct ncclWorkElemReg* regElem = (struct ncclWorkElemReg*)(work->elems+s);
+ // For CollNet
+ for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) {
+ int peer = channel->collTree.down[i];
+ if (peer == -1) break;
+ // Get intra-node slot
+ int j = comm->rankToLocalRank[peer];
+ if (j < 0) {
+ WARN("Invalid intra-node rank %d for peer %d", j, peer);
+ return ncclInternalError;
}
- for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) {
- int peer = channel->collTree.up[i];
- if (peer == -1) break;
- int j = comm->rankToIntraNodeRank[peer];
- if (j < 0) {
- WARN("Invalid intra-node rank %d for peer %d", j, peer);
- return ncclInternalError;
- }
- regElem->upOutputs[i] = regInfo->recvbuffs[j];
+ // Input buffer of leaf peer
+ regElem->dnInputs[i] = regInfo->sendbuffs[j];
+ // Output buffer of leaf peer
+ regElem->dnOutputs[i] = regInfo->recvbuffs[j];
+ }
+ for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) {
+ int peer = channel->collTree.up[i];
+ if (peer == -1) break;
+ int j = comm->rankToLocalRank[peer];
+ if (j < 0) {
+ WARN("Invalid intra-node rank %d for peer %d", j, peer);
+ return ncclInternalError;
}
- work->elems[s].regUsed = 1;
+ // Output buffer of root peer
+ regElem->upOutputs[i] = regInfo->recvbuffs[j];
}
+ work->elems[s].regUsed = 1;
return ncclSuccess;
}
+// Enqueue P2P op
ncclResult_t ncclEnqueueP2pKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem) {
- struct ncclWorkElem* workElem = &eqElem->work;
- struct ncclProxyArgs* proxyArgs = &eqElem->proxyArgs;
+ struct ncclWorkElemP2p* workElem = eqElem->work.p2pElems;
+ struct ncclProxyOp* proxyOp = &eqElem->proxyOp;
// Try to reuse last p2p operation if not full yet
- struct ncclChannel* channel = proxyArgs->subs[0].channel;
+ struct ncclChannel* channel = comm->channels+proxyOp->channelId;
int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS;
struct ncclWork* w = channel->workFifo+opIndex;
int segment = -1;
if (channel->workCount) {
// Try to pack more segments into a single operation
- segment = getSegment(P2P_Segment, workElem->p2p.delta, w);
+ segment = getSegment(ncclWorkTypeP2p, workElem->subType, workElem->peer, w);
}
if (segment == -1) {
NCCLCHECK(getNextOp(channel, &w, NULL));
- segment = 0;
+ segment = workElem->subType == ncclWorkSubTypeRecv ? 0 : 1;
+ // Initialize work as P2P, set peer=-1 to designate the p2p elem is not used.
+ w->header.type = ncclWorkTypeP2p;
+ for (int i=0; i<NCCL_MAX_WORK_ELEMENTS_P2P; i++) w->p2pElems[i].peer = -1;
}
+ //printf("%s to %d -> Channel %d OpCount %ld Segment %d\n", workElem->subType == ncclWorkSubTypeRecv ? "Recv" : "Send", proxyOp->root, channel->id, channel->workFifoTail-1, segment);
// store work element into FIFO
- NCCLCHECK(ncclProxySaveP2p(comm, proxyArgs));
- NCCLCHECK(enqueueSegOp(P2P_Segment, workElem, w, segment, &eqElem->buffRegInfo, channel, comm));
+ NCCLCHECK(ncclProxySaveP2p(comm, proxyOp));
+ NCCLCHECK(enqueueSegOp(ncclWorkTypeP2p, &eqElem->work, w, segment, &eqElem->buffRegInfo, channel, comm));
return ncclSuccess;
}
+// Setup P2P op
ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) {
ncclComm* comm = info->comm;
// Compute cuda kernel arg and proxy arg templates
struct ncclQueueElem* eqElem;
NCCLCHECK(comm->enqueueInfo->elemList->getNewElem(&eqElem));
// The proxy code will set and tune the send/recv chunk size, make sure to run it first.
- NCCLCHECK(ncclProxyComputeP2p(info, &eqElem->proxyArgs));
- NCCLCHECK(computeP2pWorkElem(info, &eqElem->work));
-
+ NCCLCHECK(ncclProxyComputeP2p(info, &eqElem->proxyOp));
+ NCCLCHECK(computeP2pWorkElem(info, eqElem->work.p2pElems));
+ // Compute grid size
int channelId = info->channelId;
struct cudaLaunchParams* params = comm->myParams;
params->gridDim.x = std::max<unsigned>(params->gridDim.x, channelId+1);
- params->blockDim.x = std::max<unsigned>(params->blockDim.x, eqElem->work.nThreads);
+ params->blockDim.x = std::max<unsigned>(params->blockDim.x, eqElem->work.header.nWarps*WARP_SIZE);
comm->enqueueInfo->maxChannels = params->gridDim.x; // params may be varied by a second graph hence we need to capture it here
// Record the first kernel to launch
// Just for CUDA kernel to know this is a P2P operation
// The CUDA kernel does not use the inlined first work element as fastpath argument
if (params->func == NULL) {
- params->func = ncclKerns[eqElem->work.funcIndex];
- comm->args.comm = eqElem->work.comm;
- comm->args.active = 0;
+ params->func = ncclKerns[eqElem->work.header.funcIndex];
+ comm->args.header.type = ncclWorkTypeUnused;
}
return ncclSuccess;
}
@@ -970,24 +1044,24 @@ ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) {
// Dynamic enqueue function for collective kernels
// Supports both aggregated and non-aggregated modes
ncclResult_t ncclEnqueueCollKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem, int aggMode) {
- struct ncclWorkElem* work = &eqElem->work;
- struct ncclProxyArgs* proxyArgs = &eqElem->proxyArgs;
+ struct ncclWork* work = &eqElem->work;
+ struct ncclWorkElem* elem = work->elems;
+ struct ncclProxyOp* proxyOp = &eqElem->proxyOp;
- int nChannels = work->coll.nChannels;
- size_t channelSize = work->coll.count*ncclTypeSize(proxyArgs->dtype)/work->coll.nChannels;
- int segmentType = proxyArgs->redOp == ncclNumOps ? RingTree_Segment : CollNet_Segment; // redOp is only set when using CollNet
+ int nChannels = elem->nChannels;
+ size_t channelSize = elem->count*ncclTypeSize(proxyOp->dtype)/elem->nChannels;
+ enum ncclWorkElemType workElemType = proxyOp->redOp == ncclNumOps ? ncclWorkTypeColl : ncclWorkTypeRegColl; // redOp is only set when using CollNet
for (int bid=0; bid<nChannels; bid++) {
int channelId = getNextChannel(comm, aggMode);
struct ncclChannel* channel = comm->channels+channelId;
// Proxy
- proxyArgs->subs[0].channel = channel;
- proxyArgs->opCount = comm->collOpCount;
- proxyArgs->commOpCount = comm->opCount;
- if (proxyArgs->subs[0].nsteps) NCCLCHECK(ncclProxySaveColl(proxyArgs, comm->nRanks));
+ proxyOp->channelId = channelId;
+ proxyOp->opCount = comm->collOpCount;
+ if (proxyOp->nsteps) NCCLCHECK(ncclProxySaveColl(comm, proxyOp, comm->nRanks));
- work->coll.bid = bid % nChannels;
+ elem->bid = bid % nChannels;
struct ncclWork* w = NULL;
int segment = -1;
if (aggMode && channel->workCount) {
@@ -996,9 +1070,9 @@ ncclResult_t ncclEnqueueCollKernel(struct ncclComm* comm, struct ncclQueueElem*
w = channel->workFifo+opIndex;
// All elems in work must have same (funcIndex,nThreads),
// see "src/collectives/device/common.h"
- if (w->elems[0].funcIndex == work->funcIndex &&
- w->elems[0].nThreads == work->nThreads) {
- segment = getSegment(segmentType, 0, w);
+ if (w->header.funcIndex == work->header.funcIndex &&
+ w->header.nWarps == work->header.nWarps) {
+ segment = getSegment(workElemType, ncclWorkSubTypeUnused, 0, w);
}
}
if (segment == -1) {
@@ -1007,16 +1081,20 @@ ncclResult_t ncclEnqueueCollKernel(struct ncclComm* comm, struct ncclQueueElem*
}
// store work element into FIFO
- NCCLCHECK(enqueueSegOp(segmentType, work, w, segment, &eqElem->buffRegInfo, channel, comm));
+ NCCLCHECK(enqueueSegOp(workElemType, work, w, segment, &eqElem->buffRegInfo, channel, comm));
channel->totalSize += channelSize;
}
comm->collOpCount++;
return ncclSuccess;
}
+// Host setup node for CUDA Graph
+// Performs the enqueue job
template<int USING_CUDA_GRAPH>
void CUDART_CB ncclEnqueueHostSetup(void* arg) {
+ NVTX3_FUNC_RANGE_IN(nccl_domain);
ncclResult_t ret;
+ // All work for current launch has been captured in Queue Info
struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)arg;
ncclComm_t comm = eqInfo->comm;
int aggMode = eqInfo->elemList->count() > 1 ? 1 : 0;
@@ -1024,7 +1102,7 @@ void CUDART_CB ncclEnqueueHostSetup(void* arg) {
// Iterate through the element list
struct ncclQueueElem* eqElem = eqInfo->elemList->begin();
while (eqElem != NULL) {
- if (eqElem->work.funcIndex == FUNC_INDEX_P2P) {
+ if (eqElem->work.header.funcIndex == FUNC_INDEX_P2P) {
NCCLCHECKGOTO(ncclEnqueueP2pKernel(comm, eqElem), ret, cb_end);
} else {
NCCLCHECKGOTO(ncclEnqueueCollKernel(comm, eqElem, aggMode), ret, cb_end);
@@ -1045,6 +1123,8 @@ cb_end:
template void CUDART_CB ncclEnqueueHostSetup<0>(void*);
template void CUDART_CB ncclEnqueueHostSetup<1>(void*);
+// CUDA Graph helper thread
+// for de-registering user buffers
void* graphHelperFunc(void *args) {
struct ncclGraphHelperResources* res = (struct ncclGraphHelperResources*)args;
if (res == NULL) {
@@ -1058,8 +1138,10 @@ void* graphHelperFunc(void *args) {
volatile enum helperThreadState* state = &res->threadState;
volatile int* ipcTail = &res->ipcTail;
while (1) {
+ // Last IPC entry enqueue so far
int ipcTailMark = *ipcTail;
int ipcCount = 0;
+ // Close IPC till the last entry
while (res->ipcHead != ipcTailMark) {
if (res->ipcBases[res->ipcHead] != NULL)
CUDACHECKIGNORE(cudaIpcCloseMemHandle(res->ipcBases[res->ipcHead]));
@@ -1069,6 +1151,7 @@ void* graphHelperFunc(void *args) {
}
TRACE(NCCL_COLL, "CUDA Graph helper thread closed %d IPC handles", ipcCount);
pthread_mutex_lock(&res->threadLock);
+ // Check for exit signal
while (res->ipcHead == *ipcTail && *state != ThreadStop) {
pthread_cond_wait(&res->threadCond, &res->threadLock);
}
@@ -1080,20 +1163,21 @@ void* graphHelperFunc(void *args) {
}
}
+// Check if we are in CUDA Graph capture mode
ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph) {
comm->usingCudaGraph = 0;
+ // Feature requires CUDA 11.3/R465 or above
#if CUDART_VERSION >= 11030
cudaStreamCaptureStatus captureStatus;
unsigned long long cudaGraphId;
+ ncclResult_t ret = ncclSuccess;
if (comm->driverVersion < 11030) {
- CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus));
- if (captureStatus != cudaStreamCaptureStatusNone) {
- WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
- return ncclInvalidUsage;
- }
- return ncclSuccess;
+ // Runtime driver version older than compiler version
+ // Enhanced compat fallback
+ goto enh_compat_end;
}
- CUDACHECK(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL));
+ // Get CUDA Graph handle
+ CUDACHECKGOTO(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL), ret, enh_compat_end);
if (captureStatus == cudaStreamCaptureStatusActive) {
if (cudaGraphId != comm->lastCudaGraphId) {
INFO(NCCL_COLL, "stream is being captured by a new graph, id %llu", cudaGraphId);
@@ -1109,15 +1193,31 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph) {
// Only create this thread when buffer registration is enabled
if ((!comm->graphHelperThread) && comm->graphRegister == 1 && comm->disableGraphHelper == 0) {
pthread_mutex_init(&comm->graphHelperResources->threadLock, NULL);
+ // Init signaling method between Graph destroy function and helper thread
pthread_cond_init(&comm->graphHelperResources->threadCond, NULL);
+ // Set state
comm->graphHelperResources->threadState = ThreadStart;
+ // Create thread
pthread_create(&comm->graphHelperThread, NULL, graphHelperFunc, comm->graphHelperResources);
+ // Name thread
+ ncclSetThreadName(comm->graphHelperThread, "NCCL GrHelper%2d", comm->cudaDev);
}
}
+ return ncclSuccess;
+
+enh_compat_end: // Enhanced compat fallback
+ (void)ret;
+ CUDACHECK(cudaStreamIsCapturing(comm->userStream, &captureStatus));
+ if (captureStatus != cudaStreamCaptureStatusNone) {
+ WARN("The installed CUDA driver is older than the minimum version (R465) required for NCCL's CUDA Graphs support");
+ return ncclInvalidUsage;
+ }
+ // If we are not in capture mode, we can ignore the driver being lower
#endif
return ncclSuccess;
}
+// Create host setup node in CUDA Graph
ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph) {
#if CUDART_VERSION >= 11030
struct ncclQueueInfo* eqInfo = comm->enqueueInfo;
@@ -1125,14 +1225,17 @@ ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph) {
// which CUDA graph would manage lifetime of
cudaUserObject_t object;
CUDACHECK(cudaUserObjectCreate(&object, eqInfo, ncclDestroyQueueInfo, 1/*initialRefcount*/, cudaUserObjectNoDestructorSync));
+ // Hand over ownership to CUDA Graph
CUDACHECK(cudaGraphRetainUserObject(graph, object, 1, cudaGraphUserObjectMove));
cudaHostFn_t fn = ncclEnqueueHostSetup<1>;
// Add a CPU node to the graph
cudaGraphNode_t setupNode;
+ // Function + parameter space for that function (i.e. enqueue info)
cudaHostNodeParams setupNodeParams = {fn, eqInfo};
int numDependencies = comm->lastSetupNode == NULL ? 0 : 1;
CUDACHECK(cudaGraphAddHostNode(&setupNode, graph, &comm->lastSetupNode, numDependencies, &setupNodeParams));
+ // Create dependency from last setup node in the same graph
CUDACHECK(cudaStreamUpdateCaptureDependencies(comm->userStream, &setupNode, 1, cudaStreamAddCaptureDependencies));
comm->lastSetupNode = setupNode;
return ncclSuccess;
@@ -1237,7 +1340,7 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
- if (info->coll == ncclFuncSendRecv) { //p2p stored separately
+ if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) { //p2p stored separately
NCCLCHECKGOTO(ncclSaveP2p(info), ret, end);
} else {
NCCLCHECKGOTO(ncclSaveAsyncColl(info), ret, end);
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index a26611e..da9a360 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,6 +8,7 @@
#include "graph.h"
#include "trees.h"
#include "rings.h"
+#include "topo.h"
/******************************************************************/
/********************* Internode connection ***********************/
@@ -17,7 +18,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
struct ncclTopoRanks* topoRanks) {
int rank = comm->rank;
- int localRanks = comm->localRanks;
+ int localRanks = comm->topo->nodes[GPU].count;
int nChannels = comm->nChannels;
for (int c=0; c<nChannels; c++) {
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 64c54df..2bd52b0 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -171,20 +171,21 @@ static ncclResult_t getLocalCpu(struct ncclTopoSystem* system, int gpu, int* ret
return ncclSuccess;
}
-static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int i1, int t2, int i2) {
- struct ncclTopoNode* cpuNode = system->nodes[CPU].nodes+c;
+static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) {
+ struct ncclTopoNode* cpuNode = system->nodes[tx].nodes+ix;
struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1;
int l=0;
// Node 1 -> CPU
- for (int i=0; i<srcNode->paths[CPU][c].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[CPU][c].list[i];
+ for (int i=0; i<srcNode->paths[tx][ix].count; i++) srcNode->paths[t2][i2].list[l++] = srcNode->paths[tx][ix].list[i];
// CPU -> Node 2
for (int i=0; i<cpuNode->paths[t2][i2].count; i++) srcNode->paths[t2][i2].list[l++] = cpuNode->paths[t2][i2].list[i];
// Update path characteristics
srcNode->paths[t2][i2].count = l;
- srcNode->paths[t2][i2].type = std::max(srcNode->paths[CPU][c].type, cpuNode->paths[t2][i2].type);
- srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width);
+ srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type);
+ if (tx == GPU) srcNode->paths[t2][i2].type = PATH_PXN;
+ srcNode->paths[t2][i2].width = std::min(srcNode->paths[tx][ix].width, cpuNode->paths[t2][i2].width);
return ncclSuccess;
}
@@ -241,6 +242,8 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
return ncclSuccess;
}
+NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0);
+
int ncclTopoUserP2pLevel = -1;
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank) {
*p2p = 0;
@@ -256,13 +259,14 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
return ncclSuccess;
}
-
+ int intermediateIndex = -1;
// Set intermediate GPU rank, if routing through an intermediate GPU.
struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2;
if (path->count == 2) {
struct ncclTopoNode* intermediateNode = path->list[0]->remNode;
- if (intermediateNode->type == GPU && intermediateRank) {
- *intermediateRank = intermediateNode->gpu.rank;
+ if (intermediateNode->type == GPU) {
+ intermediateIndex = intermediateNode - system->nodes[GPU].nodes;
+ if (intermediateRank) *intermediateRank = intermediateNode->gpu.rank;
}
}
@@ -292,6 +296,38 @@ compare:
// Compute the PCI distance and compare with the p2pLevel.
if (path->type <= p2pLevel) *p2p = 1;
+ if (*p2p == 1) {
+ // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to
+ // validate against NVML at all since they are pretending to be on other hw.
+ if (g1 != g2 && ncclParamIgnoreDisabledP2p() != 2) {
+ int indexes[3] = {-1,-1,-1};
+ int verticeN = 0;
+ NCCLCHECK(ncclNvmlEnsureInitialized());
+
+ indexes[verticeN++] = system->nodes[GPU].nodes[g1].gpu.dev;
+ if (intermediateIndex != -1) indexes[verticeN++] = system->nodes[GPU].nodes[intermediateIndex].gpu.dev;
+ indexes[verticeN++] = system->nodes[GPU].nodes[g2].gpu.dev;
+
+ for (int i=1; i < verticeN; i++) {
+ nvmlGpuP2PStatus_t status;
+ status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusRead;
+ bool good = status == NVML_P2P_STATUS_OK;
+ status = ncclNvmlDevicePairs[indexes[i-1]][indexes[i-0]].p2pStatusWrite;
+ good &= status == NVML_P2P_STATUS_OK;
+ if (!good) {
+ if (ncclParamIgnoreDisabledP2p()) {
+ *p2p = 0;
+ } else if (path->type <= PATH_NVB) {
+ WARN("P2P is disabled between NVLINK connected GPUs %d and %d. This should not be the case given their connectivity, and is probably due to a hardware issue. If you still want to proceed, you can set NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
+ return ncclUnhandledCudaError;
+ } else if (path->type < PATH_SYS) {
+ INFO(NCCL_INIT, "P2P is disabled between connected GPUs %d and %d. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.", indexes[i-1], indexes[i-0]);
+ }
+ }
+ }
+ }
+ }
+
if (path->type == PATH_NVL) {
struct ncclTopoNode* gpu2 = system->nodes[GPU].nodes+g2;
// Enable P2P Read for Ampere/NVLink only
@@ -342,6 +378,14 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
NCCLCHECK(ncclGetLevel(&ncclTopoUserGdrLevel, NULL, "NCCL_NET_GDR_LEVEL"));
if (ncclTopoUserGdrLevel != -2) netGdrLevel = ncclTopoUserGdrLevel;
int distance = gpu->paths[NET][n].type;
+ if (distance == PATH_PXN) {
+ // In case of PXN, use the intermediate GPU distance instead
+ int proxyRank, g;
+ NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netDev, &proxyRank));
+ NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g));
+ struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g;
+ distance = proxyGpu->paths[NET][n].type;
+ }
if (distance > netGdrLevel) {
INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel);
return ncclSuccess;
@@ -352,6 +396,77 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
return ncclSuccess;
}
+ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank) {
+ // Get GPU and NET
+ int n, g;
+ NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n));
+ NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
+ struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+ struct ncclTopoLinkList* path = gpu->paths[NET]+n;
+ if (path->type == PATH_PXN) {
+ struct ncclTopoNode* node;
+ int type = NVS;
+ for (int i=0; i<path->count && type == NVS; i++) {
+ node = path->list[i]->remNode;
+ type = node->type;
+ }
+ if (type != GPU) {
+ WARN("Could not find intermediate GPU between GPU rank %d and NIC %d\n", rank, netDev);
+ return ncclInternalError;
+ }
+ *intermediateRank = node->gpu.rank;
+ } else {
+ *intermediateRank = rank;
+ }
+ return ncclSuccess;
+}
+
+NCCL_PARAM(PxnDisable, "PXN_DISABLE", 0);
+
+// Net v4 plugins don't have non-blocking connect/accept. We can't therefore use
+// remote proxies without risking deadlocks
+int ncclPxnDisable() {
+ static int pxnDisable = -1;
+ if (pxnDisable == -1) {
+ if (ncclNetVersion() == 4) {
+ INFO(NCCL_INIT, "PXN Disabled as plugin is v4");
+ pxnDisable = 1;
+ } else {
+ pxnDisable = ncclParamPxnDisable();
+ }
+ }
+ return pxnDisable;
+}
+
+ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks) {
+ struct ncclTopoSystem* system = comm->topo;
+ *nranks = 0;
+ *intermediateRanks = NULL;
+ if (system->nodes[NET].count == 0) return ncclSuccess;
+
+ int nr = 0;
+ int* ranks = NULL;
+ for (int rank=0; rank<comm->nRanks; rank++) {
+ int netDev, proxyRank;
+ NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netDev, &proxyRank));
+ if (proxyRank == comm->rank) continue;
+ int useGdr;
+ NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->busId, netDev, 1, &useGdr));
+ if (useGdr == 0) continue;
+ int found = 0;
+ for (int r=0; r<nr; r++) {
+ if (ranks[r] == proxyRank) found = 1;
+ }
+ if (!found) {
+ NCCLCHECK(ncclRealloc(&ranks, nr, nr+1));
+ ranks[nr++] = proxyRank;
+ }
+ }
+ *nranks = nr;
+ *intermediateRanks = ranks;
+ return ncclSuccess;
+}
+
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) {
// Precompute paths between GPUs/NICs.
@@ -376,7 +491,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
// Divert all traffic through the CPU
int cpu;
NCCLCHECK(getLocalCpu(system, g, &cpu));
- NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g));
+ NCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g));
}
}
@@ -403,6 +518,29 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
NCCLCHECK(ncclTopoSetPaths(netNode, system));
for (int g=0; g<system->nodes[GPU].count; g++) {
+ // Check whether we can access the NIC through another NVLink-connected GPU (PXN)
+ struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+ if (ncclPxnDisable() != 1 && gpu->paths[NET][n].type > PATH_PXB) {
+ for (int p=0; p<system->nodes[GPU].count; p++) {
+ if (p == g) continue;
+ struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+p;
+
+ // To ensure proper balancing, use only a local GPU which advertised that NIC as its preferred one.
+ int netDev;
+ NCCLCHECK(ncclTopoGetLocalNet(system, peerNode->gpu.rank, &netDev));
+ // Make sure we can allocate memory on that GPU.
+ if (netDev != netNode->id) continue;
+
+ // PXN = PCI + NVLink.
+ if (netNode->paths[GPU][p].type > PATH_PXB || peerNode->paths[GPU][g].type > PATH_NVL) continue;
+
+ // We can use that GPU as relay to communicate with that NIC.
+ // Only enabling it in the GPU->NIC direction for now to favor
+ // receiving locally and sending remotely (consistent with net.cc)
+ NCCLCHECK(addInterStep(system, GPU, p, GPU, g, NET, n));
+ break;
+ }
+ }
// Update path when we dont want to / can't use GPU Direct RDMA.
int gdr;
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
@@ -410,8 +548,8 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
// We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
int localCpu;
NCCLCHECK(getLocalCpu(system, g, &localCpu));
- NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g));
- NCCLCHECK(addCpuStep(system, localCpu, GPU, g, NET, n));
+ NCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g));
+ NCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n));
}
}
}
@@ -454,7 +592,6 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
NCCLCHECK(ncclTopoRemoveNode(system, GPU, g));
}
- comm->localRanks = system->nodes[GPU].count;
if (system->nodes[GPU].count == comm->nRanks) {
for (int n=system->nodes[NET].count-1; n>=0; n--)
NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
@@ -469,6 +606,8 @@ void ncclTopoFree(struct ncclTopoSystem* system) {
free(system);
}
+NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", 2);
+
static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*local gpu index*/, int peerRank, int* nChannels) {
int peer;
struct ncclTopoLinkList* path = NULL;
@@ -488,7 +627,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclTopoSystem* system, int g /*
}
} else {
// Remote rank, use network
- *nChannels = 1;
+ *nChannels = ncclParamNChannelsPerNetPeer();
}
return ncclSuccess;
}
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 8894bd1..d70b6a7 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -254,10 +254,10 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time);
// Try to keep all searchs within one second
-#define NCCL_SEARCH_GLOBAL_TIMEOUT (3ULL<<19)
-#define NCCL_SEARCH_TIMEOUT (1<<18)
-#define NCCL_SEARCH_TIMEOUT_TREE (1<<17)
-#define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<10)
+#define NCCL_SEARCH_GLOBAL_TIMEOUT (1ULL<<18)
+#define NCCL_SEARCH_TIMEOUT (1<<14)
+#define NCCL_SEARCH_TIMEOUT_TREE (1<<14)
+#define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<8)
#define FORCED_ORDER_PCI 1
#define FORCED_ORDER_REPLAY 2
@@ -305,6 +305,57 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoGraph* graph, struct ncclTopoG
return ncclSuccess;
}
+// Build a list of the best NETs to try.
+//
+// "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu
+// index when trying to get back to the NIC.
+//
+// The list is built the following way:
+// 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
+// 2. For each GPU, once that list of NICs with a given distance is prepared, shuffle the list
+// based on the GPU NVML index so that e.g. GPU 1 chooses NIC 1 first instead of NIC 0 which
+// might have been choosen by GPU 0 (case with multiple independent communicators per node)
+// 3. Then add the NETs to the final list if they were not already added by another closer GPU.
+
+ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) {
+ int netCount = 0;
+ int localNetCount;
+ int* localNets;
+ NCCLCHECK(ncclCalloc(&localNets, system->nodes[NET].count));
+
+ for (int t=0; t <= typeInter; t++) {
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ if (gpu != -1 && gpu != g) continue;
+ localNetCount = 0;
+ struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+ struct ncclTopoLinkList* paths = gpu->paths[NET];
+ for (int n=0; n<system->nodes[NET].count; n++) {
+ if (paths[n].type == t) localNets[localNetCount++] = n;
+ }
+ if (localNetCount == 0) continue;
+ // Shuffle by gpu NVML device number so that GPUs on the same PCI switch
+ // with multiple NICs don't use the same one as first choice.
+ for (int r=0; r<system->nodes[GPU].nodes[g].gpu.dev % localNetCount; r++) {
+ int net0 = localNets[0];
+ for (int i=0; i<localNetCount-1; i++) localNets[i] = localNets[i+1];
+ localNets[localNetCount-1] = net0;
+ }
+ // Append NICs to list
+ for (int i=0; i<localNetCount; i++) {
+ int n = localNets[i];
+ int found = 0;
+ while (nets[found] != n && found<netCount) found++;
+ if (found == netCount) nets[netCount++] = n;
+ }
+ }
+ }
+
+ *netCountRet = netCount;
+ free(localNets);
+
+ return ncclSuccess;
+}
+
ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
if ((*time) <= 0) return ncclSuccess;
(*time)--;
@@ -333,7 +384,12 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
int startNetIndex;
NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
- for (int n=0; n<system->nodes[NET].count; n++) {
+ int netcount;
+ int* nets;
+ NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
+ NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netcount));
+ for (int i=0; i<netcount; i++) {
+ int n = nets[i];
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
@@ -359,6 +415,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
graph->speedInter = speedInterSave;
}
}
+ free(nets);
}
} else if (step < system->nodes[GPU].count-1) {
// Go to next GPU
@@ -393,65 +450,12 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
return ncclSuccess;
}
-// Select only NICs with the maximum bandwidth w.r.t. GPUs, and sort them by distance.
-ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int* nets, int* netcountRet) {
- float* maxwidths;
- int* minhops;
- int netcount = 0;
- NCCLCHECK(ncclCalloc(&minhops, system->nodes[NET].count));
- NCCLCHECK(ncclCalloc(&maxwidths, system->nodes[NET].count));
- for (int n=0; n<system->nodes[NET].count; n++) {
- maxwidths[n] = 0.0;
- minhops[n] = 255;
- struct ncclTopoNode* net = system->nodes[NET].nodes+n;
- struct ncclTopoLinkList* paths = net->paths[GPU];
- for (int g=0; g<system->nodes[GPU].count; g++) {
- if (paths[g].width > maxwidths[n] || (paths[g].width == maxwidths[n] && paths[g].count < minhops[n])) {
- maxwidths[n] = paths[g].width;
- minhops[n] = paths[g].count;
- }
- }
- if (netcount && maxwidths[nets[0]] > maxwidths[n]) continue; // Do not keep NICs with lower BW
- if (netcount && maxwidths[nets[0]] < maxwidths[n]) netcount = 0; // Remove all NICs with lower BW
- int index;
- for (index = 0; index < netcount; index++) {
- if (minhops[n] < minhops[nets[index]]) break;
- }
- // Insert net at index
- // Shift all nets with higher nhops
- for (int i = netcount; i>index; i--) nets[i] = nets[i-1];
- // Insert this net at index
- nets[index] = n;
- netcount++;
- }
-
- *netcountRet = netcount;
-
- // Then shuffle NICs with the same nhops based on the GPU device number, so that when we have
- // 2 NICs and 2 GPUs and create communicators with only one GPU, we will use both NICs.
- for (int start = 0; start < netcount;) {
- int end = start+1;
- while (end < netcount && minhops[nets[end]] == minhops[nets[start]]) end++;
- // Shuffle
- for (int r=0; r<system->nodes[GPU].nodes[0].gpu.dev % (end-start); r++) {
- int netStart = nets[start];
- for (int i=start; i<end-1; i++) nets[i] = nets[i+1];
- nets[end-1] = netStart;
- }
- start = end;
- }
-
- free(minhops);
- free(maxwidths);
- return ncclSuccess;
-}
-
ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
const int speed = graph->speedInter;
int* nets;
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
int netcount;
- NCCLCHECK(ncclTopoSelectNets(system, nets, &netcount));
+ NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netcount));
for (int i=0; i<netcount; i++) {
int n = nets[i];
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
@@ -461,6 +465,8 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
if (net->net.maxChannels == 0) continue;
graph->inter[graph->nChannels*2] = net->id;
+ graph->latencyInter = net->net.latency;
+
for (int i=0; i<system->nodes[NET].count; i++) {
if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) &&
(system->nodes[NET].nodes[i].net.port == net->net.port)) {
@@ -587,7 +593,18 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra
/* User defined graph from XML file */
/************************************/
-struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "NVB", PATH_NVB}, { "LOC", PATH_LOC }, { NULL, 0 } };
+struct kvDict kvDictLinkType[] = {
+ { "LOC", PATH_LOC },
+ { "NVL", PATH_NVL },
+ { "NVB", PATH_NVB },
+ { "PIX", PATH_PIX },
+ { "PXB", PATH_PXB },
+ { "PXN", PATH_PXN },
+ { "PHB", PATH_PHB },
+ { "SYS", PATH_SYS },
+ { NULL, 0 }
+};
+
ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
int* inter = graph->inter+2*c;
@@ -627,6 +644,7 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc
NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels));
NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->speedIntra));
NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->speedInter));
+ if (xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != ncclSuccess) graph->latencyInter = 0.0;
const char* str;
NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str));
NCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType));
@@ -685,6 +703,7 @@ ncclResult_t ncclTopoGetXmlFromGraph(struct ncclTopoGraph* graph, struct ncclTop
NCCLCHECK(xmlSetAttrInt(xmlGraph, "nchannels", graph->nChannels));
NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->speedIntra));
NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->speedInter));
+ NCCLCHECK(xmlSetAttrFloat(xmlGraph, "latencyinter", graph->latencyInter));
const char* str;
NCCLCHECK(kvConvertToStr(graph->typeIntra, &str, kvDictLinkType));
NCCLCHECK(xmlSetAttr(xmlGraph, "typeintra", str));
@@ -712,10 +731,14 @@ float speedArrayInter[] = { 48.0, 30.0, 24.0, 22.0, 18.0, 15.0, 12.0, 10.0, 9.0,
#define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
#define NSPEEDSINTER (sizeof(speedArrayInter)/sizeof(float))
+NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
+
ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
+ graph->crossNic = ncclParamCrossNic();
int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
graph->speedIntra = graph->speedInter = 0;
+ graph->latencyInter = 0;
if (graph->crossNic == 2) graph->crossNic = 0;
graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
graph->typeInter = PATH_PIX;
@@ -802,19 +825,13 @@ search:
goto search;
}
tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
- if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXB)) {
+
+ if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
tmpGraph.typeInter += 1;
goto search;
}
tmpGraph.typeInter = PATH_PIX;
- // Try a simpler tree
- if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
- tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
- goto search;
- }
- tmpGraph.pattern = graph->pattern;
-
if (crossNic && tmpGraph.crossNic == 0) {
// Try again with crossNic if permitted
tmpGraph.crossNic = crossNic;
@@ -822,6 +839,13 @@ search:
}
tmpGraph.crossNic = 0;
+ // Try a simpler tree
+ if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) {
+ tmpGraph.pattern = NCCL_TOPO_PATTERN_TREE;
+ goto search;
+ }
+ tmpGraph.pattern = graph->pattern;
+
// Decrease speed until we find a solution
if ((speedIndex < nspeeds-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->speedInter > .49))) {
tmpGraph.speedInter = tmpGraph.speedIntra = speedArray[++speedIndex];
@@ -915,17 +939,66 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
return ncclSuccess;
}
-ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* dev) {
+// 0: don't use PXN for P2P, 1: use PXN if needed, 2: use PXN as much as possible to maximize aggregation
+NCCL_PARAM(P2pPxnLevel, "P2P_PXN_LEVEL", 2);
+
+#include "comm.h"
+ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* dev, int* proxyRank) {
if (graph) {
// Honor the net device in the graph
int channel = channelId%graph->nChannels;
- int ngpus = system->nodes[GPU].count;
+ int ngpus = comm->topo->nodes[GPU].count;
int index = graph->intra[channel*ngpus] == rank ? 0 : 1;
*dev = graph->inter[channel*2+index];
+ NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
+ } else if (peerRank == -1) {
+ return ncclInternalError;
} else {
- int64_t id;
- NCCLCHECK(ncclTopoGetLocalNet(system, rank, &id, rr));
- *dev = id;
+ // Start with our local NIC and local Rank
+ NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, dev));
+ *proxyRank = rank;
+
+ int pxnLevel = ncclPxnDisable() == 1 ? 0 : ncclParamP2pPxnLevel();
+ // See whether we can use the remote rank preferred device.
+ if (ncclParamCrossNic() == 0 || (pxnLevel != 0)) {
+ int netDev = comm->peerInfo[peerRank].netDev;
+ int n;
+ // Check that device exists on our node
+ if (ncclParamCrossNic() == 0) {
+ if (ncclTopoIdToIndex(comm->topo, NET, netDev, &n) != ncclSuccess) {
+ WARN("Rank %d requires NIC %d but that NIC is not available for rank %d", peerRank, netDev, rank);
+ return ncclInvalidUsage;
+ }
+ *dev = netDev;
+ }
+ if (pxnLevel == 1) {
+ int g, n;
+ NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g));
+ NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n));
+ struct ncclTopoNode* gpu = comm->topo->nodes[GPU].nodes+g;
+ if (gpu->paths[NET][n].type <= PATH_PXN) {
+ *dev = netDev;
+ NCCLCHECK(ncclTopoGetIntermediateRank(comm->topo, rank, *dev, proxyRank));
+ }
+ } else if (pxnLevel == 2) {
+ // Check whether we can access it through our node-local GPU for that NIC.
+ for (int r=0; r<comm->localRanks; r++) {
+ int peerRank = comm->localRankToRank[r];
+ if (comm->peerInfo[peerRank].netDev == netDev) {
+ int g1, g2, n;
+ NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1));
+ NCCLCHECK(ncclTopoRankToIndex(comm->topo, peerRank, &g2));
+ NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netDev, &n));
+ struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2;
+ if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) {
+ *proxyRank = peerRank;
+ *dev = netDev;
+ return ncclSuccess;
+ }
+ }
+ }
+ }
+ }
}
return ncclSuccess;
}
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index 1d34286..83f125f 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -20,8 +20,8 @@
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
-const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "SYS", "NET" };
-const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PHB", "SYS" };
+const char* topoLinkTypeStr[] = { "LOC", "NVL", "", "PCI", "", "", "", "SYS", "NET" };
+const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS" };
/******************************************************************/
/******************* Graph Creation Functions *********************/
@@ -121,6 +121,7 @@ ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNo
n->net.asic = 0ULL;
n->net.port = NCCL_TOPO_UNDEF;
n->net.width = 0.0;
+ n->net.latency = 0.0;
}
*node = n;
return ncclSuccess;
@@ -332,13 +333,14 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s
ncclDebugNoWarn = NCCL_GRAPH;
int mbps;
- if (xmlGetAttrInt(xmlNet, "speed", &mbps) != ncclSuccess) mbps = 0;
+ NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0));
if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1
net->net.width = mbps / 8000.0;
- if (xmlGetAttrInt(xmlNet, "port", &net->net.port) != ncclSuccess) net->net.port = 0;
- if (xmlGetAttrInt(xmlNet, "gdr", &net->net.gdrSupport) != ncclSuccess) net->net.gdrSupport = 0;
- if (xmlGetAttrInt(xmlNet, "maxconn", &net->net.maxChannels) != ncclSuccess) net->net.maxChannels = MAXCHANNELS;
- if (xmlGetAttrInt(xmlNet, "coll", &net->net.collSupport) != ncclSuccess) net->net.collSupport = 0;
+ if (xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != ncclSuccess) net->net.latency = 0;
+ NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0));
+ NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0));
+ NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS));
+ NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0));
ncclDebugNoWarn = 0;
NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width));
@@ -578,6 +580,16 @@ static ncclResult_t xmlInitAttrUint64(struct ncclXmlNode* node, const char* attr
}
return ncclSuccess;
}
+static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrName, const float value) {
+ int index;
+ NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+ if (index == -1) {
+ index = node->nAttrs++;
+ strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+ snprintf(node->attrs[index].value, MAX_STR_LEN, "%f", value);
+ }
+ return ncclSuccess;
+}
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
@@ -614,7 +626,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
// Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
// so we start with collnet so that it has precedence.
int netDevCount = 0;
- if (ncclCollNet) {
+ if (collNetSupport()) {
NCCLCHECK(collNetDevices(&netDevCount));
for (int n=0; n<netDevCount; n++) {
ncclNetProperties_t props;
@@ -643,6 +655,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
+ NCCLCHECK(xmlInitAttrFloat(netNode, "latency", props.latency));
NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
@@ -662,7 +675,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
return ncclSuccess;
}
-ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_t* id, int rr) {
+ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id) {
int g;
NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
int minType = PATH_SYS;
@@ -679,6 +692,13 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_
}
if (path->width == maxWidth && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
}
+ if (count == 0) {
+ *id = -1;
+ free(nets);
+ return ncclSuccess;
+ }
+
+ int rr = system->nodes[GPU].nodes[g].gpu.dev;
*id = nets[rr%count];
free(nets);
return ncclSuccess;
@@ -778,3 +798,14 @@ ncclResult_t ncclTopoGetCompCap(struct ncclTopoSystem* system, int* ccMin, int*
if (ccMax) *ccMax = max;
return ncclSuccess;
}
+
+ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank) {
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
+ *localRank = g;
+ return ncclSuccess;
+ }
+ }
+ WARN("Could not find local GPU with rank %d\n", rank);
+ return ncclInternalError;
+}
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 304b496..ada1732 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -43,9 +43,10 @@ extern const char* topoNodeTypeStr[];
// Skipping 2 for PATH_NVB
#define LINK_PCI 3
// Skipping 4 for PATH_PXB
-// Skipping 5 for PATH_PHB
-#define LINK_SYS 6
-#define LINK_NET 7
+// Skipping 5 for PATH_PXN
+// Skipping 6 for PATH_PHB
+#define LINK_SYS 7
+#define LINK_NET 8
extern const char* topoLinkTypeStr[];
#define PATH_LOC 0
@@ -53,8 +54,10 @@ extern const char* topoLinkTypeStr[];
#define PATH_NVB 2
#define PATH_PIX 3
#define PATH_PXB 4
-#define PATH_PHB 5
-#define PATH_SYS 6
+#define PATH_PXN 5
+#define PATH_PHB 6
+#define PATH_SYS 7
+#define PATH_DIS 7
extern const char* topoPathTypeStr[];
struct ncclTopoNode;
@@ -93,6 +96,7 @@ struct ncclTopoNode {
uint64_t asic;
int port;
float width;
+ float latency;
int gdrSupport;
int collSupport;
int maxChannels;
@@ -132,8 +136,7 @@ ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id)
ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width);
ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system);
-
-ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_t* id, int rr);
+ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int netDev, int* intermediateRank);
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem);
ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* nChannels);
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index e30a927..b07ca38 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -66,7 +66,7 @@ static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
/* PCI */
{ /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { 1.0, 1.9, 8.0 } },
/* NET */
- { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 9.6 }, /* CollNet (LL/LL128/Simple)*/ { 5.0, 5.0, 10.7 } }
+ { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 28 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 28 }, /* CollNet (LL/LL128/Simple)*/ { 5.0, 5.0, 10.7 } }
};
// LL128 max BW per channel
@@ -80,8 +80,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
- comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
- getNthreads("NCCL_NTHREADS", ncclParamNthreads(), NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
+ comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS;
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128] =
@@ -112,7 +111,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
nRanks;
- int nInterSteps = coll == ncclFuncAllReduce ? 2*(nNodes-1) :
+ int nInterSteps = coll == ncclFuncAllReduce ? (nNodes > 1 ? 2*nNodes :0) :
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 :
nNodes;
@@ -138,7 +137,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
comm->latencies[coll][a][p] = baseLat[a][p];
float intraLat = hwLat[intraHw[a]][a][p];
- float interLat = hwLat[NCCL_HW_NET][a][p];
+ float interLat = graphs[a]->latencyInter ? graphs[a]->latencyInter : hwLat[NCCL_HW_NET][a][p];
+
if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8;
if (a == NCCL_ALGO_RING) {
float lat = hwLat[hw[a]][a][p];
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index 8f50301..838a7f5 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -602,7 +602,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
if (busId == NULL || cudaDeviceGetByPCIBusId(&dev, busId) != cudaSuccess) dev = -1;
} else {
- NCCLCHECK(wrapNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&dev));
+ NCCLCHECK(ncclNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&dev));
}
NCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev));
}
@@ -617,7 +617,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
CUDACHECK(cudaGetDeviceProperties(&devProp, dev));
cudaMajor = devProp.major; cudaMinor = devProp.minor;
} else {
- NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDev, &cudaMajor, &cudaMinor));
+ NCCLCHECK(ncclNvmlDeviceGetCudaComputeCapability(nvmlDev, &cudaMajor, &cudaMinor));
}
NCCLCHECK(xmlSetAttrInt(gpuNode, "sm", cudaMajor*10+cudaMinor));
}
@@ -638,15 +638,15 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
for (int l=0; l<maxNvLinks; ++l) {
// Check whether we can use this NVLink for P2P
unsigned canP2P;
- if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
+ if ((ncclNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
// Make sure the Nvlink is up. The previous call should have trained the link.
nvmlEnableState_t isActive;
- if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
+ if ((ncclNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
// Try to figure out what's on the other side of the NVLink
nvmlPciInfo_t remoteProc;
- if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
+ if (ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
// Make a lower case copy of the bus ID for calling ncclDeviceType
// PCI system path is in lower case
@@ -701,13 +701,7 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl
NCCLCHECK(xmlSetAttrIfUnset(node, "class", "0x03"));
NCCLCHECK(ncclTopoGetXmlFromSys(node, xml));
nvmlDevice_t nvmlDev = NULL;
- static int nvmlInit = 0;
- if (nvmlInit == 0) {
- nvmlInit = (wrapNvmlSymbols() != ncclSuccess || wrapNvmlInit() != ncclSuccess) ? 2 : 1;
- }
- if (nvmlInit == 1) {
- if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
- }
+ if (ncclNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode));
return ncclSuccess;
}
diff --git a/src/graph/xml.h b/src/graph/xml.h
index 0c16b95..73f777d 100644
--- a/src/graph/xml.h
+++ b/src/graph/xml.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -94,6 +94,14 @@ static ncclResult_t xmlGetAttrInt(struct ncclXmlNode* node, const char* attrName
return ncclSuccess;
}
+static ncclResult_t xmlGetAttrIntDefault(struct ncclXmlNode* node, const char* attrName, int* value, int defaultValue) {
+ const char* str;
+ NCCLCHECK(xmlGetAttr(node, attrName, &str));
+ *value = str ? strtol(str, NULL, 0) : defaultValue;
+ return ncclSuccess;
+}
+
+
static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrName, float* value) {
const char* str;
NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
diff --git a/src/group.cc b/src/group.cc
index 217e76d..0e8f19e 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -52,21 +52,6 @@ struct ncclAsyncArgs {
thread_local struct ncclAsyncArgs ncclGroupArgs[MAX_ASYNC_OPS];
-#define NCCLCHECKTHREAD(a) do { \
- if ((args->ret = (a)) != ncclSuccess) { \
- INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
- return args; \
- } \
-} while(0)
-
-#define CUDACHECKTHREAD(a) do { \
- if ((a) != cudaSuccess) { \
- INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
- args->ret = ncclUnhandledCudaError; \
- return args; \
- } \
-} while(0)
-
void* ncclAsyncThreadMain(void* args_) {
struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_;
NCCLCHECKTHREAD(args->init.func(args->init.newcomm, args->init.ndev, args->init.commId, args->init.myrank, args->init.cudaDev));
@@ -116,15 +101,19 @@ ncclResult_t ncclGroupStart() {
return ncclSuccess;
}
-static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int channelId, ssize_t recvbytes, void* recvbuff, ssize_t sendbytes, const void* sendbuff) {
- struct ncclInfo info = { ncclFuncSendRecv, "SendRecv",
- sendbuff, recvbuff, (size_t)std::max<ssize_t>(sendbytes,recvbytes), ncclInt8, ncclSum, -1, comm, comm->userStream, /* Args */
+static ncclResult_t scheduleSend(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff) {
+ struct ncclInfo info = { ncclFuncSend, "Send",
+ NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
+ 1, 1 };
+ info.channelId = channelId;
+ NCCLCHECK(ncclSetupP2pKernel(&info));
+ return ncclSuccess;
+}
+static ncclResult_t scheduleRecv(struct ncclComm* comm, int peer, int channelId, size_t count, void* buff) {
+ struct ncclInfo info = { ncclFuncRecv, "Recv",
+ NULL, buff, count, ncclInt8, ncclSum, peer, comm, comm->userStream, /* Args */
1, 1 };
- info.delta = delta;
info.channelId = channelId;
- info.sendbytes = sendbytes;
- info.recvbytes = recvbytes;
- if (delta == 0 && sendbytes != recvbytes) return ncclInvalidUsage;
NCCLCHECK(ncclSetupP2pKernel(&info));
return ncclSuccess;
}
@@ -134,7 +123,7 @@ void* ncclAsyncThreadPreconnect(void* args_) {
struct ncclComm* comm = args->coll.comm;
CUDACHECKTHREAD(cudaSetDevice(comm->cudaDev));
if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
- NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, 0));
+ NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, 1));
return args;
}
@@ -216,8 +205,10 @@ ncclResult_t ncclGroupEnd() {
struct ncclAsyncArgs* args = ncclGroupArgs+i;
if (args->funcType == ASYNC_FUNC_COLL) {
struct ncclComm* comm = args->coll.comm;
- int rank = comm->rank;
- int nRanks = comm->nRanks;
+ int node = comm->node;
+ int nNodes = comm->nNodes;
+ int localRank = comm->localRank;
+ int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
// Compute how much to split operations
// Natural step size matching buffer steps.
@@ -233,50 +224,70 @@ ncclResult_t ncclGroupEnd() {
while (comm->p2pSendCount > 0 || comm->p2pRecvCount > 0) {
// schedule delta 0, +1, -1, +2, -2, ...
// also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
- for (int d=0; d<=nRanks/4; d++) {
- int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, (nRanks-(nRanks/2-d))%nRanks };
+ for (int d=0; d<=nNodes/4; d++) {
+ int deltas[4] = { d, (nNodes-d)%nNodes, nNodes/2-d, (nNodes-(nNodes/2-d))%nNodes };
int index = 0;
int delta = deltas[index];
sched_delta:
- uint32_t from = (rank+nRanks-delta)%nRanks;
- uint32_t to = (rank+delta)%nRanks;
- struct ncclP2Pinfo* recv = comm->p2pRecvs[from] ? comm->p2pRecvs[from]->getNext() : NULL;
- struct ncclP2Pinfo* send = comm->p2pSends[to] ? comm->p2pSends[to]->getNext() : NULL;
- if (recv != NULL || send != NULL) {
- ssize_t totRecvBytes = -1, totSendBytes = -1;
- if (recv != NULL) totRecvBytes = recv->nbytes;
- if (send != NULL) totSendBytes = send->nbytes;
- ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
- ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
-
- ssize_t sendOffset = 0;
- ssize_t recvOffset = 0;
- int sendRemaining = 1, recvRemaining = 1;
- int chunk = 0;
- do {
- int channelId = (delta+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
- ssize_t recvbytes = totRecvBytes-recvOffset;
- ssize_t sendbytes = totSendBytes-sendOffset;
- if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
- if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
- // 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested
- // (total size == 0), otherwise set size to -1 so that the kernel skips the operation.
- if (sendbytes == 0 && totSendBytes != 0) sendbytes = -1;
- if (recvbytes == 0 && totRecvBytes != 0) recvbytes = -1;
- if (sendbytes >= 0 || recvbytes >= 0) {
- NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
- recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL,
- sendbytes, send ? ((const char*)(send->buff)) + sendOffset : NULL), ret, group_cleanup);
+ uint32_t recvNode = (node+nNodes-delta)%nNodes;
+ uint32_t sendNode = (node+delta)%nNodes;
+ int steps = comm->maxLocalRanks;
+ for (int s=0; s<steps; s++) {
+ int recvIndex = (localRank-s+steps)%steps;
+ int recvPeer = recvIndex<comm->nodeRanks[recvNode].localRanks ? comm->nodeRanks[recvNode].localRankToRank[recvIndex] : -1;
+ int sendIndex = (localRank+s)%steps;
+ int sendPeer = sendIndex<comm->nodeRanks[sendNode].localRanks ? comm->nodeRanks[sendNode].localRankToRank[sendIndex] : -1;
+ struct ncclP2Pinfo* recv = recvPeer != -1 && comm->p2pRecvs[recvPeer] ? comm->p2pRecvs[recvPeer]->getNext() : NULL;
+ struct ncclP2Pinfo* send = sendPeer != -1 && comm->p2pSends[sendPeer] ? comm->p2pSends[sendPeer]->getNext() : NULL;
+ if (recv != NULL || send != NULL) {
+ ssize_t totRecvBytes = -1, totSendBytes = -1;
+ if (recv != NULL) totRecvBytes = recv->nbytes;
+ if (send != NULL) totSendBytes = send->nbytes;
+ if (recv) comm->p2pRecvCount--;
+ if (send) comm->p2pSendCount--;
+ if (recvPeer == comm->rank) { // Check self send/recv
+ if (sendPeer != comm->rank) { WARN("Sendrecv schedule not aligned for self"); ret = ncclInternalError; goto group_cleanup; }
+ if (send && recv == NULL) { WARN("Trying to send to self without a matching recv"); ret = ncclInvalidUsage; goto group_cleanup; }
+ if (send == NULL && recv) { WARN("Trying to recv to self without a matching send"); ret = ncclInvalidUsage; goto group_cleanup; }
}
- recvOffset += recvChunkSize;
- sendOffset += sendChunkSize;
- chunk++;
- } while (sendRemaining || recvRemaining);
- if (recv) comm->p2pRecvCount--;
- if (send) comm->p2pSendCount--;
+ void* recvBuff = recv ? recv->buff : NULL;
+ void* sendBuff = send ? send->buff : NULL;
+ // After we recycle p2pSend/Recv, we're no longer allowed to dereference send or recv, only use them as boolean NULL/not NULL.
+ if (recv && comm->p2pRecvs[recvPeer]->peakNext() == NULL) comm->p2pRecvs[recvPeer]->recycle();
+ if (send && comm->p2pSends[sendPeer]->peakNext() == NULL) comm->p2pSends[sendPeer]->recycle();
+
+ ssize_t recvChunkSize = getP2pChunkSize(totRecvBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
+ ssize_t sendChunkSize = getP2pChunkSize(totSendBytes, nChannelsMin, nChannelsMax, stepSize, SENDRECV_SLICEFACTOR*stepSize);
+
+ ssize_t sendOffset = 0;
+ ssize_t recvOffset = 0;
+ int sendRemaining = 1, recvRemaining = 1;
+ int chunk = 0;
+ do {
+ // Shuffle channels with s intra-node, and delta inter-node. Inter-node, make sure
+ // to use multiple channels to guarantee progress on all ranks from the same node.
+ int shuffle = comm->nNodes > 1 ? delta+(s/p2pGroupSize) : s;
+ int channelId = (shuffle+comm->p2pChannels[chunk%comm->p2pnChannelsPerPeer]) % comm->p2pnChannels;
+ ssize_t recvbytes = totRecvBytes-recvOffset;
+ ssize_t sendbytes = totSendBytes-sendOffset;
+ if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
+ if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
+ // 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested
+ // (total size == 0), otherwise set size to -1.
+ if (sendbytes <= 0 && totSendBytes != 0) send = NULL;
+ if (recvbytes <= 0 && totRecvBytes != 0) recv = NULL;
+ if (recv) {
+ NCCLCHECKGOTO(scheduleRecv(comm, recvPeer, channelId, recvbytes, ((char*)recvBuff)+recvOffset), ret, group_cleanup);
+ }
+ if (send) {
+ NCCLCHECKGOTO(scheduleSend(comm, sendPeer, channelId, sendbytes, ((char*)sendBuff)+sendOffset), ret, group_cleanup);
+ }
+ recvOffset += recvChunkSize;
+ sendOffset += sendChunkSize;
+ chunk++;
+ } while (sendRemaining || recvRemaining);
+ }
}
- if (recv == NULL && comm->p2pRecvs[from]) comm->p2pRecvs[from]->recycle();
- if (send == NULL && comm->p2pSends[to]) comm->p2pSends[to]->recycle();
index++;
if (index == 1 && deltas[1] == deltas[0]) index++;
if (index == 2 && deltas[2] == deltas[0]) index++;
@@ -382,16 +393,6 @@ group_cleanup:
}
comm->p2pSendCount = comm->p2pRecvCount = 0;
}
- /* Free all proxy ops in state->nextOps */
- struct ncclProxyState* state = &comm->proxyState;
- pthread_mutex_lock(&state->poolMutex);
- for (struct ncclProxyArgs *op = state->nextOps; op; op = op->next) {
- op->next = state->pool;
- state->pool = op;
- }
- pthread_mutex_unlock(&state->poolMutex);
- state->nextOps = NULL;
-
ncclLaunchReset(comm);
}
}
diff --git a/src/include/alloc.h b/src/include/alloc.h
index 0791592..14bccf9 100644
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -30,16 +30,37 @@ static inline ncclResult_t ncclCudaHostFree(void* ptr) {
}
template <typename T>
-static ncclResult_t ncclCalloc(T** ptr, size_t nelem) {
+static ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
void* p = malloc(nelem*sizeof(T));
if (p == NULL) {
WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
return ncclSystemError;
}
+ //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p);
memset(p, 0, nelem*sizeof(T));
*ptr = (T*)p;
return ncclSuccess;
}
+#define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
+
+template <typename T>
+static ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
+ if (nelem < oldNelem) return ncclInternalError;
+ if (nelem == oldNelem) return ncclSuccess;
+
+ T* oldp = *ptr;
+ T* p = (T*)malloc(nelem*sizeof(T));
+ if (p == NULL) {
+ WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
+ return ncclSystemError;
+ }
+ memcpy(p, oldp, oldNelem*sizeof(T));
+ free(oldp);
+ memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T));
+ *ptr = (T*)p;
+ INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr);
+ return ncclSuccess;
+}
template <typename T>
static ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h
index 77ac12b..a787c0b 100644
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,18 +8,17 @@
#define NCCL_BOOTSTRAP_H_
#include "nccl.h"
+#include "comm.h"
ncclResult_t bootstrapNetInit();
ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
-ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState);
+ncclResult_t bootstrapInit(ncclUniqueId* id, struct ncclComm* comm);
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
-ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, cudaIpcMemHandle_t* ipc, void** ptr);
-ncclResult_t bootstrapRemFree(int id, int rank, void* commState);
ncclResult_t bootstrapClose(void* commState);
ncclResult_t bootstrapAbort(void* commState);
#endif
diff --git a/src/include/checks.h b/src/include/checks.h
index 131c079..9624608 100644
--- a/src/include/checks.h
+++ b/src/include/checks.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -60,6 +60,49 @@
} \
} while(true)
+#define SYSCHECKGOTO(statement, res, label) do { \
+ if ((statement) == -1) { \
+ /* Print the back trace*/ \
+ res = ncclSystemError; \
+ INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
+ goto label; \
+ } \
+} while (0);
+
+#define NEQCHECK(statement, value) do { \
+ if ((statement) != value) { \
+ /* Print the back trace*/ \
+ INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError); \
+ return ncclSystemError; \
+ } \
+} while (0);
+
+#define NEQCHECKGOTO(statement, value, res, label) do { \
+ if ((statement) != value) { \
+ /* Print the back trace*/ \
+ res = ncclSystemError; \
+ INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
+ goto label; \
+ } \
+} while (0);
+
+#define EQCHECK(statement, value) do { \
+ if ((statement) == value) { \
+ /* Print the back trace*/ \
+ INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, ncclSystemError); \
+ return ncclSystemError; \
+ } \
+} while (0);
+
+#define EQCHECKGOTO(statement, value, res, label) do { \
+ if ((statement) == value) { \
+ /* Print the back trace*/ \
+ res = ncclSystemError; \
+ INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
+ goto label; \
+ } \
+} while (0);
+
// Propagate errors up
#define NCCLCHECK(call) do { \
ncclResult_t res = call; \
@@ -79,4 +122,39 @@
} \
} while (0);
+#define NCCLWAIT(call, cond, abortFlagPtr) do { \
+ volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
+ ncclResult_t res = call; \
+ if (res != ncclSuccess) { \
+ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
+ return ncclInternalError; \
+ } \
+ if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
+} while (!(cond));
+
+#define NCCLWAITGOTO(call, cond, abortFlagPtr, res, label) do { \
+ volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
+ res = call; \
+ if (res != ncclSuccess) { \
+ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
+ goto label; \
+ } \
+ if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, res, label); \
+} while (!(cond));
+
+#define NCCLCHECKTHREAD(a) do { \
+ if ((args->ret = (a)) != ncclSuccess) { \
+ INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
+ return args; \
+ } \
+} while(0)
+
+#define CUDACHECKTHREAD(a) do { \
+ if ((a) != cudaSuccess) { \
+ INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
+ args->ret = ncclUnhandledCudaError; \
+ return args; \
+ } \
+} while(0)
+
#endif
diff --git a/src/include/coll_net.h b/src/include/coll_net.h
index 0d17b76..c2d831e 100644
--- a/src/include/coll_net.h
+++ b/src/include/coll_net.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -29,6 +29,6 @@ static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK
static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; }
static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
-static int collNetSupport() { return ncclCollNet != NULL ? 1 : 0; }
+static int collNetSupport() { return ncclCollNet != nullptr ? 1 : 0; }
#endif
diff --git a/src/include/collectives.h b/src/include/collectives.h
index 5fde721..d65c6ae 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -36,7 +36,7 @@ struct ncclDevRedOpFull {
/* Declare all collective operations */
#define DECL5(func, algo, proto, devredop, type) \
extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
- extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(ncclWorkElem c); \
+ extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, struct ncclWorkElem c); \
#define CONCAT(a,b) a##b
#define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(t, f)
diff --git a/src/include/comm.h b/src/include/comm.h
index bcbc695..4b55dc6 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -31,8 +31,6 @@ struct cudaLaunchParams {
#define NCCL_LL128_THREAD_THRESHOLD 8
#define NCCL_SIMPLE_THREAD_THRESHOLD 64
-#define NCCL_MAX_INTRA_RANKS 32
-
struct ncclSendMem {
union {
struct {
@@ -41,10 +39,10 @@ struct ncclSendMem {
void* ptrExchange;
uint64_t redOpArgExchange[2];
char pad2[CACHE_LINE_SIZE-sizeof(void*)-2*sizeof(uint64_t)];
+ int offsFifo[NCCL_STEPS];
};
char pad3[MEM_ALIGN];
};
- char buff[1]; // Actually larger than that
};
struct ncclRecvMem {
@@ -53,18 +51,18 @@ struct ncclRecvMem {
uint64_t tail;
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
int sizesFifo[NCCL_STEPS];
- void* ptrsFifo[NCCL_STEPS];
+ int offsFifo[NCCL_STEPS];
+ int flush; // For GDRCopy-based flush
};
char pad4[MEM_ALIGN];
};
- char buff[1]; // Actually larger than that
};
typedef cudaError_t(*pfn_cuMemGetAddressRange_t)(void**, size_t*, void*);
enum helperThreadState {ThreadStart, ThreadStop};
-#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_INTRA_RANKS*NCCL_MAX_OPS)
+#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS)
struct ncclGraphHelperResources {
ncclComm* comm;
@@ -82,6 +80,11 @@ struct ncclUserRedOp {
ncclDevRedOpFull opFull;
};
+struct ncclNodeRanks {
+ int localRanks;
+ int* localRankToRank;
+};
+
struct ncclComm {
struct ncclChannel channels[MAXCHANNELS];
@@ -102,12 +105,14 @@ struct ncclComm {
int node;
int nNodes;
-
- // Intra-node rank info
- int intraNodeGlobalRanks[NCCL_MAX_INTRA_RANKS];
+ int localRank;
int localRanks;
- int intraNodeRank;
- int8_t* rankToIntraNodeRank;
+ int maxLocalRanks;
+ int* rankToNode;
+ int* rankToLocalRank;
+ int* localRankToRank;
+ // localRanks and localRanktoRank for all nodes
+ struct ncclNodeRanks* nodeRanks;
enum { GROUP, PARALLEL, GROUP_GRAPH } launchMode;
cudaStream_t userStream;
@@ -161,14 +166,13 @@ struct ncclComm {
// Storage for deferred intra-process launch
struct cudaLaunchParams * intraParams;
struct cudaLaunchParams *myParams;
+ pthread_t* intraThreads;
int* intraCudaDevs;
int* intraCGMode; // Whether we can use CUDA9 CGMD or not
int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not
struct ncclWorkElem args;
- void* argsptr;
+ void* argsptrs[2];
- // Global proxy thread
- pthread_t proxyThread;
struct ncclProxyState proxyState;
// Whether this communicator uses collNet
diff --git a/src/include/debug.h b/src/include/debug.h
index 6ce90ee..7af38fd 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -16,6 +16,9 @@
#include <string.h>
#include <pthread.h>
+// Conform to pthread and NVTX standard
+#define NCCL_THREAD_NAMELEN 16
+
extern int ncclDebugLevel;
extern uint64_t ncclDebugMask;
extern pthread_mutex_t ncclDebugOutputLock;
@@ -37,4 +40,6 @@ extern std::chrono::high_resolution_clock::time_point ncclEpoch;
#define TRACE(...)
#endif
+void ncclSetThreadName(pthread_t thread, const char *fmt, ...);
+
#endif
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
index 676ffda..8ff9d4b 100644
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,8 +11,8 @@
#include "align.h"
#include <stdint.h>
-#define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now
-typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclNumFuncs} ncclFunc_t;
+#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
+typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
@@ -90,16 +90,22 @@ struct ncclConnInfo {
uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
int *sizesFifo; // Sizes fifo from GPU to proxy
- void* *ptrsFifo; // Buffer fifo from proxy to GPU
+ int *offsFifo; // Buffer fifo from proxy to GPU
uint64_t step; // Keep where we are
uint64_t llLastCleaning;
};
+struct ncclProxyConnector {
+ int rank;
+ int localRank;
+ struct ncclProxyConnection* connection;
+ struct ncclComm* comm;
+};
+
struct ncclConnector {
int connected;
- struct ncclProxyArgs *proxyAppend;
- struct ncclProxyArgs **proxyAppendPtr;
+ struct ncclProxyConnector proxyConn;
struct ncclTransportComm* transportComm;
void* transportResources;
struct ncclConnInfo conn;
@@ -147,63 +153,89 @@ struct ncclPeer {
struct ncclDevComm;
-#define NCCL_MAX_WORK_ELEMENTS 8
-#define NCCL_MAX_GROUPS (NCCL_MAX_WORK_ELEMENTS*2)
-
/* ncclWork is to be a power of two, currently 8x64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of ncclWorkElem. */
-struct ncclWorkElem {
- // Header
- struct ncclDevComm* comm;
- uint16_t nThreads;
+#define NCCL_WORK_SIZE 512
+
+enum ncclWorkElemType : uint8_t {
+ ncclWorkTypeUnused=0,
+ ncclWorkTypeColl=1,
+ ncclWorkTypeP2p=2,
+ ncclWorkTypeRegColl=3
+};
+enum ncclWorkElemSubType : uint8_t {
+ ncclWorkSubTypeUnused =0,
+ ncclWorkSubTypeSend,
+ ncclWorkSubTypeRecv
+};
+
+struct ncclWorkElemHeader {
uint16_t funcIndex;
+ enum ncclWorkElemType type;
+ unsigned nWarps:5;
+ unsigned isLast:1;
+};
+
+struct ncclWorkElem {
+ struct ncclWorkElemHeader header;
uint8_t regUsed;
uint8_t direct;
- uint8_t active, redOpArgIsPtr;
+ uint8_t redOpArgIsPtr;
const void * sendbuff;
void * recvbuff;
- // Op-specific fields.
- union {
- struct {
- size_t count;
- size_t lastChunkSize;
- uint32_t root;
- uint8_t bid;
- uint8_t nChannels;
- uint64_t redOpArg;
- } coll;
- struct {
- size_t sendCount;
- size_t recvCount;
- int sendChunkSize;
- int recvChunkSize;
- int32_t delta;
- uint16_t nThreads;
- } p2p;
- uint64_t align[4];
- };
+ size_t count;
+ size_t lastChunkSize;
+ uint32_t root;
+ uint8_t bid;
+ uint8_t nChannels;
+ uint64_t redOpArg;
+ uint64_t pad;
+};
+static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElem) == 0, "ncclWorkElem size must be a multiple of ncclWork size");
+
+struct ncclWorkElemP2p {
+ struct ncclWorkElemHeader header;
+ int32_t peer;
+ void* buff;
+ size_t count;
+ int chunkSize;
+ uint8_t ngroups;
+ uint8_t warpStart;
+ uint8_t nWarps;
+ enum ncclWorkElemSubType subType;
};
-static_assert(sizeof(struct ncclWorkElem) == (0x10*sizeof(int)), "ncclWorkElem must have a pow2 size");
+static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemP2p) == 0, "ncclWorkElemP2p size must be a multiple of ncclWork size");
-struct ncclWorkRegElem {
+struct ncclWorkElemReg {
struct ncclWorkElem elem;
void* dnInputs[NCCL_MAX_DIRECT_ARITY+1];
void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1];
void* upOutputs[NCCL_MAX_DIRECT_ARITY+1];
};
-#define NCCL_REG_ELEM_FACTOR 4
-static_assert(sizeof(struct ncclWorkRegElem) == (NCCL_REG_ELEM_FACTOR*sizeof(struct ncclWorkElem)), "ncclWorkRegElem size must be pow2 times ncclWorkElem size");
+static_assert(NCCL_WORK_SIZE % sizeof(struct ncclWorkElemReg) == 0, "ncclWork size must be a multiple of ncclWorkElemReg size");
+static_assert(sizeof(struct ncclWorkElemReg) % sizeof(struct ncclWorkElem) == 0, "ncclWorkElemReg size must be a multiple of ncclWorkElem size");
+
+#define NCCL_MAX_WORK_ELEMENTS (NCCL_WORK_SIZE/sizeof(struct ncclWorkElem))
+#define NCCL_MAX_WORK_ELEMENTS_P2P (NCCL_WORK_SIZE/sizeof(struct ncclWorkElemP2p))
+#define NCCL_MAX_WORK_ELEMENTS_REG (NCCL_WORK_SIZE/sizeof(struct ncclWorkElemReg))
+// Number of named barriers supported by CUDA
+#define NCCL_MAX_GROUPS 16
struct ncclWork {
union {
+ char pad[NCCL_WORK_SIZE];
+ struct ncclWorkElemHeader header;
struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
- struct ncclWorkRegElem regElems[NCCL_MAX_WORK_ELEMENTS/NCCL_REG_ELEM_FACTOR];
+ struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P];
+ struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG];
};
};
+static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "ncclWork size needs to be well aligned");
+
struct ncclChannel {
union {
struct {
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
index 962896e..02a9adb 100644
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -31,17 +31,17 @@ ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph);
ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph);
struct ncclBuffRegInfo {
- void* sendbuffsBase[NCCL_MAX_INTRA_RANKS];
- void* recvbuffsBase[NCCL_MAX_INTRA_RANKS];
- void* sendbuffs[NCCL_MAX_INTRA_RANKS];
- void* recvbuffs[NCCL_MAX_INTRA_RANKS];
+ void* sendbuffsBase[NCCL_MAX_LOCAL_RANKS];
+ void* recvbuffsBase[NCCL_MAX_LOCAL_RANKS];
+ void* sendbuffs[NCCL_MAX_LOCAL_RANKS];
+ void* recvbuffs[NCCL_MAX_LOCAL_RANKS];
int nBuffs;
};
// Enqueue information (for kernel and proxy) for each operation
struct ncclQueueElem {
- struct ncclWorkElem work;
- struct ncclProxyArgs proxyArgs;
+ struct ncclWork work;
+ struct ncclProxyOp proxyOp;
struct ncclBuffRegInfo buffRegInfo;
};
@@ -87,7 +87,7 @@ static void ncclDestroyQueueInfo(void* ptr) {
// but currently the destroy function of CUDA objects does not allow CUDA API calls
while (eqElem != NULL) {
for (int i=0; i<eqElem->buffRegInfo.nBuffs; i++) {
- if (i == eqInfo->comm->intraNodeRank) continue;
+ if (i == eqInfo->comm->localRank) continue;
CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.sendbuffsBase[i]));
CUDACHECKIGNORE(cudaIpcCloseMemHandle(eqElem->buffRegInfo.recvbuffsBase[i]));
}
diff --git a/src/include/graph.h b/src/include/graph.h
index 4b7a836..898b903 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -30,9 +30,12 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
// Query topology
-ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* net);
+ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
+int ncclPxnDisable();
+ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
+ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank);
// Find CPU affinity
ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
@@ -48,6 +51,7 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
+ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int* id);
#define NCCL_TOPO_MAX_NODES 256
@@ -70,6 +74,7 @@ struct ncclTopoGraph {
int nChannels;
float speedIntra;
float speedInter;
+ float latencyInter;
int typeIntra;
int typeInter;
int sameChannels;
diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h
index 4ec1ac6..63555ba 100644
--- a/src/include/ibvwrap.h
+++ b/src/include/ibvwrap.h
@@ -4,7 +4,7 @@
* Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
*
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -328,7 +328,8 @@ enum ibv_access_flags {
IBV_ACCESS_REMOTE_WRITE = (1<<1),
IBV_ACCESS_REMOTE_READ = (1<<2),
IBV_ACCESS_REMOTE_ATOMIC = (1<<3),
- IBV_ACCESS_MW_BIND = (1<<4)
+ IBV_ACCESS_MW_BIND = (1<<4),
+ IBV_ACCESS_RELAXED_ORDERING = (1<<20),
};
struct ibv_pd {
@@ -1065,6 +1066,7 @@ ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context)
ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd);
ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access);
struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access);
+ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr);
ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context);
ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
diff --git a/src/include/info.h b/src/include/info.h
index 2e99e9c..3461cc7 100644
--- a/src/include/info.h
+++ b/src/include/info.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,7 +11,7 @@
#include "devcomm.h"
#include "collectives.h"
-typedef enum {
+typedef enum : uint8_t {
ncclPatternRing,
ncclPatternRingTwice,
ncclPatternPipelineFrom,
@@ -19,7 +19,9 @@ typedef enum {
ncclPatternTreeUp,
ncclPatternTreeDown,
ncclPatternTreeUpDown,
- ncclPatternCollTreeUpDown
+ ncclPatternCollTreeUpDown,
+ ncclPatternSend,
+ ncclPatternRecv
} ncclPattern_t;
// Used to pass NCCL call information between functions
@@ -32,7 +34,7 @@ struct ncclInfo {
size_t count;
ncclDataType_t datatype;
ncclRedOp_t op;
- int root;
+ int root; // peer for p2p operations
ncclComm_t comm;
cudaStream_t stream;
// Algorithm details
@@ -48,11 +50,7 @@ struct ncclInfo {
size_t nBytes;
int nstepsPerLoop;
int nchunksPerLoop;
- ssize_t sendbytes;
- ssize_t recvbytes;
- int recvChunkSize;
- int sendChunkSize;
- uint32_t delta;
+ int chunkSize;
int channelId;
};
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
index 389c1ea..ce61672 100644
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,7 +10,7 @@
#include "nccl.h"
#include <stdint.h>
-#define NCCL_NET_HANDLE_MAXSIZE 64
+#define NCCL_NET_HANDLE_MAXSIZE 128
#define NCCL_PTR_HOST 0x1
#define NCCL_PTR_CUDA 0x2
@@ -31,10 +31,114 @@ typedef struct {
int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
int speed; // Port speed in Mbps.
int port; // Port number.
+ float latency; // Network latency
int maxComms; // Maximum number of comms we can create
-}ncclNetProperties_v4_t;
+ int maxRecvs; // Maximum number of grouped receives.
+}ncclNetProperties_v5_t;
-typedef ncclNetProperties_v4_t ncclNetProperties_t;
+typedef ncclNetProperties_v5_t ncclNetProperties_t;
+
+typedef struct {
+ // Name of the network (mainly for logs)
+ const char* name;
+ // Initialize the network.
+ ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+ // Return the number of adapters.
+ ncclResult_t (*devices)(int* ndev);
+ // Get various device properties.
+ ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
+ // Create a receiving object and provide a handle to connect to it. The
+ // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+ // between ranks to create a connection.
+ ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+ // Connect to a handle and return a sending comm object for that peer.
+ // This call must not block for the connection to be established, and instead
+ // should return successfully with sendComm == NULL with the expectation that
+ // it will be called again until sendComm != NULL.
+ ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+ // Finalize connection establishment after remote peer has called connect.
+ // This call must not block for the connection to be established, and instead
+ // should return successfully with recvComm == NULL with the expectation that
+ // it will be called again until recvComm != NULL.
+ ncclResult_t (*accept)(void* listenComm, void** recvComm);
+ // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+ // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+ ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+ ncclResult_t (*deregMr)(void* comm, void* mhandle);
+ // Asynchronous send to a peer.
+ // May return request == NULL if the call cannot be performed (or would block)
+ ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+ // Asynchronous recv from a peer.
+ // May return request == NULL if the call cannot be performed (or would block)
+ ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+ // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+ // visible to the GPU
+ ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+ // Test whether a request is complete. If size is not NULL, it returns the
+ // number of bytes sent/received.
+ ncclResult_t (*test)(void* request, int* done, int* sizes);
+ // Close and free send/recv comm objects
+ ncclResult_t (*closeSend)(void* sendComm);
+ ncclResult_t (*closeRecv)(void* recvComm);
+ ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v5_t;
+
+typedef ncclNet_v5_t ncclNet_t;
+
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v5
+
+typedef struct {
+ // Name of the collective network (mainly for logs)
+ const char* name;
+ // Initialize the collective network.
+ ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+ // Return the number of adapters capable of doing collective operations.
+ // If ndev returns 0, all other functions might be set to NULL.
+ ncclResult_t (*devices)(int* ndev);
+ // Get various device properties.
+ ncclResult_t (*getProperties)(int dev, ncclNetProperties_v5_t* props);
+ // Create a receiving object and provide a handle to connect to it. The
+ // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+ // between ranks to create connections.
+ ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+ // Create a group for collective operations. handles have been created
+ // using listen() above. rank indicates caller's rank in the collective network.
+ ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+ // Returns whether a reduction operation on a data type is supported.
+ // 1 for supported, 0 otherwise.
+ ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+ // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+ ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+ ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+ // Performs an asynchronous allreduce operation on the collective group.
+ // May return request == NULL if the call cannot be performed (or would block).
+ ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+ ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+ // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+ // visible to the GPU
+ ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+ // Test whether a request is complete. If size is not NULL, it returns the
+ // number of bytes sent/received.
+ ncclResult_t (*test)(void* request, int* done, int* size);
+ // Close and free collective comm objects
+ ncclResult_t (*closeColl)(void* collComm);
+ ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v5_t;
+
+typedef ncclCollNet_v5_t ncclCollNet_t;
+
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v5
+
+typedef struct {
+ char* name; // Used mostly for logging.
+ char* pciPath; // Path to the PCI device in /sys.
+ uint64_t guid; // Unique identifier for the NIC chip. Important for
+ // cards with multiple PCI functions (Physical or virtual).
+ int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
+ int speed; // Port speed in Mbps.
+ int port; // Port number.
+ int maxComms; // Maximum number of comms we can create
+} ncclNetProperties_v4_t;
typedef struct {
// Name of the network (mainly for logs)
@@ -75,10 +179,6 @@ typedef struct {
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v4_t;
-typedef ncclNet_v4_t ncclNet_t;
-
-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v4
-
typedef struct {
// Name of the collective network (mainly for logs)
const char* name;
@@ -117,8 +217,4 @@ typedef struct {
ncclResult_t (*closeListen)(void* listenComm);
} ncclCollNet_v4_t;
-typedef ncclCollNet_v4_t ncclCollNet_t;
-
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v4
-
#endif // end include guard
diff --git a/src/include/net.h b/src/include/net.h
index ef553e2..0cc5067 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,10 +9,14 @@
#include "nccl.h"
#include "nccl_net.h"
+#include "checks.h"
extern ncclNet_t* ncclNet;
typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
+ncclResult_t ncclNetInit();
+int ncclNetVersion();
+
// Translation to external API
static const char* ncclNetName() { return ncclNet->name; }
static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
@@ -22,56 +26,16 @@ static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCC
static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; }
-static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; }
-static ncclResult_t ncclNetIflush(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, data, size, mhandle, request)); return ncclSuccess; }
-static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, tag, mhandle, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, n, data, sizes, tags, mhandles, request)); return ncclSuccess; }
+static ncclResult_t ncclNetIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { NCCLCHECK(ncclNet->iflush(recvComm, n, data, sizes, mhandles, request)); return ncclSuccess; }
+static ncclResult_t ncclNetTest(void* request, int* done, int* sizes) { NCCLCHECK(ncclNet->test(request, done, sizes)); return ncclSuccess; }
static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
// Test whether the current GPU support GPU Direct RDMA.
-#define GPU_BUF_SIZE (2*1024*1024)
-static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
- int netDevs;
- NCCLCHECK(ncclNetDevices(&netDevs));
- *gdrSupport = 0;
- for (int dev=0; dev<netDevs; dev++) {
- // Find a net device which is GDR-capable
- ncclNetProperties_t props;
- NCCLCHECK(ncclNet->getProperties(dev, &props));
- if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
-
- // Allocate memory on the GPU and try to register it on the NIC.
- void *lComm = NULL, *sComm = NULL, *rComm = NULL;
- ncclNetHandle_t handle;
- void* gpuPtr = NULL;
- void* mHandle = NULL;
- ncclResult_t ret;
- ncclDebugNoWarn = NCCL_NET;
- NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1);
- NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2);
- NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3);
- CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
- if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
- NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
- NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
- NCCLCHECK(ncclNetDeregMr(rComm, mHandle));
- *gdrSupport = 1;
- }
- ncclDebugNoWarn = 0;
- CUDACHECK(cudaFree(gpuPtr));
-cleanup4:
- NCCLCHECK(ncclNetCloseRecv(rComm));
-cleanup3:
- NCCLCHECK(ncclNetCloseSend(sComm));
-cleanup2:
- NCCLCHECK(ncclNetCloseListen(lComm));
-cleanup1:
- break;
- }
- return ncclSuccess;
-}
+ncclResult_t ncclGpuGdrSupport(int* gdrSupport);
extern ncclNet_t ncclNetIb;
extern ncclNet_t ncclNetSocket;
diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h
index 21ee82e..29731dd 100644
--- a/src/include/nvmlwrap.h
+++ b/src/include/nvmlwrap.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,59 +9,13 @@
#include "nccl.h"
-// The NVML library doesn't appear to be thread safe
-#include <pthread.h>
-extern pthread_mutex_t nvmlLock;
-#define NVMLLOCK() pthread_mutex_lock(&nvmlLock)
-#define NVMLUNLOCK() pthread_mutex_unlock(&nvmlLock)
-
-#define NVMLLOCKCALL(cmd, ret) do { \
- NVMLLOCK(); \
- ret = cmd; \
- NVMLUNLOCK(); \
-} while(false)
-
-#define NVMLCHECK(cmd) do { \
- nvmlReturn_t e; \
- NVMLLOCKCALL(cmd, e); \
- if( e != NVML_SUCCESS ) { \
- WARN("NVML failure '%s'", nvmlErrorString(e)); \
- return ncclSystemError; \
- } \
-} while(false)
-
-//#define NVML_DIRECT 1
-#ifdef NVML_DIRECT
-#include "nvml.h"
+//#define NCCL_NVML_DIRECT 1
+#ifndef NCCL_NVML_DIRECT
+#define NCCL_NVML_DIRECT 0
+#endif
-static ncclResult_t wrapNvmlSymbols(void) { return ncclSuccess; }
-static ncclResult_t wrapNvmlInit(void) { NVMLCHECK(nvmlInit()); return ncclSuccess; }
-static ncclResult_t wrapNvmlShutdown(void) { NVMLCHECK(nvmlShutdown()); return ncclSuccess; }
-static ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
- NVMLCHECK(nvmlDeviceGetHandleByPciBusId(pciBusId, device));
- return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
- NVMLCHECK(nvmlDeviceGetIndex(device, index));
- return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
- NVMLCHECK(nvmlDeviceGetNvLinkState(device, link, isActive));
- return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
- NVMLCHECK(nvmlDeviceGetNvLinkRemotePciInfo(device, link, pci));
- return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
- nvmlNvLinkCapability_t capability, unsigned int *capResult) {
- NVMLCHECK(nvmlDeviceGetNvLinkCapability(device, link, capability, capResult));
- return ncclSuccess;
-}
-static ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
- NVMLCHECK(nvmlDeviceGetCudaComputeCapability(device, major, minor));
- return ncclSuccess;
-}
+#if NCCL_NVML_DIRECT
+#include "nvml.h"
#else
// Dynamically handle dependencies on NVML
@@ -129,21 +83,56 @@ typedef struct nvmlPciInfo_st
unsigned int reserved2;
unsigned int reserved3;
} nvmlPciInfo_t;
-/* End of nvml.h */
-
-ncclResult_t wrapNvmlSymbols(void);
-ncclResult_t wrapNvmlInit(void);
-ncclResult_t wrapNvmlShutdown(void);
-ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
-ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
-ncclResult_t wrapNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
-ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
-ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
-ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
- nvmlNvLinkCapability_t capability, unsigned int *capResult);
-ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
-
-#endif // NVML_DIRECT
+/* P2P Capability Index Status*/
+typedef enum nvmlGpuP2PStatus_enum
+{
+ NVML_P2P_STATUS_OK = 0,
+ NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED,
+ NVML_P2P_STATUS_GPU_NOT_SUPPORTED,
+ NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED,
+ NVML_P2P_STATUS_DISABLED_BY_REGKEY,
+ NVML_P2P_STATUS_NOT_SUPPORTED,
+ NVML_P2P_STATUS_UNKNOWN
+} nvmlGpuP2PStatus_t;
+
+/* P2P Capability Index*/
+typedef enum nvmlGpuP2PCapsIndex_enum
+{
+ NVML_P2P_CAPS_INDEX_READ = 0,
+ NVML_P2P_CAPS_INDEX_WRITE,
+ NVML_P2P_CAPS_INDEX_NVLINK,
+ NVML_P2P_CAPS_INDEX_ATOMICS,
+ NVML_P2P_CAPS_INDEX_PROP,
+ NVML_P2P_CAPS_INDEX_UNKNOWN
+} nvmlGpuP2PCapsIndex_t;
+/* End of nvml.h */
+#endif // NCCL_NVML_DIRECT
+
+constexpr int ncclNvmlMaxDevices = 32;
+struct ncclNvmlDeviceInfo {
+ nvmlDevice_t handle;
+ int computeCapabilityMajor, computeCapabilityMinor;
+};
+struct ncclNvmlDevicePairInfo {
+ nvmlGpuP2PStatus_t p2pStatusRead, p2pStatusWrite;
+};
+extern int ncclNvmlDeviceCount;
+extern ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices];
+extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices];
+
+// All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly.
+// Outsiders need only call it if they want to inspect the ncclNvml global
+// tables above.
+ncclResult_t ncclNvmlEnsureInitialized();
+
+ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
+ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
+ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
+ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
+ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
+ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult);
+ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
+ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
#endif // End include guard
diff --git a/src/include/param.h b/src/include/param.h
index 49c4606..7f749fb 100644
--- a/src/include/param.h
+++ b/src/include/param.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -38,6 +38,7 @@ static void setEnvFile(const char* fileName) {
strncpy(envValue, line+s, 1023);
envValue[1023]='\0';
setenv(envVar, envValue, 0);
+ //printf("%s : %s->%s\n", fileName, envVar, envValue);
}
if (line) free(line);
fclose(file);
diff --git a/src/include/profiler.h b/src/include/profiler.h
new file mode 100644
index 0000000..103af99
--- /dev/null
+++ b/src/include/profiler.h
@@ -0,0 +1,37 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PROFILER_H_
+#define NCCL_PROFILER_H_
+
+#include "proxy.h"
+
+enum ncclProxyProfileState {
+ ncclProxyProfileBegin = 0,
+
+ ncclProxyProfileSendGPUWait = 1,
+ ncclProxyProfileSendWait = 2,
+
+ ncclProxyProfileRecvWait = 1,
+ ncclProxyProfileRecvFlushWait = 2,
+ ncclProxyProfileRecvGPUWait = 3,
+
+ ncclProxyProfileEnd = 4,
+
+ ncclProxyProfileSleep = 8,
+ ncclProxyProfileWakeup = 9,
+
+ ncclProxyProfileIdle = 16,
+ ncclProxyProfileActive = 17,
+
+ ncclProxyProfileAppend = 24,
+ ncclProxyProfileAppendEnd = 25
+};
+
+ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state);
+void ncclProfilingDump();
+
+#endif
diff --git a/src/include/proxy.h b/src/include/proxy.h
index 58a58b2..c7ca0aa 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,27 +7,47 @@
#ifndef NCCL_PROXY_H_
#define NCCL_PROXY_H_
+#include "devcomm.h"
+#include "info.h"
+#include "socket.h"
#include <pthread.h>
enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
struct ncclProxyArgs;
-typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*);
+typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclComm*, struct ncclProxyArgs*);
#define NCCL_PROXY_MAX_SUBS MAXCHANNELS
static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
+struct ncclProxyOp {
+ struct ncclProxyConnection* connection;
+ int channelId;
+ int nsteps;
+ ssize_t nbytes;
+ int root;
+ int next;
+
+ uint64_t opCount;
+ int sliceSteps;
+ int chunkSteps;
+ int chunkSize;
+ ncclDataType_t dtype;
+ ncclRedOp_t redOp;
+ ncclPattern_t pattern; // uint8_t
+ uint8_t protocol;
+ uint16_t pad;
+};
+static_assert(sizeof(struct ncclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch");
+
struct ncclProxySubArgs {
- struct ncclChannel* channel;
- struct ncclConnector* connector;
+ struct ncclProxyConnection* connection;
+ int channelId;
int nsteps;
- ssize_t sendbytes;
- ssize_t recvbytes;
- int sendChunkSize;
- int recvChunkSize;
- int delta;
+ ssize_t nbytes;
+ int peer;
- // Internal state
+ int groupSize; // Number of consecutive sub operations sharing the same recvComm
uint64_t base;
uint64_t posted;
uint64_t received;
@@ -36,23 +56,22 @@ struct ncclProxySubArgs {
uint64_t done;
uint64_t end;
void* requests[NCCL_STEPS];
+ void* profilingEvents[NCCL_STEPS];
};
struct ncclProxyArgs {
- proxyProgressFunc_t progress;
struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS];
+ proxyProgressFunc_t progress;
int nsubs;
int done;
+ uint64_t opCount;
int sliceSteps;
int chunkSteps;
int chunkSize;
- uint64_t opCount;
- uint64_t commOpCount;
- int protocol;
ncclDataType_t dtype;
ncclRedOp_t redOp;
ncclPattern_t pattern;
- int root;
+ uint8_t protocol;
int state;
char* sharedBuff[NCCL_STEPS];
int sharedSize[NCCL_STEPS];
@@ -60,39 +79,104 @@ struct ncclProxyArgs {
int idle;
// Element linking
- pthread_mutex_t mutex;
struct ncclProxyArgs* next;
struct ncclProxyArgs* nextPeer;
struct ncclProxyArgs** proxyAppendPtr;
};
+#define NCCL_MAX_NETDEVS 128
+
+// ProxyOps are used to communicate between main thread and service thread
+// Make sure we have enough to store two full rounds of operations on all channels.
+// Otherwise we'd be unable to post half of them to free new elements.
+#define MAX_OPS_PER_PEER (2*MAXCHANNELS*NCCL_MAX_WORK_ELEMENTS_P2P)
+#define NCCL_MAX_LOCAL_RANKS 64
+struct ncclProxyOpsPool {
+ struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS];
+ volatile int nextOps;
+ volatile int nextOpsEnd;
+ volatile int freeOps[NCCL_MAX_LOCAL_RANKS];
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+};
+
+struct ncclProxyOps {
+ ncclProxyOpsPool* pool;
+ int count;
+ int freeOp;
+ int nextOps;
+ int nextOpsEnd;
+};
+
+struct ncclProxySharedP2p {
+ int refcount;
+ int size;
+ char* cudaBuff;
+ char* hostBuff;
+ cudaIpcMemHandle_t ipc;
+ struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv
+};
-struct ncclProxySharedBuffers {
+struct ncclProxySharedCollNet {
int size;
char* cudaBuff;
char* hostBuff;
- struct ncclProxyArgs* proxyAppend[2*MAXCHANNELS]; // Separate send and recv
- // Collnet sharing is technically per device, but for now MAXDEVICES == MAXCHANNELS.
- struct ncclProxyArgs* proxyAppendCollNet[2*MAXCHANNELS];
- void* collNetResources;
+ struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS];
+ void* resources;
+};
+
+struct ncclProxyPeer {
+ struct ncclProxySharedP2p send;
+ struct ncclProxySharedP2p recv;
+};
+
+struct ncclSharedNetComms {
+ void* sendComm[MAXCHANNELS];
+ void* recvComm[MAXCHANNELS];
+ int sendRefCount[MAXCHANNELS];
+ int recvRefCount[MAXCHANNELS];
};
struct ncclProxyPool;
-struct ncclProxyState {
- pthread_cond_t cond;
- pthread_mutex_t opsMutex;
- pthread_mutex_t poolMutex;
- bool stop;
- struct ncclProxySharedBuffers sharedBuffs;
- struct ncclProxyArgs* ops; // Running operations, used by proxy thread
- struct ncclProxyArgs* postedOps; // Posted operations, shared between proxy and main thread, locked with opsMutex
- struct ncclProxyArgs* postedOpsEnd;
- struct ncclProxyArgs* nextOps; // Pending operations, used by main thread (could still be cancelled)
- struct ncclProxyArgs* nextOpsEnd;
- struct ncclProxyArgs* pool; // Free operations for main thread
- struct ncclProxyArgs* poolFreed; // Freed operations by the progress thread
- struct ncclProxyArgs* poolReturned; // Shared between main and progress thread, lock with poolMutex
+struct ncclProxyProgressState {
+ // Used by main threads to send work to progress thread
+ struct ncclProxyOpsPool* opsPool;
+ char opsPoolShmSuffix[6];
+ pthread_t thread;
+ bool stop;
+ struct ncclProxyPeer** localPeers;
+ struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS];
+ struct ncclProxySharedCollNet collNet;
+ struct ncclProxyArgs* active;
+ struct ncclProxyArgs* pool;
struct ncclProxyPool* pools;
+ int nextOps;
+};
+
+struct ncclProxyState {
+ // Service thread
+ pthread_t thread;
+ struct ncclSocket* listenSock;
+ int stop;
+
+ // Used by main thread
+ union ncclSocketAddress* peerAddresses;
+ struct ncclSocket* peerSocks;
+ struct ncclProxyOps* proxyOps;
+ void** sharedDevMems;
+
+ // Progress thread
+ struct ncclProxyProgressState progressState;
+};
+
+struct ncclProxyConnection {
+ int send, transport, shared;
+ int localRank;
+ struct ncclSocket* sock;
+ struct ncclTransportComm* tcomm;
+ struct ncclProxyArgs *proxyAppend;
+ struct ncclProxyArgs **proxyAppendPtr;
+ void* transportResources;
};
typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
@@ -103,26 +187,25 @@ enum proxyMode {
proxyTo = 2
};
-ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int nranks);
-ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyArgs* args);
-ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args);
+ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* proxyOp, int nranks);
+ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp);
+ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* proxyOp);
ncclResult_t ncclProxyStart(struct ncclComm* comm);
+ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses);
ncclResult_t ncclProxyCreate(struct ncclComm* comm);
-ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
-
-ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr);
-ncclResult_t ncclProxySharedBuffersGetP2p(struct ncclComm* comm, int cuda, int type, int channel, int slot, int index, char** ptr);
-ncclResult_t ncclProxySharedBuffersGetCollNet(struct ncclComm* comm, int cuda, int type, int slot, int channel, char** ptr);
-ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm);
-
-#include <unistd.h>
-
-// Spin wait until func evaluates to true
-template<typename FUNC>
-inline void transportProxyWait(const FUNC& func) {
- while (!func()) {
- sched_yield();
- }
-}
+ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn);
+enum ncclProxyMsgType {
+ ncclProxyMsgInit = 1,
+ ncclProxyMsgSharedInit = 2,
+ ncclProxyMsgSetup = 3,
+ ncclProxyMsgConnect = 4,
+ ncclProxyMsgStart = 5,
+ ncclProxyMsgClose = 6,
+ ncclProxyMsgAbort = 7,
+ ncclProxyMsgStop = 8
+};
+ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
+ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
+ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
#endif
diff --git a/src/include/shm.h b/src/include/shm.h
index 7334f16..08dc849 100644
--- a/src/include/shm.h
+++ b/src/include/shm.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,65 +7,9 @@
#ifndef NCCL_SHM_H_
#define NCCL_SHM_H_
-#include <sys/types.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-
-// Change functions behavior to match other SYS functions
-static int shm_allocate(int fd, const int shmsize) {
- int err = posix_fallocate(fd, 0, shmsize);
- if (err) { errno = err; return -1; }
- return 0;
-}
-static int shm_map(int fd, const int shmsize, void** ptr) {
- *ptr = mmap(NULL, shmsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
- return (*ptr == MAP_FAILED) ? -1 : 0;
-}
-
-static ncclResult_t shmSetup(const char* shmname, const int shmsize, int* fd, void** ptr, int create) {
- SYSCHECKVAL(shm_open(shmname, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "shm_open", *fd);
- if (create) SYSCHECK(shm_allocate(*fd, shmsize), "posix_fallocate");
- SYSCHECK(shm_map(*fd, shmsize, ptr), "mmap");
- close(*fd);
- *fd = -1;
- if (create) memset(*ptr, 0, shmsize);
- return ncclSuccess;
-}
-
-static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPtr, void** devShmPtr, int create) {
- int fd = -1;
- void* ptr = MAP_FAILED;
- ncclResult_t res = ncclSuccess;
-
- NCCLCHECKGOTO(shmSetup(shmname, shmsize, &fd, &ptr, create), res, sysError);
- CUDACHECKGOTO(cudaHostRegister(ptr, shmsize, cudaHostRegisterMapped), res, cudaError);
- CUDACHECKGOTO(cudaHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError);
-
- *shmPtr = ptr;
- return ncclSuccess;
-sysError:
- WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmname, shmsize);
-cudaError:
- if (fd != -1) close(fd);
- if (create) shm_unlink(shmname);
- if (ptr != MAP_FAILED) munmap(ptr, shmsize);
- *shmPtr = NULL;
- return res;
-}
-
-static ncclResult_t shmUnlink(const char* shmname) {
- if (shmname != NULL) SYSCHECK(shm_unlink(shmname), "shm_unlink");
- return ncclSuccess;
-}
-
-static ncclResult_t shmClose(void* shmPtr, void* devShmPtr, const int shmsize) {
- CUDACHECK(cudaHostUnregister(shmPtr));
- if (munmap(shmPtr, shmsize) != 0) {
- WARN("munmap of shared memory failed");
- return ncclSystemError;
- }
- return ncclSuccess;
-}
+#include "nccl.h"
+ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create);
+ncclResult_t ncclShmUnlink(const char* shmname);
+ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize);
#endif
diff --git a/src/include/socket.h b/src/include/socket.h
index 6ca5f7d..53fda4d 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,14 +7,13 @@
#ifndef NCCL_SOCKET_H_
#define NCCL_SOCKET_H_
+#include "nccl.h"
#include <sys/socket.h>
#include <arpa/inet.h>
#include <netinet/tcp.h>
-#include <unistd.h>
#include <netdb.h>
-#include <ifaddrs.h>
-#include <net/if.h>
-#include "utils.h"
+#include <fcntl.h>
+#include <poll.h>
#define MAX_IFS 16
#define MAX_IF_NAME_SIZE 16
@@ -24,438 +23,48 @@
#define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
/* Common socket address storage structure for IPv4/IPv6 */
-union socketAddress {
+union ncclSocketAddress {
struct sockaddr sa;
struct sockaddr_in sin;
struct sockaddr_in6 sin6;
};
-/* Format a string representation of a (union socketAddress *) socket address using getnameinfo()
- *
- * Output: "IPv4/IPv6 address<port>"
- */
-static inline const char *socketToString(union socketAddress *addr, char *buf) {
- if (buf == NULL || addr == NULL) return NULL;
- struct sockaddr *saddr = &addr->sa;
- if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
- char host[NI_MAXHOST], service[NI_MAXSERV];
- (void) getnameinfo(saddr, sizeof(union socketAddress), host, NI_MAXHOST, service, NI_MAXSERV, NI_NUMERICHOST|NI_NUMERICSERV);
- sprintf(buf, "%s<%s>", host, service);
- return buf;
-}
-
-static inline uint16_t socketToPort(union socketAddress *addr) {
- struct sockaddr *saddr = &addr->sa;
- return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port);
-}
-
-/* Allow the user to force the IPv4/IPv6 interface selection */
-static inline int envSocketFamily(void) {
- int family = -1; // Family selection is not forced, will use first one found
- char* env = getenv("NCCL_SOCKET_FAMILY");
- if (env == NULL)
- return family;
-
- INFO(NCCL_ENV, "NCCL_SOCKET_FAMILY set by environment to %s", env);
-
- if (strcmp(env, "AF_INET") == 0)
- family = AF_INET; // IPv4
- else if (strcmp(env, "AF_INET6") == 0)
- family = AF_INET6; // IPv6
- return family;
-}
-
-static int findInterfaces(const char* prefixList, char* names, union socketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
-#ifdef ENABLE_TRACE
- char line[SOCKET_NAME_MAXLEN+1];
-#endif
- struct netIf userIfs[MAX_IFS];
- bool searchNot = prefixList && prefixList[0] == '^';
- if (searchNot) prefixList++;
- bool searchExact = prefixList && prefixList[0] == '=';
- if (searchExact) prefixList++;
- int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
-
- int found = 0;
- struct ifaddrs *interfaces, *interface;
- getifaddrs(&interfaces);
- for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) {
- if (interface->ifa_addr == NULL) continue;
-
- /* We only support IPv4 & IPv6 */
- int family = interface->ifa_addr->sa_family;
- if (family != AF_INET && family != AF_INET6)
- continue;
-
- TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, socketToString((union socketAddress *)interface->ifa_addr, line));
-
- /* Allow the caller to force the socket family type */
- if (sock_family != -1 && family != sock_family)
- continue;
-
- /* We also need to skip IPv6 loopback interfaces */
- if (family == AF_INET6) {
- struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr);
- if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
- }
-
- // check against user specified interfaces
- if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) {
- continue;
- }
-
- // Check that this interface has not already been saved
- // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link
- bool duplicate = false;
- for (int i = 0; i < found; i++) {
- if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; }
- }
-
- if (!duplicate) {
- // Store the interface name
- strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize);
- // Store the IP address
- int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
- memcpy(addrs+found, interface->ifa_addr, salen);
- found++;
- }
- }
-
- freeifaddrs(interfaces);
- return found;
-}
-
-static bool matchSubnet(struct ifaddrs local_if, union socketAddress* remote) {
- /* Check family first */
- int family = local_if.ifa_addr->sa_family;
- if (family != remote->sa.sa_family) {
- return false;
- }
-
- if (family == AF_INET) {
- struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
- struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
- struct sockaddr_in& remote_addr = remote->sin;
- struct in_addr local_subnet, remote_subnet;
- local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
- remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
- return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true;
- } else if (family == AF_INET6) {
- struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
- struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
- struct sockaddr_in6& remote_addr = remote->sin6;
- struct in6_addr& local_in6 = local_addr->sin6_addr;
- struct in6_addr& mask_in6 = mask->sin6_addr;
- struct in6_addr& remote_in6 = remote_addr.sin6_addr;
- bool same = true;
- int len = 16; //IPv6 address is 16 unsigned char
- for (int c = 0; c < len; c++) { //Network byte order is big-endian
- char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c];
- char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c];
- if (c1 ^ c2) {
- same = false;
- break;
- }
- }
- // At last, we need to compare scope id
- // Two Link-type addresses can have the same subnet address even though they are not in the same scope
- // For Global type, this field is 0, so a comparison wouldn't matter
- same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id);
- return same;
- } else {
- WARN("Net : Unsupported address family type");
- return false;
- }
-}
-
-static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
-#ifdef ENABLE_TRACE
- char line[SOCKET_NAME_MAXLEN+1];
-#endif
- char line_a[SOCKET_NAME_MAXLEN+1];
- int found = 0;
- struct ifaddrs *interfaces, *interface;
- getifaddrs(&interfaces);
- for (interface = interfaces; interface && !found; interface = interface->ifa_next) {
- if (interface->ifa_addr == NULL) continue;
-
- /* We only support IPv4 & IPv6 */
- int family = interface->ifa_addr->sa_family;
- if (family != AF_INET && family != AF_INET6)
- continue;
-
- // check against user specified interfaces
- if (!matchSubnet(*interface, remoteAddr)) {
- continue;
- }
-
- // Store the local IP address
- int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
- memcpy(localAddrs+found, interface->ifa_addr, salen);
-
- // Store the interface name
- strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
-
- TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(localAddrs+found, line), socketToString(remoteAddr, line_a));
- found++;
- if (found == maxIfs) break;
- }
-
- if (found == 0) {
- WARN("Net : No interface found in the same subnet as remote address %s", socketToString(remoteAddr, line_a));
- }
- freeifaddrs(interfaces);
- return found;
-}
-
-static ncclResult_t GetSocketAddrFromString(union socketAddress* ua, const char* ip_port_pair) {
- if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
- WARN("Net : string is null");
- return ncclInvalidArgument;
- }
-
- bool ipv6 = ip_port_pair[0] == '[';
- /* Construct the sockaddress structure */
- if (!ipv6) {
- struct netIf ni;
- // parse <ip_or_hostname>:<port> string, expect one pair
- if (parseStringList(ip_port_pair, &ni, 1) != 1) {
- WARN("Net : No valid <IPv4_or_hostname>:<port> pair found");
- return ncclInvalidArgument;
- }
-
- struct addrinfo hints, *p;
- int rv;
- memset(&hints, 0, sizeof(hints));
- hints.ai_family = AF_UNSPEC;
- hints.ai_socktype = SOCK_STREAM;
-
- if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) {
- WARN("Net : error encountered when getting address info : %s", gai_strerror(rv));
- return ncclInvalidArgument;
- }
-
- // use the first
- if (p->ai_family == AF_INET) {
- struct sockaddr_in& sin = ua->sin;
- memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in));
- sin.sin_family = AF_INET; // IPv4
- //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr)); // IP address
- sin.sin_port = htons(ni.port); // port
- } else if (p->ai_family == AF_INET6) {
- struct sockaddr_in6& sin6 = ua->sin6;
- memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6));
- sin6.sin6_family = AF_INET6; // IPv6
- sin6.sin6_port = htons(ni.port); // port
- sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete
- sin6.sin6_scope_id = 0; // should be global scope, set to 0
- } else {
- WARN("Net : unsupported IP family");
- return ncclInvalidArgument;
- }
-
- freeaddrinfo(p); // all done with this structure
-
- } else {
- int i, j = -1, len = strlen(ip_port_pair);
- for (i = 1; i < len; i++) {
- if (ip_port_pair[i] == '%') j = i;
- if (ip_port_pair[i] == ']') break;
- }
- if (i == len) {
- WARN("Net : No valid [IPv6]:port pair found");
- return ncclInvalidArgument;
- }
- bool global_scope = (j == -1 ? true : false); // If no % found, global scope; otherwise, link scope
-
- char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ];
- memset(ip_str, '\0', sizeof(ip_str));
- memset(port_str, '\0', sizeof(port_str));
- memset(if_name, '\0', sizeof(if_name));
- strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1);
- strncpy(port_str, ip_port_pair+i+2, len-i-1);
- int port = atoi(port_str);
- if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name
-
- struct sockaddr_in6& sin6 = ua->sin6;
- sin6.sin6_family = AF_INET6; // IPv6
- inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr)); // IP address
- sin6.sin6_port = htons(port); // port
- sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete
- sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name); // 0 if global scope; intf index if link scope
- }
- return ncclSuccess;
-}
-
-static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) {
- static int shownIfName = 0;
- int nIfs = 0;
- // Allow user to force the INET socket family selection
- int sock_family = envSocketFamily();
- // User specified interface
- char* env = getenv("NCCL_SOCKET_IFNAME");
- if (env && strlen(env) > 1) {
- INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env);
- // Specified by user : find or fail
- if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env);
- nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
- } else {
- // Try to automatically pick the right one
- // Start with IB
- nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
- // else see if we can get some hint from COMM ID
- if (nIfs == 0) {
- char* commId = getenv("NCCL_COMM_ID");
- if (commId && strlen(commId) > 1) {
- INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
- // Try to find interface that is in the same subnet as the IP in comm id
- union socketAddress idAddr;
- GetSocketAddrFromString(&idAddr, commId);
- nIfs = findInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
- }
- }
- // Then look for anything else (but not docker or lo)
- if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
- // Finally look for docker, then lo.
- if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
- if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
- }
- return nIfs;
-}
-
-static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr) {
- /* IPv4/IPv6 support */
- int family = localAddr->sa.sa_family;
- int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
-
- /* Create socket and bind it to a port */
- int sockfd = socket(family, SOCK_STREAM, 0);
- if (sockfd == -1) {
- WARN("Net : Socket creation failed : %s", strerror(errno));
- return ncclSystemError;
- }
-
- if (socketToPort(localAddr)) {
- // Port is forced by env. Make sure we get the port.
- int opt = 1;
-#if defined(SO_REUSEPORT)
- SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
-#else
- SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
-#endif
- }
-
- // localAddr port should be 0 (Any port)
- SYSCHECK(bind(sockfd, &localAddr->sa, salen), "bind");
-
- /* Get the assigned Port */
- socklen_t size = salen;
- SYSCHECK(getsockname(sockfd, &localAddr->sa, &size), "getsockname");
-
-#ifdef ENABLE_TRACE
- char line[SOCKET_NAME_MAXLEN+1];
- TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(localAddr, line));
-#endif
-
- /* Put the socket in listen mode
- * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
- */
- SYSCHECK(listen(sockfd, 16384), "listen");
- *fd = sockfd;
- return ncclSuccess;
-}
-
-static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
- char line[SOCKET_NAME_MAXLEN+1];
- /* IPv4/IPv6 support */
- int family = remoteAddr->sa.sa_family;
- if (family != AF_INET && family != AF_INET6) {
- WARN("Net : connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
- socketToString(remoteAddr, line), family, AF_INET, AF_INET6);
- return ncclInternalError;
- }
- int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
-
- /* Connect to a hostname / port */
- *fd = socket(family, SOCK_STREAM, 0);
- if (*fd == -1) {
- WARN("Net : Socket creation failed : %s", strerror(errno));
- return ncclSystemError;
- }
-
- const int one = 1;
- SYSCHECK(setsockopt(*fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
-
- /* const int bufsize = 128*1024;
- SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
- SYSCHECK(setsockopt(*fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/
-
- TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", socketToString(remoteAddr, line));
+enum ncclSocketState {
+ ncclSocketConnecting = 0,
+ ncclSocketConnected = 1,
+ ncclSocketError = 2,
+ ncclSocketStateNum = 3
+} ;
+
+struct ncclSocket {
+ int fd;
+ union ncclSocketAddress addr;
+ volatile uint32_t* abortFlag;
+ int asyncFlag;
+ enum ncclSocketState state;
+};
- int ret;
- int timedout_retries = 0;
- int refused_retries = 0;
-retry:
- SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret);
- if (ret == 0) return ncclSuccess;
- if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) {
- if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
- (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
- if (refused_retries % 1000 == 0) INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno));
- usleep(SLEEP_INT);
- goto retry;
- }
- }
- WARN("Net : Connect to %s failed : %s", socketToString(remoteAddr, line), strerror(errno));
- return ncclSystemError;
-}
+const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf);
+ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
+int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
+int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
+// Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
+ncclResult_t ncclSocketListen(struct ncclSocket* sock);
+// Connect to sock->addr. sock->fd is set after a successful call.
+ncclResult_t ncclSocketConnect(struct ncclSocket* sock);
+// Return socket connection state.
+ncclResult_t ncclGetSocketState(struct ncclSocket* sock, enum ncclSocketState* state);
+// Accept an incoming connection from listenSocket->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
+ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket);
#define NCCL_SOCKET_SEND 0
#define NCCL_SOCKET_RECV 1
-static ncclResult_t socketProgressOpt(int op, int fd, union socketAddress *addr, void* ptr, int size, int* offset, int block) {
- int bytes = 0;
- char* data = (char*)ptr;
- char line[SOCKET_NAME_MAXLEN+1];
- do {
- if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
- if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
- if (op == NCCL_SOCKET_RECV && bytes == 0) {
- WARN("Net : Connection closed by remote peer %s", socketToString(addr, line));
- return ncclSystemError;
- }
- if (bytes == -1) {
- if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
- WARN("Net : Call to recv from %s failed : %s", socketToString(addr, line), strerror(errno));
- return ncclSystemError;
- } else {
- bytes = 0;
- }
- }
- (*offset) += bytes;
- } while (bytes > 0 && (*offset) < size);
- return ncclSuccess;
-}
-
-static ncclResult_t socketProgress(int op, int fd, union socketAddress *addr, void* ptr, int size, int* offset) {
- return socketProgressOpt(op, fd, addr, ptr, size, offset, 0);
-}
-
-static ncclResult_t socketWait(int op, int fd, union socketAddress *addr, void* ptr, int size, int* offset) {
- while (*offset < size)
- NCCLCHECK(socketProgressOpt(op, fd, addr, ptr, size, offset, 1));
- return ncclSuccess;
-}
-
-static ncclResult_t socketSend(int fd, union socketAddress *addr, void* ptr, int size) {
- int offset = 0;
- NCCLCHECK(socketWait(NCCL_SOCKET_SEND, fd, addr, ptr, size, &offset));
- return ncclSuccess;
-}
-
-static ncclResult_t socketRecv(int fd, union socketAddress *addr, void* ptr, int size) {
- int offset = 0;
- NCCLCHECK(socketWait(NCCL_SOCKET_RECV, fd, addr, ptr, size, &offset));
- return ncclSuccess;
-}
+ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
+ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
+ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
+ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
+ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed);
+/* initialize a socket. */
+ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
#endif
diff --git a/src/include/timer.h b/src/include/timer.h
new file mode 100644
index 0000000..284fec6
--- /dev/null
+++ b/src/include/timer.h
@@ -0,0 +1,60 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TIMER_H_
+#define NCCL_TIMER_H_
+#if ENABLE_TIMER
+#include <unistd.h>
+#include <sys/time.h>
+#include <x86intrin.h>
+static double freq = -1;
+static void calibrate() {
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+ uint64_t timeCycles = __rdtsc();
+ double time = - tv.tv_sec*1E6 - tv.tv_usec;
+ uint64_t total = 0ULL;
+ for (int i=0; i<10000; i++) total += __rdtsc();
+ gettimeofday(&tv, NULL);
+ timeCycles = __rdtsc() - timeCycles;
+ time += tv.tv_sec*1E6 + tv.tv_usec;
+ freq = timeCycles/time;
+}
+static inline double gettime() {
+ if (freq == -1) calibrate();
+ return __rdtsc()/freq;
+}
+static uint64_t counts[8];
+static double times[8];
+static double startTimes[8];
+#define TIME_START(index) do { \
+ counts[index]++; \
+ startTimes[index] = gettime(); \
+} while (0);
+
+#define TIME_STOP(index) do { \
+ times[index] += gettime() - startTimes[index]; \
+} while (0);
+
+#define TIME_CANCEL(index) do { \
+ counts[index]--; \
+} while (0);
+
+#define TIME_PRINT(name) do { \
+ printf("%s stats", name); \
+ for (int i=0; i<8; i++) { \
+ if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \
+ counts[i] = 0; \
+ } \
+ printf("\n"); \
+} while (0);
+#else
+#define TIME_START(index) while(0);
+#define TIME_STOP(index) while(0);
+#define TIME_CANCEL(index) while(0);
+#define TIME_PRINT(name)
+#endif
+#endif
diff --git a/src/include/transport.h b/src/include/transport.h
index e64dfbf..043a415 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,12 +11,14 @@
#include "graph.h"
#include "nvmlwrap.h"
#include "core.h"
-#include "proxy.h"
-#define NTRANSPORTS 3
+#define NTRANSPORTS 4
#define TRANSPORT_P2P 0
#define TRANSPORT_SHM 1
#define TRANSPORT_NET 2
+#define TRANSPORT_COLLNET 3
+
+#include "proxy.h"
extern struct ncclTransport ncclTransports[];
@@ -28,11 +30,14 @@ struct ncclComm;
struct ncclPeerInfo {
int rank;
int cudaDev;
+ int netDev;
int gdrSupport;
uint64_t hostHash;
uint64_t pidHash;
dev_t shmDev;
int64_t busId;
+ struct ncclComm* comm;
+ int cudaCompCap;
};
#define CONNECT_SIZE 128
@@ -43,8 +48,12 @@ struct ncclConnect {
struct ncclTransportComm {
ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex);
ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
- ncclResult_t (*free)(void*);
- ncclResult_t (*proxy)(struct ncclProxyArgs*);
+ ncclResult_t (*free)(struct ncclConnector*);
+ ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels);
+ ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
+ ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
+ ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclComm* comm);
+ ncclResult_t (*proxyProgress)(struct ncclComm* comm, struct ncclProxyArgs*);
};
struct ncclTransport {
diff --git a/src/include/utils.h b/src/include/utils.h
index 739a774..f08ff37 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,6 +8,7 @@
#define NCCL_UTILS_H_
#include "nccl.h"
+#include "checks.h"
#include <stdint.h>
int ncclCudaCompCap();
@@ -94,6 +95,11 @@ class ncclRecyclableList {
return rv;
}
+ T* peakNext() {
+ if (cursor == NULL || cursor == tail) return NULL;
+ return &cursor->data;
+ }
+
// Recycle the list without freeing the space
void recycle() {
tail = cursor = head;
diff --git a/src/init.cc b/src/init.cc
index 1684cc9..4da8dfd 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -46,90 +46,6 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
-ncclNet_t* ncclNet = NULL;
-ncclCollNet_t* ncclCollNet = NULL;
-
-// Returns ncclInternalError if anything fails, causing that network to be ignored.
-ncclResult_t initNet(ncclNet_t* net) {
- int ndev;
- if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
- if (net->devices(&ndev) != ncclSuccess) return ncclInternalError;
- if (ndev <= 0) return ncclSystemError;
- return ncclSuccess;
-}
-
-ncclResult_t initCollNet(ncclCollNet_t* collnet) {
- int ndev;
- if (collnet->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
- if (collnet->devices(&ndev) != ncclSuccess) return ncclInternalError;
- if (ndev <= 0) return ncclSystemError;
- return ncclSuccess;
-}
-
-ncclResult_t initNetPlugin(ncclNet_t** net, ncclCollNet_t** collnet) {
- char ncclNetPluginName[128];
- const char* envPluginName = getenv("NCCL_NET_PLUGIN");
- if (envPluginName && strlen(envPluginName)) {
- snprintf(ncclNetPluginName, 128, "libnccl-net-%s.so", envPluginName);
- INFO(NCCL_INIT, "Plugin name set by env to %s\n", ncclNetPluginName);
- } else {
- sprintf(ncclNetPluginName, "libnccl-net.so");
- }
- void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL);
- if (netPluginLib == NULL) {
- // dlopen does not guarantee to set errno, but dlerror only gives us a
- // string, so checking errno doesn't hurt to try to provide a better
- // error message
- if (errno == ENOENT) {
- INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (%s), using internal implementation", ncclNetPluginName);
- } else {
- INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
- }
- return ncclSuccess;
- }
- *net = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
- if (*net == NULL) {
- INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol.");
- if (netPluginLib != NULL) dlclose(netPluginLib);
- return ncclSuccess;
- }
- // Check for CollNet
- *collnet = (ncclCollNet_t*) dlsym(netPluginLib, STR(NCCL_COLLNET_PLUGIN_SYMBOL));
- if (*collnet == NULL) {
- INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_COLLNET_PLUGIN_SYMBOL) " symbol.");
- }
- return ncclSuccess;
-}
-
-ncclResult_t initNet() {
- // Always initialize bootstrap network
- NCCLCHECK(bootstrapNetInit());
-
- // Initialize main communication network
- ncclNet_t* nets[3] = { NULL, &ncclNetIb, &ncclNetSocket };
- ncclCollNet_t* collNets[3] = { NULL, NULL, NULL };
- NCCLCHECK(initNetPlugin(nets+0, collNets+0));
- char* netName = getenv("NCCL_NET");
-
- for (int i=0; i<3; i++) {
- if (nets[i] == NULL) continue;
- if (netName && strcmp(netName, nets[i]->name) != 0) continue;
- // net plugin is already initialized
- if (initNet(nets[i]) != ncclSuccess) continue;
- ncclNet = nets[i];
- if (collNets[i] && initCollNet(collNets[i]) == ncclSuccess) {
- ncclCollNet = collNets[i];
- }
- break;
- }
-
- if (ncclNet == NULL) {
- WARN("Error: network %s not found.", netName ? netName : "");
- return ncclInvalidUsage;
- }
- return ncclSuccess;
-}
-
// GDRCOPY support: Off by default
NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0);
@@ -155,7 +71,7 @@ static ncclResult_t ncclInit() {
initEnv();
initGdrCopy();
maxLocalSizeBytes = ncclKernMaxLocalSize();
- NCCLCHECK(initNet());
+ NCCLCHECK(ncclNetInit());
INFO(NCCL_INIT, "Using network %s", ncclNetName());
initialized = true;
}
@@ -194,6 +110,9 @@ static ncclResult_t commFree(ncclComm_t comm) {
if (comm == NULL)
return ncclSuccess;
+ // First stop all threads before we free anything.
+ NCCLCHECK(ncclProxyDestroy(comm));
+
delete[] comm->userRedOps;
free(comm->connectSend);
@@ -208,6 +127,10 @@ static ncclResult_t commFree(ncclComm_t comm) {
free(comm->peerInfo);
ncclTopoFree(comm->topo);
+ for (int n=0; n<comm->nNodes; n++) free(comm->nodeRanks[n].localRankToRank);
+ free(comm->nodeRanks);
+ free(comm->rankToNode);
+ free(comm->rankToLocalRank);
if (comm->bootstrap)
NCCLCHECK(bootstrapClose(comm->bootstrap));
@@ -231,8 +154,16 @@ static ncclResult_t commFree(ncclComm_t comm) {
int isLast;
NCCLCHECK(ncclCpuBarrierIn(comm, &isLast));
if (isLast) {
+ // Wait for all service threads to be done. We could not
+ // do it earlier because it could have blocked and prevented
+ // other ranks in the process to call ncclCommDestroy
+ for (int i=0; i<comm->intraRanks; i++) {
+ void* ret;
+ if (comm->intraThreads[i]) pthread_join(comm->intraThreads[i], &ret);
+ }
free(comm->intraBarrier);
free(comm->intraParams);
+ free(comm->intraThreads);
free(comm->intraCudaDevs);
free(comm->intraCGMode);
free(comm->intraCC);
@@ -291,7 +222,8 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
comm->hostDevComm.abortFlag = comm->abortFlag;
*comm->abortFlag = 0;
- comm->argsptr = &comm->args;
+ comm->argsptrs[0] = &comm->devComm;
+ comm->argsptrs[1] = &comm->args;
comm->collNetSupport = 0;
NCCLCHECK(ncclCalloc(&comm->asyncOps, NCCL_MAX_OPS));
@@ -329,10 +261,6 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
NCCLCHECK(ncclCalloc(&comm->p2pSends, comm->nRanks));
NCCLCHECK(ncclCalloc(&comm->p2pRecvs, comm->nRanks));
- // Create a map between global rank and intra-node rank
- NCCLCHECK(ncclCalloc(&comm->rankToIntraNodeRank, comm->nRanks));
- memset(comm->rankToIntraNodeRank, -1, comm->nRanks*sizeof(comm->rankToIntraNodeRank[0]));
-
// Mark channels as non initialized.
for (int c=0; c<MAXCHANNELS; c++) comm->channels[c].id = -1;
@@ -389,6 +317,8 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
info->busId = comm->busId;
NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport));
+ info->comm = comm;
+ info->cudaCompCap = ncclCudaCompCap();
return ncclSuccess;
}
@@ -418,7 +348,7 @@ void* waitForNonNullPtr(void* p) {
ncclResult_t initParams(struct ncclComm* comm) {
struct cudaLaunchParams* params = comm->myParams = comm->intraParams+comm->intraRank;
- params->args = &comm->argsptr;
+ params->args = comm->argsptrs;
params->stream = NULL;
params->sharedMem = 0;
params->blockDim.x = 0; params->blockDim.y = params->blockDim.z = 1;
@@ -440,6 +370,7 @@ ncclResult_t ncclCommSetIntraProc(struct ncclComm* comm, int rank, int ranks, st
bar[0] = bar[1] = 0;
comm->intraBarrier = bar;
NCCLCHECK(ncclCalloc(&comm->intraParams, comm->intraRanks));
+ NCCLCHECK(ncclCalloc(&comm->intraThreads, comm->intraRanks));
NCCLCHECK(ncclCalloc(&comm->intraCudaDevs, comm->intraRanks));
int* CGMode;
NCCLCHECK(ncclCalloc(&CGMode, 1));
@@ -452,11 +383,13 @@ ncclResult_t ncclCommSetIntraProc(struct ncclComm* comm, int rank, int ranks, st
} else {
comm->intraBarrier = (int*)waitForNonNullPtr(&comm0->intraBarrier);
comm->intraParams = (struct cudaLaunchParams*)waitForNonNullPtr(&comm0->intraParams);
+ comm->intraThreads = (pthread_t*)waitForNonNullPtr(&comm0->intraThreads);
comm->intraCudaDevs = (int*)waitForNonNullPtr(&comm0->intraCudaDevs);
comm->intraCGMode = (int*)waitForNonNullPtr(&comm0->intraCGMode);
comm->intraCC = (int*)waitForNonNullPtr(&comm0->intraCC);
}
comm->intraCudaDevs[comm->intraRank] = comm->cudaDev;
+ comm->intraThreads[comm->intraRank] = comm->proxyState.thread;
NCCLCHECK(initParams(comm));
int cgMdLaunch = 0;
@@ -508,7 +441,6 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
return ncclSuccess;
}
-NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
NCCL_PARAM(CollNetNodeThreshold, "COLLNET_NODE_THRESHOLD", 2);
NCCL_PARAM(NvbPreconnect, "NVB_PRECONNECT", 1);
@@ -522,75 +454,19 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
int nranks = comm->nRanks;
uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
- NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
+ NCCLCHECK(bootstrapInit(commId, comm));
// AllGather1 - begin
- struct {
- struct ncclPeerInfo peerInfo;
- struct ncclComm* comm;
- int cudaCompCap;
- } *allGather1Data;
-
- NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
- allGather1Data[rank].comm = comm;
- allGather1Data[rank].cudaCompCap = ncclCudaCompCap();
- struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo;
- NCCLCHECK(fillInfo(comm, myInfo, commHash));
- NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
-
NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks+1)); // Extra rank to represent CollNet root
- for (int i = 0; i < nranks; i++) {
- memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
- if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
- WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, myInfo->busId);
- return ncclInvalidUsage;
- }
- }
+ NCCLCHECK(fillInfo(comm, comm->peerInfo+rank, commHash));
+ NCCLCHECK(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)));
- // Compute intra ranks and minimum CUDA Compute capabilities of intra-node GPUs and all GPUs
- int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
- int intraNodeRank0 = -1, intraNodeRank = -1, intraNodeRanks = 0;
- int myCompCap = allGather1Data[rank].cudaCompCap;
- int minCompCap = myCompCap, maxCompCap = myCompCap;
for (int i = 0; i < nranks; i++) {
- if (allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) {
- // Rank is on same node
- if (intraNodeRanks == 0) intraNodeRank0 = i;
- if (i == rank) intraNodeRank = intraNodeRanks;
- comm->intraNodeGlobalRanks[intraNodeRanks] = i;
- comm->rankToIntraNodeRank[i] = intraNodeRanks;
- intraNodeRanks++;
- if (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash) {
- // Rank is in same process
- if (intraProcRanks == 0) intraProcRank0 = i;
- if (i == rank) intraProcRank = intraProcRanks;
- intraProcRanks++;
- }
+ if ((i != rank) && (comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) && (comm->peerInfo[i].busId == comm->peerInfo[rank].busId)) {
+ WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, comm->peerInfo[rank].busId);
+ return ncclInvalidUsage;
}
- minCompCap = std::min(allGather1Data[i].cudaCompCap, minCompCap);
- maxCompCap = std::max(allGather1Data[i].cudaCompCap, maxCompCap);
- }
- TRACE(NCCL_INIT,"hostHash[%d] %lx intraNodeRank %d intraNodeRanks %d intraNodeRank0 %d",
- rank, allGather1Data[rank].peerInfo.hostHash, intraNodeRank, intraNodeRanks, intraNodeRank0);
- TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
- rank, allGather1Data[rank].peerInfo.pidHash, intraProcRank, intraProcRanks, intraProcRank0);
- if (intraProcRank == -1 || intraProcRank0 == -1 || allGather1Data[intraProcRank0].comm == NULL) {
- WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
- rank, allGather1Data[rank].peerInfo.hostHash, allGather1Data[rank].peerInfo.pidHash,
- intraProcRank, intraProcRanks, intraProcRank0);
- return ncclInternalError;
- }
- if (intraNodeRank == -1 || intraNodeRank0 == -1 || intraNodeRanks == 0) {
- WARN("Failed to determine intra node ranks rank %d hostHash %lx pidHash %lx intraNodeRank %d intraNodeRanks %d intraNodeRank0 %d",
- rank, allGather1Data[rank].peerInfo.hostHash, allGather1Data[rank].peerInfo.pidHash,
- intraNodeRank, intraNodeRanks, intraNodeRank0);
- return ncclInternalError;
}
- struct ncclComm* intraProcRank0Comm = allGather1Data[intraProcRank0].comm;
- uint64_t intraNodeRank0pidHash = allGather1Data[intraNodeRank0].peerInfo.pidHash;
- comm->intraNodeRank = intraNodeRank;
-
- free(allGather1Data);
// AllGather1 - end
@@ -607,11 +483,23 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
// Print final topology
NCCLCHECK(ncclTopoPrint(comm->topo));
+ // Set Affinity to a CPU local the our GPU, so that all memory we allocate
+ // on the host is local.
+ NCCLCHECK(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity));
+ cpu_set_t affinitySave;
+ if (CPU_COUNT(&comm->cpuAffinity)) {
+ sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+ sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+ }
+ ncclResult_t ret;
+
+ // Launch proxy service thread
+ NCCLCHECK(ncclProxyCreate(comm));
+
// Get rings and trees
struct ncclTopoGraph ringGraph;
ringGraph.id = 0;
ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
- ringGraph.crossNic = ncclParamCrossNic();
ringGraph.collNet = 0;
ringGraph.minChannels = 1;
ringGraph.maxChannels = MAXCHANNELS/2;
@@ -621,7 +509,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
struct ncclTopoGraph treeGraph;
treeGraph.id = 1;
treeGraph.pattern = NCCL_TOPO_PATTERN_BALANCED_TREE;
- treeGraph.crossNic = ncclParamCrossNic();
treeGraph.collNet = 0;
treeGraph.minChannels = 1;
treeGraph.maxChannels = ringGraph.nChannels;
@@ -632,7 +519,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
collNetGraph.id = 2;
collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
collNetGraph.collNet = 1;
- collNetGraph.crossNic = ncclParamCrossNic();
collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels;
NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph));
NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph));
@@ -644,10 +530,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
// Determine local CollNet support before all-gather
if (ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1;
- if (intraNodeRanks > 8) {
- if (comm->collNetSupport == 1) WARN("CollNet currently only supports up to 8 GPUs per node");
- comm->collNetSupport = 0;
- }
// AllGather3 - begin
struct ncclGraphInfo {
@@ -661,6 +543,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
};
struct {
+ int netDev;
int collNetSupport;
struct ncclGraphInfo tree;
struct ncclGraphInfo ring;
@@ -669,6 +552,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
} *allGather3Data;
NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
+ NCCLCHECK(ncclTopoGetLocalNet(comm->topo, rank, &allGather3Data[rank].netDev));
allGather3Data[rank].tree.pattern = treeGraph.pattern;
allGather3Data[rank].tree.nChannels = treeGraph.nChannels;
allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
@@ -701,45 +585,77 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
int *nodesFirstRank, *nodesTreePatterns;
NCCLCHECK(ncclCalloc(&nodesFirstRank, nranks));
NCCLCHECK(ncclCalloc(&nodesTreePatterns, nranks));
- for (int i=0; i<nranks; i++) {
- int node = -1;
- int firstRank = allGather3Data[i].topoRanks.ringRecv[0];
- for (int n=0; n<comm->nNodes; n++) {
- if (nodesFirstRank[n] == firstRank) node = n;
- }
- if (node == -1) {
- node = comm->nNodes++;
+ NCCLCHECK(ncclCalloc(&comm->rankToNode, comm->nRanks));
+ for (int r=0; r<nranks; r++) {
+ int node;
+ int firstRank = allGather3Data[r].topoRanks.ringRecv[0];
+ for (node=0; node<comm->nNodes && nodesFirstRank[node] != firstRank; node++);
+ if (node == comm->nNodes) {
+ comm->nNodes++;
nodesFirstRank[node] = firstRank;
// Record tree pattern of each node as they can be different depending on sm arch
- nodesTreePatterns[node] = allGather3Data[i].tree.pattern;
+ nodesTreePatterns[node] = allGather3Data[r].tree.pattern;
}
- if (i == comm->rank) comm->node = node;
+ comm->rankToNode[r] = node;
+ }
+ // Now that we know nNodes, alloc nodeRanks and compute localRanks for each node
+ NCCLCHECK(ncclCalloc(&comm->nodeRanks, comm->nNodes));
+ NCCLCHECK(ncclCalloc(&comm->rankToLocalRank, comm->nRanks));
+ for (int r=0; r<comm->nRanks; r++) {
+ int node = comm->rankToNode[r];
+ comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks;
+ comm->nodeRanks[node].localRanks++;
+ }
+ // Allocate ranks arrays for each node
+ for (int n=0; n<comm->nNodes; n++) {
+ NCCLCHECK(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks));
+ comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks);
+ comm->nodeRanks[n].localRanks = 0;
+ }
+ // And fill the ranks arrays
+ for (int r=0; r<comm->nRanks; r++) {
+ int node = comm->rankToNode[r];
+ comm->nodeRanks[node].localRankToRank[comm->nodeRanks[node].localRanks++] = r;
+ }
+ comm->node = comm->rankToNode[rank];
+ comm->localRankToRank = comm->nodeRanks[comm->node].localRankToRank;
+ comm->localRank = comm->rankToLocalRank[rank];
+ comm->localRanks = comm->nodeRanks[comm->node].localRanks;
+
+ TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d",
+ rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]);
+ if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) {
+ WARN("Failed to determine local ranks rank %d hostHash %lx pidHash %lx localRank %d localRanks %d localRank0 %d",
+ rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
+ comm->localRank, comm->localRanks, comm->localRankToRank[0]);
+ return ncclInternalError;
}
int nChannelsOrig = comm->nChannels;
struct ncclTopoRanks** allTopoRanks;
NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
for (int i=0; i<nranks; i++) {
+ comm->peerInfo[i].netDev = allGather3Data[i].netDev;
allTopoRanks[i] = &allGather3Data[i].topoRanks;
// Make sure we align all ranks so that the tuning is consistent across ranks
treeGraph.nChannels = std::min(allGather3Data[i].tree.nChannels, treeGraph.nChannels);
treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
- treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
- treeGraph.typeInter = std::min(allGather3Data[i].tree.typeInter, treeGraph.typeInter);
+ treeGraph.typeIntra = std::max(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
+ treeGraph.typeInter = std::max(allGather3Data[i].tree.typeInter, treeGraph.typeInter);
ringGraph.nChannels = std::min(allGather3Data[i].ring.nChannels, ringGraph.nChannels);
ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
- ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
- ringGraph.typeInter = std::min(allGather3Data[i].ring.typeInter, ringGraph.typeInter);
+ ringGraph.typeIntra = std::max(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
+ ringGraph.typeInter = std::max(allGather3Data[i].ring.typeInter, ringGraph.typeInter);
collNetGraph.nChannels = std::min(allGather3Data[i].collNet.nChannels, collNetGraph.nChannels);
collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels);
collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra);
collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter);
- collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
- collNetGraph.typeInter = std::min(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
+ collNetGraph.typeIntra = std::max(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
+ collNetGraph.typeInter = std::max(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport);
}
@@ -750,12 +666,20 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
for (int i=0; i<comm->nChannels; i++) memcpy(comm->channels+comm->nChannels+i, comm->channels+nChannelsOrig+i, sizeof(struct ncclChannel));
}
- // Determine CollNet support after all-gather now that we know nNodes
- int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
- if (comm->nNodes < collNetNodeThreshold) {
- if (comm->collNetSupport == 1)
+ // Determine CollNet support after all-gather now that we know nNodes and each node localRanks
+ if (comm->collNetSupport == 1) {
+ int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
+ if (comm->nNodes < collNetNodeThreshold) {
INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
- comm->collNetSupport = 0;
+ comm->collNetSupport = 0;
+ }
+ for (int n=0; n<comm->nNodes; n++) {
+ if (comm->nodeRanks[n].localRanks > NCCL_MAX_DIRECT_ARITY+1) {
+ WARN("CollNet currently only supports up to %d GPUs per node, disabling CollNet", NCCL_MAX_DIRECT_ARITY+1);
+ comm->collNetSupport = 0;
+ break;
+ }
+ }
}
int *rings;
@@ -782,16 +706,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
line[1023] = '\0';
INFO(NCCL_INIT, "Trees%s", line);
- // Set Affinity to a CPU local the our GPU, so that all memory we allocate
- // on the host is local.
- NCCLCHECK(ncclTopoGetCpuAffinity(comm->topo, comm->rank, &comm->cpuAffinity));
- cpu_set_t affinitySave;
- if (CPU_COUNT(&comm->cpuAffinity)) {
- sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
- sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
- }
- ncclResult_t ret;
-
NCCLCHECK(computeBuffSizes(comm));
// Connect with prev/next for each ring
@@ -818,7 +732,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
// Check if we can setup CollNet
if (comm->collNetSupport > 0) {
int collNetSetupFail = 0;
- int highestTypes[NCCL_MAX_INTRA_RANKS] = {TRANSPORT_P2P};
+ int highestTypes[NCCL_MAX_LOCAL_RANKS] = {TRANSPORT_P2P};
// Find all head ranks
int nHeads = collNetGraph.nChannels;
int *heads;
@@ -858,8 +772,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
// Exchange highest intra-node transport type among ranks
// because we need to know whether all ranks can p2p each other to determine whether we can directly read/write registered user buffer
- comm->intraHighestTransportType = highestTypes[comm->intraNodeRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
- NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, highestTypes, sizeof(int)));
+ comm->intraHighestTransportType = highestTypes[comm->localRank] = highestTransportType0 > highestTransportType1 ? highestTransportType0 : highestTransportType1;
+ NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, highestTypes, sizeof(int)));
for (int i=0; i<comm->localRanks; i++) {
if (highestTypes[i] > comm->intraHighestTransportType)
comm->intraHighestTransportType = highestTypes[i];
@@ -877,7 +791,15 @@ collnet_cleanup:
TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
// Compute time models for algorithm and protocol combinations
- NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
+ do {
+ int myCompCap = comm->peerInfo[rank].cudaCompCap;
+ int minCompCap = myCompCap, maxCompCap = myCompCap;
+ for (int i = 0; i < nranks; i++) {
+ minCompCap = std::min(comm->peerInfo[i].cudaCompCap, minCompCap);
+ maxCompCap = std::max(comm->peerInfo[i].cudaCompCap, maxCompCap);
+ }
+ NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
+ } while(0);
// Compute nChannels per peer for p2p
NCCLCHECK(ncclTopoComputeP2pChannels(comm));
@@ -892,28 +814,68 @@ collnet_cleanup:
int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks;
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
- if (comm->channels[channelId].peers[peer].recv[0].connected == 0) { // P2P uses only 1 connector
+ if (comm->channels[channelId].peers[peer].recv[1].connected == 0) { // P2P uses only 1 connector
comm->connectRecv[peer] |= (1<<channelId);
}
}
delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks;
for (int c=0; c<comm->p2pnChannelsPerPeer; c++) {
int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels;
- if (comm->channels[channelId].peers[peer].send[0].connected == 0) { // P2P uses only 1 connector
+ if (comm->channels[channelId].peers[peer].send[1].connected == 0) { // P2P uses only 1 connector
comm->connectSend[peer] |= (1<<channelId);
}
}
}
- NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 0));
+ NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1));
free(nvbPeers);
}
- NCCLCHECK(ncclCommSetIntraProc(comm, intraProcRank, intraProcRanks, intraProcRank0Comm));
+ // Connect to local net proxy
+ struct ncclProxyConnector proxyConn;
+ NCCLCHECK(ncclTopoGetLocalRank(comm->topo, comm->rank, &proxyConn.localRank));
+ NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, comm->rank, &proxyConn));
+ NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
+
+ // Then to remote ones when using PXN
+ if (ncclPxnDisable() == 0) {
+ int nranks;
+ int* pxnPeers;
+ NCCLCHECK(ncclTopoGetPxnRanks(comm, &pxnPeers, &nranks));
+ for (int r=0; r<nranks; r++) {
+ NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, pxnPeers[r], &proxyConn));
+ NCCLCHECK(ncclProxyCall(&proxyConn, ncclProxyMsgSharedInit, &comm->p2pnChannels, sizeof(int), NULL, 0));
+ }
+ free(pxnPeers);
+ }
+
+ do {
+ // Compute intra-process ranks
+ int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
+ for (int i = 0; i < nranks; i++) {
+ if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
+ && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
+ // Rank is in same process
+ if (intraProcRanks == 0) intraProcRank0 = i;
+ if (i == rank) intraProcRank = intraProcRanks;
+ intraProcRanks++;
+ }
+ }
+ TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
+ rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0);
+ if (intraProcRank == -1 || intraProcRank0 == -1 || comm->peerInfo[intraProcRank0].comm == NULL) {
+ WARN("Failed to determine intra proc ranks rank %d hostHash %lx pidHash %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
+ rank, comm->peerInfo[rank].hostHash, comm->peerInfo[rank].pidHash,
+ intraProcRank, intraProcRanks, intraProcRank0);
+ return ncclInternalError;
+ }
+ NCCLCHECK(ncclCommSetIntraProc(comm, intraProcRank, intraProcRanks, comm->peerInfo[intraProcRank0].comm));
+ } while(0);
/* Local intra-node barrier */
- NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->intraNodeGlobalRanks, intraNodeRank, intraNodeRanks, (int)intraNodeRank0pidHash));
+ NCCLCHECK(bootstrapBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]));
- if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));
+ // Unlink proxy shm to make sure it will be properly cleaned up.
+ NCCLCHECK(ncclProxyShmUnlink(comm));
// We should have allocated all buffers, collective fifos, ... we can
// restore the affinity.
@@ -937,6 +899,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
TRACE(NCCL_INIT, "Setting cudaLimitStackSize to %zi", maxLocalSizeBytes);
CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, maxLocalSizeBytes));
}
+ *newcomm = NULL;
NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
@@ -1028,6 +991,12 @@ static ncclResult_t ncclGraphHelperDestroy(ncclComm* comm) {
}
static ncclResult_t commDestroy(ncclComm_t comm) {
+ // Try and prevent a double free of the comm struct (user error)
+ if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {
+ WARN("comm %p has already been destroyed", comm);
+ return ncclInvalidArgument;
+ }
+
int savedDevice;
CUDACHECK(cudaGetDevice(&savedDevice));
int commDevice = comm->cudaDev;
@@ -1039,19 +1008,18 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, comm->rank, *comm->abortFlag, comm->fatalError);
CUDACHECK(cudaStreamSynchronize(comm->groupStream));
- NCCLCHECK(ncclProxyDestroy(comm));
+
ncclDestroyQueueInfo(comm->enqueueInfo);
#if CUDART_VERSION >= 11030
NCCLCHECK(ncclGraphHelperDestroy(comm));
#endif
INFO(NCCL_COLL, "Created %d queue info, destroyed %d", comm->nQueueInfoCreated, comm->nQueueInfoDestroyed);
+
NCCLCHECK(commFree(comm));
if (savedDevice != commDevice)
CUDACHECK(cudaSetDevice(savedDevice));
- TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, comm->rank);
-
return ncclSuccess;
}
@@ -1061,15 +1029,13 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
if (comm == NULL)
return ncclSuccess;
- TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
-
- // Try and prevent a double free of the comm struct (user error)
- if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {
- WARN("comm %p has already been destroyed", comm);
- return ncclInvalidArgument;
- }
+ int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
+ int64_t busId = comm->busId;
+ TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId);
- return commDestroy(comm);
+ NCCLCHECK(commDestroy(comm));
+ INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Destroy COMPLETE", comm, rank, nranks, cudaDev, busId);
+ return ncclSuccess;
}
NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
@@ -1078,10 +1044,16 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
if (comm == NULL)
return ncclSuccess;
+ int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
+ int64_t busId = comm->busId;
+ TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, busId);
+
// Ask anything that might still be running on the device to quit
*comm->abortFlag = 1;
- return commDestroy(comm);
+ NCCLCHECK(commDestroy(comm));
+ INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Abort COMPLETE", comm, rank, nranks, cudaDev, busId);
+ return ncclSuccess;
}
NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc
index fe4e760..1c5ba3c 100644
--- a/src/misc/argcheck.cc
+++ b/src/misc/argcheck.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -63,12 +63,8 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
}
if (info->comm->checkPointers) {
- if (info->coll == ncclFuncSendRecv) {
- if (strcmp(info->opName, "Send") == 0) {
- NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", "Send"));
- } else {
- NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", "Recv"));
- }
+ if ((info->coll == ncclFuncSend || info->coll == ncclFuncRecv) && info->count > 0) {
+ NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "buff", info->opName));
} else {
// Check CUDA device pointers
if (info->coll != ncclFuncBroadcast || info->comm->rank == info->root) {
diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc
index 439712e..e1aabac 100644
--- a/src/misc/ibvwrap.cc
+++ b/src/misc/ibvwrap.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -29,6 +29,7 @@ int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int at
struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
+struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
@@ -65,7 +66,7 @@ ncclResult_t wrap_ibv_symbols(void) {
}
}
-#define LOAD_SYM(handle, symbol, funcptr) do { \
+#define LOAD_SYM(handle, symbol, funcptr) do { \
cast = (void**)&funcptr; \
tmp = dlvsym(handle, symbol, IBVERBS_VERSION); \
if (tmp == NULL) { \
@@ -75,6 +76,12 @@ ncclResult_t wrap_ibv_symbols(void) {
*cast = tmp; \
} while (0)
+// Attempt to load a specific symbol version - fail silently
+#define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do { \
+ cast = (void**)&funcptr; \
+ *cast = dlvsym(handle, symbol, version); \
+ } while (0)
+
LOAD_SYM(ibvhandle, "ibv_get_device_list", ibv_internal_get_device_list);
LOAD_SYM(ibvhandle, "ibv_free_device_list", ibv_internal_free_device_list);
LOAD_SYM(ibvhandle, "ibv_get_device_name", ibv_internal_get_device_name);
@@ -89,6 +96,8 @@ ncclResult_t wrap_ibv_symbols(void) {
LOAD_SYM(ibvhandle, "ibv_alloc_pd", ibv_internal_alloc_pd);
LOAD_SYM(ibvhandle, "ibv_dealloc_pd", ibv_internal_dealloc_pd);
LOAD_SYM(ibvhandle, "ibv_reg_mr", ibv_internal_reg_mr);
+ // Cherry-pick the ibv_reg_mr_iova2 API from IBVERBS 1.8
+ LOAD_SYM_VERSION(ibvhandle, "ibv_reg_mr_iova2", ibv_internal_reg_mr_iova2, "IBVERBS_1.8");
LOAD_SYM(ibvhandle, "ibv_dereg_mr", ibv_internal_dereg_mr);
LOAD_SYM(ibvhandle, "ibv_create_cq", ibv_internal_create_cq);
LOAD_SYM(ibvhandle, "ibv_destroy_cq", ibv_internal_destroy_cq);
@@ -116,6 +125,7 @@ teardown:
ibv_internal_alloc_pd = NULL;
ibv_internal_dealloc_pd = NULL;
ibv_internal_reg_mr = NULL;
+ ibv_internal_reg_mr_iova2 = NULL;
ibv_internal_dereg_mr = NULL;
ibv_internal_create_cq = NULL;
ibv_internal_destroy_cq = NULL;
@@ -260,6 +270,14 @@ struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t len
return ibv_internal_reg_mr(pd, addr, length, access);
}
+ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access) {
+ if (ibv_internal_reg_mr_iova2 == NULL) {
+ return ncclInternalError;
+ }
+ if (ret == NULL) { return ncclSuccess; } // Assume dummy call
+ IBV_PTR_CHECK(ibv_internal_reg_mr_iova2, ibv_internal_reg_mr_iova2(pd, addr, length, iova, access), *ret, NULL, "ibv_reg_mr_iova2");
+}
+
ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
IBV_INT_CHECK_RET_ERRNO(ibv_internal_dereg_mr, ibv_internal_dereg_mr(mr), 0, "ibv_dereg_mr");
}
diff --git a/src/misc/nvmlwrap.cc b/src/misc/nvmlwrap.cc
index e83392d..5db7c6b 100644
--- a/src/misc/nvmlwrap.cc
+++ b/src/misc/nvmlwrap.cc
@@ -1,219 +1,262 @@
/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "nvmlwrap.h"
+#include "checks.h"
+#include "debug.h"
-#ifndef NVML_DIRECT
-#include <dlfcn.h>
-#include "core.h"
-
-static enum { nvmlUninitialized, nvmlInitializing, nvmlInitialized, nvmlError } nvmlState = nvmlUninitialized;
-
-static nvmlReturn_t (*nvmlInternalInit)(void);
-static nvmlReturn_t (*nvmlInternalShutdown)(void);
-static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device);
-static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index);
-static const char* (*nvmlInternalErrorString)(nvmlReturn_t r);
-static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
-static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
-static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link,
- nvmlNvLinkCapability_t capability, unsigned int *capResult);
-static nvmlReturn_t (*nvmlInternalDeviceGetCudaComputeCapability)(nvmlDevice_t device, int* major, int* minor);
-
-// Used to make the NVML library calls thread safe
-pthread_mutex_t nvmlLock = PTHREAD_MUTEX_INITIALIZER;
-
-ncclResult_t wrapNvmlSymbols(void) {
- if (nvmlState == nvmlInitialized)
- return ncclSuccess;
- if (nvmlState == nvmlError)
- return ncclSystemError;
-
- if (__sync_bool_compare_and_swap(&nvmlState, nvmlUninitialized, nvmlInitializing) == false) {
- // Another thread raced in front of us. Wait for it to be done.
- while (nvmlState == nvmlInitializing) pthread_yield();
- return (nvmlState == nvmlInitialized) ? ncclSuccess : ncclSystemError;
- }
+#include <initializer_list>
+#include <memory>
+#include <mutex>
- static void* nvmlhandle = NULL;
- void* tmp;
- void** cast;
+int ncclNvmlDeviceCount = 0;
+ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices];
+ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices];
- nvmlhandle=dlopen("libnvidia-ml.so.1", RTLD_NOW);
- if (!nvmlhandle) {
- WARN("Failed to open libnvidia-ml.so.1");
- goto teardown;
- }
+#if NCCL_NVML_DIRECT
+ #define NCCL_NVML_FN(name, rettype, arglist) constexpr rettype(*pfn_##name)arglist = name;
+#else
+ #include <dlfcn.h>
+ #define NCCL_NVML_FN(name, rettype, arglist) rettype(*pfn_##name)arglist = nullptr;
+#endif
-#define LOAD_SYM(handle, symbol, funcptr) do { \
- cast = (void**)&funcptr; \
- tmp = dlsym(handle, symbol); \
- if (tmp == NULL) { \
- WARN("dlsym failed on %s - %s", symbol, dlerror());\
- goto teardown; \
- } \
- *cast = tmp; \
- } while (0)
-
-#define LOAD_SYM_OPTIONAL(handle, symbol, funcptr) do {\
- cast = (void**)&funcptr; \
- tmp = dlsym(handle, symbol); \
- if (tmp == NULL) { \
- INFO(NCCL_INIT,"dlsym failed on %s, ignoring", symbol); \
- } \
- *cast = tmp; \
- } while (0)
-
- LOAD_SYM(nvmlhandle, "nvmlInit", nvmlInternalInit);
- LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown);
- LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId);
- LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex);
- LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString);
- LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState);
- LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo);
- LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability);
- LOAD_SYM(nvmlhandle, "nvmlDeviceGetCudaComputeCapability", nvmlInternalDeviceGetCudaComputeCapability);
-
- nvmlState = nvmlInitialized;
- return ncclSuccess;
+namespace {
+ NCCL_NVML_FN(nvmlInit, nvmlReturn_t, ())
+ NCCL_NVML_FN(nvmlInit_v2, nvmlReturn_t, ())
+ NCCL_NVML_FN(nvmlShutdown, nvmlReturn_t, ())
+ NCCL_NVML_FN(nvmlDeviceGetCount, nvmlReturn_t, (unsigned int*))
+ NCCL_NVML_FN(nvmlDeviceGetCount_v2, nvmlReturn_t, (unsigned int*))
+ NCCL_NVML_FN(nvmlDeviceGetHandleByPciBusId, nvmlReturn_t, (const char* pciBusId, nvmlDevice_t* device))
+ NCCL_NVML_FN(nvmlDeviceGetHandleByIndex, nvmlReturn_t, (unsigned int index, nvmlDevice_t *device))
+ NCCL_NVML_FN(nvmlDeviceGetIndex, nvmlReturn_t, (nvmlDevice_t device, unsigned* index))
+ NCCL_NVML_FN(nvmlErrorString, char const*, (nvmlReturn_t r))
+ NCCL_NVML_FN(nvmlDeviceGetNvLinkState, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive))
+ NCCL_NVML_FN(nvmlDeviceGetNvLinkRemotePciInfo, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci))
+ NCCL_NVML_FN(nvmlDeviceGetNvLinkCapability, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult))
+ NCCL_NVML_FN(nvmlDeviceGetCudaComputeCapability, nvmlReturn_t, (nvmlDevice_t device, int* major, int* minor))
+ NCCL_NVML_FN(nvmlDeviceGetP2PStatus, nvmlReturn_t, (nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus))
-teardown:
- nvmlInternalInit = NULL;
- nvmlInternalShutdown = NULL;
- nvmlInternalDeviceGetHandleByPciBusId = NULL;
- nvmlInternalDeviceGetIndex = NULL;
- nvmlInternalDeviceGetNvLinkState = NULL;
- nvmlInternalDeviceGetNvLinkRemotePciInfo = NULL;
- nvmlInternalDeviceGetNvLinkCapability = NULL;
-
- if (nvmlhandle != NULL) dlclose(nvmlhandle);
- nvmlState = nvmlError;
- return ncclSystemError;
+ std::mutex lock; // NVML has had some thread safety bugs
+ bool initialized = false;
+ thread_local bool threadInitialized = false;
+ ncclResult_t initResult;
}
+ncclResult_t ncclNvmlEnsureInitialized() {
+ // Optimization to avoid repeatedly grabbing the lock when we only want to
+ // read from the global tables.
+ if (threadInitialized) return initResult;
+ threadInitialized = true;
+
+ std::lock_guard<std::mutex> locked(lock);
-ncclResult_t wrapNvmlInit(void) {
- if (nvmlInternalInit == NULL) {
- WARN("lib wrapper not initialized.");
- return ncclInternalError;
+ if (initialized) return initResult;
+ initialized = true;
+
+ #if !NCCL_NVML_DIRECT
+ if (pfn_nvmlInit == nullptr) {
+ void *libhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
+ if (libhandle == nullptr) {
+ WARN("Failed to open libnvidia-ml.so.1");
+ initResult = ncclSystemError;
+ return initResult;
+ }
+
+ struct Symbol { void **ppfn; char const *name; };
+ std::initializer_list<Symbol> symbols = {
+ {(void**)&pfn_nvmlInit, "nvmlInit"},
+ {(void**)&pfn_nvmlInit_v2, "nvmlInit_v2"},
+ {(void**)&pfn_nvmlShutdown, "nvmlShutdown"},
+ {(void**)&pfn_nvmlDeviceGetCount, "nvmlDeviceGetCount"},
+ {(void**)&pfn_nvmlDeviceGetCount_v2, "nvmlDeviceGetCount_v2"},
+ {(void**)&pfn_nvmlDeviceGetHandleByPciBusId, "nvmlDeviceGetHandleByPciBusId"},
+ {(void**)&pfn_nvmlDeviceGetHandleByIndex, "nvmlDeviceGetHandleByIndex"},
+ {(void**)&pfn_nvmlDeviceGetIndex, "nvmlDeviceGetIndex"},
+ {(void**)&pfn_nvmlErrorString, "nvmlErrorString"},
+ {(void**)&pfn_nvmlDeviceGetNvLinkState, "nvmlDeviceGetNvLinkState"},
+ {(void**)&pfn_nvmlDeviceGetNvLinkRemotePciInfo, "nvmlDeviceGetNvLinkRemotePciInfo"},
+ {(void**)&pfn_nvmlDeviceGetNvLinkCapability, "nvmlDeviceGetNvLinkCapability"},
+ {(void**)&pfn_nvmlDeviceGetCudaComputeCapability, "nvmlDeviceGetCudaComputeCapability"},
+ {(void**)&pfn_nvmlDeviceGetP2PStatus, "nvmlDeviceGetP2PStatus"}
+ };
+ for(Symbol sym: symbols) {
+ *sym.ppfn = dlsym(libhandle, sym.name);
+ }
}
- nvmlReturn_t ret = nvmlInternalInit();
- if (ret != NVML_SUCCESS) {
- WARN("nvmlInit() failed: %s",
- nvmlInternalErrorString(ret));
- return ncclSystemError;
+ #endif
+
+ #if NCCL_NVML_DIRECT
+ bool have_v2 = true;
+ #else
+ bool have_v2 = pfn_nvmlInit_v2 != nullptr; // if this compare is done in the NCCL_NVML_DIRECT=1 case then GCC warns about it never being null
+ #endif
+ nvmlReturn_t res1 = (have_v2 ? pfn_nvmlInit_v2 : pfn_nvmlInit)();
+ if (res1 != NVML_SUCCESS) {
+ WARN("nvmlInit%s() failed: %s", have_v2 ? "_v2" : "", pfn_nvmlErrorString(res1));
+ initResult = ncclSystemError;
+ return initResult;
}
- return ncclSuccess;
-}
-ncclResult_t wrapNvmlShutdown(void) {
- if (nvmlInternalShutdown == NULL) {
- WARN("lib wrapper not initialized.");
- return ncclInternalError;
+ unsigned int ndev;
+ res1 = (have_v2 ? pfn_nvmlDeviceGetCount_v2 : pfn_nvmlDeviceGetCount)(&ndev);
+ if (res1 != NVML_SUCCESS) {
+ WARN("nvmlDeviceGetCount%s() failed: %s", have_v2 ? "_v2" :"", pfn_nvmlErrorString(res1));
+ initResult = ncclSystemError;
+ return initResult;
}
- nvmlReturn_t ret = nvmlInternalShutdown();
- if (ret != NVML_SUCCESS) {
- WARN("nvmlShutdown() failed: %s ",
- nvmlInternalErrorString(ret));
- return ncclSystemError;
+
+ ncclNvmlDeviceCount = int(ndev);
+ if (ncclNvmlMaxDevices < ncclNvmlDeviceCount) {
+ WARN("nvmlDeviceGetCount() reported more devices (%d) than the internal maximum (ncclNvmlMaxDevices=%d)", ncclNvmlDeviceCount, ncclNvmlMaxDevices);
+ initResult = ncclInternalError;
+ return initResult;
}
- return ncclSuccess;
-}
-ncclResult_t wrapNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
- if (nvmlInternalDeviceGetHandleByPciBusId == NULL) {
- WARN("lib wrapper not initialized.");
- return ncclInternalError;
+ for(int a=0; a < ncclNvmlDeviceCount; a++) {
+ res1 = pfn_nvmlDeviceGetHandleByIndex(a, &ncclNvmlDevices[a].handle);
+ if (res1 != NVML_SUCCESS) {
+ WARN("nvmlDeviceGetHandleByIndex(%d) failed: %s", int(a), pfn_nvmlErrorString(res1));
+ initResult = ncclSystemError;
+ return initResult;
+ }
+
+ res1 = pfn_nvmlDeviceGetCudaComputeCapability(ncclNvmlDevices[a].handle, &ncclNvmlDevices[a].computeCapabilityMajor, &ncclNvmlDevices[a].computeCapabilityMinor);
+ if (res1 != NVML_SUCCESS) {
+ WARN("nvmlDeviceGetCudaComputeCapability(%d) failed: %s", int(a), pfn_nvmlErrorString(res1));
+ initResult = ncclSystemError;
+ return initResult;
+ }
}
- nvmlReturn_t ret;
- NVMLLOCKCALL(nvmlInternalDeviceGetHandleByPciBusId(pciBusId, device), ret);
- if (ret != NVML_SUCCESS) {
- WARN("nvmlDeviceGetHandleByPciBusId() failed: %s ",
- nvmlInternalErrorString(ret));
- return ncclSystemError;
+
+ for(int a=0; a < ncclNvmlDeviceCount; a++) {
+ for(int b=0; b < ncclNvmlDeviceCount; b++) {
+ nvmlDevice_t da = ncclNvmlDevices[a].handle;
+ nvmlDevice_t db = ncclNvmlDevices[b].handle;
+
+ res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_READ, &ncclNvmlDevicePairs[a][b].p2pStatusRead);
+ if (res1 != NVML_SUCCESS) {
+ WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1));
+ initResult = ncclSystemError;
+ return initResult;
+ }
+
+ res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_WRITE, &ncclNvmlDevicePairs[a][b].p2pStatusWrite);
+ if (res1 != NVML_SUCCESS) {
+ WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1));
+ initResult = ncclSystemError;
+ return initResult;
+ }
+ }
}
+
+ initResult = ncclSuccess;
+ return initResult;
+}
+
+#define NVMLCHECK(name, ...) do { \
+ nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \
+ if (e44241808 != NVML_SUCCESS) { \
+ WARN(#name "() failed: %s", pfn_nvmlErrorString(e44241808)); \
+ return ncclSystemError; \
+ } \
+} while(0)
+
+#define NVMLTRY(name, ...) do { \
+ if (!NCCL_NVML_DIRECT && pfn_##name == nullptr) \
+ return ncclInternalError; /* missing symbol is not a warned error */ \
+ nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__); \
+ if (e44241808 != NVML_SUCCESS) { \
+ if (e44241808 != NVML_ERROR_NOT_SUPPORTED) \
+ INFO(NCCL_INIT, #name "() failed: %s", pfn_nvmlErrorString(e44241808)); \
+ return ncclSystemError; \
+ } \
+} while(0)
+
+ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
+ NCCLCHECK(ncclNvmlEnsureInitialized());
+ std::lock_guard<std::mutex> locked(lock);
+ NVMLCHECK(nvmlDeviceGetHandleByPciBusId, pciBusId, device);
return ncclSuccess;
}
-ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
- if (nvmlInternalDeviceGetIndex == NULL) {
- WARN("lib wrapper not initialized.");
- return ncclInternalError;
- }
- nvmlReturn_t ret;
- NVMLLOCKCALL(nvmlInternalDeviceGetIndex(device, index), ret);
- if (ret != NVML_SUCCESS) {
- WARN("nvmlDeviceGetIndex() failed: %s ",
- nvmlInternalErrorString(ret));
- return ncclSystemError;
- }
+ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
+ NCCLCHECK(ncclNvmlEnsureInitialized());
+ *device = ncclNvmlDevices[index].handle;
return ncclSuccess;
}
-ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
- if (nvmlInternalDeviceGetNvLinkState == NULL) {
- /* Do not warn, this symbol is optional. */
- return ncclInternalError;
- }
- nvmlReturn_t ret;
- NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkState(device, link, isActive), ret);
- if (ret != NVML_SUCCESS) {
- if (ret != NVML_ERROR_NOT_SUPPORTED)
- INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ",
- nvmlInternalErrorString(ret));
- return ncclSystemError;
+ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
+ NCCLCHECK(ncclNvmlEnsureInitialized());
+ for (int d=0; d < ncclNvmlDeviceCount; d++) {
+ if (ncclNvmlDevices[d].handle == device) {
+ *index = d;
+ return ncclSuccess;
+ }
}
+ return ncclInvalidArgument;
+}
+
+ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) {
+ NCCLCHECK(ncclNvmlEnsureInitialized());
+ std::lock_guard<std::mutex> locked(lock);
+ NVMLTRY(nvmlDeviceGetNvLinkState, device, link, isActive);
return ncclSuccess;
}
-ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
- if (nvmlInternalDeviceGetNvLinkRemotePciInfo == NULL) {
- /* Do not warn, this symbol is optional. */
- return ncclInternalError;
- }
- nvmlReturn_t ret;
- NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci), ret);
- if (ret != NVML_SUCCESS) {
- if (ret != NVML_ERROR_NOT_SUPPORTED)
- INFO(NCCL_INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ",
- nvmlInternalErrorString(ret));
- return ncclSystemError;
- }
+ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci) {
+ NCCLCHECK(ncclNvmlEnsureInitialized());
+ std::lock_guard<std::mutex> locked(lock);
+ NVMLTRY(nvmlDeviceGetNvLinkRemotePciInfo, device, link, pci);
return ncclSuccess;
}
-ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
- nvmlNvLinkCapability_t capability, unsigned int *capResult) {
- if (nvmlInternalDeviceGetNvLinkCapability == NULL) {
- /* Do not warn, this symbol is optional. */
- return ncclInternalError;
- }
- nvmlReturn_t ret;
- NVMLLOCKCALL(nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult), ret);
- if (ret != NVML_SUCCESS) {
- if (ret != NVML_ERROR_NOT_SUPPORTED)
- INFO(NCCL_INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ",
- nvmlInternalErrorString(ret));
- return ncclSystemError;
- }
+ncclResult_t ncclNvmlDeviceGetNvLinkCapability(
+ nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability,
+ unsigned int *capResult
+ ) {
+ NCCLCHECK(ncclNvmlEnsureInitialized());
+ std::lock_guard<std::mutex> locked(lock);
+ NVMLTRY(nvmlDeviceGetNvLinkCapability, device, link, capability, capResult);
return ncclSuccess;
}
-ncclResult_t wrapNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
- if (nvmlInternalDeviceGetNvLinkCapability == NULL) {
- WARN("lib wrapper not initialized.");
- return ncclInternalError;
+ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
+ NCCLCHECK(ncclNvmlEnsureInitialized());
+
+ for(int d=0; d < ncclNvmlDeviceCount; d++) {
+ if(device == ncclNvmlDevices[d].handle) {
+ *major = ncclNvmlDevices[d].computeCapabilityMajor;
+ *minor = ncclNvmlDevices[d].computeCapabilityMinor;
+ return ncclSuccess;
+ }
}
- nvmlReturn_t ret;
- NVMLLOCKCALL(nvmlInternalDeviceGetCudaComputeCapability(device, major, minor), ret);
- if (ret != NVML_SUCCESS) {
- WARN("nvmlDeviceGetCudaComputeCapability() failed: %s ",
- nvmlInternalErrorString(ret));
- return ncclSystemError;
+ return ncclInvalidArgument;
+}
+
+ncclResult_t ncclNvmlDeviceGetP2PStatus(
+ nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,
+ nvmlGpuP2PStatus_t* p2pStatus
+ ) {
+ NCCLCHECK(ncclNvmlEnsureInitialized());
+
+ if (p2pIndex == NVML_P2P_CAPS_INDEX_READ || p2pIndex == NVML_P2P_CAPS_INDEX_WRITE) {
+ int a = -1, b = -1;
+ for(int d=0; d < ncclNvmlDeviceCount; d++) {
+ if(device1 == ncclNvmlDevices[d].handle) a = d;
+ if(device2 == ncclNvmlDevices[d].handle) b = d;
+ }
+ if (a == -1 || b == -1) return ncclInvalidArgument;
+ if (p2pIndex == NVML_P2P_CAPS_INDEX_READ)
+ *p2pStatus = ncclNvmlDevicePairs[a][b].p2pStatusRead;
+ else
+ *p2pStatus = ncclNvmlDevicePairs[a][b].p2pStatusWrite;
+ }
+ else {
+ std::lock_guard<std::mutex> locked(lock);
+ NVMLCHECK(nvmlDeviceGetP2PStatus, device1, device2, p2pIndex, p2pStatus);
}
return ncclSuccess;
}
-#endif
diff --git a/src/misc/profiler.cc b/src/misc/profiler.cc
new file mode 100644
index 0000000..145b18f
--- /dev/null
+++ b/src/misc/profiler.cc
@@ -0,0 +1,115 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "profiler.h"
+
+//#define PROFILE_PROXY 1
+#ifdef PROFILE_PROXY
+#include "timer.h"
+#include "alloc.h"
+
+static const char* profilingStateSendStr[] = { "BufferWait", "GPUWait", "SendWait", "", "End" };
+static const char* profilingStateRecvStr[] = { "BufferWait", "RecvWait", "FlushWait", "GPUWait", "End" };
+static const char* profilingEventStr[] = { "SendRecv", "Sleep", "Idle", "Append" };
+struct ncclProxyProfileEvent {
+ double timestamp[6];
+ uint64_t opCount;
+ int peer;
+ int step;
+ uint16_t channel;
+ uint8_t type; // send / recv
+ uint8_t opIndex;
+};
+
+struct ncclProxyProfileEvent* profilingEvents = NULL;
+int profilingIndex = 0;
+double profilingStart = 0;
+#define MAX_EVENTS 200000
+
+ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) {
+ if (profilingEvents == NULL) {
+ NCCLCHECK(ncclCalloc(&profilingEvents, MAX_EVENTS));
+ profilingStart = gettime();
+ }
+ struct ncclProxyProfileEvent* event = NULL;
+ if (state%8 == 0) {
+ if (profilingIndex == MAX_EVENTS) return ncclSuccess;
+ args->subs[sub].profilingEvents[step%NCCL_STEPS] = event = profilingEvents+profilingIndex++;
+ if (state == ncclProxyProfileBegin) {
+ // Proxy operation information
+ event->opCount = args->opCount;
+ event->channel = args->subs[sub].channelId;
+ event->peer = args->subs[sub].peer;
+ event->type = args->pattern;
+ event->step = step;
+ event->opIndex = (((uint64_t)args)/sizeof(struct ncclProxyArgs))%256;
+ } else event->peer = -state;
+ } else {
+ event = (struct ncclProxyProfileEvent*)args->subs[sub].profilingEvents[step%NCCL_STEPS];
+ if (state == ncclProxyProfileEnd) args->subs[sub].profilingEvents[step%NCCL_STEPS] = NULL;
+ if (state == ncclProxyProfileAppendEnd) event->opCount = args->opCount;
+ }
+ // Timestamp
+ event->timestamp[state%8] = gettime()-profilingStart;
+ return ncclSuccess;
+}
+
+void ncclProfilingDump() {
+ static int dumpDone = 0;
+ if (dumpDone) return;
+ dumpDone = 1;
+ const char* str = getenv("NCCL_PROXY_PROFILE");
+ if (!str) { free(profilingEvents); return; }
+ FILE* f = fopen(str, "w");
+ fprintf(f, "[\n");
+
+ for (int i=0; i<profilingIndex; i++) {
+ struct ncclProxyProfileEvent* e = profilingEvents+i;
+ const int sendrecv = e->peer >= 0;
+ const char* typeStr = sendrecv ? (e->type == ncclPatternSend ? "Send" : "Recv") :
+ profilingEventStr[-(e->peer/8)];
+
+
+ if (sendrecv) {
+ int state = ncclProxyProfileBegin;
+ const char** stateStr = e->type == ncclPatternSend ? profilingStateSendStr : profilingStateRecvStr;
+ fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f, \"args\": { \"opCount\": %ld, \"proxyOpIndex\":%d } },\n",
+ typeStr, e->peer, e->step, i, e->channel, e->timestamp[state], e->opCount, e->opIndex);
+
+ while (state<ncclProxyProfileEnd) {
+ if (e->timestamp[state]) {
+ const char* name = stateStr[state];
+ fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
+ name, i, e->channel, e->timestamp[state]);
+ state++;
+ while (e->timestamp[state] == 0) state++;
+ fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
+ name, i, e->channel, e->timestamp[state]);
+ }
+ }
+
+ fprintf(f, "{\"name\": \"%s-%d-%d\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": 1, \"ts\": %f },\n",
+ typeStr, e->peer, e->step, i, e->channel, e->timestamp[state]);
+ } else {
+ if (e->peer == -ncclProxyProfileAppend) {
+ fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f, \"args\": { \"added\": %ld } },\n",
+ typeStr, i, e->timestamp[0], e->opCount);
+ } else {
+ fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n",
+ typeStr, i, e->timestamp[0]);
+ }
+ fprintf(f, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": -1, \"tid\": 1, \"ts\": %f },\n",
+ typeStr, i, e->timestamp[1]);
+ }
+ }
+ fprintf(f, "{} ]\n");
+ fclose(f);
+ free(profilingEvents);
+}
+#else
+ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state) { return ncclSuccess; }
+void ncclProfilingDump() {}
+#endif
diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc
new file mode 100644
index 0000000..d6bc353
--- /dev/null
+++ b/src/misc/shmutils.cc
@@ -0,0 +1,90 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "shm.h"
+#include "checks.h"
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+// Change functions behavior to match other SYS functions
+static int shm_allocate(int fd, const int shmSize) {
+ int err = posix_fallocate(fd, 0, shmSize);
+ if (err) { errno = err; return -1; }
+ return 0;
+}
+static int shm_map(int fd, const int shmSize, void** ptr) {
+ *ptr = mmap(NULL, shmSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ return (*ptr == MAP_FAILED) ? -1 : 0;
+}
+
+static ncclResult_t ncclShmSetup(char* shmPath, const int shmSize, int* fd, void** ptr, int create) {
+ if (create) {
+ if (shmPath[0] == '\0') {
+ sprintf(shmPath, "/dev/shm/nccl-XXXXXX");
+ *fd = mkstemp(shmPath);
+ } else {
+ SYSCHECKVAL(open(shmPath, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "open", *fd);
+ }
+ if (ftruncate(*fd, shmSize) != 0) {
+ WARN("Error: failed to extend %s to %d bytes", shmPath, shmSize);
+ return ncclSystemError;
+ }
+ } else {
+ SYSCHECKVAL(open(shmPath, O_RDWR, S_IRUSR | S_IWUSR), "open", *fd);
+ }
+ *ptr = (char*)mmap(NULL, shmSize, PROT_READ|PROT_WRITE, MAP_SHARED, *fd, 0);
+ if (*ptr == NULL) {
+ WARN("Could not map %s\n", shmPath);
+ return ncclSystemError;
+ }
+ close(*fd);
+ *fd = -1;
+ if (create) memset(*ptr, 0, shmSize);
+ return ncclSuccess;
+}
+
+ncclResult_t ncclShmOpen(char* shmPath, const int shmSize, void** shmPtr, void** devShmPtr, int create) {
+ int fd = -1;
+ void* ptr = MAP_FAILED;
+ ncclResult_t res = ncclSuccess;
+
+ NCCLCHECKGOTO(ncclShmSetup(shmPath, shmSize, &fd, &ptr, create), res, sysError);
+ if (devShmPtr) {
+ CUDACHECKGOTO(cudaHostRegister(ptr, shmSize, cudaHostRegisterMapped), res, cudaError);
+ CUDACHECKGOTO(cudaHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError);
+ }
+
+ *shmPtr = ptr;
+ return ncclSuccess;
+sysError:
+ WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmPath, shmSize);
+cudaError:
+ if (fd != -1) close(fd);
+ if (create) shm_unlink(shmPath);
+ if (ptr != MAP_FAILED) munmap(ptr, shmSize);
+ *shmPtr = NULL;
+ return res;
+}
+
+ncclResult_t ncclShmUnlink(const char* shmPath) {
+ if (shmPath != NULL) SYSCHECK(unlink(shmPath), "unlink");
+ return ncclSuccess;
+}
+
+ncclResult_t ncclShmClose(void* shmPtr, void* devShmPtr, const int shmSize) {
+ if (devShmPtr) CUDACHECK(cudaHostUnregister(shmPtr));
+ if (munmap(shmPtr, shmSize) != 0) {
+ WARN("munmap of shared memory failed");
+ return ncclSystemError;
+ }
+ return ncclSuccess;
+}
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
new file mode 100644
index 0000000..4e3295f
--- /dev/null
+++ b/src/misc/socket.cc
@@ -0,0 +1,552 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "socket.h"
+#include "utils.h"
+#include <stdlib.h>
+
+#include <unistd.h>
+#include <ifaddrs.h>
+#include <net/if.h>
+
+/* Format a string representation of a (union ncclSocketAddress *) socket address using getnameinfo()
+ *
+ * Output: "IPv4/IPv6 address<port>"
+ */
+const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf) {
+ if (buf == NULL || addr == NULL) return NULL;
+ struct sockaddr *saddr = &addr->sa;
+ if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
+ char host[NI_MAXHOST], service[NI_MAXSERV];
+ (void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, NI_NUMERICHOST|NI_NUMERICSERV);
+ sprintf(buf, "%s<%s>", host, service);
+ return buf;
+}
+
+static uint16_t socketToPort(union ncclSocketAddress *addr) {
+ struct sockaddr *saddr = &addr->sa;
+ return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port);
+}
+
+/* Allow the user to force the IPv4/IPv6 interface selection */
+static int envSocketFamily(void) {
+ int family = -1; // Family selection is not forced, will use first one found
+ char* env = getenv("NCCL_SOCKET_FAMILY");
+ if (env == NULL)
+ return family;
+
+ INFO(NCCL_ENV, "NCCL_SOCKET_FAMILY set by environment to %s", env);
+
+ if (strcmp(env, "AF_INET") == 0)
+ family = AF_INET; // IPv4
+ else if (strcmp(env, "AF_INET6") == 0)
+ family = AF_INET6; // IPv6
+ return family;
+}
+
+static int findInterfaces(const char* prefixList, char* names, union ncclSocketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
+#ifdef ENABLE_TRACE
+ char line[SOCKET_NAME_MAXLEN+1];
+#endif
+ struct netIf userIfs[MAX_IFS];
+ bool searchNot = prefixList && prefixList[0] == '^';
+ if (searchNot) prefixList++;
+ bool searchExact = prefixList && prefixList[0] == '=';
+ if (searchExact) prefixList++;
+ int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
+
+ int found = 0;
+ struct ifaddrs *interfaces, *interface;
+ getifaddrs(&interfaces);
+ for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) {
+ if (interface->ifa_addr == NULL) continue;
+
+ /* We only support IPv4 & IPv6 */
+ int family = interface->ifa_addr->sa_family;
+ if (family != AF_INET && family != AF_INET6)
+ continue;
+
+ TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, ncclSocketToString((union ncclSocketAddress *) interface->ifa_addr, line));
+
+ /* Allow the caller to force the socket family type */
+ if (sock_family != -1 && family != sock_family)
+ continue;
+
+ /* We also need to skip IPv6 loopback interfaces */
+ if (family == AF_INET6) {
+ struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr);
+ if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
+ }
+
+ // check against user specified interfaces
+ if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) {
+ continue;
+ }
+
+ // Check that this interface has not already been saved
+ // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link
+ bool duplicate = false;
+ for (int i = 0; i < found; i++) {
+ if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; }
+ }
+
+ if (!duplicate) {
+ // Store the interface name
+ strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize);
+ // Store the IP address
+ int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+ memcpy(addrs+found, interface->ifa_addr, salen);
+ found++;
+ }
+ }
+
+ freeifaddrs(interfaces);
+ return found;
+}
+
+static bool matchSubnet(struct ifaddrs local_if, union ncclSocketAddress* remote) {
+ /* Check family first */
+ int family = local_if.ifa_addr->sa_family;
+ if (family != remote->sa.sa_family) {
+ return false;
+ }
+
+ if (family == AF_INET) {
+ struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
+ struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
+ struct sockaddr_in& remote_addr = remote->sin;
+ struct in_addr local_subnet, remote_subnet;
+ local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
+ remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
+ return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true;
+ } else if (family == AF_INET6) {
+ struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
+ struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
+ struct sockaddr_in6& remote_addr = remote->sin6;
+ struct in6_addr& local_in6 = local_addr->sin6_addr;
+ struct in6_addr& mask_in6 = mask->sin6_addr;
+ struct in6_addr& remote_in6 = remote_addr.sin6_addr;
+ bool same = true;
+ int len = 16; //IPv6 address is 16 unsigned char
+ for (int c = 0; c < len; c++) { //Network byte order is big-endian
+ char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c];
+ char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c];
+ if (c1 ^ c2) {
+ same = false;
+ break;
+ }
+ }
+ // At last, we need to compare scope id
+ // Two Link-type addresses can have the same subnet address even though they are not in the same scope
+ // For Global type, this field is 0, so a comparison wouldn't matter
+ same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id);
+ return same;
+ } else {
+ WARN("Net : Unsupported address family type");
+ return false;
+ }
+}
+
+int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
+#ifdef ENABLE_TRACE
+ char line[SOCKET_NAME_MAXLEN+1];
+#endif
+ char line_a[SOCKET_NAME_MAXLEN+1];
+ int found = 0;
+ struct ifaddrs *interfaces, *interface;
+ getifaddrs(&interfaces);
+ for (interface = interfaces; interface && !found; interface = interface->ifa_next) {
+ if (interface->ifa_addr == NULL) continue;
+
+ /* We only support IPv4 & IPv6 */
+ int family = interface->ifa_addr->sa_family;
+ if (family != AF_INET && family != AF_INET6)
+ continue;
+
+ // check against user specified interfaces
+ if (!matchSubnet(*interface, remoteAddr)) {
+ continue;
+ }
+
+ // Store the local IP address
+ int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+ memcpy(localAddrs+found, interface->ifa_addr, salen);
+
+ // Store the interface name
+ strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
+
+ TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, ncclSocketToString(localAddrs+found, line), ncclSocketToString(remoteAddr, line_a));
+ found++;
+ if (found == maxIfs) break;
+ }
+
+ if (found == 0) {
+ WARN("Net : No interface found in the same subnet as remote address %s", ncclSocketToString(remoteAddr, line_a));
+ }
+ freeifaddrs(interfaces);
+ return found;
+}
+
+ncclResult_t ncclGetSocketAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) {
+ if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
+ WARN("Net : string is null");
+ return ncclInvalidArgument;
+ }
+
+ bool ipv6 = ip_port_pair[0] == '[';
+ /* Construct the sockaddress structure */
+ if (!ipv6) {
+ struct netIf ni;
+ // parse <ip_or_hostname>:<port> string, expect one pair
+ if (parseStringList(ip_port_pair, &ni, 1) != 1) {
+ WARN("Net : No valid <IPv4_or_hostname>:<port> pair found");
+ return ncclInvalidArgument;
+ }
+
+ struct addrinfo hints, *p;
+ int rv;
+ memset(&hints, 0, sizeof(hints));
+ hints.ai_family = AF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+
+ if ( (rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) {
+ WARN("Net : error encountered when getting address info : %s", gai_strerror(rv));
+ return ncclInvalidArgument;
+ }
+
+ // use the first
+ if (p->ai_family == AF_INET) {
+ struct sockaddr_in& sin = ua->sin;
+ memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in));
+ sin.sin_family = AF_INET; // IPv4
+ //inet_pton(AF_INET, ni.prefix, &(sin.sin_addr)); // IP address
+ sin.sin_port = htons(ni.port); // port
+ } else if (p->ai_family == AF_INET6) {
+ struct sockaddr_in6& sin6 = ua->sin6;
+ memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6));
+ sin6.sin6_family = AF_INET6; // IPv6
+ sin6.sin6_port = htons(ni.port); // port
+ sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete
+ sin6.sin6_scope_id = 0; // should be global scope, set to 0
+ } else {
+ WARN("Net : unsupported IP family");
+ return ncclInvalidArgument;
+ }
+
+ freeaddrinfo(p); // all done with this structure
+
+ } else {
+ int i, j = -1, len = strlen(ip_port_pair);
+ for (i = 1; i < len; i++) {
+ if (ip_port_pair[i] == '%') j = i;
+ if (ip_port_pair[i] == ']') break;
+ }
+ if (i == len) {
+ WARN("Net : No valid [IPv6]:port pair found");
+ return ncclInvalidArgument;
+ }
+ bool global_scope = (j == -1 ? true : false); // If no % found, global scope; otherwise, link scope
+
+ char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ];
+ memset(ip_str, '\0', sizeof(ip_str));
+ memset(port_str, '\0', sizeof(port_str));
+ memset(if_name, '\0', sizeof(if_name));
+ strncpy(ip_str, ip_port_pair+1, global_scope ? i-1 : j-1);
+ strncpy(port_str, ip_port_pair+i+2, len-i-1);
+ int port = atoi(port_str);
+ if (!global_scope) strncpy(if_name, ip_port_pair+j+1, i-j-1); // If not global scope, we need the intf name
+
+ struct sockaddr_in6& sin6 = ua->sin6;
+ sin6.sin6_family = AF_INET6; // IPv6
+ inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr)); // IP address
+ sin6.sin6_port = htons(port); // port
+ sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete
+ sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name); // 0 if global scope; intf index if link scope
+ }
+ return ncclSuccess;
+}
+
+int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) {
+ static int shownIfName = 0;
+ int nIfs = 0;
+ // Allow user to force the INET socket family selection
+ int sock_family = envSocketFamily();
+ // User specified interface
+ char* env = getenv("NCCL_SOCKET_IFNAME");
+ if (env && strlen(env) > 1) {
+ INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env);
+ // Specified by user : find or fail
+ if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env);
+ nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+ } else {
+ // Try to automatically pick the right one
+ // Start with IB
+ nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+ // else see if we can get some hint from COMM ID
+ if (nIfs == 0) {
+ char* commId = getenv("NCCL_COMM_ID");
+ if (commId && strlen(commId) > 1) {
+ INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
+ // Try to find interface that is in the same subnet as the IP in comm id
+ union ncclSocketAddress idAddr;
+ ncclGetSocketAddrFromString(&idAddr, commId);
+ nIfs = ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
+ }
+ }
+ // Then look for anything else (but not docker or lo)
+ if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+ // Finally look for docker, then lo.
+ if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+ if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+ }
+ return nIfs;
+}
+
+ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
+ /* IPv4/IPv6 support */
+ int family = sock->addr.sa.sa_family;
+ int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+ int flags;
+
+ /* Create socket and bind it to a port */
+ int fd = socket(family, SOCK_STREAM, 0);
+ if (fd == -1) {
+ WARN("Net : Socket creation failed : %s", strerror(errno));
+ return ncclSystemError;
+ }
+
+ if (socketToPort(&sock->addr)) {
+ // Port is forced by env. Make sure we get the port.
+ int opt = 1;
+#if defined(SO_REUSEPORT)
+ SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
+#else
+ SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
+#endif
+ }
+
+ /* make all new sockets non-blocking */
+ EQCHECK(flags = fcntl(fd, F_GETFL), -1);
+ SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+
+ // addr port should be 0 (Any port)
+ SYSCHECK(bind(fd, &sock->addr.sa, salen), "bind");
+
+ /* Get the assigned Port */
+ socklen_t size = salen;
+ SYSCHECK(getsockname(fd, &sock->addr.sa, &size), "getsockname");
+
+#ifdef ENABLE_TRACE
+ char line[SOCKET_NAME_MAXLEN+1];
+ TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", ncclSocketToString(&sock->addr, line));
+#endif
+
+ /* Put the socket in listen mode
+ * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
+ */
+ SYSCHECK(listen(fd, 16384), "listen");
+ sock->fd = fd;
+ return ncclSuccess;
+}
+
+static ncclResult_t getFdState(int fd, enum ncclSocketState* state) {
+ struct pollfd pfd;
+ int timeout = 1, ret;
+ socklen_t rlen = sizeof(int);
+
+ memset(&pfd, 0, sizeof(struct pollfd));
+ pfd.fd = fd;
+ pfd.events = POLLOUT;
+ SYSCHECK(ret = poll(&pfd, 1, timeout), "poll");
+ if (ret == 0) {
+ ret = EINPROGRESS;
+ } else {
+ /* check socket status */
+ EQCHECK(ret == 1 && (pfd.revents & POLLOUT), 0);
+ SYSCHECK(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
+ }
+
+ if (ret == EINPROGRESS)
+ *state = ncclSocketConnecting;
+ else if (ret == 0)
+ *state = ncclSocketConnected;
+ else
+ *state = ncclSocketError;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclGetSocketState(struct ncclSocket* sock, enum ncclSocketState* state) {
+ NCCLCHECK(getFdState(sock->fd, state));
+ sock->state = *state;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
+ char line[SOCKET_NAME_MAXLEN+1];
+ /* IPv4/IPv6 support */
+ int family = sock->addr.sa.sa_family;
+ if (family != AF_INET && family != AF_INET6) {
+ WARN("Net : connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
+ ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
+ return ncclInternalError;
+ }
+ int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+ int flags;
+
+ /* Connect to a hostname / port */
+ int fd = socket(family, SOCK_STREAM, 0);
+ if (fd == -1) {
+ WARN("Net : Socket creation failed : %s", strerror(errno));
+ return ncclSystemError;
+ }
+
+ const int one = 1;
+ SYSCHECK(setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
+
+ /* support non-blocking socket; by default, the socket is non-blocking */
+ EQCHECK(flags = fcntl(fd, F_GETFL), -1);
+ SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+
+ /* const int bufsize = 128*1024;
+ SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (char*)&bufsize, sizeof(int)), "setsockopt");
+ SYSCHECK(setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (char*)&bufsize, sizeof(int)), "setsockopt");*/
+
+ TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line));
+
+ int ret;
+ int timedout_retries = 0;
+ int refused_retries = 0;
+retry:
+ /* async connect; abort when error happens and abortFlag is present. */
+ ret = connect(fd, &sock->addr.sa, salen);
+
+ if (errno == EAGAIN || (errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) ||
+ (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) {
+ if (refused_retries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
+ usleep(SLEEP_INT);
+ goto retry;
+ } else if (errno == EINPROGRESS && !sock->asyncFlag) {
+ enum ncclSocketState state;
+ do {
+ if (sock->abortFlag) NEQCHECK(*sock->abortFlag, 0);
+ NCCLCHECK(getFdState(fd, &state));
+ } while (state == ncclSocketConnecting);
+ EQCHECK(state, ncclSocketError);
+ ret = 0;
+ }
+
+ if (ret == 0 || (errno == EINPROGRESS && sock->asyncFlag)) {
+ sock->fd = fd;
+ return ncclSuccess;
+ }
+
+ WARN("Net : Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
+ return ncclSystemError;
+}
+
+ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listenSocket) {
+ socklen_t socklen = sizeof(union ncclSocketAddress);
+ int tmpFd = sock->fd = -1;
+
+ do {
+ if (listenSocket->abortFlag) NEQCHECK(*listenSocket->abortFlag, 0);
+ tmpFd = accept(listenSocket->fd, &sock->addr.sa, &socklen);
+ } while ((errno == EAGAIN || errno == EWOULDBLOCK) && tmpFd == -1 && !listenSocket->asyncFlag);
+
+ if (!listenSocket->asyncFlag) {
+ EQCHECK(tmpFd, -1);
+ } else if (tmpFd == -1 && errno != EAGAIN && errno != EWOULDBLOCK) {
+ return ncclSystemError;
+ }
+
+ sock->fd = tmpFd;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr, volatile uint32_t* abortFlag, int asyncFlag) {
+ if (sock == NULL)
+ return ncclSuccess;
+
+ sock->fd = -1;
+ if (addr) {
+ memcpy(&sock->addr, addr, sizeof(union ncclSocketAddress));
+ } else {
+ memset(&sock->addr, 0, sizeof(union ncclSocketAddress));
+ }
+ sock->abortFlag = abortFlag;
+ sock->asyncFlag = asyncFlag;
+ sock->state = ncclSocketStateNum;
+ return ncclSuccess;
+}
+
+static ncclResult_t ncclSocketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
+ int bytes = 0;
+ *closed = 0;
+ char* data = (char*)ptr;
+ char line[SOCKET_NAME_MAXLEN+1];
+ do {
+ if (op == NCCL_SOCKET_RECV) bytes = recv(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
+ if (op == NCCL_SOCKET_SEND) bytes = send(sock->fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT);
+ if (op == NCCL_SOCKET_RECV && bytes == 0) {
+ *closed = 1;
+ return ncclSuccess;
+ }
+ if (bytes == -1) {
+ if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+ WARN("Net : Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
+ return ncclSystemError;
+ } else {
+ bytes = 0;
+ }
+ }
+ (*offset) += bytes;
+ if (sock->abortFlag && *sock->abortFlag != 0) {
+ INFO(NCCL_NET, "Socket progress: abort called");
+ return ncclSystemError;
+ }
+ } while (bytes > 0 && (*offset) < size);
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
+ int closed;
+ NCCLCHECK(ncclSocketProgressOpt(op, sock, ptr, size, offset, 0, &closed));
+ if (closed) {
+ char line[SOCKET_NAME_MAXLEN+1];
+ WARN("Net : Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line));
+ return ncclSystemError;
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
+ while (*offset < size)
+ NCCLCHECK(ncclSocketProgress(op, sock, ptr, size, offset));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size) {
+ int offset = 0;
+ NCCLCHECK(ncclSocketWait(NCCL_SOCKET_SEND, sock, ptr, size, &offset));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
+ int offset = 0;
+ NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, sock, ptr, size, &offset));
+ return ncclSuccess;
+}
+
+// Receive or detect connection closed
+ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed) {
+ int offset = 0;
+ *closed = 0;
+ while (offset < size) {
+ NCCLCHECK(ncclSocketProgressOpt(NCCL_SOCKET_RECV, sock, ptr, size, &offset, 0, closed));
+ if (*closed) return ncclSuccess;
+ }
+ return ncclSuccess;
+}
diff --git a/src/net.cc b/src/net.cc
new file mode 100644
index 0000000..5f68021
--- /dev/null
+++ b/src/net.cc
@@ -0,0 +1,261 @@
+#include "net.h"
+#include "bootstrap.h"
+#include "checks.h"
+
+#include <string.h>
+#include <errno.h>
+#include <dlfcn.h>
+//#include <sys/types.h>
+//#include <sys/stat.h>
+//#include <unistd.h>
+
+ncclNet_t *ncclNet;
+ncclCollNet_t *ncclCollNet;
+
+static ncclNet_v5_t ncclNet_v4_as_v5;
+static ncclNet_v4_t *ncclNet_v4;
+static ncclCollNet_v5_t ncclCollNet_v4_as_v5;
+static ncclCollNet_v4_t *ncclCollNet_v4;
+
+static ncclResult_t ncclNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) {
+ ncclNetProperties_v4_t p4;
+ ncclResult_t ans = ncclNet_v4->getProperties(dev, &p4);
+ if (ans != ncclSuccess) return ans;
+ props->name = p4.name;
+ props->pciPath = p4.pciPath;
+ props->guid = p4.guid;
+ props->ptrSupport = p4.ptrSupport;
+ props->speed = p4.speed;
+ props->port = p4.port;
+ props->maxComms = p4.maxComms;
+ props->maxRecvs = 1;
+ props->latency = 0;
+ return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_v4_as_v5_isend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
+ return ncclNet_v4->isend(sendComm, data, size, mhandle, request);
+}
+
+static ncclResult_t ncclNet_v4_as_v5_irecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
+ if (n == 0) return ncclSuccess;
+ if (n != 1) return ncclInvalidArgument;
+ return ncclNet_v4->irecv(recvComm, data[0], sizes[0], mhandles[0], request);
+}
+
+static ncclResult_t ncclNet_v4_as_v5_iflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
+ if (n == 0) return ncclSuccess;
+ if (n != 1) return ncclInvalidArgument;
+ return ncclNet_v4->iflush(recvComm, data[0], sizes[0], mhandles[0], request);
+}
+
+// We use a wrapper around the v4 init to copy over the struct contents
+// post-init since they may not be initialized before hand.
+static ncclResult_t ncclNet_v4_as_v5_init(ncclDebugLogger_t logfn) {
+ NCCLCHECK(ncclNet_v4->init(logfn));
+ ncclNet_v4_as_v5.name = ncclNet_v4->name;
+ ncclNet_v4_as_v5.devices = ncclNet_v4->devices;
+ ncclNet_v4_as_v5.getProperties = ncclNet_v4_as_v5_getProperties;
+ ncclNet_v4_as_v5.listen = ncclNet_v4->listen;
+ ncclNet_v4_as_v5.connect = ncclNet_v4->connect;
+ ncclNet_v4_as_v5.accept = ncclNet_v4->accept;
+ ncclNet_v4_as_v5.regMr = ncclNet_v4->regMr;
+ ncclNet_v4_as_v5.deregMr = ncclNet_v4->deregMr;
+ ncclNet_v4_as_v5.isend = ncclNet_v4_as_v5_isend;
+ ncclNet_v4_as_v5.irecv = ncclNet_v4_as_v5_irecv;
+ ncclNet_v4_as_v5.iflush = ncclNet_v4_as_v5_iflush;
+ ncclNet_v4_as_v5.test = ncclNet_v4->test;
+ ncclNet_v4_as_v5.closeSend = ncclNet_v4->closeSend;
+ ncclNet_v4_as_v5.closeRecv = ncclNet_v4->closeRecv;
+ ncclNet_v4_as_v5.closeListen = ncclNet_v4->closeListen;
+ return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_v4_as_v5_getProperties(int dev, ncclNetProperties_v5_t* props) {
+ ncclNetProperties_v4_t p4;
+ ncclResult_t ans = ncclCollNet_v4->getProperties(dev, &p4);
+ if (ans != ncclSuccess) return ans;
+ props->name = p4.name;
+ props->pciPath = p4.pciPath;
+ props->guid = p4.guid;
+ props->ptrSupport = p4.ptrSupport;
+ props->speed = p4.speed;
+ props->port = p4.port;
+ props->maxComms = p4.maxComms;
+ props->maxRecvs = 1;
+ props->latency = 0;
+ return ncclSuccess;
+}
+
+// We use a wrapper around the v4 init to copy over the struct contents
+// post-init since they may not be initialized before hand.
+static ncclResult_t ncclCollNet_v4_as_v5_init(ncclDebugLogger_t logfn) {
+ NCCLCHECK(ncclCollNet_v4->init(logfn));
+ ncclCollNet_v4_as_v5.name = ncclCollNet_v4->name;
+ ncclCollNet_v4_as_v5.devices = ncclCollNet_v4->devices;
+ ncclCollNet_v4_as_v5.getProperties = ncclCollNet_v4_as_v5_getProperties;
+ ncclCollNet_v4_as_v5.listen = ncclCollNet_v4->listen;
+ ncclCollNet_v4_as_v5.connect = ncclCollNet_v4->connect;
+ ncclCollNet_v4_as_v5.reduceSupport = ncclCollNet_v4->reduceSupport;
+ ncclCollNet_v4_as_v5.regMr = ncclCollNet_v4->regMr;
+ ncclCollNet_v4_as_v5.deregMr = ncclCollNet_v4->deregMr;
+ ncclCollNet_v4_as_v5.iallreduce = ncclCollNet_v4->iallreduce;
+ ncclCollNet_v4_as_v5.iflush = ncclCollNet_v4->iflush;
+ ncclCollNet_v4_as_v5.test = ncclCollNet_v4->test;
+ ncclCollNet_v4_as_v5.closeColl = ncclCollNet_v4->closeColl;
+ ncclCollNet_v4_as_v5.closeListen = ncclCollNet_v4->closeListen;
+ return ncclSuccess;
+}
+
+static void initPlugin(ncclNet_v5_t** net, ncclCollNet_v5_t** collnet) {
+ char ncclNetPluginName[128];
+ const char* envPluginName = getenv("NCCL_NET_PLUGIN");
+ if (envPluginName && strlen(envPluginName)) {
+ snprintf(ncclNetPluginName, 128, "libnccl-net-%s.so", envPluginName);
+ INFO(NCCL_INIT, "Plugin name set by env to %s\n", ncclNetPluginName);
+ } else {
+ sprintf(ncclNetPluginName, "libnccl-net.so");
+ }
+ void* netPluginLib = dlopen(ncclNetPluginName, RTLD_NOW | RTLD_LOCAL);
+ if (netPluginLib == nullptr) {
+ // dlopen does not guarantee to set errno, but dlerror only gives us a
+ // string, so checking errno doesn't hurt to try to provide a better
+ // error message
+ if (errno == ENOENT) {
+ INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (%s), using internal implementation", ncclNetPluginName);
+ } else {
+ INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror());
+ }
+ return;
+ }
+
+ *net = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
+ if (*net == nullptr) {
+ INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v5 symbol.");
+ ncclNet_v4 = (ncclNet_v4_t*)dlsym(netPluginLib, "ncclNetPlugin_v4");
+ if (ncclNet_v4 == nullptr) {
+ INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v4 symbol.");
+ if (netPluginLib != nullptr) dlclose(netPluginLib);
+ return;
+ }
+ *net = &ncclNet_v4_as_v5;
+ ncclNet_v4_as_v5.init = ncclNet_v4_as_v5_init;
+ }
+
+ // Check for CollNet
+ *collnet = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
+ if (*collnet == nullptr) {
+ INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v5 symbol.");
+ ncclCollNet_v4 = (ncclCollNet_v4_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v4");
+ if (ncclCollNet_v4 == nullptr) {
+ INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v4 symbol.");
+ } else {
+ *collnet = &ncclCollNet_v4_as_v5;
+ ncclCollNet_v4_as_v5.init = ncclCollNet_v4_as_v5_init;
+ }
+ }
+ return;
+}
+
+ncclResult_t ncclNetInit() {
+ // Always initialize bootstrap network
+ NCCLCHECK(bootstrapNetInit());
+
+ // Initialize main communication network
+ ncclNet_t* nets[3] = { nullptr, &ncclNetIb, &ncclNetSocket };
+ ncclCollNet_t* collNets[3] = { nullptr, nullptr, nullptr };
+ initPlugin(&nets[0], &collNets[0]);
+ char* netName = getenv("NCCL_NET");
+ bool ok = false;
+
+ for (int i=0; i<3; i++) {
+ if (nets[i] == nullptr) continue;
+ if (netName && strcmp(netName, nets[i]->name) != 0) continue;
+
+ // net plugin is already initialized
+ int ndev;
+ if (nets[i]->init(ncclDebugLog) != ncclSuccess) continue;
+ if (nets[i]->devices(&ndev) != ncclSuccess) continue;
+ if (ndev <= 0) continue;
+ ncclNet = nets[i];
+ ok = true;
+
+ if (collNets[i]) {
+ do {
+ if (collNets[i]->init(ncclDebugLog) != ncclSuccess) break;
+ if (collNets[i]->devices(&ndev) != ncclSuccess) break;
+ if (ndev <= 0) break;
+ ncclCollNet = collNets[i];
+ } while(0);
+ }
+ break;
+ }
+
+ if (!ok) {
+ WARN("Error: network %s not found.", netName ? netName : "");
+ return ncclInvalidUsage;
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
+ constexpr int GPU_BUF_SIZE = 2*1024*1024;
+#if CUDART_VERSION >= 11030
+ // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute
+ int driverVersion;
+ CUDACHECK(cudaDriverGetVersion(&driverVersion));
+ if (driverVersion >= 11030) {
+ int cudaDev, attr = 0;
+ CUDACHECK(cudaGetDevice(&cudaDev));
+ CUDACHECK(cudaDeviceGetAttribute(&attr, cudaDevAttrGPUDirectRDMASupported, cudaDev));
+ *gdrSupport = attr;
+ return ncclSuccess;
+ }
+#endif
+ int netDevs;
+ NCCLCHECK(ncclNetDevices(&netDevs));
+ *gdrSupport = 0;
+ for (int dev=0; dev<netDevs; dev++) {
+ // Find a net device which is GDR-capable
+ ncclNetProperties_t props;
+ NCCLCHECK(ncclNetGetProperties(dev, &props));
+ if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
+
+ // Allocate memory on the GPU and try to register it on the NIC.
+ void *lComm = NULL, *sComm = NULL, *rComm = NULL;
+ ncclNetHandle_t handle;
+ void* gpuPtr = NULL;
+ void* mHandle = NULL;
+ ncclResult_t ret;
+ ncclDebugNoWarn = NCCL_NET;
+ NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1);
+ while (sComm == NULL) {
+ NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2);
+ }
+ while (rComm == NULL) {
+ NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3);
+ }
+ CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4);
+ if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
+ NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
+ NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
+ NCCLCHECK(ncclNetDeregMr(rComm, mHandle));
+ *gdrSupport = 1;
+ }
+ ncclDebugNoWarn = 0;
+ CUDACHECK(cudaFree(gpuPtr));
+cleanup4:
+ NCCLCHECK(ncclNetCloseRecv(rComm));
+cleanup3:
+ NCCLCHECK(ncclNetCloseSend(sComm));
+cleanup2:
+ NCCLCHECK(ncclNetCloseListen(lComm));
+cleanup1:
+ break;
+ }
+ return ncclSuccess;
+}
+
+int ncclNetVersion() {
+ return (ncclNet == &ncclNet_v4_as_v5) ? 4 : 5;
+}
diff --git a/src/proxy.cc b/src/proxy.cc
index e5d2eab..7d4f811 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,6 +7,11 @@
#include "comm.h"
#include "info.h"
#include "collectives.h"
+#include "socket.h"
+#include "shm.h"
+#include "profiler.h"
+#define ENABLE_TIMER 0
+#include "timer.h"
enum { proxyRecv=0, proxySend=1 };
@@ -14,7 +19,7 @@ static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, in
if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true;
/* In chains, one rank does not need a proxy. Let's figure out which one it is */
- // Which index in the reorganized rings should we compare root against */
+ /* Which index in the reorganized rings should we compare root against */
const int myrank = 0, nextrank = 1, prevrank = nranks-1;
int index = pattern == ncclPatternPipelineFrom ?
/* no recv / no send if root = */
@@ -24,47 +29,30 @@ static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, in
return (root != rank);
}
-#define PROXYARGS_ALLOCATE_SIZE 128
+#define PROXYARGS_ALLOCATE_SIZE NCCL_MAX_OPS
struct ncclProxyPool {
struct ncclProxyPool *next;
struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE];
};
-static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) {
- struct ncclProxyState* state = &comm->proxyState;
+static ncclResult_t allocateArgs(struct ncclProxyProgressState* state, struct ncclProxyArgs** argsptr) {
struct ncclProxyArgs* elem;
if (state->pool == NULL) {
- // Check whether there are freed elements
- if (state->poolReturned) {
- pthread_mutex_lock(&state->poolMutex);
- state->pool = state->poolReturned;
- state->poolReturned = NULL;
- pthread_mutex_unlock(&state->poolMutex);
- } else {
- // Allocate a new pool of elements. Make sure we allocate the memory close
- // to the network thread
- struct ncclProxyPool* newPool;
- cpu_set_t affinitySave;
- if (CPU_COUNT(&comm->cpuAffinity)) {
- sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
- sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
- }
- NCCLCHECK(ncclCalloc(&newPool, 1));
- if (CPU_COUNT(&comm->cpuAffinity)) {
- sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
- }
+ // Allocate a new pool of elements. Make sure we allocate the memory close
+ // to the network thread
+ struct ncclProxyPool* newPool;
+ NCCLCHECK(ncclCalloc(&newPool, 1));
- struct ncclProxyArgs* newElems = newPool->elems;
- // Chain newly allocated elements
- for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) {
- if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1;
- }
- // Add them all to the pool list
- state->pool = newElems;
- // Save the pool memory block for later resource release
- newPool->next = state->pools;
- state->pools = newPool;
+ struct ncclProxyArgs* newElems = newPool->elems;
+ // Chain newly allocated elements
+ for (int i=0; i<PROXYARGS_ALLOCATE_SIZE; i++) {
+ if (i+1 < PROXYARGS_ALLOCATE_SIZE) newElems[i].next = newElems+i+1;
}
+ // Add them all to the pool list
+ state->pool = newElems;
+ // Save the pool memory block for later resource release
+ newPool->next = state->pools;
+ state->pools = newPool;
}
elem = state->pool;
state->pool = state->pool->next;
@@ -82,241 +70,393 @@ static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** a
#define OP_INDEX(op) ((op) ? (op)-state->pools->elems : -1)
#define OP_SEEN 0x100000
-ncclResult_t dumpProxyState(struct ncclProxyState* state) {
-#ifdef DEBUG_PROXY
- struct ncclProxyArgs* op = state->ops;
+
+ncclResult_t getOpIndex(struct ncclProxyArgs* op, struct ncclProxyProgressState* state, int* poolIndex, int* opIndex) {
+ struct ncclProxyPool* pool = state->pools;
+ int p = 0;
+ while (pool) {
+ uint64_t o = op-pool->elems;
+ if (o < PROXYARGS_ALLOCATE_SIZE) {
+ *opIndex = o;
+ *poolIndex = p;
+ return ncclSuccess;
+ }
+ pool = pool->next;
+ p++;
+ }
+ WARN("Could not find pool of op %p\n", op);
+ return ncclInternalError;
+}
+
+ncclResult_t printProxyOp(struct ncclProxyArgs* op, int poolIndex, int opIndex) {
+ printf("[%d-%d|%ld| %s", poolIndex, opIndex, op->opCount, op->pattern == ncclPatternSend ? "Send" : op->pattern == ncclPatternRecv ? "Recv" : "Coll");
+ for (int s=0; s<op->nsubs; s++) {
+ struct ncclProxySubArgs* sub = op->subs+s;
+ if (op->state == ncclProxyOpProgress) {
+ char status = ' ';
+ if (op->pattern == ncclPatternRecv) {
+ if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init
+ else if (sub->received < sub->posted) status = 'R'; // Receiving
+ else if (sub->received < sub->transmitted) status = 'R'; // Receiving
+ else if (sub->transmitted < sub->received) status = 'F'; // Flushing
+ else if (sub->done < sub->transmitted) status = 'G'; // Waiting on GPU
+ else status = 'D'; // Done
+ } else if (op->pattern == ncclPatternSend) {
+ if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) status = 'I'; // Init
+ else if (sub->transmitted < sub->posted) status = 'G'; // Waiting on GPU
+ else if (sub->done < sub->transmitted) status = 'S'; // Sending
+ else status = 'D'; // Done
+ }
+ printf(" %d%c/%d", sub->peer, status, sub->channelId);
+ } else {
+ printf(" %d/%d", sub->peer, sub->channelId);
+ }
+ }
+ printf("]");
+ return ncclSuccess;
+}
+ncclResult_t dumpProxyState(struct ncclProxyProgressState* state) {
+ struct ncclProxyArgs* op = state->active;
+ int poolIndex, opIndex;
+ printf("ACTIVE OPS\n");
while (op) {
- if (op->idle & OP_SEEN) {
- WARN("Active list loop at element %ld", OP_INDEX(op));
- }
- op->idle |= OP_SEEN;
- printf("[%ld(%ld/%d)]", OP_INDEX(op), op->opCount, op->nsubs);
- if (op->nextPeer) {
- printf("(%ld)", OP_INDEX(op->nextPeer));
- struct ncclProxyArgs* n = op->nextPeer;
- n->idle |= OP_SEEN;
- while (n->nextPeer) {
- n = n->nextPeer;
- n->idle |= OP_SEEN;
+ NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
+ if (op->state & OP_SEEN) {
+ WARN("List loop at element %d-%d", poolIndex, opIndex);
+ }
+ NCCLCHECK(printProxyOp(op, poolIndex, opIndex));
+ op->state |= OP_SEEN;
+ printf("\n");
+ struct ncclProxyArgs* nextOp = op->nextPeer;
+ while (nextOp) {
+ NCCLCHECK(getOpIndex(nextOp, state, &poolIndex, &opIndex));
+ if (nextOp->state & OP_SEEN) {
+ WARN("List loop at element %d-%d", poolIndex, opIndex);
}
+ printf("| `-> ");
+ NCCLCHECK(printProxyOp(nextOp, poolIndex, opIndex));
+ nextOp->state |= OP_SEEN;
+ printf("\n");
+ if (nextOp->next) {
+ WARN("Inactive op has next set!\n");
+ }
+ nextOp = nextOp->nextPeer;
}
+ if (op->nextPeer == NULL) printf("|\n");
+ op = op->next;
+ printf("v\n");
+ }
+ printf("[X]\n");
+
+# if 0
+ printf("FREE OPS\n");
+ op = state->pool;
+ while (op) {
+ NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
+ if (op->state & OP_SEEN) {
+ WARN("List loop at element %d-%d", poolIndex, opIndex);
+ }
+ NCCLCHECK(printProxyOp(op, poolIndex, opIndex));
+ op->state |= OP_SEEN;
printf("->");
op = op->next;
}
printf("[X]\n");
+#else
+ op = state->pool;
+ while (op) {
+ NCCLCHECK(getOpIndex(op, state, &poolIndex, &opIndex));
+ if (op->state & OP_SEEN) {
+ WARN("List loop at element %d-%d", poolIndex, opIndex);
+ }
+ op->state |= OP_SEEN;
+ op = op->next;
+ }
+#endif
- struct ncclProxyArgs* free = state->pool;
- while (free) {
- if (free->idle & OP_SEEN) {
- WARN("Free list loop at element %ld", OP_INDEX(free));
- }
- free->idle |= OP_SEEN;
- free = free->next;
- }
-
- struct ncclProxyPool* p = state->pools;
- int i = 0;
- while (p) {
- for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++) {
- if ((p->elems[e].idle & OP_SEEN) == 0) {
- WARN("Element %d of pool %d has been lost", e, i);
- struct ncclProxyArgs* free = state->pool;
- printf("Free list ");
- while (free) {
- printf("--> %ld ", OP_INDEX(free));
- free = free->next;
- }
+ struct ncclProxyPool* pool = state->pools;
+ poolIndex = 0;
+ while (pool) {
+ struct ncclProxyArgs* elem = pool->elems;
+ for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++, elem++) {
+ if ((elem->state & OP_SEEN) == 0) {
+ printf("Elem %d-%d is not in any list:\n", poolIndex, e);
+ NCCLCHECK(printProxyOp(elem, poolIndex, e));
printf("\n");
- return ncclInternalError;
+ } else {
+ elem->state -= OP_SEEN;
}
- p->elems[e].idle -= OP_SEEN;
}
- p = p->next;
- i++;
+ pool = pool->next;
+ poolIndex++;
}
-#endif
return ncclSuccess;
}
-static ncclResult_t ProxyAppend(struct ncclProxyState* state, struct ncclProxyArgs* args) {
- struct ncclProxyArgs* proxyAppend = *args->proxyAppendPtr;
- int shared = args->subs[0].connector->conn.shared;
- if (proxyAppend) {
- if (shared && proxyAppend->opCount == args->opCount) {
- if ((proxyAppend->sliceSteps != args->sliceSteps) ||
- (proxyAppend->chunkSteps != args->chunkSteps) ||
- (proxyAppend->protocol != args->protocol) ||
- (proxyAppend->dtype != args->dtype) ||
- (proxyAppend->redOp != args->redOp)) {
- WARN("Proxy append mismatch");
- return ncclInternalError;
- }
- if (proxyAppend->nsubs >= NCCL_PROXY_MAX_SUBS) {
- WARN("Proxy append out of bound");
- return ncclInternalError;
- }
- memcpy(proxyAppend->subs+proxyAppend->nsubs, args->subs, sizeof(struct ncclProxySubArgs));
- proxyAppend->nsubs++;
- args->next = proxyAppend->next;
- // Free args as we merged them
- args->next = state->poolFreed;
- state->poolFreed = args;
- DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as group with %5ld\n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend));
+static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyArgs* args, int subIndex) {
+ struct ncclProxySubArgs* sub = args->subs+subIndex;
+ if (subIndex >= NCCL_PROXY_MAX_SUBS) {
+ WARN("Proxy append out of bounds");
+ return ncclInternalError;
+ }
+
+ //memset(sub, 0, sizeof(struct ncclProxySubArgs));
+ sub->connection = op->connection;
+ sub->channelId = op->channelId;
+ sub->nsteps = op->nsteps;
+ sub->nbytes = op->nbytes;
+ sub->peer = op->root;
+ args->nsubs = subIndex+1;
+ if (subIndex) {
+ if ((args->sliceSteps != op->sliceSteps) ||
+ (args->chunkSteps != op->chunkSteps) ||
+ (args->protocol != op->protocol) ||
+ (args->dtype != op->dtype) ||
+ (args->redOp != op->redOp)) {
+ WARN("Proxy append mismatch");
+ return ncclInternalError;
+ }
+ if (args->state != ncclProxyOpReady) {
+ WARN("Proxy append on running operation");
+ return ncclInternalError;
+ }
+ return ncclSuccess;
+ }
+ //memset(&args->progress, 0, sizeof(struct ncclProxyArgs)-offsetof(struct ncclProxyArgs, progress));
+ args->done = 0;
+ args->opCount = op->opCount;
+ args->sliceSteps = op->sliceSteps;
+ args->chunkSteps = op->chunkSteps;
+ args->chunkSize = op->chunkSize;
+ args->dtype = op->dtype;
+ args->redOp = op->redOp;
+ args->pattern = op->pattern;
+ args->protocol = op->protocol;
+ args->state = ncclProxyOpReady;
+ args->progress = op->connection->tcomm->proxyProgress;
+ args->proxyAppendPtr = op->connection->proxyAppendPtr;
+ return ncclSuccess;
+}
+
+static ncclResult_t ProxyAppend(struct ncclProxyProgressState* state, struct ncclProxyOp* op) {
+ struct ncclProxyConnection* connection = op->connection;
+ int shared = connection->shared;
+ struct ncclProxyArgs* args = *connection->proxyAppendPtr;
+
+ if (args) {
+ if (shared && args->opCount == op->opCount) {
+ NCCLCHECK(ncclProxyOpToArgs(op, args, args->nsubs));
+ DEBUG_PROXY_PRINT("Insert (%d/%5ld/%5ld) as group with %5ld\n", shared, args->opCount, op->opCount, OP_INDEX(args));
} else {
- proxyAppend->nextPeer = args;
- DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as nextPeer of %5ld\n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend));
+ struct ncclProxyArgs* prevArgs = args;
+ NCCLCHECK(allocateArgs(state, &args));
+ NCCLCHECK(ncclProxyOpToArgs(op, args, 0));
+ prevArgs->nextPeer = args;
+ DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as nextPeer of %5ld\n", OP_INDEX(args), shared, prevArgs->opCount, args->opCount, OP_INDEX(prevArgs));
*(args->proxyAppendPtr) = args;
}
} else {
// Nothing running for that peer. Add to the list
- if (state->ops == NULL) {
+ NCCLCHECK(allocateArgs(state, &args));
+ NCCLCHECK(ncclProxyOpToArgs(op, args, 0));
+ if (state->active == NULL) {
// Create the list
DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as first element\n", OP_INDEX(args), shared, args->opCount);
- state->ops = args;
+ state->active = args;
} else {
// Append element at the end of the list
- struct ncclProxyArgs* last = state->ops;
+ struct ncclProxyArgs* last = state->active;
while (last->next) last = last->next;
last->next = args;
- DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as last element\n", OP_INDEX(args),shared, args->opCount);
+ DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as last element\n", OP_INDEX(args), shared, args->opCount);
}
*(args->proxyAppendPtr) = args;
}
return ncclSuccess;
}
-static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args, int connIndex) {
+ncclResult_t ncclProxyPost(struct ncclProxyOpsPool* pool, int nextOps, int nextOpsEnd) {
+ pthread_mutex_lock(&pool->mutex);
+ if (pool->nextOps == -1) {
+ pool->nextOps = nextOps;
+ pthread_cond_signal(&pool->cond);
+ } else {
+ pool->ops[pool->nextOpsEnd].next = nextOps;
+ }
+ pool->nextOpsEnd = nextOpsEnd;
+ pthread_mutex_unlock(&pool->mutex);
+ return ncclSuccess;
+}
+
+ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, struct ncclProxyOp* proxyOp) {
+ struct ncclProxyOps* proxyOps = proxyConn->comm->proxyState.proxyOps;
+ if (proxyOps == NULL) return ncclInternalError;
+ proxyOps += proxyConn->localRank;
+ struct ncclProxyOpsPool* pool = proxyOps->pool;
+
+ TIME_START(0);
+ int opIndex = proxyOps->freeOp;
+ struct ncclProxyOp* op;
+ if (opIndex != -1) {
+ op = pool->ops+opIndex;
+ proxyOps->freeOp = op->next;
+ } else {
+ int freeOp;
+ while ((freeOp = pool->freeOps[comm->localRank]) == -1) sched_yield();
+ int freeOpNew;
+ while ((freeOpNew = __sync_val_compare_and_swap(pool->freeOps+comm->localRank, freeOp, -1)) != freeOp) freeOp = freeOpNew;
+ opIndex = freeOp;
+ op = pool->ops+opIndex;
+ proxyOps->freeOp = op->next;
+ }
+ if (op->next != -1) __builtin_prefetch(pool->ops+op->next); // Prefetch next free op
+ memcpy(op, proxyOp, sizeof(struct ncclProxyOp));
+ op->next = -1;
+ op->connection = proxyConn->connection;
+ if (proxyOps->nextOps == -1) {
+ proxyOps->nextOps = proxyOps->nextOpsEnd = opIndex;
+ } else {
+ pool->ops[proxyOps->nextOpsEnd].next = opIndex;
+ proxyOps->nextOpsEnd = opIndex;
+ }
+ if (++proxyOps->count == MAX_OPS_PER_PEER) {
+ // Post what we have so far to free some ops in the pool
+ // Do not post last operations as we could have more coming with the same opCount, and posting
+ // them in different batches would break proxyArgs aggregation with subs.
+ uint64_t lastOpCount = pool->ops[proxyOps->nextOpsEnd].opCount;
+ int lastOp = -1;
+ int toSend = 0;
+ int ops = 0;
+ for (int op= proxyOps->nextOps; op != proxyOps->nextOpsEnd; op=pool->ops[op].next) {
+ ops++;
+ if (pool->ops[op].opCount != lastOpCount) {
+ lastOp = op;
+ toSend = ops;
+ }
+ }
+ if (lastOp == -1) {
+ WARN("Unable to post incomplete proxy op chain %d..%d (opCount %ld)\n", proxyOps->nextOps, proxyOps->nextOpsEnd, lastOpCount);
+ return ncclInternalError;
+ }
+ // Cut chain at lastOp
+ int nextOps = proxyOps->nextOps;
+ proxyOps->nextOps = pool->ops[lastOp].next;
+ pool->ops[lastOp].next = -1;
+ NCCLCHECK(ncclProxyPost(proxyOps->pool, nextOps, lastOp));
+ proxyOps->count -= toSend;
+ }
+ TIME_STOP(0);
+ return ncclSuccess;
+}
+
+static ncclResult_t SaveProxy(struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex) {
if (peer < 0) return ncclSuccess;
- struct ncclChannel* channel = args->subs[0].channel;
struct ncclPeer* peerComm = channel->peers+peer;
struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex;
if (connector->transportComm == NULL) {
- WARN("Rank %d has no transport for %s peer %d on channel %d", connector->comm->rank,
- type == proxyRecv ? "recv" : "send", peer, channel->id);
+ WARN("Rank %d has no transport for %s peer %d on channel %d/%d", connector->comm->rank,
+ type == proxyRecv ? "recv" : "send", peer, channel->id, connIndex);
return ncclInternalError;
}
- if (connector->transportComm->proxy == NULL) return ncclSuccess;
-
- struct ncclProxyState* state = &connector->comm->proxyState;
- struct ncclProxyArgs* op;
- NCCLCHECK(allocateArgs(connector->comm, &op));
- memcpy(op, args, sizeof(struct ncclProxyArgs));
- op->subs[0].connector = connector;
- op->progress = connector->transportComm->proxy;
- op->state = ncclProxyOpReady;
- op->proxyAppendPtr = connector->proxyAppendPtr;
+ if (connector->transportComm->proxyProgress == NULL) return ncclSuccess;
- if (state->nextOps == NULL) state->nextOps = op;
- else state->nextOpsEnd->next = op;
- state->nextOpsEnd = op;
+ NCCLCHECK(ncclLocalOpAppend(connector->comm, &connector->proxyConn, op));
return ncclSuccess;
}
-ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int nranks) {
- struct ncclChannel* channel = args->subs[0].channel;
- int pattern = args->pattern;
+ncclResult_t ncclProxySaveColl(struct ncclComm* comm, struct ncclProxyOp* op, int nranks) {
+ struct ncclChannel* channel = comm->channels+op->channelId;
+ int pattern = op->pattern;
if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) {
struct ncclRing* ring = &channel->ring;
- if (NeedProxy(proxyRecv, pattern, args->root, ring, nranks)) NCCLCHECK(SaveProxy(proxyRecv, ring->prev, args, 0));
- if (NeedProxy(proxySend, pattern, args->root, ring, nranks)) NCCLCHECK(SaveProxy(proxySend, ring->next, args, 0));
+ if (NeedProxy(proxyRecv, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxyRecv, ring->prev, op, 0));
+ if (NeedProxy(proxySend, pattern, op->root, ring, nranks)) NCCLCHECK(SaveProxy(channel, proxySend, ring->next, op, 0));
}
if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) {
// Tree up
struct ncclTree* tree = &channel->tree;
- for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxyRecv, tree->down[i], args, 0));
- NCCLCHECK(SaveProxy(proxySend, tree->up, args, 0));
+ for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxyRecv, tree->down[i], op, 0));
+ NCCLCHECK(SaveProxy(channel, proxySend, tree->up, op, 0));
}
if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) {
// Tree down
struct ncclTree* tree = &channel->tree;
- for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxySend, tree->down[i], args, 0));
- NCCLCHECK(SaveProxy(proxyRecv, tree->up, args, 0));
+ for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(channel, proxySend, tree->down[i], op, 0));
+ NCCLCHECK(SaveProxy(channel, proxyRecv, tree->up, op, 0));
}
if (pattern == ncclPatternCollTreeUpDown) {
// CollTree up
- NCCLCHECK(SaveProxy(proxySend, channel->collTree.out, args, 1)); // For CollTree up, we are using push
+ NCCLCHECK(SaveProxy(channel, proxySend, channel->collTree.out, op, 1)); // For CollTree up, we are using push
// CollTree down
- NCCLCHECK(SaveProxy(proxyRecv, channel->collTree.out, args, 0));
+ NCCLCHECK(SaveProxy(channel, proxyRecv, channel->collTree.out, op, 0));
}
return ncclSuccess;
}
-ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyArgs* args) {
- memset(args, 0, sizeof(struct ncclProxyArgs));
- int channelId = info->channelId;
- args->nsubs = 1;
- struct ncclProxySubArgs* sub = args->subs;
+NCCL_PARAM(ChunkSize, "CHUNK_SIZE", 0);
+ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* op) {
+ memset(op, 0, sizeof(struct ncclProxyOp));
+ int channelId = info->channelId;
struct ncclChannel* channel = info->comm->channels+channelId;
- sub->channel = channel;
- args->sliceSteps = 1;
- args->chunkSteps = 1;
- args->protocol = NCCL_PROTO_SIMPLE;
- args->dtype = info->datatype;
- sub->delta = info->delta;
- sub->recvbytes = info->recvbytes;
- sub->sendbytes = info->sendbytes;
+ op->channelId = channelId;
+ op->sliceSteps = 1;
+ op->chunkSteps = 1;
+ op->protocol = NCCL_PROTO_SIMPLE;
+ op->dtype = info->datatype;
int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR;
- info->recvChunkSize = stepSize;
- info->sendChunkSize = stepSize;
+ info->chunkSize = stepSize;
+ op->root = info->root;
+ op->nbytes = info->count;
+ struct ncclPeer* peer = channel->peers + op->root;
- if (info->delta > 0 && info->recvbytes >= 0) {
- int peerrecv = (info->comm->nRanks+info->comm->rank-info->delta)%info->comm->nRanks;
- if (channel->peers[peerrecv].recv[0].transportComm && channel->peers[peerrecv].recv[0].transportComm->proxy) {
+ if (info->coll == ncclFuncSend) {
+ op->pattern = ncclPatternSend;
+ if (op->root != info->comm->rank && peer->send[1].transportComm && peer->send[1].transportComm->proxyProgress) {
// Tune chunk size for the network
- if (info->recvbytes < stepSize) info->recvChunkSize /= 4;
- else if (info->recvbytes < 8*stepSize) info->recvChunkSize /= 2;
+ if (info->count < stepSize) info->chunkSize /= 4;
+ else if (info->count < 8*stepSize) info->chunkSize /= 2;
}
- sub->recvChunkSize = info->recvChunkSize;
- }
- if (info->delta > 0 && info->sendbytes >= 0) {
- int peersend = (info->comm->rank+info->delta)%info->comm->nRanks;
- if (channel->peers[peersend].send[0].transportComm && channel->peers[peersend].send[0].transportComm->proxy) {
+ } else if (info->coll == ncclFuncRecv) {
+ op->pattern = ncclPatternRecv;
+ if (op->root != info->comm->rank && peer->recv[1].transportComm && peer->recv[1].transportComm->proxyProgress) {
// Tune chunk size for the network
- if (info->sendbytes < stepSize) info->sendChunkSize /= 4;
- else if (info->sendbytes < 8*stepSize) info->sendChunkSize /= 2;
+ if (info->count < stepSize) info->chunkSize /= 4;
+ else if (info->count < 8*stepSize) info->chunkSize /= 2;
}
- sub->sendChunkSize = info->sendChunkSize;
+ } else {
+ WARN("P2p operation is neither send or recv");
+ return ncclInternalError;
}
+ if (ncclParamChunkSize() != 0) {
+ info->chunkSize = ncclParamChunkSize();
+ }
+ op->chunkSize = info->chunkSize;
return ncclSuccess;
}
-ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args) {
- struct ncclProxySubArgs* sub = args->subs;
- struct ncclChannel* channel = sub->channel;
- args->opCount = channel->workFifoTail-1;
- args->commOpCount = comm->opCount;
- const ssize_t recvbytesOrig = sub->recvbytes;
- const ssize_t sendbytesOrig = sub->sendbytes;
- if (sub->delta > 0 && recvbytesOrig >= ssize_t(0)) {
- int peerrecv = (comm->nRanks+comm->rank-sub->delta)%comm->nRanks;
- sub->recvbytes = recvbytesOrig;
- sub->sendbytes = 0;
- sub->nsteps = DIVUP(sub->recvbytes, sub->recvChunkSize);
- if (sub->nsteps == 0) sub->nsteps = 1;
- NCCLCHECK(SaveProxy(proxyRecv, peerrecv, args, 0));
- }
- if (sub->delta > 0 && sendbytesOrig >= ssize_t(0)) {
- int peersend = (comm->rank+sub->delta)%comm->nRanks;
- sub->sendbytes = sendbytesOrig;
- sub->recvbytes = 0;
- sub->nsteps = DIVUP(sub->sendbytes, sub->sendChunkSize);
- if (sub->nsteps == 0) sub->nsteps = 1;
- NCCLCHECK(SaveProxy(proxySend, peersend, args, 0));
+ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyOp* op) {
+ struct ncclChannel* channel = comm->channels+op->channelId;
+ op->opCount = channel->workFifoTail-1;
+ if (op->root == comm->rank) return ncclSuccess;
+ if (op->pattern == ncclPatternRecv) {
+ op->nsteps = DIVUP(op->nbytes, op->chunkSize);
+ if (op->nsteps == 0) op->nsteps = 1;
+ NCCLCHECK(SaveProxy(channel, proxyRecv, op->root, op, 1));
+ } else if (op->pattern == ncclPatternSend) {
+ op->nsteps = DIVUP(op->nbytes, op->chunkSize);
+ if (op->nsteps == 0) op->nsteps = 1;
+ NCCLCHECK(SaveProxy(channel, proxySend, op->root, op, 1));
}
- // Reset proxy args for potentially multiple cuda graph launches
- // It is safe as long as SaveProxy copies contents of args to op
- sub->recvbytes = recvbytesOrig;
- sub->sendbytes = sendbytesOrig;
return ncclSuccess;
}
-static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) {
+static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) {
struct ncclProxyArgs* freeOp = *opPtr;
- DEBUG_PROXY_PRINT("Remove %ld -> %ld -> %ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(freeOp), OP_INDEX(freeOp->next));
struct ncclProxyArgs* next = freeOp->next;
+ DEBUG_PROXY_PRINT("Remove %ld -> %ld -> %ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(freeOp), OP_INDEX(next));
*opPtr = next;
if (freeOp->nextPeer) {
// replace op by nextPeer
@@ -324,7 +464,7 @@ static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs*
if (*prevOpPtr) {
(*prevOpPtr)->next = nextPeer;
} else {
- state->ops = nextPeer;
+ state->active = nextPeer;
}
nextPeer->next = next;
*(prevOpPtr) = nextPeer;
@@ -333,25 +473,31 @@ static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs*
if (*prevOpPtr) {
(*prevOpPtr)->next = next;
} else {
- state->ops = next;
+ state->active = next;
}
}
- freeOp->next = state->poolFreed;
- state->poolFreed = freeOp;
- DEBUG_PROXY_PRINT("Removed %5ld (%5ld) : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr));
+ freeOp->next = state->pool;
+ state->pool = freeOp;
+ DEBUG_PROXY_PRINT("Removed %5ld (%5ld) : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr));
+#ifdef DEBUG_PROXY
NCCLCHECK(dumpProxyState(state));
+#endif
return ncclSuccess;
}
-static ncclResult_t progressOps(struct ncclProxyState* state, struct ncclProxyArgs** opsPtr, int* idle, struct ncclComm* comm) {
+static ncclResult_t progressOps(struct ncclComm* comm, struct ncclProxyProgressState* state, struct ncclProxyArgs* opStart, int* idle) {
struct ncclProxyArgs* prevOp = NULL;
- struct ncclProxyArgs* op = *opsPtr;
+ struct ncclProxyArgs* op = opStart;
while (op) {
if (op->state == ncclProxyOpNone) return ncclInternalError;
- NCCLCHECK(op->progress(op));
+ TIME_START(0); TIME_START(1);
+ NCCLCHECK(op->progress(comm, op));
+ if (op->idle) { TIME_STOP(1); TIME_CANCEL(0); } else { TIME_CANCEL(1); TIME_STOP(0); }
*idle &= op->idle;
if (op->state == ncclProxyOpNone) {
+ TIME_START(2);
NCCLCHECK(removeOp(state, &op, &prevOp));
+ TIME_STOP(2);
} else {
prevOp = op;
op = op->next;
@@ -360,197 +506,607 @@ static ncclResult_t progressOps(struct ncclProxyState* state, struct ncclProxyAr
return ncclSuccess;
}
-ncclResult_t ncclProxyAppendPosted(struct ncclProxyState* state) {
- // Return any freed element first
- if (state->poolFreed) {
- struct ncclProxyArgs* end = state->poolFreed;
- while (end->next) end = end->next;
- pthread_mutex_lock(&state->poolMutex);
- end->next = state->poolReturned;
- state->poolReturned = state->poolFreed;
- pthread_mutex_unlock(&state->poolMutex);
- state->poolFreed = NULL;
- }
+static ncclResult_t ncclProxyGetPostedOps(struct ncclComm* comm, int* added) {
+ struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+ if (state->opsPool == NULL) return ncclInternalError;
+ struct ncclProxyOpsPool* pool = state->opsPool;
- // Then wait until we have new work to do
- pthread_mutex_lock(&state->opsMutex);
- while (state->postedOps == NULL) {
- if (state->stop) return ncclSuccess;
- pthread_cond_wait(&state->cond, &state->opsMutex);
- }
+ struct ncclProxyArgs profArgs; // Only used for profiling purposes
+ if (state->nextOps != -1) goto process_nextops;
- // Sort operations as we append them : collectives and
- // receives first, then sends.
+ // If we have ops to progress, no need to block waiting for something to arrive or even wait for the lock
+ // to be available. Exit, continue progress, and come back later.
+ if (state->active != NULL && (pool->nextOps == -1 || pthread_mutex_trylock(&pool->mutex) != 0)) return ncclSuccess;
- struct ncclProxyArgs* next, *prev = NULL, *op = state->postedOps;
- int commOpCount = op->commOpCount;
- while (op && op->commOpCount == commOpCount) {
- next = op->next;
- if (op->subs[0].sendbytes) {
- if (prev) prev->next = next;
- else state->postedOps = next;
- op->next = NULL;
- NCCLCHECK(ProxyAppend(state, op));
- } else prev = op;
- op = next;
- }
- op = state->postedOps;
- while (op && op->commOpCount == commOpCount) {
- next = op->next;
- op->next = NULL;
- NCCLCHECK(ProxyAppend(state, op));
- op = next;
+ if (state->active == NULL) {
+ pthread_mutex_lock(&pool->mutex);
+ while (pool->nextOps == -1 && !state->stop) {
+ struct ncclProxyArgs profArgs; // Only used for profiling purposes
+ ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileSleep);
+ pthread_cond_wait(&pool->cond, &pool->mutex);
+ ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileWakeup);
+ }
+ if (state->stop) { // We might have been woken up to stop.
+ pthread_mutex_unlock(&pool->mutex);
+ return ncclSuccess;
+ }
}
- state->postedOps = op;
- if (op == NULL) state->postedOpsEnd = NULL;
- NCCLCHECK(dumpProxyState(state));
- pthread_mutex_unlock(&state->opsMutex);
- if (state->poolFreed) {
- struct ncclProxyArgs* end = state->poolFreed;
- while (end->next) end = end->next;
- pthread_mutex_lock(&state->poolMutex);
- end->next = state->poolReturned;
- state->poolReturned = state->poolFreed;
- pthread_mutex_unlock(&state->poolMutex);
- state->poolFreed = NULL;
+ state->nextOps = pool->nextOps;
+ pool->nextOps = pool->nextOpsEnd = -1;
+ pthread_mutex_unlock(&pool->mutex);
+ if (state->nextOps == -1) return ncclInternalError;
+
+process_nextops:
+ ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppend);
+ TIME_START(2);
+ int freeOp[NCCL_MAX_LOCAL_RANKS];
+ int freeOpEnd[NCCL_MAX_LOCAL_RANKS];
+ for (int i=0; i<comm->localRanks; i++) freeOp[i] = -1;
+
+ for (int opIndex = state->nextOps; opIndex != -1;) {
+ struct ncclProxyOp* peerOp = pool->ops+opIndex;
+ int peer = opIndex / MAX_OPS_PER_PEER;
+ if (peerOp->connection == NULL) return ncclInternalError;
+ if (peerOp->next != -1) __builtin_prefetch(pool->ops+peerOp->next);
+ NCCLCHECK(ProxyAppend(state, peerOp));
+ (*added)++;
+ int lastOpIndex = opIndex;
+ opIndex = peerOp->next;
+ // Return op to peer pool
+ if (freeOp[peer] == -1) {
+ freeOpEnd[peer] = lastOpIndex;
+ } else {
+ peerOp->next = freeOp[peer];
+ }
+ freeOp[peer] = lastOpIndex;
+ state->nextOps = opIndex;
}
+ for (int i=0; i<comm->localRanks; i++) {
+ if (freeOp[i] == -1) continue;
+ int newFree = freeOp[i];
+ int oldFree = pool->freeOps[i];
+ pool->ops[freeOpEnd[i]].next = oldFree;
+ if (oldFree == -1) {
+ // Nothing for the main thread to consume, we can set it.
+ pool->freeOps[i] = newFree;
+ } else {
+ // The main thread may recycle free ops at any time, replace the freeOps value atomically and check it worked.
+ int swap = __sync_val_compare_and_swap(pool->freeOps+i, oldFree, newFree);
+ if (swap != oldFree) {
+ if (swap != -1) return ncclInternalError;
+ // Ops were recycled while we were trying to swap, just set the value directly now.
+ pool->ops[freeOpEnd[i]].next = -1;
+ pool->freeOps[i] = newFree;
+ }
+ }
+ }
+ profArgs.opCount = *added;
+ ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileAppendEnd);
+ TIME_STOP(2);
return ncclSuccess;
}
+#include <signal.h>
+static ncclProxyProgressState* ncclLastProxyState;
+void ncclDumpProxyState(int signal) {
+ dumpProxyState(ncclLastProxyState);
+}
-void* persistentThread(void *comm_) {
+void* ncclProxyProgress(void *comm_) {
struct ncclComm* comm = (struct ncclComm*)comm_;
- struct ncclProxyState* state = &comm->proxyState;
- char threadName[16];
- sprintf(threadName, "NCCLproxy %5d", comm->rank);
+ struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+ state->nextOps = -1;
+ signal(SIGUSR1, ncclDumpProxyState);
+ ncclLastProxyState = state;
+ char threadName[NCCL_THREAD_NAMELEN];
+ snprintf(threadName, NCCL_THREAD_NAMELEN, "NCCL Progress%2d", comm->cudaDev);
nvtxNameOsThreadA(syscall(SYS_gettid), threadName);
- struct ncclProxyArgs** opsPtr = &state->ops;
- while (1) {
- if (*comm->abortFlag) {
- return NULL;
- }
-
- while (*opsPtr == NULL) {
- if (state->stop) {
- // No more commands to process and proxy has been requested to stop
- return NULL;
- }
- ncclResult_t ret = ncclProxyAppendPosted(state);
- if (ret != ncclSuccess) {
- comm->fatalError = ret;
- INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
- return NULL;
- }
- }
+ int lastIdle = 0;
+ struct ncclProxyArgs profArgs; // Only used for profiling purposes
+ while (state->stop == 0 && *comm->abortFlag == 0) {
int idle = 1;
- ncclResult_t ret = progressOps(state, opsPtr, &idle, comm);
+ ncclResult_t ret = progressOps(comm, state, state->active, &idle);
if (ret != ncclSuccess) {
comm->fatalError = ret;
INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
return NULL;
}
+ if (lastIdle == 0 && idle == 1) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileIdle);
+ if (lastIdle == 1 && idle == 0) ncclProfilingRecord(&profArgs, 0, 0, ncclProxyProfileActive);
if (idle) {
- sched_yield(); // No request progressed. Let others run.
+ int added = 0;
+ TIME_START(3);
+ ret = ncclProxyGetPostedOps(comm, &added);
+ if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); }
+ if (ret != ncclSuccess) {
+ comm->fatalError = ret;
+ INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret);
+ }
+ if (added == 0) {
+ sched_yield(); // No request progressed. Let others run.
+ }
}
+ lastIdle = idle;
}
+ return NULL;
}
ncclResult_t ncclProxyStart(struct ncclComm* comm) {
- struct ncclProxyState* state = &comm->proxyState;
- if (state->nextOps == NULL) return ncclSuccess;
- pthread_mutex_lock(&state->opsMutex);
- if (state->postedOps) state->postedOpsEnd->next = state->nextOps;
- else state->postedOps = state->nextOps;
- state->postedOpsEnd = state->nextOpsEnd;
- state->nextOps = state->nextOpsEnd = NULL;
- pthread_cond_signal(&state->cond);
- pthread_mutex_unlock(&state->opsMutex);
+ struct ncclProxyOps* proxyOps = comm->proxyState.proxyOps;
+ if (proxyOps == NULL) return ncclSuccess;
+ TIME_START(1);
+ for (int r=0; r<comm->localRanks; r++) {
+ struct ncclProxyOps* ops = proxyOps+r;
+ if (ops->pool == NULL || ops->nextOps == -1) continue;
+ NCCLCHECK(ncclProxyPost(ops->pool, ops->nextOps, ops->nextOpsEnd));
+ ops->nextOps = ops->nextOpsEnd = -1;
+ ops->count = 0;
+ }
comm->opCount++;
+ TIME_STOP(1);
return ncclSuccess;
}
-ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr) {
- struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs;
- if (state->size == 0) {
- int p2pnChannels = 1;
- while (p2pnChannels < comm->nChannels) p2pnChannels *= 2;
- int p2pSize = 2*p2pnChannels*NCCL_MAX_WORK_ELEMENTS*comm->buffSizes[NCCL_PROTO_SIMPLE]/SENDRECV_SLICEFACTOR;
- int collNetSize = 2*comm->nChannels*comm->buffSizes[NCCL_PROTO_SIMPLE];
- state->size = std::max(p2pSize, collNetSize);
+ncclResult_t ncclProxyProgressCreate(struct ncclComm* comm) {
+ struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+ if (!state->thread) {
+ pthread_create(&state->thread, NULL, ncclProxyProgress, comm);
+ ncclSetThreadName(state->thread, "NCCL Progress%2d", comm->cudaDev);
}
+ return ncclSuccess;
+}
- *size = state->size;
+ncclResult_t ncclProxyProgressDestroy(struct ncclComm* comm) {
+ struct ncclProxyProgressState* state = &comm->proxyState.progressState;
- if (cuda && state->cudaBuff == NULL) {
- NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, *size));
- } else if (state->hostBuff == NULL) {
- NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, *size));
+ // Request the proxy to stop and then wake it
+ if (state->opsPool) {
+ pthread_mutex_lock(&state->opsPool->mutex);
+ state->stop = true;
+ pthread_cond_signal(&state->opsPool->cond);
+ pthread_mutex_unlock(&state->opsPool->mutex);
+ pthread_join(state->thread, NULL);
+ }
+
+ // Free off any memory allocated for the proxy arg pools
+ while (state->pools != NULL) {
+ struct ncclProxyPool *next = state->pools->next;
+ free(state->pools);
+ state->pools = next;
}
- *ptr = cuda ? state->cudaBuff : state->hostBuff;
+
+ ncclProfilingDump();
+ TIME_PRINT("Proxy");
return ncclSuccess;
}
-ncclResult_t ncclProxySharedBuffersGetP2p(struct ncclComm* comm, int cuda, int type, int channel, int slot, int index, char** ptr) {
- struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs;
- // Use different pools for separate send/recv.
- char* buff = cuda ? state->cudaBuff : state->hostBuff;
- int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR);
- int globalSlot = (((type*comm->p2pnChannels+channel)*NCCL_STEPS)+slot)*NCCL_MAX_WORK_ELEMENTS+index;
- *ptr = buff + slotSize * globalSlot;
+struct ncclProxyAsyncOp {
+ int type;
+ struct ncclProxyConnection* connection;
+ int reqSize, respSize;
+ char *reqBuff, *respBuff;
+};
+
+struct ncclProxyLocalPeer {
+ struct ncclSocket sock;
+ int localRank;
+ struct ncclProxyAsyncOp asyncOps;
+};
+
+#define NCCL_PROXY_CONN_POOL_SIZE_POW2 7
+#define NCCL_PROXY_CONN_POOL_SIZE (1<<(NCCL_PROXY_CONN_POOL_SIZE_POW2))
+#define NCCL_PROXY_CONN_POOL_MASK ((NCCL_PROXY_CONN_POOL_SIZE)-1)
+struct ncclProxyConnectionPool {
+ struct ncclProxyConnection** pools;
+ int banks;
+ int offset;
+ struct ncclProxyAsyncOp* ops;
+};
+
+static ncclResult_t ncclProxyNewConnection(struct ncclProxyConnectionPool* pool, int* id) {
+ if (pool->offset == NCCL_PROXY_CONN_POOL_SIZE) {
+ NCCLCHECK(ncclRealloc(&pool->pools, pool->banks, pool->banks+1));
+ NCCLCHECK(ncclCalloc(pool->pools+pool->banks, NCCL_PROXY_CONN_POOL_SIZE));
+ pool->banks++;
+ pool->offset = 0;
+ }
+ *id = ((pool->banks-1) << NCCL_PROXY_CONN_POOL_SIZE_POW2) + pool->offset;
+ pool->offset++;
return ncclSuccess;
}
-ncclResult_t ncclProxySharedBuffersGetCollNet(struct ncclComm* comm, int cuda, int type, int slot, int channel, char** ptr) {
- struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs;
- // Use different pools for different channels.
- char* buff = cuda ? state->cudaBuff : state->hostBuff;
- int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
- int globalSlot = (type*NCCL_STEPS+slot)*comm->nChannels+channel;
- *ptr = buff + slotSize * globalSlot;
+
+static ncclResult_t ncclProxyGetConnection(struct ncclProxyConnectionPool* pool, int id, struct ncclProxyConnection** conn) {
+ int bank = id>>NCCL_PROXY_CONN_POOL_SIZE_POW2;
+ int offset = id&NCCL_PROXY_CONN_POOL_MASK;
+ if ((pool->pools == NULL) || (bank > pool->banks) || (pool->pools[bank] == NULL)) return ncclInternalError;
+ *conn = pool->pools[bank]+offset;
return ncclSuccess;
}
-ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm) {
- struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs;
- CUDACHECK(cudaFree(state->cudaBuff));
- NCCLCHECK(ncclCudaHostFree(state->hostBuff));
+static ncclResult_t proxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+ if (connection->send) {
+ NCCLCHECK(ncclTransports[connection->transport].send.proxyFree(connection, comm));
+ } else {
+ NCCLCHECK(ncclTransports[connection->transport].recv.proxyFree(connection, comm));
+ }
return ncclSuccess;
}
-ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
- if (!comm->proxyThread) {
- comm->proxyState.cond = PTHREAD_COND_INITIALIZER;
- comm->proxyState.opsMutex = PTHREAD_MUTEX_INITIALIZER;
- comm->proxyState.poolMutex = PTHREAD_MUTEX_INITIALIZER;
- comm->proxyState.ops = NULL;
- pthread_create(&comm->proxyThread, NULL, persistentThread, comm);
+static ncclResult_t ncclProxyFreeConnections(struct ncclProxyConnectionPool* pool, struct ncclComm* comm) {
+ for (int b=0; b<pool->banks; b++) {
+ int max = b == pool->banks-1 ? pool->offset : NCCL_PROXY_CONN_POOL_SIZE;
+ for (int i=0; i<max; i++) {
+ NCCLCHECK(proxyFree(pool->pools[b]+i, comm));
+ }
+ free(pool->pools[b]);
}
+ free(pool->pools);
return ncclSuccess;
}
-ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
- struct ncclProxyState* state = &comm->proxyState;
+#include "transport.h"
- // Request the proxy to stop and then wake it
- pthread_mutex_lock(&state->opsMutex);
- state->stop = true;
- pthread_cond_signal(&state->cond);
- pthread_mutex_unlock(&state->opsMutex);
- if (comm->proxyThread) pthread_join(comm->proxyThread, NULL);
+ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int rank, struct ncclProxyConnector* proxyConn) {
+ // Keep one connection per mlocal rank
+ proxyConn->connection = NULL;
+ proxyConn->rank = rank;
+ if (comm->proxyState.peerSocks == NULL) {
+ NCCLCHECK(ncclCalloc(&comm->proxyState.peerSocks, comm->localRanks));
+ NCCLCHECK(ncclCalloc(&comm->proxyState.proxyOps, comm->localRanks));
+ NCCLCHECK(ncclCalloc(&comm->proxyState.sharedDevMems, comm->localRanks));
+ for (int r=0; r<comm->localRanks; r++) {
+ comm->proxyState.peerSocks[r].fd = -1;
+ comm->proxyState.peerSocks[r].abortFlag = comm->abortFlag;
+ }
+ }
+ NCCLCHECK(ncclTopoGetLocalRank(comm->topo, rank, &proxyConn->localRank));
+ struct ncclSocket* sock = comm->proxyState.peerSocks+proxyConn->localRank;
+ if (sock->fd == -1) {
+ memcpy(&sock->addr, comm->proxyState.peerAddresses+rank, sizeof(union ncclSocketAddress));
+ NCCLCHECK(ncclSocketConnect(sock));
+ }
+ int type = ncclProxyMsgInit;
+ NCCLCHECK(ncclSocketSend(sock, &type, sizeof(int)));
+ NCCLCHECK(ncclSocketSend(sock, &transport, sizeof(int)));
+ NCCLCHECK(ncclSocketSend(sock, &send, sizeof(int)));
+ NCCLCHECK(ncclSocketSend(sock, &comm->localRank, sizeof(int)));
+ NCCLCHECK(ncclSocketRecv(sock, &proxyConn->connection, sizeof(void*)));
+ struct ncclTransportComm* tcomm = send ? &ncclTransports[transport].send : &ncclTransports[transport].recv;
+ // If we need proxy progress, map progress ops
+ if (tcomm->proxyProgress) {
+ char poolPath[] = "/dev/shm/nccl-XXXXXX";
+ NCCLCHECK(ncclSocketRecv(sock, poolPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1));
+ struct ncclProxyOps* proxyOps = comm->proxyState.proxyOps+proxyConn->localRank;
+ if (proxyOps->pool == NULL) {
+ NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, 0));
+ proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
+ }
+ }
+ INFO(NCCL_NET, "Connection to proxy localRank %d -> connection %p", proxyConn->localRank, proxyConn->connection);
+ proxyConn->comm = comm;
+ return ncclSuccess;
+}
- // Free off any memory allocated for the proxy arg pools
- pthread_mutex_lock(&state->poolMutex);
- struct ncclProxyState* proxyState = &comm->proxyState;
- while (proxyState->pools != NULL) {
- struct ncclProxyPool *next = proxyState->pools->next;
- free(proxyState->pools);
- proxyState->pools = next;
+const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop" };
+ncclResult_t ncclProxyCall(struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize) {
+ if (proxyConn->comm->proxyState.peerSocks == NULL) return ncclInternalError;
+ struct ncclSocket* sock = proxyConn->comm->proxyState.peerSocks+proxyConn->localRank;
+ if (sock->fd == -1) return ncclInternalError;
+ ncclResult_t ret;
+
+ NCCLCHECKGOTO(ncclSocketSend(sock, &type, sizeof(int)), ret, error);
+ NCCLCHECKGOTO(ncclSocketSend(sock, &proxyConn->connection, sizeof(void*)), ret, error);
+ NCCLCHECKGOTO(ncclSocketSend(sock, &reqSize, sizeof(int)), ret, error);
+ NCCLCHECKGOTO(ncclSocketSend(sock, &respSize, sizeof(int)), ret, error);
+ if (reqSize) NCCLCHECKGOTO(ncclSocketSend(sock, reqBuff, reqSize), ret, error);
+ if (respSize) NCCLCHECKGOTO(ncclSocketRecv(sock, respBuff, respSize), ret, error);
+ return ncclSuccess;
+error:
+ WARN("Proxy Call to rank %d failed (%s)", proxyConn->comm->localRankToRank[proxyConn->localRank], ncclProxyMsgTypeStr[type]);
+ return ret;
+}
+
+static ncclResult_t proxyProgressInit(struct ncclComm* comm) {
+ struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+ if (state->opsPool == NULL) {
+ int size = sizeof(struct ncclProxyOpsPool);
+ struct ncclProxyOpsPool* pool = NULL;
+
+ char shmPath[sizeof("/dev/shm/nccl-XXXXXX")];
+ shmPath[0] = '\0';
+ NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, 1));
+
+ // Init pool
+ pool->nextOps = -1;
+
+ // The service thread may be launched already but localRanks may not be set yet.
+ while (comm->localRanks == 0) sched_yield();
+
+ for (int r=0; r<comm->localRanks; r++) {
+ pool->freeOps[r] = r*MAX_OPS_PER_PEER;
+ for (int i=0; i<MAX_OPS_PER_PEER-1; i++) pool->ops[r*MAX_OPS_PER_PEER+i].next = r*MAX_OPS_PER_PEER+i+1;
+ pool->ops[(r+1)*MAX_OPS_PER_PEER-1].next = -1;
+ }
+
+ // Setup mutex/cond to work inter-process
+ pthread_mutexattr_t mutexAttr;
+ pthread_mutexattr_init(&mutexAttr);
+ pthread_mutexattr_setpshared(&mutexAttr, PTHREAD_PROCESS_SHARED);
+ pthread_mutex_init(&pool->mutex, &mutexAttr);
+ pthread_condattr_t condAttr;
+ pthread_condattr_setpshared(&condAttr, PTHREAD_PROCESS_SHARED);
+ pthread_cond_init(&pool->cond, &condAttr);
+ state->opsPool = pool;
+
+ memcpy(state->opsPoolShmSuffix, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1);
+
+ // All ops structures are created, we can start the progress thread
+ NCCLCHECK(ncclProxyProgressCreate(comm));
+ }
+ return ncclSuccess;
+}
+
+static void proxyOpsFree(struct ncclComm* comm) {
+ struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+ if (ncclShmClose(state->opsPool, NULL, sizeof(struct ncclProxyOpsPool)) != ncclSuccess) {
+ WARN("[Service thread] shm close failed");
+ }
+}
+
+ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm) {
+ struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+ if (state->opsPool == NULL) return ncclSuccess;
+
+ char shmPath[] = "/dev/shm/nccl-XXXXXX";
+ memcpy(shmPath+sizeof("/dev/shm/nccl-")-1, state->opsPoolShmSuffix, sizeof("XXXXXX")-1);
+ if (ncclShmUnlink(shmPath) != ncclSuccess) {
+ WARN("[Service thread] shm unlink failed");
+ }
+ return ncclSuccess;
+}
+
+static ncclResult_t proxyConnInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) {
+ struct ncclSocket* sock = &peer->sock;
+ char buf[SOCKET_NAME_MAXLEN+1];
+ buf[SOCKET_NAME_MAXLEN] = '\0';
+ int id;
+ struct ncclProxyConnection* connection;
+ NCCLCHECK(ncclProxyNewConnection(connectionPool, &id));
+ NCCLCHECK(ncclProxyGetConnection(connectionPool, id, &connection));
+ connection->sock = sock;
+ NCCLCHECK(ncclSocketRecv(sock, &connection->transport, sizeof(int)));
+ NCCLCHECK(ncclSocketRecv(sock, &connection->send, sizeof(int)));
+ NCCLCHECK(ncclSocketRecv(sock, &peer->localRank, sizeof(int)));
+ connection->localRank = peer->localRank;
+ NCCLCHECK(ncclSocketSend(sock, &connection, sizeof(void*)));
+ connection->tcomm = connection->send ? &ncclTransports[connection->transport].send : &ncclTransports[connection->transport].recv;
+ // If we need proxy progress, let's allocate ops and start the thread
+ if (connection->tcomm->proxyProgress) {
+ NCCLCHECK(proxyProgressInit(comm));
+ struct ncclProxyProgressState* state = &comm->proxyState.progressState;
+ NCCLCHECK(ncclSocketSend(sock, state->opsPoolShmSuffix, sizeof("XXXXXX")-1));
+ }
+ buf[SOCKET_NAME_MAXLEN] = '\0';
+ INFO(NCCL_NET, "New proxy %s connection %d from %s, transport %d", connection->send ? "send":"recv", id, ncclSocketToString(&sock->addr, buf), connection->transport);
+ return ncclSuccess;
+}
+
+static ncclResult_t proxyConnSharedInit(struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm) {
+ struct ncclSocket* sock = &peer->sock;
+ struct ncclProxyConnection* connection;
+ NCCLCHECK(ncclSocketRecv(sock, &connection, sizeof(void*)));
+ int reqSize, respSize;
+ NCCLCHECK(ncclSocketRecv(sock, &reqSize, sizeof(int)));
+ NCCLCHECK(ncclSocketRecv(sock, &respSize, sizeof(int)));
+ if (reqSize != sizeof(int) || respSize != 0) return ncclInternalError;
+ int nChannels;
+ NCCLCHECK(ncclSocketRecv(sock, &nChannels, sizeof(int)));
+ if (connection->tcomm->proxySharedInit) NCCLCHECK(connection->tcomm->proxySharedInit(connection, comm, nChannels));
+ return ncclSuccess;
+}
+
+static ncclResult_t proxyProgressAsync(struct ncclProxyAsyncOp* op, struct ncclComm* comm, int* asyncOpCount) {
+ int done = 1;
+ if (op->type == ncclProxyMsgSetup) {
+ NCCLCHECK(op->connection->tcomm->proxySetup(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
+ } else if (op->type == ncclProxyMsgConnect) {
+ NCCLCHECK(op->connection->tcomm->proxyConnect(op->connection, comm, op->reqBuff, op->reqSize, op->respBuff, op->respSize, &done));
+ } else return ncclInternalError;
+ if (done) {
+ if (op->respSize) NCCLCHECK(ncclSocketSend(op->connection->sock, op->respBuff, op->respSize));
+ if (op->reqBuff) free(op->reqBuff);
+ if (op->respBuff) free(op->respBuff);
+ op->reqBuff = NULL;
+ op->respBuff = NULL;
+ op->type = 0;
+ (*asyncOpCount)--;
+ }
+ return ncclSuccess;
+}
+
+static ncclResult_t proxyConnSetupConnect(int type, struct ncclProxyLocalPeer* peer, struct ncclProxyConnectionPool* connectionPool, struct ncclComm* comm, int* asyncOpCount) {
+ struct ncclSocket* sock = &peer->sock;
+ struct ncclProxyAsyncOp* asyncOp = &peer->asyncOps;
+ asyncOp->type = type;
+ NCCLCHECK(ncclSocketRecv(sock, &asyncOp->connection, sizeof(void*)));
+
+ NCCLCHECK(ncclSocketRecv(sock, &asyncOp->reqSize, sizeof(int)));
+ NCCLCHECK(ncclSocketRecv(sock, &asyncOp->respSize, sizeof(int)));
+ if (asyncOp->reqSize) {
+ NCCLCHECK(ncclCalloc(&asyncOp->reqBuff, asyncOp->reqSize));
+ NCCLCHECK(ncclSocketRecv(sock, asyncOp->reqBuff, asyncOp->reqSize));
+ }
+ if (asyncOp->respSize) NCCLCHECK(ncclCalloc(&asyncOp->respBuff, asyncOp->respSize));
+ (*asyncOpCount)++;
+ NCCLCHECK(proxyProgressAsync(asyncOp, comm, asyncOpCount));
+ return ncclSuccess;
+}
+
+#include <poll.h>
+
+void* ncclProxyService(void* _args) {
+ struct ncclComm* comm = (struct ncclComm *) _args;
+ if (cudaSetDevice(comm->cudaDev) != cudaSuccess) {
+ WARN("[Proxy Service] Failed to set CUDA device %d", comm->cudaDev);
+ }
+ if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+
+ // Prepare poll descriptor
+ struct ncclProxyConnectionPool connectionPool;
+ connectionPool.pools = NULL;
+ connectionPool.banks = 0;
+ connectionPool.offset = NCCL_PROXY_CONN_POOL_SIZE;
+
+ struct pollfd pollfds[NCCL_MAX_LOCAL_RANKS+1];
+ struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS];
+ for (int s=0; s<NCCL_MAX_LOCAL_RANKS; s++) {
+ peers[s].sock.fd = pollfds[s].fd = -1;
+ peers[s].sock.abortFlag = NULL;
+ peers[s].sock.asyncFlag = 0;
+ pollfds[s].events = POLLHUP|POLLIN;
+ peers[s].asyncOps.type = 0;
+ }
+ pollfds[NCCL_MAX_LOCAL_RANKS].fd = comm->proxyState.listenSock->fd;
+ pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
+
+ int maxnpeers = 0;
+ int npeers = 0;
+ int stop = 0;
+ int asyncOpCount = 0;
+ while (stop == 0 || (stop == 1 && npeers > 0)) {
+ if (int error = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : -1) < 0) {
+ WARN("[Proxy Service] Poll failed with error %d", error);
+ return NULL;
+ }
+ if (pollfds[NCCL_MAX_LOCAL_RANKS].revents) {
+ int s = 0;
+ while (s < NCCL_MAX_LOCAL_RANKS && peers[s].sock.fd != -1) s++;
+ if (s == NCCL_MAX_LOCAL_RANKS) {
+ WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_LOCAL_RANKS);
+ return NULL;
+ }
+ if (maxnpeers < s+1) maxnpeers = s+1;
+ struct ncclSocket* sock = &peers[s].sock;
+ if (ncclSocketAccept(sock, comm->proxyState.listenSock) != ncclSuccess) {
+ WARN("[Service thread] Accept failed %s", strerror(errno));
+ } else {
+ pollfds[s].fd = sock->fd;
+ npeers++;
+ peers[s].localRank = -1;
+ }
+ }
+ for (int s=0; s<maxnpeers; s++) {
+ struct ncclProxyLocalPeer* peer = peers+s;
+ struct ncclSocket* sock = &peer->sock;
+ struct ncclProxyAsyncOp* op = &peer->asyncOps;
+ int closeConn = 0;
+ int type = 0;
+ ncclResult_t res = ncclSuccess;
+ if (op->type != 0) {
+ res = proxyProgressAsync(op, comm, &asyncOpCount);
+ type = op->type;
+ if (res != ncclSuccess) op->type = 0;
+ } else if (pollfds[s].revents & POLLIN) {
+ int closed;
+ if (ncclSocketTryRecv(sock, &type, sizeof(int), &closed) != ncclSuccess) {
+ WARN("[Service thread] Could not receive type from localRank %d", peer->localRank);
+ closeConn = 1;
+ } else if (closed) {
+ INFO(NCCL_INIT|NCCL_NET, "[Service thread] Connection closed by localRank %d", peer->localRank);
+ closeConn = 1;
+ } else {
+ if (type == ncclProxyMsgAbort) {
+ stop = 2;
+ closeConn = 1;
+ } else if (type == ncclProxyMsgStop) {
+ stop = 1;
+ closeConn = 1;
+ } else if (type == ncclProxyMsgClose) {
+ closeConn = 1;
+ } else if (type == ncclProxyMsgInit) {
+ res = proxyConnInit(peers+s, &connectionPool, comm);
+ } else if (type == ncclProxyMsgSharedInit) {
+ res = proxyConnSharedInit(peers+s, &connectionPool, comm);
+ } else if (type == ncclProxyMsgSetup || type == ncclProxyMsgConnect) {
+ res = proxyConnSetupConnect(type, peers+s, &connectionPool, comm, &asyncOpCount);
+ } else {
+ WARN("[Service thread] Unknown command %d from localRank %d\n", type, peer->localRank);
+ closeConn = 1;
+ }
+ }
+ } else if (pollfds[s].revents & POLLHUP) {
+ closeConn = 1;
+ }
+ if (res != ncclSuccess) {
+ WARN("[Proxy Service %d] Failed to execute operation %s from rank %d, retcode %d", comm->rank, ncclProxyMsgTypeStr[type], comm->localRankToRank[peer->localRank], res);
+ closeConn = 1;
+ }
+ if (closeConn) {
+ close(sock->fd);
+ sock->fd = pollfds[s].fd = -1;
+ npeers--;
+ }
+ }
+ }
+ // Wait for all operations to complete and stop progress thread before freeing any resource
+ if (ncclProxyProgressDestroy(comm) != ncclSuccess) {
+ WARN("[Proxy Service] proxyDestroy failed");
}
- pthread_mutex_unlock(&state->poolMutex);
+ for (int s=0; s<maxnpeers; s++) {
+ if (peers[s].sock.fd != -1) close(peers[s].sock.fd);
+ }
+ ncclProxyFreeConnections(&connectionPool, comm);
+ close(comm->proxyState.listenSock->fd);
+ free(comm->proxyState.listenSock);
+ proxyOpsFree(comm);
+ return NULL;
+}
+
+ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses) {
+ comm->proxyState.listenSock = sock;
+ comm->proxyState.peerAddresses = peerAddresses;
+ ncclSetThreadName(comm->proxyState.thread, "NCCL Service %2d", comm->cudaDev);
+ return ncclSuccess;
+}
- NCCLCHECK(ncclProxySharedBuffersDestroy(comm));
+ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
+ pthread_create(&comm->proxyState.thread, NULL, ncclProxyService, comm);
+ return ncclSuccess;
+}
+ncclResult_t ncclProxyDestroy(struct ncclComm* comm) {
+ struct ncclProxyState* state = &comm->proxyState;
+ if (state->peerAddresses) {
+ struct ncclSocket sock;
+ sock.abortFlag = NULL;
+ sock.asyncFlag = 0;
+ memcpy(&sock.addr, comm->proxyState.peerAddresses+comm->rank, sizeof(union ncclSocketAddress));
+ NCCLCHECK(ncclSocketConnect(&sock));
+ int type = (*comm->abortFlag) ? ncclProxyMsgAbort : ncclProxyMsgStop;
+ NCCLCHECK(ncclSocketSend(&sock, &type, sizeof(int)));
+ close(sock.fd);
+ free(state->peerAddresses);
+ }
+ if (state->peerSocks) {
+ for (int i=0; i<comm->localRanks; i++) {
+ if (state->peerSocks[i].fd != -1) {
+ if (state->proxyOps[i].pool) {
+ NCCLCHECK(ncclShmClose(state->proxyOps[i].pool, NULL, sizeof(struct ncclProxyOpsPool)));
+ }
+ if (state->sharedDevMems[i]) {
+ CUDACHECK(cudaIpcCloseMemHandle(state->sharedDevMems[i]));
+ }
+ int type = ncclProxyMsgClose;
+ if (*comm->abortFlag == 0) NCCLCHECK(ncclSocketSend(state->peerSocks+i, &type, sizeof(int)));
+ close(state->peerSocks[i].fd);
+ }
+ }
+ free(state->peerSocks);
+ free(state->proxyOps);
+ free(state->sharedDevMems);
+ }
return ncclSuccess;
}
diff --git a/src/transport.cc b/src/transport.cc
index 2cb5538..7ce5f2e 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,15 +7,19 @@
#include "comm.h"
#include "info.h"
#include "bootstrap.h"
+#define ENABLE_TIMER 0
+#include "timer.h"
extern struct ncclTransport p2pTransport;
extern struct ncclTransport shmTransport;
extern struct ncclTransport netTransport;
+extern struct ncclTransport collNetTransport;
struct ncclTransport ncclTransports[NTRANSPORTS] = {
p2pTransport,
shmTransport,
netTransport,
+ collNetTransport
};
template <int type>
@@ -82,12 +86,15 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
struct ncclConnect* recvData = data;
int sendChannels = 0, recvChannels = 0;
int type;
+ TIME_START(0);
for (int c=0; c<MAXCHANNELS; c++) {
if (recvMask & (1<<c)) {
NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex, &type));
if (type > highestType) highestType = type;
}
}
+ TIME_STOP(0);
+ TIME_START(1);
struct ncclConnect* sendData = recvData+recvChannels;
for (int c=0; c<MAXCHANNELS; c++) {
if (sendMask & (1<<c)) {
@@ -95,7 +102,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
if (type > highestType) highestType = type;
}
}
+ TIME_STOP(1);
+ TIME_START(2);
if (sendPeer == recvPeer) {
if (recvChannels+sendChannels) {
NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
@@ -109,7 +118,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
}
+ TIME_STOP(2);
+ TIME_START(3);
for (int c=0; c<MAXCHANNELS; c++) {
if (sendMask & (1<<c)) {
struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex;
@@ -118,6 +129,8 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
CUDACHECK(cudaMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice, transportSetupStream));
}
}
+ TIME_STOP(3);
+ TIME_START(4);
for (int c=0; c<MAXCHANNELS; c++) {
if (recvMask & (1<<c)) {
struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex;
@@ -126,11 +139,13 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
CUDACHECK(cudaMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), cudaMemcpyHostToDevice, transportSetupStream));
}
}
+ TIME_STOP(4);
comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0;
}
CUDACHECK(cudaStreamSynchronize(transportSetupStream));
CUDACHECK(cudaStreamDestroy(transportSetupStream));
if (highestTransportType != NULL) *highestTransportType = highestType;
+ TIME_PRINT("P2P Setup/Connect");
return ncclSuccess;
}
@@ -225,9 +240,9 @@ cleanup:
ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) {
// AllGather collNet setup results
- int allGatherFailures[NCCL_MAX_INTRA_RANKS] = {0};
- allGatherFailures[comm->intraNodeRank] = collNetSetupFail;
- NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->intraNodeGlobalRanks, comm->intraNodeRank, comm->localRanks, allGatherFailures, sizeof(int)));
+ int allGatherFailures[NCCL_MAX_LOCAL_RANKS] = {0};
+ allGatherFailures[comm->localRank] = collNetSetupFail;
+ NCCLCHECK(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, allGatherFailures, sizeof(int)));
for (int i=0; i<comm->localRanks; i++) {
if (allGatherFailures[i] != 0) {
collNetSetupFail = 1;
@@ -235,7 +250,7 @@ ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFa
}
}
if (collNetSetupFail) {
- if (comm->intraNodeRank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
+ if (comm->localRank == 0) WARN("Cannot initialize CollNet, using point-to-point network instead");
return ncclSystemError;
}
return ncclSuccess;
@@ -248,12 +263,12 @@ ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm) {
struct ncclPeer* peer = channel->peers+comm->nRanks;
for (int b=0; b<NCCL_MAX_CONNS; b++) {
struct ncclConnector* send = peer->send + b;
- if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send->transportResources));
+ if (send->transportResources && send->transportComm) NCCLCHECK(send->transportComm->free(send));
send->transportResources = NULL; // avoid double free
}
for (int b=0; b<NCCL_MAX_CONNS; b++) {
struct ncclConnector* recv = peer->recv + b;
- if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv->transportResources));
+ if (recv->transportResources && recv->transportComm) NCCLCHECK(recv->transportComm->free(recv));
recv->transportResources = NULL; // avoid double free
}
}
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index 4c0e76d..26f875f 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,11 +7,15 @@
#include "comm.h"
#include "coll_net.h"
#include "graph.h"
+#include "proxy.h"
+#include "gdrwrap.h"
-#define COLLNET_GROUP_NSUBS 8
-#define COLLNET_MAX_GROUPS (NCCL_PROXY_MAX_SUBS/COLLNET_GROUP_NSUBS)
+int64_t ncclParamGdrCopySyncEnable();
+int64_t ncclParamGdrCopyFlushEnable();
struct collNetRecvConnectInfo {
+ int rank;
+ int nranks;
collNetHandle_t collNetHandle;
};
@@ -20,128 +24,279 @@ struct collNetSendConnectInfo {
void* reqFifo;
};
+#define COLLNET_GROUP_NSUBS 8
+#define COLLNET_MAX_GROUPS (NCCL_PROXY_MAX_SUBS/COLLNET_GROUP_NSUBS)
+
+#define NCCL_NET_MAP_HOSTMEM 0
+#define NCCL_NET_MAP_DEVMEM 1
+#define NCCL_NET_MAP_SHARED_HOSTMEM 2
+#define NCCL_NET_MAP_SHARED_DEVMEM 3
+#define NCCL_NET_MAP_GDCMEM 4
+#define NCCL_NET_MAP_MEMS 5
+
+#define NCCL_NET_MAP_MASK_DEVMEM 0x40000000
+#define NCCL_NET_MAP_MASK_SHARED 0x80000000
+#define NCCL_NET_MAP_MASK_USED 0x20000000
+#define NCCL_NET_MAP_MASK_OFFSET 0x1fffffff
+
+#define NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName) \
+ ((mapStruct)->offsets.offsetName >> 30)
+
+#define NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) \
+ (((mapStruct)->offsets.offsetName >> 29) == 0)
+
+#define NCCL_NET_MAP_GET_POINTER(mapStruct, cpuOrGpu, offsetName) \
+ (NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) ? NULL : \
+ (mapStruct)->mems[NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName)].cpuOrGpu##Ptr + ((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_OFFSET))
+
+#define NCCL_NET_MAP_DEV_MEM(mapStruct, offsetName) \
+ (((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_DEVMEM) != 0)
+
+#define NCCL_NET_MAP_ADD_POINTER(mapStruct, shared, dev, memSize, offsetName) do { \
+ int bank = NCCL_NET_MAP_MASK_USED + (dev)*NCCL_NET_MAP_MASK_DEVMEM + (shared)*NCCL_NET_MAP_MASK_SHARED; \
+ if ((shared) == 0) { \
+ if (dev) { \
+ (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size; \
+ (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size += memSize; \
+ } else { \
+ (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size; \
+ (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size += memSize; \
+ } \
+ } else { \
+ (mapStruct)->offsets.offsetName = bank; \
+ } \
+} while (0);
+
+struct connectMapMem{
+ char* gpuPtr;
+ char* cpuPtr;
+ int size;
+};
+
+struct connectMap {
+ int shared;
+ // First 3 bits of offsets determine the mem bank. 001 is host mem, 011 is dev mem, 101 is shared host mem and 111 is shared dev mem.
+ struct connectMapMem mems[NCCL_NET_MAP_MEMS];
+ // Offsets. 3 MSBs indicate mem bank, 111 indicates NULL.
+ struct {
+ uint32_t sendMem;
+ uint32_t recvMem;
+ uint32_t buffs[NCCL_NUM_PROTOCOLS];
+ } offsets;
+};
+
struct reqSlot {
volatile void* recvBuff;
volatile int size;
};
-struct collNetSendResources {
- struct ncclComm* comm;
+struct sendResources {
+ struct connectMap map;
void* collNetComm;
struct ncclSendMem* sendMem;
struct ncclRecvMem* recvMem;
+
+ int rank;
+ int nranks;
int netDev;
int useGdr;
+ uint64_t* gdcSync;
+ void* gdrDesc;
void* sendMhandles[NCCL_NUM_PROTOCOLS];
void* recvMhandles[NCCL_NUM_PROTOCOLS];
- struct ncclRecvMem* devRecvMem;
uint64_t step;
- uint64_t llLastCleaning;
struct reqSlot (*reqFifo)[NCCL_STEPS];
int collNetRank;
};
-struct collNetRecvResources {
- struct ncclComm* comm;
+struct recvResources {
+ struct connectMap map;
void* collNetComm;
struct ncclSendMem* sendMem;
struct ncclRecvMem* recvMem;
+
+ int rank;
+ int nranks;
int netDev;
int useGdr;
+ uint64_t* gdcSync;
+ uint64_t* gdcFlush;
+ void* gdrDesc;
void* mhandles[NCCL_NUM_PROTOCOLS];
- struct ncclRecvMem* devRecvMem;
uint64_t step;
- uint64_t llLastCleaning;
struct reqSlot reqFifo[COLLNET_MAX_GROUPS][NCCL_STEPS];
int collNetRank;
};
-struct collNetSharedResources {
- void* collNetListenComms[MAXCHANNELS];
- void* collNetComms[MAXCHANNELS];
- int collNetCommRefCount[MAXCHANNELS];
-};
-
/* Determine if we can communicate with the peer */
-ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
*ret = 1;
return ncclSuccess;
}
-ncclResult_t collNetSharedListen(struct ncclComm* comm, int netDev, void* collNetHandle) {
- struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources;
- if (resources == NULL) {
- NCCLCHECK(ncclCalloc(&resources, 1));
- comm->proxyState.sharedBuffs.collNetResources = resources;
+struct setupReq {
+ int netDev;
+ int useGdr;
+};
+
+
+/* Setup send connector, and return connect information for others in the coll
+ * communicator to connect to me */
+static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
+ struct setupReq req;
+
+ int proxyRank;
+ NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
+ NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
+ send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
+
+ NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &send->proxyConn.localRank));
+ NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 1, myInfo->rank, &send->proxyConn));
+ NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
+
+ INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), req.netDev,
+ req.useGdr ? "/GDRDMA" : "");
+ return ncclSuccess;
+}
+
+static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
+ struct setupReq req;
+
+ int proxyRank;
+ NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &req.netDev, &proxyRank));
+ NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
+ recv->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
+
+ NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &recv->proxyConn.localRank));
+ NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
+ struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
+ NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
+
+ INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), req.netDev,
+ req.useGdr ? "/GDRDMA" : "");
+ return ncclSuccess;
+}
+
+static ncclResult_t collNetDumpMap(struct connectMap* map) {
+ printf("Dump map\n");
+ struct connectMapMem *mem = map->mems+NCCL_NET_MAP_HOSTMEM;
+ printf("Mem 0: Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
+ mem = map->mems+NCCL_NET_MAP_DEVMEM;
+ printf("Mem 1: Vid mem CPU (%x B) %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
+ mem = map->mems+NCCL_NET_MAP_SHARED_HOSTMEM;
+ printf("Mem 2: Shared Host mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
+ mem = map->mems+NCCL_NET_MAP_SHARED_DEVMEM;
+ printf("Mem 3: Shared Vid (%x B) mem CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
+ printf("SendMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n",
+ map->offsets.sendMem & NCCL_NET_MAP_MASK_USED ? 1 : 0,
+ NCCL_NET_MAP_OFFSET_BANK(map, sendMem), map->offsets.sendMem & NCCL_NET_MAP_MASK_OFFSET,
+ NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem), NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem));
+ printf("RecvMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n",
+ map->offsets.recvMem & NCCL_NET_MAP_MASK_USED ? 1 : 0,
+ NCCL_NET_MAP_OFFSET_BANK(map, recvMem), map->offsets.recvMem & NCCL_NET_MAP_MASK_OFFSET,
+ NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem), NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem));
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ printf("Proto %d -> Used %d Bank %d Offset %x, cpu %p, gpu %p\n", p,
+ map->offsets.buffs[p] & NCCL_NET_MAP_MASK_USED ? 1 : 0,
+ NCCL_NET_MAP_OFFSET_BANK(map, buffs[p]), map->offsets.buffs[p] & NCCL_NET_MAP_MASK_OFFSET,
+ NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]), NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]));
}
- if (resources->collNetComms[netDev] == NULL)
- NCCLCHECK(collNetListen(netDev, collNetHandle, resources->collNetListenComms+netDev));
+ printf("End of dump\n");
return ncclSuccess;
}
-/* Setup send connector, and return connect information for others in the coll communicator to connect to me */
-ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
- struct collNetSendResources* resources;
- NCCLCHECK(ncclCalloc(&resources, 1));
- send->transportResources = resources;
- send->conn.shared = 1;
- resources->comm = comm;
+struct collNetConnectArgs {
+ int rank;
+ int nranks;
+ struct ncclConnect* connectInfos;
+};
+
+static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
+ // We're on the same process as the proxy. We can pass a pointer to a struct.
+ struct collNetConnectArgs args = { rank, nranks, connectInfos };
+ struct connectMap* map;
+ NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
+
+ //NCCLCHECK(collNetDumpMap(map));
+
+ struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem);
+ void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
+ send->conn.head = gdcMem ? (uint64_t*)gdcMem : &sendMem->head;
+
+ struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
+ send->conn.tail = &recvMem->tail;
+ send->conn.sizesFifo = recvMem->sizesFifo;
+ for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;
+ send->conn.offsFifo = recvMem->offsFifo;
+
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
+ send->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
+ return ncclSuccess;
+}
- NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &resources->netDev));
- NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
+static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
+ // We're on the same process as the proxy. We can pass a pointer to a struct.
+ struct collNetConnectArgs args = { rank, nranks, connectInfos };
+ struct connectMap* map;
+ NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(struct collNetConnectArgs), &map, sizeof(struct connectMap*)));
- send->proxyAppendPtr = comm->proxyState.sharedBuffs.proxyAppendCollNet+2*resources->netDev+1;
+ //NCCLCHECK(collNetDumpMap(map));
- NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
+ struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem);
+ recv->conn.head = &sendMem->head;
- int recvSize = offsetof(struct ncclRecvMem, buff);
- // Simple uses shared buffers and we don't support LL128
- recvSize += send->comm->buffSizes[NCCL_PROTO_LL];
+ struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
+ void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
+ recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail;
+ recv->conn.offsFifo = recvMem->offsFifo;
- if (resources->useGdr) {
- NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ recv->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
}
- NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));
+ return ncclSuccess;
+}
- INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
- resources->useGdr ? "/GDRDMA" : "");
+static ncclResult_t sendFree(struct ncclConnector* send) {
return ncclSuccess;
}
-/* Setup recv connector */
-ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
- struct collNetRecvResources* resources;
- NCCLCHECK(ncclCalloc(&resources, 1));
- recv->transportResources = resources;
- recv->conn.shared = 1;
- resources->comm = comm;
+static ncclResult_t recvFree(struct ncclConnector* recv) {
+ return ncclSuccess;
+}
- NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &resources->netDev));
- NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
+static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+ struct setupReq* req = (struct setupReq*)reqBuff;
+ if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
- recv->proxyAppendPtr = comm->proxyState.sharedBuffs.proxyAppendCollNet+2*resources->netDev;
+ struct sendResources* resources;
+ NCCLCHECK(ncclCalloc(&resources, 1));
+ connection->transportResources = resources;
+ connection->shared = 1;
- NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
+ resources->netDev = req->netDev;
+ resources->useGdr = req->useGdr;
+ return ncclSuccess;
+}
- int recvSize = offsetof(struct ncclRecvMem, buff);
- // Simple uses shared buffers and we don't support LL128
- recvSize += recv->comm->buffSizes[NCCL_PROTO_LL];
+struct sharedResources {
+ void* collNetListenComms[MAXCHANNELS];
+ void* collNetComms[MAXCHANNELS];
+ int commRefCount[NCCL_MAX_NETDEVS];
+};
- if (resources->useGdr) {
- NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize));
+ncclResult_t sharedListen(struct ncclComm* comm, int netDev, void* collNetHandle) {
+ struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
+ if (resources == NULL) {
+ NCCLCHECK(ncclCalloc(&resources, 1));
+ comm->proxyState.progressState.collNet.resources = resources;
}
- NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize));
-
- INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev,
- resources->useGdr ? "/GDRDMA" : "");
- struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
-
- NCCLCHECK(collNetSharedListen(comm, resources->netDev, &info->collNetHandle));
+ if (resources->collNetComms[netDev] == NULL)
+ NCCLCHECK(collNetListen(netDev, collNetHandle, resources->collNetListenComms+netDev));
return ncclSuccess;
}
-ncclResult_t collNetSharedConnect(struct ncclComm* comm, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, void** collNetComm) {
- struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources;
+static ncclResult_t sharedConnect(struct ncclComm* comm, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, void** collNetComm) {
+ struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
if (resources->collNetComms[netDev] == NULL) {
// Connect to coll comm
collNetHandle_t** handlePtrs = NULL;
@@ -159,152 +314,234 @@ ncclResult_t collNetSharedConnect(struct ncclComm* comm, int netDev, struct nccl
NCCLCHECK(collNetCloseListen(resources->collNetListenComms[netDev]));
}
*collNetComm = resources->collNetComms[netDev];
- resources->collNetCommRefCount[netDev]++;
+ resources->commRefCount[netDev]++;
return ncclSuccess;
}
-ncclResult_t collNetSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
- // Setup device pointers
- struct collNetSendResources* resources = (struct collNetSendResources*)send->transportResources;
- struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank);
-
- // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host
- send->conn.buffs[NCCL_PROTO_LL] = resources->recvMem->buff;
- send->conn.buffs[NCCL_PROTO_LL128] = send->conn.buffs[NCCL_PROTO_SIMPLE] = NULL;
- send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
-
- // Head/Tail/Opcount/Fifos are always on host
- send->conn.tail = &resources->recvMem->tail;
- send->conn.sizesFifo = resources->recvMem->sizesFifo;
- send->conn.ptrsFifo = resources->recvMem->ptrsFifo;
- send->conn.head = &resources->sendMem->head;
- resources->sendMem->head = -NCCL_STEPS; // Don't give any credit yet when sharing buffers
- for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;
+static ncclResult_t sharedFree(struct ncclComm* comm, int netDev) {
+ struct sharedResources* resources = (struct sharedResources*)comm->proxyState.progressState.collNet.resources;
+ resources->commRefCount[netDev]--;
+ if (resources->commRefCount[netDev] == 0) {
+ NCCLCHECK(collNetCloseColl(resources->collNetComms[netDev]));
+ }
+ for (int n=0; n<NCCL_MAX_NETDEVS; n++) if (resources->commRefCount[n]) return ncclSuccess;
+ comm->proxyState.progressState.collNet.resources = NULL;
+ free(resources);
+ return ncclSuccess;
+}
+
+static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, char** gpuPtr, char** cpuPtr, int* size) {
+ struct ncclProxySharedCollNet* state = &comm->proxyState.progressState.collNet;
+ if (state->size == 0) {
+ state->size = 2*comm->nChannels*comm->buffSizes[NCCL_PROTO_SIMPLE];
+ }
+
+ *size = state->size;
+
+ if (cuda && state->cudaBuff == NULL) {
+ NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, *size));
+ }
+ if (!cuda && state->hostBuff == NULL) {
+ NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, *size));
+ }
+ *gpuPtr = *cpuPtr = cuda ? state->cudaBuff : state->hostBuff;
+ return ncclSuccess;
+}
+
+static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int type, int slot, int channel, int* offset) {
+ // Use different pools for different channels and also separate send/recv.
+ int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
+ int globalSlot = (type*NCCL_STEPS+slot)*comm->nChannels+channel;
+ *offset = slotSize * globalSlot;
+ return ncclSuccess;
+}
+
+static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm) {
+ struct ncclProxySharedCollNet* state = &comm->proxyState.progressState.collNet;
+ if (state->size == 0) return ncclSuccess;
+ CUDACHECK(cudaFree(state->cudaBuff));
+ NCCLCHECK(ncclCudaHostFree(state->hostBuff));
+ // This will be called multiple times, with multiple channels and send/recv. Make sure we only do it once.
+ state->size = 0;
+ return ncclSuccess;
+}
+
+static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+ struct setupReq* req = (struct setupReq*)reqBuff;
+ if (reqSize != sizeof (struct setupReq)) return ncclInternalError;
+
+ struct recvResources* resources;
+ NCCLCHECK(ncclCalloc(&resources, 1));
+ connection->transportResources = resources;
+ connection->shared = 1;
+
+ resources->netDev = req->netDev;
+ resources->useGdr = req->useGdr;
+
+ collNetHandle_t* netHandle = (collNetHandle_t*) respBuff;
+ if (respSize != sizeof(collNetHandle_t)) return ncclInternalError;
+
+ NCCLCHECK(sharedListen(comm, req->netDev, netHandle));
+ return ncclSuccess;
+}
+
+static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+ if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld\n", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
+ struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
+ struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
+
+ struct sendResources* resources = (struct sendResources*)(connection->transportResources);
// Get info from recv side
- resources->collNetRank = rank;
+ resources->collNetRank = args->rank;
resources->reqFifo = (struct reqSlot (*)[NCCL_STEPS])(info->reqFifo);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
resources->recvMhandles[p] = info->mhandles[p];
- NCCLCHECK(collNetSharedConnect(comm, resources->netDev, connectInfos, nranks, rank, &resources->collNetComm));
+ NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
+ connection->proxyAppendPtr = comm->proxyState.progressState.collNet.proxyAppend+2*resources->netDev;
+
+ struct connectMap* map = &resources->map;
+
+ NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
+ NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
+
+ NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
+ map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
+ if (ncclGdrCopy && ncclParamGdrCopySyncEnable()) {
+ uint64_t *cpuPtr, *gpuPtr;
+ NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc));
+
+ resources->gdcSync = cpuPtr;
+ struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
+ gdcMem->cpuPtr = (char*)cpuPtr;
+ gdcMem->gpuPtr = (char*)gpuPtr;
+ gdcMem->size = sizeof(uint64_t); // sendMem->head
+ }
+
+ resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
+ resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
+ // Don't give credits yet in shared mode.
+ resources->sendMem->head = -NCCL_STEPS;
- int size;
- char* ptr;
// Allocate & Register shared buffers for the Simple protocol
- NCCLCHECK(ncclProxySharedBuffersInit(send->comm, resources->useGdr, &size, &ptr));
- NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size,
+ int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
+ struct connectMapMem* mapMem = map->mems+bank;
+ NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
+ NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+
+ NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
&resources->sendMhandles[NCCL_PROTO_SIMPLE]));
- // Allocate & Register shared buffers for the LL protocol
- NCCLCHECK(ncclProxySharedBuffersInit(send->comm, 0, &size, &ptr));
- NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size,
- NCCL_PTR_HOST,
- &resources->sendMhandles[NCCL_PROTO_LL]));
+ if (respSize != sizeof(struct connectMap*)) { WARN("sendProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; }
+ *((struct connectMap**)respBuff) = &resources->map;
return ncclSuccess;
}
-ncclResult_t collNetRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
- // Setup device pointers
- struct collNetRecvResources* resources = (struct collNetRecvResources*)recv->transportResources;
- struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank);
- resources->collNetRank = rank;
+static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+ if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld\n", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
+ struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
- // Intermediate buffering on GPU for GPU Direct RDMA
- struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->recvMem;
- int offset = 0;
- for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
- recv->conn.buffs[p] = (p == NCCL_PROTO_LL ? resources->recvMem->buff : recvMem->buff) + offset;
- offset += recv->comm->buffSizes[p];
- }
- recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
+ struct recvResources* resources = (struct recvResources*)(connection->transportResources);
+ struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(args->connectInfos+args->rank);
+ resources->collNetRank = args->rank;
- // Head/Tail/Opcount are always on host
- recv->conn.tail = &resources->recvMem->tail;
- recv->conn.ptrsFifo = resources->recvMem->ptrsFifo;
- recv->conn.head = &resources->sendMem->head;
+ NCCLCHECK(sharedConnect(comm, resources->netDev, args->connectInfos, args->nranks, args->rank, &resources->collNetComm));
+ connection->proxyAppendPtr = comm->proxyState.progressState.collNet.proxyAppend+2*resources->netDev+1;
- NCCLCHECK(collNetSharedConnect(comm, resources->netDev, connectInfos, nranks, rank, &resources->collNetComm));
+ struct connectMap* map = &resources->map;
- int size;
- char* ptr;
+ NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
+ NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
+
+ NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
+ map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
+ if (ncclGdrCopy) {
+ uint64_t *cpuPtr, *gpuPtr;
+ NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc));
+
+ if (ncclParamGdrCopySyncEnable()) {
+ resources->gdcSync = cpuPtr;
+ struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
+ gdcMem->cpuPtr = (char*)cpuPtr;
+ gdcMem->gpuPtr = (char*)gpuPtr;
+ gdcMem->size = sizeof(uint64_t);
+ }
+ if (ncclParamGdrCopyFlushEnable()) resources->gdcFlush = cpuPtr + 1;
+ }
+
+ resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
+ resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
// Allocate & Register shared buffers for the Simple protocol
- NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, resources->useGdr, &size, &ptr));
- NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size,
+ int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
+ struct connectMapMem* mapMem = map->mems+bank;
+ NCCLCHECK(sharedBuffersInit(comm, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
+ NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+
+ NCCLCHECK(collNetRegMr(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST,
&resources->mhandles[NCCL_PROTO_SIMPLE]));
- // Allocate & Register shared buffers for the LL protocol
- NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, 0, &size, &ptr));
- NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size,
- NCCL_PTR_HOST,
- &resources->mhandles[NCCL_PROTO_LL]));
-
// Pass info to send side
info->reqFifo = resources->reqFifo;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
info->mhandles[p] = resources->mhandles[p];
+ if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld\n", respSize, sizeof(void*)); return ncclInternalError; }
+ *((struct connectMap**)respBuff) = &resources->map;
return ncclSuccess;
}
-ncclResult_t collNetSharedFree(struct ncclComm* comm, int netDev) {
- struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources;
- resources->collNetCommRefCount[netDev]--;
- if (resources->collNetCommRefCount[netDev] == 0) {
- NCCLCHECK(collNetCloseColl(resources->collNetComms[netDev]));
+static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+ struct sendResources* resources = (struct sendResources*)(connection->transportResources);
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ if (resources->sendMhandles[p]) {
+ NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[p]));
+ }
}
- for (int c=0; c<MAXCHANNELS; c++) if (resources->collNetCommRefCount[c]) return ncclSuccess;
- comm->proxyState.sharedBuffs.collNetResources = NULL;
- free(resources);
+ struct connectMapMem* mems = resources->map.mems;
+ NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
+ CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+ if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
+ NCCLCHECK(sharedBuffersDestroy(comm));
+ NCCLCHECK(sharedFree(comm, resources->netDev));
+ free(connection->transportResources);
return ncclSuccess;
}
-ncclResult_t collNetSendFree(void* sendTransportResources) {
- struct collNetSendResources* resources = (struct collNetSendResources*)sendTransportResources;
- NCCLCHECK(ncclCudaHostFree(resources->sendMem));
- NCCLCHECK(ncclCudaHostFree(resources->recvMem));
- if (resources->collNetComm) {
- NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[NCCL_PROTO_LL]));
- NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[NCCL_PROTO_SIMPLE]));
+static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+ struct recvResources* resources = (struct recvResources*)(connection->transportResources);
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ if (resources->mhandles[p]) {
+ NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[p]));
+ }
}
- if (resources->useGdr) CUDACHECK(cudaFree(resources->devRecvMem));
-
- NCCLCHECK(collNetSharedFree(resources->comm, resources->netDev));
- free(resources);
+ struct connectMapMem* mems = resources->map.mems;
+ NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
+ CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+ if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
+ NCCLCHECK(sharedBuffersDestroy(comm));
+ NCCLCHECK(sharedFree(comm, resources->netDev));
+ free(connection->transportResources);
return ncclSuccess;
}
-ncclResult_t collNetRecvFree(void* recvTransportResources) {
- struct collNetRecvResources* resources = (struct collNetRecvResources*)recvTransportResources;
- NCCLCHECK(ncclCudaHostFree(resources->sendMem));
- NCCLCHECK(ncclCudaHostFree(resources->recvMem));
- if (resources->collNetComm) {
- NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[NCCL_PROTO_LL]));
- NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[NCCL_PROTO_SIMPLE]));
- }
- if (resources->useGdr) CUDACHECK(cudaFree(resources->devRecvMem));
-
- NCCLCHECK(collNetSharedFree(resources->comm, resources->netDev));
- free(resources);
- return ncclSuccess;
-}
#define LAST_OF_GROUP(s) \
(s % COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || s == args->nsubs-1)
-ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
- if (args->protocol == NCCL_PROTO_LL128) {
- WARN("CollNet does not support LL128");
+static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+ if (args->protocol != NCCL_PROTO_SIMPLE) {
+ WARN("CollNet does not support LL/LL128");
return ncclInternalError;
}
if (args->state == ncclProxyOpReady) {
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
- struct collNetSendResources* resources = (struct collNetSendResources*) (sub->connector->transportResources);
+ struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
// Round to next multiple of sliceSteps
sub->base = ROUNDUP(resources->step, args->chunkSteps);
sub->posted = sub->received = sub->transmitted = sub->done = 0;
@@ -319,23 +556,21 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
int perGroupSteps = NCCL_STEPS / nGroups;
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
- struct collNetSendResources* resources = (struct collNetSendResources*) (sub->connector->transportResources);
+ struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
void* sendMhandle = resources->sendMhandles[p];
void* recvMhandle = resources->recvMhandles[p];
- int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS;
auto reqFifo = resources->reqFifo;
if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) {
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
- if (p == NCCL_PROTO_SIMPLE) {
- char* ptr;
- int sharedBuffSlot = sub->posted%NCCL_STEPS;
- NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, resources->useGdr, 0, sharedBuffSlot, 0, &ptr));
- resources->recvMem->ptrsFifo[buffSlot] = ptr + s*args->chunkSize;
- __sync_synchronize();
- }
- volatile uint64_t* sendHead = &resources->sendMem->head;
+ int sharedBuffSlot = sub->posted%NCCL_STEPS;
+ int offset;
+ NCCLCHECK(sharedBuffersGet(comm, 0, sharedBuffSlot, 0, &offset));
+ resources->recvMem->offsFifo[buffSlot] = offset + s*args->chunkSize;
+ __sync_synchronize();
+ volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
sub->posted += args->sliceSteps;
*sendHead = sub->base + sub->posted - NCCL_STEPS;
+ if (resources->gdcSync) wc_store_fence(); // Flush out WC write
}
// Enforce sync between operations of the same group.
bool groupSync = (((s == 0) && ((sub+args->nsubs-1)->received == sub->received)) || (s && (sub-1)->received > sub->received));
@@ -344,30 +579,15 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
int sharedBuffSlot = sub->received%NCCL_STEPS;
volatile int* sizesFifo = resources->recvMem->sizesFifo;
volatile uint64_t* recvTail = &resources->recvMem->tail;
- if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->received)) || p == NCCL_PROTO_LL)) {
+ char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[p]);
+ if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->received)))) {
// We have something to receive, let's check whether data is ready.
- int size = sizesFifo[buffSlot];
int ready = 1;
if (s == 0) {
- NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, p == NCCL_PROTO_SIMPLE ? resources->useGdr : 0, 0, sharedBuffSlot, 0, &args->sharedBuff[sharedBuffSlot]));
- args->sharedSize[sharedBuffSlot] = p == NCCL_PROTO_SIMPLE ? args->chunkSize : size/2;
- }
- if (p == NCCL_PROTO_LL) {
- char* localBuff = sub->connector->conn.buffs[p];
- uint32_t flag = NCCL_LL_FLAG(sub->base + sub->received + 1);
- int nFifoLines = size / sizeof(union ncclLLFifoLine);
- union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
- // Pack data into the shared buffer
- uint32_t* sendBuff = (uint32_t*)(args->sharedBuff[sharedBuffSlot]+args->sharedSize[sharedBuffSlot]*s);
- for (int i=0; i<nFifoLines; i++) {
- volatile uint32_t *f1 = &lines[i].flag1;
- volatile uint32_t *d1 = &lines[i].data1;
- volatile uint32_t *f2 = &lines[i].flag2;
- volatile uint32_t *d2 = &lines[i].data2;
- if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
- sendBuff[2*i] = d1[0];
- sendBuff[2*i+1] = d2[0];
- }
+ int offset;
+ NCCLCHECK(sharedBuffersGet(comm, 0, sharedBuffSlot, 0, &offset));
+ args->sharedBuff[sharedBuffSlot] = localBuff + offset;
+ args->sharedSize[sharedBuffSlot] = args->chunkSize;
}
if (ready) {
sizesFifo[buffSlot] = -1;
@@ -426,15 +646,15 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
return ncclSuccess;
}
-ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
- if (args->protocol == NCCL_PROTO_LL128) {
- WARN("CollNet does not support LL128");
+static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
+ if (args->protocol != NCCL_PROTO_SIMPLE) {
+ WARN("CollNet does not support LL/LL128");
return ncclInternalError;
}
if (args->state == ncclProxyOpReady) {
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
- struct collNetRecvResources* resources = (struct collNetRecvResources*) (sub->connector->transportResources);
+ struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
// Round to next multiple of sliceSteps
sub->base = ROUNDUP(resources->step, args->chunkSteps);
sub->posted = sub->received = sub->flushed = sub->transmitted = sub->done = 0;
@@ -449,19 +669,20 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
int perGroupSteps = NCCL_STEPS / nGroups;
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
- struct collNetRecvResources* resources = (struct collNetRecvResources*) (sub->connector->transportResources);
+ struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
void* mhandle = resources->mhandles[p];
- int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS;
auto reqFifo = resources->reqFifo;
+ char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
+
// Enforce sync between operations of the same group.
if (LAST_OF_GROUP(s) && (sub->posted < sub->done + perGroupSteps) && (sub->posted < sub->nsteps)) {
int group = s / COLLNET_GROUP_NSUBS;
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
- char* ptr;
int sharedBuffSlot = sub->posted%NCCL_STEPS;
int startChannel = group*COLLNET_GROUP_NSUBS;
- NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, p == NCCL_PROTO_SIMPLE ? resources->useGdr : 0, 1, sharedBuffSlot, startChannel, &ptr));
- reqFifo[group][buffSlot].recvBuff = ptr;
+ int offset;
+ NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
+ reqFifo[group][buffSlot].recvBuff = localBuff + offset;
TRACE(NCCL_NET, "recvProxy [%d/%d/%d] posted buffer %p", sub->posted, group, buffSlot, reqFifo[group][buffSlot].recvBuff);
sub->posted += args->sliceSteps;
args->idle = 0;
@@ -476,11 +697,24 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
int totalSize = args->sharedSize[sharedBuffSlot]*(s-group*COLLNET_GROUP_NSUBS+1);
TRACE(NCCL_NET, "recvProxy [%d/%d/%d] received, size %d", sub->received, group, buffSlot, totalSize);
sub->received += args->sliceSteps;
- if (reqFifo[group][buffSlot].size > 0 && p == NCCL_PROTO_SIMPLE && resources->useGdr) {
- int startChannel = group*COLLNET_GROUP_NSUBS;
- char* groupRecvAddress;
- NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, 1, 1, sharedBuffSlot, startChannel, &groupRecvAddress));
- NCCLCHECK(collNetIflush(resources->collNetComm, groupRecvAddress, totalSize, mhandle, sub->requests+buffSlot));
+ sub->requests[buffSlot] = NULL;
+ if (reqFifo[group][buffSlot].size > 0 && resources->useGdr) {
+ // GDRCOPY support
+ if (resources->gdcFlush) {
+#if defined (__x86_64__)
+ // Force a PCI-E read from GPU memory
+ asm volatile ("mov (%0), %%eax" :: "l"(resources->gdcFlush) : "%eax");
+#else
+ WARN("NET: GDR Flush only supported on x86_64");
+ return ncclInternalError;
+#endif
+ sub->requests[buffSlot] = NULL;
+ } else {
+ int startChannel = group*COLLNET_GROUP_NSUBS;
+ int offset;
+ NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
+ NCCLCHECK(collNetIflush(resources->collNetComm, localBuff + offset, totalSize, mhandle, sub->requests+buffSlot));
+ }
} else {
for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps;
}
@@ -506,27 +740,14 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS;
int sharedBuffSlot = sub->transmitted%NCCL_STEPS;
int startChannel = group*COLLNET_GROUP_NSUBS;
- char* groupRecvAddress;
- NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, p == NCCL_PROTO_SIMPLE ? resources->useGdr : 0, 1, sharedBuffSlot, startChannel, &groupRecvAddress));
- char* ptr = groupRecvAddress + (s%COLLNET_GROUP_NSUBS)*args->sharedSize[sharedBuffSlot];
- if (p == NCCL_PROTO_SIMPLE) {
- volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
- ptrsFifo[buffSlot] = ptr;
- __sync_synchronize();
- resources->recvMem->tail = sub->base + sub->flushed;
- }
- if (p == NCCL_PROTO_LL) { // ll
- // re-attach flag
- char* localBuff = sub->connector->conn.buffs[p];
- uint32_t flag = NCCL_LL_FLAG(sub->base + sub->transmitted + 1);
- union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize);
- uint32_t* recvData = (uint32_t*)ptr;
- int nFifoLines = DIVUP(args->sharedSize[sharedBuffSlot], 2*sizeof(uint32_t));
- for (int i=0; i<nFifoLines; i++) {
- lines[i].v[0] = ((uint64_t)flag << 32) + recvData[2*i];
- lines[i].v[1] = ((uint64_t)flag << 32) + recvData[2*i+1];
- }
- }
+ int offset;
+ NCCLCHECK(sharedBuffersGet(comm, 1, sharedBuffSlot, startChannel, &offset));
+ volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo;
+ offsFifo[buffSlot] = offset;
+ __sync_synchronize();
+ volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
+ *recvTail = sub->base + sub->flushed;
+ if (resources->gdcSync) wc_store_fence(); // Flush out WC write
sub->transmitted += args->sliceSteps;
args->idle = 0;
continue;
@@ -551,7 +772,7 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
struct ncclTransport collNetTransport = {
"COL",
- collNetCanConnect,
- { collNetSendSetup, collNetSendConnect, collNetSendFree, collNetSendProxy },
- { collNetRecvSetup, collNetRecvConnect, collNetRecvFree, collNetRecvProxy }
+ canConnect,
+ { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress },
+ { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress }
};
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 5abc32d..56f0315 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,51 +7,125 @@
#include "comm.h"
#include "net.h"
#include "graph.h"
+#include "proxy.h"
#include "collectives.h"
#include "gdrwrap.h"
+#include "shm.h"
+#include "profiler.h"
-struct netConnectInfo {
- ncclNetHandle_t netHandle;
+static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large");
+
+#define NCCL_NET_MAP_HOSTMEM 0
+#define NCCL_NET_MAP_DEVMEM 1
+#define NCCL_NET_MAP_SHARED_HOSTMEM 2
+#define NCCL_NET_MAP_SHARED_DEVMEM 3
+#define NCCL_NET_MAP_GDCMEM 4
+#define NCCL_NET_MAP_MEMS 5
+
+#define NCCL_NET_MAP_MASK_DEVMEM 0x40000000
+#define NCCL_NET_MAP_MASK_SHARED 0x80000000
+#define NCCL_NET_MAP_MASK_USED 0x20000000
+#define NCCL_NET_MAP_MASK_OFFSET 0x1fffffff
+
+#define NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName) \
+ ((mapStruct)->offsets.offsetName >> 30)
+
+#define NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) \
+ (((mapStruct)->offsets.offsetName >> 29) == 0)
+
+#define NCCL_NET_MAP_GET_POINTER(mapStruct, cpuOrGpu, offsetName) \
+ (NCCL_NET_MAP_OFFSET_NULL(mapStruct, offsetName) ? NULL : \
+ (mapStruct)->mems[NCCL_NET_MAP_OFFSET_BANK(mapStruct, offsetName)].cpuOrGpu##Ptr + ((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_OFFSET))
+
+#define NCCL_NET_MAP_DEV_MEM(mapStruct, offsetName) \
+ (((mapStruct)->offsets.offsetName & NCCL_NET_MAP_MASK_DEVMEM) != 0)
+
+#define NCCL_NET_MAP_ADD_POINTER(mapStruct, shared, dev, memSize, offsetName) do { \
+ int bank = NCCL_NET_MAP_MASK_USED + (dev)*NCCL_NET_MAP_MASK_DEVMEM + (shared)*NCCL_NET_MAP_MASK_SHARED; \
+ if ((shared) == 0) { \
+ if (dev) { \
+ (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size; \
+ (mapStruct)->mems[NCCL_NET_MAP_DEVMEM].size += memSize; \
+ } else { \
+ (mapStruct)->offsets.offsetName = bank + (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size; \
+ (mapStruct)->mems[NCCL_NET_MAP_HOSTMEM].size += memSize; \
+ } \
+ } else { \
+ (mapStruct)->offsets.offsetName = bank; \
+ } \
+} while (0);
+
+struct connectMapMem{
+ char* gpuPtr;
+ char* cpuPtr;
+ int size;
+ union {
+ char shmPath[PATH_MAX];
+ cudaIpcMemHandle_t ipc;
+ };
};
-#define LOC_HOSTMEM 0
-#define LOC_DEVMEM 1
-#define LOC_COUNT 2
+struct connectMap {
+ int sameProcess;
+ int shared;
+ int cudaDev;
+ // First 3 bits of offsets determine the mem bank. 001 is host mem, 011 is dev mem, 101 is shared host mem and 111 is shared dev mem.
+ struct connectMapMem mems[NCCL_NET_MAP_MEMS];
+ // Offsets. 3 MSBs indicate mem bank, 111 indicates NULL.
+ struct {
+ uint32_t sendMem;
+ uint32_t recvMem;
+ uint32_t buffs[NCCL_NUM_PROTOCOLS];
+ } offsets;
+};
-struct netSendResources {
+struct sendResources {
+ struct connectMap map;
void* netSendComm;
struct ncclSendMem* sendMem;
struct ncclRecvMem* recvMem;
+
+ int rank;
+ int localRank;
+ int remoteRank;
int netDev;
int useGdr;
+ int maxRecvs;
+ uint64_t* gdcSync;
+ void* gdrDesc;
int shared;
- char* buffers[LOC_COUNT];
- int buffSizes[LOC_COUNT];
- void* mhandles[LOC_COUNT];
- void** mhandlesProto[NCCL_NUM_PROTOCOLS];
+ int channelId;
+ int connIndex;
+ char* buffers[NCCL_NUM_PROTOCOLS];
+ int buffSizes[NCCL_NUM_PROTOCOLS];
+ void* mhandles[NCCL_NUM_PROTOCOLS];
uint64_t step;
uint64_t llLastCleaning;
};
-struct netRecvResources {
+struct recvResources {
+ struct connectMap map;
void* netListenComm;
void* netRecvComm;
struct ncclSendMem* sendMem;
struct ncclRecvMem* recvMem;
- // GDRCOPY support
- void* gdrMemDesc;
- struct ncclRecvMem* devRecvMem;
- void* gdrFlushDesc;
- int* devFlushMem;
-
+ int rank;
+ int localRank;
+ int remoteRank;
+ int proxyRank;
int netDev;
int useGdr;
+ int maxRecvs;
+ uint64_t* gdcSync;
+ uint64_t* gdcFlush;
+ void* gdrDesc;
int shared;
- char* buffers[LOC_COUNT];
- int buffSizes[LOC_COUNT];
- void* mhandles[LOC_COUNT];
- void** mhandlesProto[NCCL_NUM_PROTOCOLS];
+ int channelId;
+ int connIndex;
+ char* buffers[NCCL_NUM_PROTOCOLS];
+ int buffSizes[NCCL_NUM_PROTOCOLS];
+ void* mhandles[NCCL_NUM_PROTOCOLS];
uint64_t step;
uint64_t llLastCleaning;
};
@@ -59,7 +133,7 @@ struct netRecvResources {
NCCL_PARAM(NetDisableIntra, "NET_DISABLE_INTRA", -2);
/* Determine if two peers can communicate with NET */
-ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
// Same host?
if (info1->hostHash == info2->hostHash) {
// User disabled NET for intra-node?
@@ -73,274 +147,670 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
}
NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2);
+NCCL_PARAM(NetSharedComms, "NET_SHARED_COMMS", 1);
+
+struct setupReq {
+ int rank;
+ int localRank;
+ int remoteRank;
+ int shared;
+ int netDev;
+ int useGdr;
+ int channelId;
+ int connIndex;
+};
/* Determine if we will use this transport for this peer and return connect
* information for this peer */
-ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
- struct netSendResources* resources;
- NCCLCHECK(ncclCalloc(&resources, 1));
- send->transportResources = resources;
- send->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1;
- send->proxyAppendPtr = send->conn.shared ? comm->proxyState.sharedBuffs.proxyAppend+2*channelId+1 : &send->proxyAppend;
+static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
+ struct setupReq req;
- // Send/Receive: Round-robin NICs based on the receiver's CUDA device
- int nicRR = comm->peerInfo[peerInfo->rank].cudaDev;
- NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &resources->netDev));
- NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
+ send->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
+ req.channelId = channelId;
+ req.connIndex = connIndex;
- NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
- NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1));
+ int proxyRank;
+ NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &req.netDev, &proxyRank));
+ NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 1, &req.useGdr));
+ send->conn.direct |= req.useGdr ? NCCL_DIRECT_NIC : 0;
- send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
- send->conn.tail = &resources->recvMem->tail;
- send->conn.sizesFifo = resources->recvMem->sizesFifo;
- // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
- send->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL;
- send->conn.head = &resources->sendMem->head;
- resources->sendMem->head = resources->shared ? -NCCL_STEPS : 0; // Don't give any credit yet when sharing buffers
- for (int i=0; i<NCCL_STEPS; i++) send->conn.sizesFifo[i] = -1;
-
- if (resources->shared == 0) {
- int protoLoc[NCCL_NUM_PROTOCOLS];
- for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
- protoLoc[p] = p != NCCL_PROTO_LL && resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
- }
- int buffSizes[NCCL_NUM_PROTOCOLS];
- for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
- buffSizes[p] = send->comm->buffSizes[p];
- resources->buffSizes[protoLoc[p]] += buffSizes[p];
- }
-
- if (resources->buffSizes[LOC_DEVMEM]) {
- NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM]));
- }
- if (resources->buffSizes[LOC_HOSTMEM]) {
- NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
- }
+ NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn));
+ req.rank = myInfo->rank;
+ NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
+ req.remoteRank = peerInfo->rank;
+ NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
- int offsets[LOC_COUNT];
- offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
- for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
- resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
- send->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
- offsets[protoLoc[p]] += buffSizes[p];
- }
+ if (proxyRank == myInfo->rank) {
+ INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
+ req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
+ } else {
+ INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), req.netDev,
+ proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
}
-
- INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), resources->netDev,
- resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : "");
+ *((int*)connectInfo) = proxyRank;
return ncclSuccess;
}
// GDRCOPY support: TAIL_ENABLE When enabled locates the RX proxy tail in CUDA memory
-NCCL_PARAM(GdrCopyTailEnable, "GDRCOPY_TAIL_ENABLE", 1);
+NCCL_PARAM(GdrCopySyncEnable, "GDRCOPY_SYNC_ENABLE", 1);
// GDRCOPY support: FLUSH_ENABLE When enabled uses a PCI-E read to flush GDRDMA buffers
NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0);
-ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
- struct netRecvResources* resources;
- NCCLCHECK(ncclCalloc(&resources, 1));
- recv->transportResources = resources;
- recv->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1;
- recv->proxyAppendPtr = recv->conn.shared ? comm->proxyState.sharedBuffs.proxyAppend+2*channelId : &recv->proxyAppend;
-
- // Send/Receive: Round-robin NICs based on the receiver's CUDA device
- int nicRR = comm->cudaDev;
- NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &resources->netDev));
- NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
-
- NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1));
- NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1));
-
- // GDRCOPY tail support
- if (ncclGdrCopy != NULL && ncclParamGdrCopyTailEnable() == 1) {
- struct ncclRecvMem* devCudaPtr;
- NCCLCHECK(ncclGdrCudaCalloc(&resources->devRecvMem, &devCudaPtr, 1, &resources->gdrMemDesc));
- // The GDR mapped VA doesn't work on the SMs
- recv->conn.tail = &((struct ncclRecvMem*)devCudaPtr)->tail;
- } else {
- recv->conn.tail = &resources->recvMem->tail;
+/* Setup recv connector */
+static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) {
+ struct setupReq req;
+
+ recv->conn.shared = req.shared = graph ? 0 : ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : 1;
+ req.channelId = channelId;
+ req.connIndex = connIndex;
+
+ // Use myInfo->rank as the receiver uses its own NIC
+ int proxyRank;
+ NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &req.netDev, &proxyRank));
+ NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, req.netDev, 0, &req.useGdr));
+
+ // We don't support PXN on receive yet
+ NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn));
+
+ req.rank = myInfo->rank;
+ NCCLCHECK(ncclTopoGetLocalRank(comm->topo, myInfo->rank, &req.localRank));
+ req.remoteRank = peerInfo->rank;
+ NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
+
+ INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), req.netDev,
+ req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
+ return ncclSuccess;
+}
+
+static ncclResult_t netMapShm(struct connectMapMem* mem) {
+ NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, 0));
+ NCCLCHECK(ncclShmUnlink(mem->shmPath));
+ return ncclSuccess;
+}
+static ncclResult_t netCreateShm(struct connectMapMem* mem) {
+ mem->shmPath[0] = '\0'; // Let ncclShmOpen create a tmp file
+ NCCLCHECK(ncclShmOpen(mem->shmPath, mem->size, (void**)&mem->cpuPtr, NULL, 1));
+ return ncclSuccess;
+}
+
+static ncclResult_t netDumpMap(struct connectMap* map) {
+ printf("Dump map same process %d shared %d\n", map->sameProcess, map->shared);
+ struct connectMapMem *mem = map->mems+NCCL_NET_MAP_HOSTMEM;
+ printf("Mem 0: Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr);
+ mem = map->mems+NCCL_NET_MAP_DEVMEM;
+ printf("Mem 1: Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
+ mem = map->mems+NCCL_NET_MAP_SHARED_HOSTMEM;
+ printf("Mem 2: Shared Host mem %s (%x B) CPU %p GPU %p\n", mem->shmPath, mem->size, mem->cpuPtr, mem->gpuPtr);
+ mem = map->mems+NCCL_NET_MAP_SHARED_DEVMEM;
+ printf("Mem 3: Shared Vid mem (%x B) CPU %p GPU %p\n", mem->size, mem->cpuPtr, mem->gpuPtr);
+ printf("SendMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n",
+ map->offsets.sendMem & NCCL_NET_MAP_MASK_USED ? 1 : 0,
+ NCCL_NET_MAP_OFFSET_BANK(map, sendMem), map->offsets.sendMem & NCCL_NET_MAP_MASK_OFFSET,
+ NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem), NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem));
+ printf("RecvMem -> Used %d Bank %d Offset %x, cpu %p gpu %p\n",
+ map->offsets.recvMem & NCCL_NET_MAP_MASK_USED ? 1 : 0,
+ NCCL_NET_MAP_OFFSET_BANK(map, recvMem), map->offsets.recvMem & NCCL_NET_MAP_MASK_OFFSET,
+ NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem), NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem));
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ printf("Proto %d -> Used %d Bank %d Offset %x, cpu %p, gpu %p\n", p,
+ map->offsets.buffs[p] & NCCL_NET_MAP_MASK_USED ? 1 : 0,
+ NCCL_NET_MAP_OFFSET_BANK(map, buffs[p]), map->offsets.buffs[p] & NCCL_NET_MAP_MASK_OFFSET,
+ NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]), NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]));
}
+ printf("End of dump\n");
+ return ncclSuccess;
+}
- // GDRCOPY flush support
-#if defined (__x86_64__)
- if (ncclGdrCopy != NULL && ncclParamGdrCopyFlushEnable() == 1) {
- int* cudaPtr;
- NCCLCHECK(ncclGdrCudaCalloc(&resources->devFlushMem, &cudaPtr, 1, &resources->gdrFlushDesc));
+static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
+ // Setup device pointers
+ struct connectMap* map;
+ NCCLCHECK(ncclCalloc(&map, 1));
+ send->transportResources = map;
+ NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(ncclNetHandle_t), map, sizeof(struct connectMap)));
+
+ if (map->sameProcess) {
+ if (map->cudaDev != comm->cudaDev) {
+ // Enable P2P access
+ cudaError_t err = cudaDeviceEnablePeerAccess(map->cudaDev, 0);
+ if (err == cudaErrorPeerAccessAlreadyEnabled) {
+ cudaGetLastError();
+ } else if (err != cudaSuccess) {
+ WARN("failed to peer with device %d: %d %s", map->cudaDev, err, cudaGetErrorString(err));
+ return ncclInternalError;
+ }
+ }
+ } else {
+ NCCLCHECK(netMapShm(map->mems+NCCL_NET_MAP_HOSTMEM));
+ if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
+ CUDACHECK(cudaIpcOpenMemHandle((void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].ipc, cudaIpcMemLazyEnablePeerAccess));
+ map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = NULL;
+ }
+ if (map->mems[NCCL_NET_MAP_SHARED_DEVMEM].size) {
+ void** sharedDevMemPtr = comm->proxyState.sharedDevMems+send->proxyConn.localRank;
+ if (*sharedDevMemPtr == NULL) {
+ CUDACHECK(cudaIpcOpenMemHandle(sharedDevMemPtr, map->mems[NCCL_NET_MAP_SHARED_DEVMEM].ipc, cudaIpcMemLazyEnablePeerAccess));
+ }
+ map->mems[NCCL_NET_MAP_SHARED_DEVMEM].gpuPtr = (char*)(*sharedDevMemPtr);
+ map->mems[NCCL_NET_MAP_SHARED_DEVMEM].cpuPtr = NULL;
+ }
}
-#endif
+ //NCCLCHECK(netDumpMap(map));
- recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
+ struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem);
+ void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
+ send->conn.head = gdcMem ? (uint64_t*)gdcMem : &sendMem->head;
+
+ struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
+ send->conn.tail = &recvMem->tail;
+ send->conn.sizesFifo = recvMem->sizesFifo;
// Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
- recv->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL;
- recv->conn.head = &resources->sendMem->head;
+ send->conn.offsFifo = map->shared ? recvMem->offsFifo : NULL;
- if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree not for p2p
- int protoLoc[NCCL_NUM_PROTOCOLS];
- for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
- protoLoc[p] = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
- }
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
+ send->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
+ return ncclSuccess;
+}
- int buffSizes[NCCL_NUM_PROTOCOLS];
- for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
- buffSizes[p] = recv->comm->buffSizes[p];
- resources->buffSizes[protoLoc[p]] += buffSizes[p];
- }
+/* Connect to this peer */
+static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
+ struct connectMap* map;
+ NCCLCHECK(ncclCalloc(&map, 1));
+ recv->transportResources = map;
+ NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgConnect, connectInfo, sizeof(int), map, sizeof(struct connectMap)));
+ //NCCLCHECK(netDumpMap(map));
- if (resources->buffSizes[LOC_DEVMEM]) {
- NCCLCHECK(ncclCudaCalloc(resources->buffers+LOC_DEVMEM, resources->buffSizes[LOC_DEVMEM]));
- }
- if (resources->buffSizes[LOC_HOSTMEM]) {
- NCCLCHECK(ncclCudaHostCalloc(resources->buffers+LOC_HOSTMEM, resources->buffSizes[LOC_HOSTMEM]));
+ struct ncclSendMem *sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, sendMem);
+ recv->conn.head = &sendMem->head;
+
+ struct ncclRecvMem *recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, gpu, recvMem);
+ void* gdcMem = map->mems[NCCL_NET_MAP_GDCMEM].gpuPtr;
+ recv->conn.tail = gdcMem ? (uint64_t*)gdcMem : &recvMem->tail;
+ recv->conn.sizesFifo = recvMem->sizesFifo;
+ // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree
+ recv->conn.offsFifo = map->shared ? recvMem->offsFifo : NULL;
+
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++)
+ recv->conn.buffs[p] = NCCL_NET_MAP_GET_POINTER(map, gpu, buffs[p]);
+ return ncclSuccess;
+}
+
+static ncclResult_t sendFree(struct ncclConnector* send) {
+ struct connectMap* map = (struct connectMap*)(send->transportResources);
+ if (map->sameProcess == 0) {
+ NCCLCHECK(ncclShmClose(map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
+ if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
+ CUDACHECK(cudaIpcCloseMemHandle(map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
}
+ }
+ return ncclSuccess;
+}
- int offsets[LOC_COUNT];
- offsets[LOC_HOSTMEM] = offsets[LOC_DEVMEM] = 0;
- for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
- resources->mhandlesProto[p] = resources->mhandles+protoLoc[p];
- recv->conn.buffs[p] = resources->buffers[protoLoc[p]] + offsets[protoLoc[p]];
- offsets[protoLoc[p]] += buffSizes[p];
+static ncclResult_t recvFree(struct ncclConnector* recv) {
+ return ncclSuccess;
+}
+
+#define NCCL_SHARED_STEPS 16
+static ncclResult_t sharedBuffersInit(struct ncclComm* comm, int cuda, int localRank, int type, int sameProcess,
+ int nChannels, char** gpuPtr, char** cpuPtr, int* size, cudaIpcMemHandle_t* ipc) {
+ if (cuda == 0 && sameProcess == 0) {
+ WARN("PXN should not use host buffers for data");
+ return ncclInternalError;
+ }
+ struct ncclProxyProgressState* progressState = &comm->proxyState.progressState;
+ if (progressState->localPeers == NULL) {
+ NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks));
+ }
+ struct ncclProxyPeer** localPeers = progressState->localPeers;
+ if (localPeers[localRank] == NULL) {
+ NCCLCHECK(ncclCalloc(localPeers+localRank, 1));
+ }
+ struct ncclProxyPeer* peer = localPeers[localRank];
+ struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
+ state->refcount++;
+ if (state->size == 0) {
+ state->size = nChannels*(NCCL_SHARED_STEPS/NCCL_STEPS)*comm->buffSizes[NCCL_PROTO_SIMPLE]/SENDRECV_SLICEFACTOR;
+ }
+
+ if (size) *size = state->size;
+
+ if (cuda && state->cudaBuff == NULL) {
+ NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size));
+ if (sameProcess == 0) {
+ CUDACHECK(cudaIpcGetMemHandle(&state->ipc, state->cudaBuff));
}
}
+ if (!cuda && state->hostBuff == NULL) {
+ NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size));
+ }
+ if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff;
+ if (sameProcess) {
+ if (gpuPtr) *gpuPtr = *cpuPtr;
+ } else {
+ if (gpuPtr) *gpuPtr = NULL;
+ if (ipc) memcpy(ipc, &state->ipc, sizeof(cudaIpcMemHandle_t));
+ }
+ return ncclSuccess;
+}
+
+static ncclResult_t sharedBuffersGet(struct ncclComm* comm, int channel, int slot, int* offset) {
+ // Use different pools for different channels and also separate send/recv.
+ int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR);
+ int globalSlot = (channel*NCCL_SHARED_STEPS)+slot;
+ *offset = slotSize * globalSlot;
+ return ncclSuccess;
+}
+
+static ncclResult_t sharedBuffersDestroy(struct ncclComm* comm, int localRank, int type) {
+ if (comm->proxyState.progressState.localPeers == NULL) NCCLCHECK(ncclInternalError);
+ struct ncclProxyPeer* peer = comm->proxyState.progressState.localPeers[localRank];
+ if (peer == NULL) NCCLCHECK(ncclInternalError;)
+ struct ncclProxySharedP2p* state = type == 0 ? &peer->send : &peer->recv;
+ if (state->size == 0) NCCLCHECK(ncclInternalError);
+ state->refcount--;
+ if (state->refcount == 0) {
+ if (state->cudaBuff) CUDACHECK(cudaFree(state->cudaBuff));
+ if (state->hostBuff) NCCLCHECK(ncclCudaHostFree(state->hostBuff));
+ }
+ if (peer->send.refcount || peer->recv.refcount) return ncclSuccess;
+ free(peer);
+ comm->proxyState.progressState.localPeers[localRank] = NULL;
+ for (int r=0; r<comm->localRanks; r++) {
+ if (comm->proxyState.progressState.localPeers[r]) return ncclSuccess;
+ }
+ // All peers are freed, free array
+ free(comm->proxyState.progressState.localPeers);
+ comm->proxyState.progressState.localPeers = NULL;
+ return ncclSuccess;
+}
- INFO(NCCL_INIT|NCCL_NET,"Channel %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), resources->netDev,
- resources->useGdr ? "/GDRDMA" : "", resources->shared ? "/Shared" : "");
- struct netConnectInfo* info = (struct netConnectInfo*) connectInfo;
- NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm));
+static ncclResult_t proxySharedInit(struct ncclProxyConnection* connection, struct ncclComm* comm, int nChannels) {
+ int rank = comm->localRankToRank[connection->localRank];
+ int sameProcess = comm->peerInfo[rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
+ NCCLCHECK(sharedBuffersInit(comm, 1, connection->localRank, 0, sameProcess, nChannels, NULL, NULL, NULL, NULL));
+ return ncclSuccess;
+}
+static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+ struct setupReq* req = (struct setupReq*) reqBuff;
+ if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
+
+ struct sendResources* resources;
+ NCCLCHECK(ncclCalloc(&resources, 1));
+ connection->transportResources = resources;
+
+ resources->rank = req->rank;
+ resources->localRank = req->localRank;
+ resources->remoteRank = req->remoteRank;
+ resources->netDev = req->netDev;
+ resources->shared = connection->shared = req->shared;
+ resources->useGdr = req->useGdr;
+ resources->channelId = req->channelId;
+ resources->connIndex = req->connIndex;
+ ncclNetProperties_t props;
+ NCCLCHECK(ncclNetGetProperties(req->netDev, &props));
+ resources->maxRecvs = props.maxRecvs;
+
+ // We don't return any data
+ if (respSize != 0) return ncclInternalError;
+ *done = 1;
return ncclSuccess;
}
-ncclResult_t netSendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
- // Setup device pointers
- struct netSendResources* resources = (struct netSendResources*)send->transportResources;
- struct netConnectInfo* info = (struct netConnectInfo*)connectInfo;
+static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+ struct setupReq* req = (struct setupReq*) reqBuff;
+ if (reqSize != sizeof(struct setupReq)) return ncclInternalError;
+
+ struct recvResources* resources;
+ NCCLCHECK(ncclCalloc(&resources, 1));
+ connection->transportResources = resources;
+
+ resources->rank = req->rank;
+ resources->localRank = req->localRank;
+ resources->remoteRank = req->remoteRank;
+ resources->netDev = req->netDev;
+ resources->shared = connection->shared = req->shared;
+ resources->useGdr = req->useGdr;
+ resources->channelId = req->channelId;
+ resources->connIndex = req->connIndex;
+ ncclNetProperties_t props;
+ NCCLCHECK(ncclNetGetProperties(req->netDev, &props));
+ resources->maxRecvs = props.maxRecvs;
- // Connect to remote peer
- NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm));
+ if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
+ NCCLCHECK(ncclNetListen(req->netDev, respBuff, &resources->netListenComm));
+ *done = 1;
+ return ncclSuccess;
+}
+
+static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+ struct sendResources* resources = (struct sendResources*)(connection->transportResources);
+ if (reqSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
if (resources->shared) {
+ // Shared buffers
+ struct ncclProxyProgressState* progressState = &comm->proxyState.progressState;
+ if (progressState->localPeers == NULL) {
+ NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks));
+ }
+ struct ncclProxyPeer** localPeers = progressState->localPeers;
+ if (localPeers[resources->localRank] == NULL) {
+ NCCLCHECK(ncclCalloc(localPeers+resources->localRank, 1));
+ }
+ connection->proxyAppendPtr = localPeers[resources->localRank]->send.proxyAppend+resources->channelId;
+
+ if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
+ // Connect or reuse connection for a netdev/remote rank.
+ if (progressState->netComms[resources->netDev] == NULL) {
+ NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
+ }
+ struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->remoteRank;
+ if (comms->sendComm[resources->channelId] == NULL) NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, comms->sendComm+resources->channelId));
+ resources->netSendComm = comms->sendComm[resources->channelId];
+ if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
+ } else {
+ NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm));
+ }
+ } else {
+ // Connect to remote peer
+ NCCLCHECK(ncclNetConnect(resources->netDev, reqBuff, &resources->netSendComm));
+ connection->proxyAppendPtr = &connection->proxyAppend;
+ }
+ if (resources->netSendComm == NULL) {
+ *done = 0;
+ return ncclSuccess;
+ }
+ *done = 1;
+
+ // Create structures
+ struct connectMap* map = &resources->map;
+ map->sameProcess =
+ comm->peerInfo[resources->rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
+ map->shared = resources->shared;
+ CUDACHECK(cudaGetDevice(&map->cudaDev));
+
+ if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ NCCL_NET_MAP_ADD_POINTER(map, 0, p!= NCCL_PROTO_LL && resources->useGdr, comm->buffSizes[p], buffs[p]);
+ resources->buffSizes[p] = comm->buffSizes[p];
+ }
+ } else {
// Get shared buffers
- int loc = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
- NCCLCHECK(ncclProxySharedBuffersInit(send->comm, resources->useGdr, resources->buffSizes+loc, resources->buffers+loc));
- resources->mhandlesProto[NCCL_PROTO_SIMPLE] = resources->mhandles+loc;
+ int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
+ struct connectMapMem* mapMem = map->mems+bank;
+ NCCLCHECK(sharedBuffersInit(
+ comm, resources->useGdr, resources->localRank, 0, map->sameProcess, comm->p2pnChannels,
+ &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, &mapMem->ipc));
+ resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size;
+ NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
}
- if (resources->buffSizes[LOC_DEVMEM]) {
- NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
+ NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
+ NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
+
+ if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
+ if (resources->shared == 0) {
+ if (!map->sameProcess) {
+ ALIGN_SIZE(map->mems[NCCL_NET_MAP_DEVMEM].size, CUDA_IPC_MIN);
+ }
+ NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size));
+ map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
+ }
+ if (!map->sameProcess) {
+ CUDACHECK(cudaIpcGetMemHandle(&map->mems[NCCL_NET_MAP_DEVMEM].ipc, map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
+ }
+ }
+ if (map->sameProcess) {
+ NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
+ map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
+ } else {
+ NCCLCHECK(netCreateShm(map->mems+NCCL_NET_MAP_HOSTMEM));
}
- if (resources->buffSizes[LOC_HOSTMEM]) {
- NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM]));
+ if (ncclGdrCopy && map->sameProcess && ncclParamGdrCopySyncEnable()) {
+ uint64_t *cpuPtr, *gpuPtr;
+ NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 1, &resources->gdrDesc));
+
+ resources->gdcSync = cpuPtr;
+ struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
+ gdcMem->cpuPtr = (char*)cpuPtr;
+ gdcMem->gpuPtr = (char*)gpuPtr;
+ gdcMem->size = sizeof(uint64_t); // sendMem->head
+ }
+
+ resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
+ resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
+
+ // Don't give credits yet in shared mode.
+ resources->sendMem->head = map->shared ? -NCCL_STEPS : 0;
+ for (int i=0; i<NCCL_STEPS; i++) resources->recvMem->sizesFifo[i] = -1;
+
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
+ if (resources->buffers[p]) {
+ NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+ }
}
+
+ //NCCLCHECK(netDumpMap(map));
+ if (respSize != sizeof(struct connectMap)) return ncclInternalError;
+ memcpy(respBuff, map, sizeof(struct connectMap));
return ncclSuccess;
}
-/* Connect to this peer */
-ncclResult_t netRecvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
- // Setup device pointers
- struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources;
+static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+ if (reqSize != sizeof(int)) return ncclInternalError;
+ struct recvResources* resources = (struct recvResources*)(connection->transportResources);
+ resources->proxyRank = *(int*)reqBuff;
// Finish connection establishment from remote peer
- NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
+ if (resources->shared) {
+ // Shared buffers
+ struct ncclProxyProgressState* progressState = &comm->proxyState.progressState;
+ if (progressState->localPeers == NULL) {
+ NCCLCHECK(ncclCalloc(&progressState->localPeers, comm->localRanks));
+ }
+ struct ncclProxyPeer** localPeers = progressState->localPeers;
+ if (localPeers[resources->localRank] == NULL) {
+ NCCLCHECK(ncclCalloc(localPeers+resources->localRank, 1));
+ }
+ connection->proxyAppendPtr = localPeers[resources->localRank]->recv.proxyAppend+resources->channelId;
+
+ if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
+ // Connect or reuse connection for a netdev/remote rank.
+ if (progressState->netComms[resources->netDev] == NULL) {
+ NCCLCHECK(ncclCalloc(progressState->netComms+resources->netDev, comm->nRanks));
+ }
+ struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev]+resources->proxyRank;
+ if (comms->recvComm[resources->channelId] == NULL) NCCLCHECK(ncclNetAccept(resources->netListenComm, comms->recvComm+resources->channelId));
+ resources->netRecvComm = comms->recvComm[resources->channelId];
+ if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
+ } else {
+ NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
+ }
+ } else {
+ // Connect to remote peer
+ NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm));
+ connection->proxyAppendPtr = &connection->proxyAppend;
+ }
+ if (resources->netRecvComm == NULL) {
+ *done = 0;
+ return ncclSuccess;
+ }
+ *done = 1;
NCCLCHECK(ncclNetCloseListen(resources->netListenComm));
- if (resources->shared) {
+ // Create structures
+ struct connectMap* map = &resources->map;
+ map->sameProcess =
+ comm->peerInfo[resources->rank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
+ if (map->sameProcess == 0) return ncclInternalError; // We don't support remote proxy for recv
+ map->shared = resources->shared;
+
+ if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ NCCL_NET_MAP_ADD_POINTER(map, 0, resources->useGdr, comm->buffSizes[p], buffs[p]);
+ resources->buffSizes[p] = comm->buffSizes[p];
+ }
+ } else {
// Get shared buffers
- int loc = resources->useGdr ? LOC_DEVMEM : LOC_HOSTMEM;
- NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, resources->useGdr, resources->buffSizes+loc, resources->buffers+loc));
- resources->mhandlesProto[NCCL_PROTO_SIMPLE] = resources->mhandles+loc;
+ int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
+ struct connectMapMem* mapMem = map->mems+bank;
+ NCCLCHECK(sharedBuffersInit(
+ comm, resources->useGdr, resources->localRank, 1, 1, comm->p2pnChannels,
+ &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, NULL));
+ resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size;
+ NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
}
- if (resources->buffSizes[LOC_DEVMEM]) {
- NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_DEVMEM], resources->buffSizes[LOC_DEVMEM], NCCL_PTR_CUDA, &resources->mhandles[LOC_DEVMEM]));
+ NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
+ NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
+
+ if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
+ if (resources->shared == 0) {
+ NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size));
+ map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
+ }
+ }
+ NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
+ map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
+ if (ncclGdrCopy && map->sameProcess) {
+ uint64_t *cpuPtr, *gpuPtr;
+ NCCLCHECK(ncclGdrCudaCalloc(&cpuPtr, &gpuPtr, 2, &resources->gdrDesc));
+
+ if (ncclParamGdrCopySyncEnable()) {
+ resources->gdcSync = cpuPtr;
+ struct connectMapMem* gdcMem = map->mems+NCCL_NET_MAP_GDCMEM;
+ gdcMem->cpuPtr = (char*)cpuPtr;
+ gdcMem->gpuPtr = (char*)gpuPtr;
+ gdcMem->size = sizeof(uint64_t);
+ }
+ if (ncclParamGdrCopyFlushEnable()) resources->gdcFlush = cpuPtr + 1;
}
- if (resources->buffSizes[LOC_HOSTMEM]) {
- NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[LOC_HOSTMEM], resources->buffSizes[LOC_HOSTMEM], NCCL_PTR_HOST, &resources->mhandles[LOC_HOSTMEM]));
+
+ resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
+ resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
+ if (resources->buffers[p]) {
+ NCCLCHECK(ncclNetRegMr(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[p]));
+ }
}
+
+ //NCCLCHECK(netDumpMap(map));
+ if (respSize != sizeof(struct connectMap)) return ncclInternalError;
+ memcpy(respBuff, map, sizeof(struct connectMap));
return ncclSuccess;
}
-ncclResult_t netSendFree(void* transportResources) {
- struct netSendResources* resources = (struct netSendResources*)transportResources;
- NCCLCHECK(ncclCudaHostFree(resources->sendMem));
- NCCLCHECK(ncclCudaHostFree(resources->recvMem));
- for (int l=0; l<LOC_COUNT; l++) {
- if (resources->buffers[l])
- NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[l]));
+static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+ struct sendResources* resources = (struct sendResources*)(connection->transportResources);
+ if (resources == NULL) { // NVB Preconnect
+ NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 0));
+ return ncclSuccess;
+ }
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ if (resources->buffers[p]) {
+ NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandles[p]));
+ }
}
- if (resources->shared == 0) {
- NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
- CUDACHECK(cudaFree(resources->buffers[LOC_DEVMEM]));
+ struct connectMapMem* mems = resources->map.mems;
+ if (resources->map.sameProcess) {
+ NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
+ } else {
+ NCCLCHECK(ncclShmClose(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, NULL, mems[NCCL_NET_MAP_HOSTMEM].size));
+ }
+ CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+ if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
+ if (resources->shared) {
+ NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 0));
+ if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
+ struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->remoteRank;
+ comms->sendRefCount[resources->channelId]--;
+ if (comms->sendRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseSend(comms->sendComm[resources->channelId]));
+ } else {
+ NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
+ }
+ } else {
+ NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
}
- NCCLCHECK(ncclNetCloseSend(resources->netSendComm));
free(resources);
return ncclSuccess;
}
-ncclResult_t netRecvFree(void* transportResources) {
- struct netRecvResources* resources = (struct netRecvResources*)transportResources;
- // GDRCOPY support
- if (resources->gdrFlushDesc) {
- NCCLCHECK(ncclGdrCudaFree(resources->gdrFlushDesc));
- }
- // GDRCOPY support
- if (resources->gdrMemDesc) {
- NCCLCHECK(ncclGdrCudaFree(resources->gdrMemDesc));
+static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+ struct recvResources* resources = (struct recvResources*)(connection->transportResources);
+ if (resources == NULL) { // NVB Preconnect
+ NCCLCHECK(sharedBuffersDestroy(comm, connection->localRank, 1));
+ return ncclSuccess;
}
- NCCLCHECK(ncclCudaHostFree(resources->sendMem));
- NCCLCHECK(ncclCudaHostFree(resources->recvMem));
- for (int l=0; l<LOC_COUNT; l++) {
- if (resources->buffers[l])
- NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[l]));
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ if (resources->buffers[p]) {
+ NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandles[p]));
+ }
}
- if (resources->shared == 0) {
- NCCLCHECK(ncclCudaHostFree(resources->buffers[LOC_HOSTMEM]));
- CUDACHECK(cudaFree(resources->buffers[LOC_DEVMEM]));
+ struct connectMapMem* mems = resources->map.mems;
+ NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
+ CUDACHECK(cudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
+ if (mems[NCCL_NET_MAP_GDCMEM].cpuPtr) NCCLCHECK(ncclGdrCudaFree(resources->gdrDesc));
+ if (resources->shared) {
+ NCCLCHECK(sharedBuffersDestroy(comm, resources->localRank, 1));
+ if (resources->maxRecvs > 1 && ncclParamNetSharedComms()) {
+ struct ncclSharedNetComms* comms = comm->proxyState.progressState.netComms[resources->netDev]+resources->proxyRank;
+ comms->recvRefCount[resources->channelId]--;
+ if (comms->recvRefCount[resources->channelId] == 0) NCCLCHECK(ncclNetCloseRecv(comms->recvComm[resources->channelId]));
+ } else {
+ NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
+ }
+ } else {
+ NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
}
- NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm));
free(resources);
return ncclSuccess;
}
static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
-ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
+static ncclResult_t sendProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
- struct netSendResources* resources = (struct netSendResources*) (sub->connector->transportResources);
+ struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
// Round to next multiple of sliceSteps
sub->base = ROUNDUP(resources->step, args->chunkSteps);
sub->posted = sub->transmitted = sub->done = 0;
+ for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
}
args->state = ncclProxyOpProgress;
}
args->idle = 1;
if (args->state == ncclProxyOpProgress) {
int p = args->protocol;
+ int maxDepth = std::min(NCCL_STEPS, NCCL_SHARED_STEPS/args->nsubs);
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
if (sub->done == sub->nsteps) continue;
- struct netSendResources* resources = (struct netSendResources*) (sub->connector->transportResources);
- void* mhandle = *(resources->mhandlesProto[p]);
- int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS;
- char* localBuff = sub->connector->conn.buffs[p];
+ struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
+ void* mhandle = resources->mhandles[p];
+ int stepSize = resources->buffSizes[p] / NCCL_STEPS;
+ char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
int buffSize = stepSize*args->sliceSteps;
- if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR;
- if (sub->sendbytes < buffSize) buffSize = sub->sendbytes;
+ if (sub->nbytes < buffSize) buffSize = sub->nbytes;
// Post buffers to the GPU
- if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) {
+ if (sub->posted < sub->nsteps && sub->posted < sub->done + maxDepth) {
int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
if (resources->shared) {
- char* ptr;
- int sharedBuffSlot = sub->posted%NCCL_STEPS;
- NCCLCHECK(ncclProxySharedBuffersGetP2p(sub->connector->comm, resources->useGdr, 0, sub->channel->id, sharedBuffSlot, s, &ptr));
- resources->recvMem->ptrsFifo[buffSlot] = ptr;
+ int sharedBuffSlot = sub->posted%maxDepth;
+ int offset;
+ NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s, &offset));
+ resources->recvMem->offsFifo[buffSlot] = offset;
__sync_synchronize();
- volatile uint64_t* sendHead = &resources->sendMem->head;
+ volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
sub->posted += args->sliceSteps;
*sendHead = sub->base + sub->posted - NCCL_STEPS;
+ if (resources->gdcSync) wc_store_fence(); // Flush out WC write
} else sub->posted += args->sliceSteps;
+ for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) {
+ ncclProfilingRecord(args, s, step, ncclProxyProfileSendGPUWait);
+ }
args->idle = 0;
continue;
}
@@ -352,7 +822,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) {
// We have something to receive, let's check if it's completely ready.
int size = sizesFifo[buffSlot];
- char* buff = resources->shared ? (char*)resources->recvMem->ptrsFifo[buffSlot] : localBuff+buffSlot*stepSize;
+ char* buff = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
int ready = 1;
if (p == NCCL_PROTO_LL128) {
ready = resources->useGdr;
@@ -379,13 +849,14 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
}
if (ready) {
// Data is ready, try to send.
- NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, sub->requests+buffSlot));
+ NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, resources->rank, mhandle, sub->requests+buffSlot));
if (sub->requests[buffSlot] != NULL) {
- TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend (LL) posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
+ TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]);
sizesFifo[buffSlot] = -1;
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
sub->transmitted += args->sliceSteps;
+ for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileSendWait);
args->idle = 0;
continue;
}
@@ -400,9 +871,12 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
if (done) {
TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
sub->done += args->sliceSteps;
+ for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileEnd);
if (resources->shared == 0) {
- resources->sendMem->head = sub->base + sub->done;
+ volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
+ *sendHead = sub->base + sub->done;
+ if (resources->gdcSync) wc_store_fence(); // Flush out WC write
}
args->idle = 0;
if (sub->done == sub->nsteps) {
@@ -419,111 +893,203 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
return ncclSuccess;
}
-ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
+static ncclResult_t recvProxyProgress(struct ncclComm* comm, struct ncclProxyArgs* args) {
if (args->state == ncclProxyOpReady) {
+ // Initialize subs and group them by same recvComm.
+ void* recvComm;
+ int groupSize = 0;
+ int maxRecvs = 1;
for (int s=0; s<args->nsubs; s++) {
struct ncclProxySubArgs* sub = args->subs+s;
- struct netRecvResources* resources = (struct netRecvResources*) (sub->connector->transportResources);
+ if (groupSize == maxRecvs) {
+ groupSize = 0;
+ } else if (s>0) { // Find next sub with the same recvComm
+ int next;
+ for (next=s; next<args->nsubs; next++) {
+ struct recvResources* nextRes = (struct recvResources*) (args->subs[next].connection->transportResources);
+ if (nextRes->netRecvComm == recvComm) break;
+ }
+ if (next == args->nsubs) { // Not found
+ groupSize = 0;
+ } else if (s != next) { // We found a sub later with the same recvComm ; swap subs
+ struct ncclProxySubArgs temp;
+ memcpy(&temp, sub, sizeof(struct ncclProxySubArgs));
+ memcpy(sub, args->subs+next, sizeof(struct ncclProxySubArgs));
+ memcpy(args->subs+next, &temp, sizeof(struct ncclProxySubArgs));
+ }
+ }
+ groupSize++;
+ struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+ maxRecvs = resources->maxRecvs;
+ recvComm = resources->netRecvComm;
// Round to next multiple of sliceSteps
sub->base = ROUNDUP(resources->step, args->chunkSteps);
sub->posted = sub->received = sub->transmitted = sub->done = 0;
+ for (int i=0; i<groupSize; i++) sub[-i].groupSize = groupSize;
+ for (uint64_t step=0; step<sub->nsteps; step++) ncclProfilingRecord(args, s, step, ncclProxyProfileBegin);
}
args->state = ncclProxyOpProgress;
}
args->idle = 1;
if (args->state == ncclProxyOpProgress) {
int p = args->protocol;
- for (int s=0; s<args->nsubs; s++) {
- struct ncclProxySubArgs* sub = args->subs+s;
- if (sub->done == sub->nsteps) continue;
- struct netRecvResources* resources = (struct netRecvResources*) (sub->connector->transportResources);
- void* mhandle = *(resources->mhandlesProto[p]);
- int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS;
- char* localBuff = sub->connector->conn.buffs[p];
- int buffSize = stepSize*args->sliceSteps;
- if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR;
- if (sub->recvbytes < buffSize) buffSize = sub->recvbytes;
+ int maxDepth = std::min(NCCL_STEPS, NCCL_SHARED_STEPS/args->nsubs);
+ for (int s=0; s<args->nsubs; s+=args->subs[s].groupSize) {
+ struct ncclProxySubArgs* subGroup = args->subs+s;
+ int subCount = 0;
+ void* ptrs[NCCL_PROXY_MAX_SUBS];
+ int sizes[NCCL_PROXY_MAX_SUBS];
+ int tags[NCCL_PROXY_MAX_SUBS];
+ void* mhandles[NCCL_PROXY_MAX_SUBS];
- if ((sub->posted < sub->done + NCCL_STEPS) && (sub->posted < sub->nsteps)) {
- int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
- char* ptr;
- if (resources->shared) {
- int sharedBuffSlot = sub->posted%NCCL_STEPS;
- NCCLCHECK(ncclProxySharedBuffersGetP2p(sub->connector->comm, resources->useGdr, 1, sub->channel->id, sharedBuffSlot, s, &ptr));
- volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
- ptrsFifo[buffSlot] = ptr;
- } else {
- ptr = localBuff+buffSlot*stepSize;
+ for (int i=0; i<subGroup->groupSize; i++) {
+ struct ncclProxySubArgs* sub = subGroup + i;
+ if (sub->posted < sub->nsteps) {
+ if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; }
+ struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+ int stepSize = resources->buffSizes[p] / NCCL_STEPS;
+ char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
+ int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
+ if (resources->shared) {
+ int sharedBuffSlot = sub->posted%maxDepth;
+ int offset;
+ NCCLCHECK(sharedBuffersGet(comm, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset));
+ volatile int* offsFifo = (volatile int*)resources->recvMem->offsFifo;
+ offsFifo[buffSlot] = offset;
+ ptrs[subCount] = localBuff+offset;
+ } else {
+ ptrs[subCount] = localBuff+buffSlot*stepSize;
+ }
+ sizes[subCount] = stepSize*args->sliceSteps;
+ if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes;
+ tags[subCount] = resources->remoteRank;
+ mhandles[subCount] = resources->mhandles[p];
+ subCount++;
}
- NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, sub->requests+buffSlot));
- if (sub->requests[buffSlot] != NULL) {
- TRACE(NCCL_NET, "recvProxy [%ld/%d] posted recv request %p", sub->posted, buffSlot, sub->requests[buffSlot]);
- sub->posted += args->sliceSteps;
+ }
+ if (subCount) {
+ uint64_t step = subGroup->posted;
+ struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
+ void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
+ NCCLCHECK(ncclNetIrecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
+ if (*requestPtr) {
+ for (int i=0; i<subGroup->groupSize; i++) {
+ struct ncclProxySubArgs* sub = subGroup+i;
+ sub->posted += args->sliceSteps;
+ for (uint64_t step=sub->posted-args->sliceSteps; step<sub->posted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvWait);
+ }
args->idle = 0;
- continue;
}
}
- if (sub->posted > sub->received) {
- int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
- int done, size;
- NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, &size));
- if (done) {
- sub->received += args->sliceSteps;
- if (size > 0 && p == NCCL_PROTO_SIMPLE && resources->useGdr) {
- // Don't pass data to the GPU yet, flush first.
+ }
+ if (args->idle == 0) return ncclSuccess;
+ for (int s=0; s<args->nsubs; s+=args->subs[s].groupSize) {
+ struct ncclProxySubArgs* subGroup = args->subs+s;
+ if (subGroup->posted > subGroup->received) {
+ uint64_t step = subGroup->received;
+ int done;
+ void* ptrs[NCCL_PROXY_MAX_SUBS];
+ int sizes[NCCL_PROXY_MAX_SUBS];
+ void* mhandles[NCCL_PROXY_MAX_SUBS];
+ for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) sizes[i] = 0;
+ NCCLCHECK(ncclNetTest(subGroup->requests[step%NCCL_STEPS], &done, sizes));
+ if (done) {
+ int useGdr = 0;
+ int totalSize = 0;
+ for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) totalSize += sizes[i];
+ for (int i=0; i<subGroup->groupSize; i++) {
+ struct ncclProxySubArgs* sub = subGroup + i;
+ sub->received += args->sliceSteps;
+ for (uint64_t step=sub->received-args->sliceSteps; step<sub->received; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvFlushWait);
+ if (step < sub->nsteps) {
+ struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+ if (resources->useGdr) useGdr = 1;
+ }
+ }
+ subGroup->requests[step%NCCL_STEPS] = NULL;
+ if (totalSize > 0 && p == NCCL_PROTO_SIMPLE && useGdr) {
// GDRCOPY support
- if (resources->devFlushMem) {
+ struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
+ if (resources->gdcFlush) {
#if defined (__x86_64__)
// Force a PCI-E read from GPU memory
- asm volatile ("mov (%0), %%eax" :: "l"(resources->devFlushMem) : "%eax");
+ asm volatile ("mov (%0), %%eax" :: "l"(resources->gdcFlush) : "%eax");
#else
WARN("NET: GDR Flush only supported on x86_64");
return ncclInternalError;
#endif
- sub->requests[buffSlot] = NULL;
} else {
- volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo;
- char* ptr = resources->shared ? (char*)(ptrsFifo[buffSlot]) : localBuff+buffSlot*stepSize;
- NCCLCHECK(ncclNetIflush(resources->netRecvComm, ptr, size, mhandle, sub->requests+buffSlot));
+ int subCount = 0;
+ for (int i=0; i<subGroup->groupSize; i++) {
+ struct ncclProxySubArgs* sub = subGroup + i;
+ if (step < sub->nsteps) {
+ struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+ int stepSize = resources->buffSizes[p] / NCCL_STEPS;
+ char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
+ int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
+ ptrs[subCount] = resources->shared ? localBuff+resources->recvMem->offsFifo[buffSlot] : localBuff+buffSlot*stepSize;
+ mhandles[subCount] = resources->mhandles[p];
+ subCount++;
+ }
+ }
+ struct recvResources* resources = (struct recvResources*) (subGroup->connection->transportResources);
+ NCCLCHECK(ncclNetIflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
}
- } else {
- sub->requests[buffSlot] = NULL;
}
args->idle = 0;
- continue;
}
}
- if (sub->received > sub->transmitted) {
- // Progress flush operations
- int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
+ }
+ if (args->idle == 0) return ncclSuccess;
+
+ for (int s=0; s<args->nsubs; s+=args->subs[s].groupSize) {
+ struct ncclProxySubArgs* subGroup = args->subs+s;
+ if (subGroup->received > subGroup->transmitted) {
+ uint64_t step = subGroup->transmitted;
int done = 1;
- if (sub->requests[buffSlot]) NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL));
+ void* request = subGroup->requests[step%NCCL_STEPS];
+ if (request) NCCLCHECK(ncclNetTest(request, &done, NULL));
if (done) {
- sub->transmitted += args->sliceSteps;
- __sync_synchronize();
- if (resources->devRecvMem) {
- // GDRCOPY support: Write updated tail directly to the device memory
- resources->devRecvMem->tail = sub->base + sub->transmitted;
- wc_store_fence(); // Flush out WC write
- } else {
- resources->recvMem->tail = sub->base + sub->transmitted;
+ for (int i=0; i<subGroup->groupSize; i++) {
+ struct ncclProxySubArgs* sub = subGroup + i;
+ sub->transmitted += args->sliceSteps;
+ for (uint64_t step=sub->transmitted-args->sliceSteps; step<sub->transmitted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvGPUWait);
+ if (step < sub->nsteps) {
+ __sync_synchronize();
+ struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+ volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
+ *recvTail = sub->base + sub->transmitted;
+ if (resources->gdcSync) wc_store_fence(); // Flush out WC write
+ }
}
args->idle = 0;
- continue;
}
}
- if (sub->transmitted > sub->done) {
- volatile uint64_t* sendHead = &resources->sendMem->head;
- uint64_t done = *sendHead;
- while (done > sub->base + sub->done &&
- // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
- sub->transmitted > sub->done) {
- sub->done += args->sliceSteps;
- args->idle = 0;
- if (sub->done == sub->nsteps) {
- resources->step = sub->base + sub->nsteps;
- args->done++;
+ }
+ if (args->idle == 0) return ncclSuccess;
+
+ for (int s=0; s<args->nsubs; s+=args->subs[s].groupSize) {
+ struct ncclProxySubArgs* subGroup = args->subs+s;
+ for (int i=0; i<subGroup->groupSize; i++) {
+ struct ncclProxySubArgs* sub = subGroup + i;
+ if (sub->done == sub->nsteps) continue;
+ if (sub->transmitted > sub->done) {
+ struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+ volatile uint64_t* sendHead = &resources->sendMem->head;
+ uint64_t done = *sendHead;
+ while (done > sub->base + sub->done &&
+ // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
+ sub->transmitted > sub->done) {
+ sub->done += args->sliceSteps;
+ for (uint64_t step=sub->done-args->sliceSteps; step<sub->done; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileEnd);
+ args->idle = 0;
+ if (sub->done == sub->nsteps) {
+ struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
+ resources->step = sub->base + sub->nsteps;
+ args->done++;
+ break;
+ }
}
}
}
@@ -537,7 +1103,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
struct ncclTransport netTransport = {
"NET",
- netCanConnect,
- { netSendSetup, netSendConnect, netSendFree, netSendProxy },
- { netRecvSetup, netRecvConnect, netRecvFree, netRecvProxy }
+ canConnect,
+ { sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress },
+ { recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress }
};
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index db27eae..4edff0f 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -20,26 +20,44 @@
#include <poll.h>
#include <sys/types.h>
#include <unistd.h>
+#define ENABLE_TIMER 0
+#include "timer.h"
#include "ibvwrap.h"
#define USE_RDMA_WRITE 1
#define MAXNAMESIZE 64
static char ncclIbIfName[MAX_IF_NAME_SIZE+1];
-static union socketAddress ncclIbIfAddr;
+static union ncclSocketAddress ncclIbIfAddr;
+
+struct ncclIbMr {
+ uintptr_t addr;
+ int pages;
+ int refs;
+ ibv_mr *mr;
+};
+
+struct ncclIbMrCache {
+ struct ncclIbMr *slots;
+ int capacity, population;
+};
static int ncclNIbDevs = -1;
struct ncclIbDev {
+ pthread_mutex_t lock;
int device;
uint64_t guid;
uint8_t port;
uint8_t link;
int speed;
ibv_context* context;
+ int pdRefs;
+ ibv_pd* pd;
char devName[MAXNAMESIZE];
char* pciPath;
int realPort;
int maxQp;
+ struct ncclIbMrCache mrCache;
};
#define MAX_IB_PORT 15
@@ -52,6 +70,7 @@ struct userIbDev {
struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
struct userIbDev userIbDevs[MAX_IB_DEVS];
pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
+static int ncclIbRelaxedOrderingEnabled = 0;
NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", 0);
NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 14);
@@ -61,6 +80,7 @@ NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0);
NCCL_PARAM(IbSl, "IB_SL", 0);
NCCL_PARAM(IbTc, "IB_TC", 0);
NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
+NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2);
pthread_t ncclIbAsyncThread;
static void* ncclIbAsyncThreadMain(void* args) {
@@ -114,17 +134,28 @@ static int ncclIbSpeed(int speed) {
return ibvSpeeds[firstBitSet(speed, sizeof(ibvSpeeds)/sizeof(int)-1)];
}
+// Determine whether RELAXED_ORDERING is enabled and possible
+static int ncclIbRelaxedOrderingCapable(void) {
+ int roMode = ncclParamIbPciRelaxedOrdering();
+ ncclResult_t r = ncclInternalError;
+ if (roMode == 1 || roMode == 2) {
+ // Query IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
+ r = wrap_ibv_reg_mr_iova2(NULL, NULL, NULL, 0, 0, 0);
+ }
+ return r == ncclInternalError ? 0 : 1;
+}
+
ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
+ if (ncclParamIbDisable()) return ncclInternalError;
static int shownIbHcaEnv = 0;
if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
- if (ncclParamIbDisable()) return ncclInternalError;
if (ncclNIbDevs == -1) {
pthread_mutex_lock(&ncclIbLock);
wrap_ibv_fork_init();
if (ncclNIbDevs == -1) {
ncclNIbDevs = 0;
- if (findInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
+ if (ncclFindInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
WARN("NET/IB : No IP interface found.");
return ncclInternalError;
}
@@ -175,18 +206,26 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
}
TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
+ pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL);
ncclIbDevs[ncclNIbDevs].device = d;
ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid;
ncclIbDevs[ncclNIbDevs].port = port;
ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width);
ncclIbDevs[ncclNIbDevs].context = context;
+ ncclIbDevs[ncclNIbDevs].pdRefs = 0;
+ ncclIbDevs[ncclNIbDevs].pd = NULL;
strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
NCCLCHECK(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort));
ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp;
+ ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0;
+ ncclIbDevs[ncclNIbDevs].mrCache.population = 0;
+ ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
+
+ pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
+ ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
ncclNIbDevs++;
nPorts++;
- pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
}
if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; }
}
@@ -197,13 +236,16 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
} else {
char line[1024];
line[0] = '\0';
+ // Determine whether RELAXED_ORDERING is enabled and possible
+ ncclIbRelaxedOrderingEnabled = ncclIbRelaxedOrderingCapable();
for (int d=0; d<ncclNIbDevs; d++) {
snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%d/%s", d, ncclIbDevs[d].devName,
ncclIbDevs[d].port, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
}
line[1023] = '\0';
char addrline[SOCKET_NAME_MAXLEN+1];
- INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s ; OOB %s:%s", line, ncclIbIfName, socketToString(&ncclIbIfAddr, addrline));
+ INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "",
+ ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline));
}
pthread_mutex_unlock(&ncclIbLock);
}
@@ -231,11 +273,13 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
return ncclSuccess;
}
-static ncclResult_t GetSocketAddr(union socketAddress* addr) {
+static ncclResult_t GetSocketAddr(union ncclSocketAddress* addr) {
memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
return ncclSuccess;
}
+#define NCCL_NET_IB_MAX_RECVS 8
+
ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
props->name = ncclIbDevs[dev].devName;
props->pciPath = ncclIbDevs[dev].pciPath;
@@ -247,18 +291,23 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
props->ptrSupport |= NCCL_PTR_CUDA;
}
props->speed = ncclIbDevs[dev].speed;
+ props->latency = 0; // Not set
props->port = ncclIbDevs[dev].port + ncclIbDevs[dev].realPort;
props->maxComms = ncclIbDevs[dev].maxQp;
+ props->maxRecvs = NCCL_NET_IB_MAX_RECVS;
return ncclSuccess;
}
-#define MAX_REQUESTS NCCL_NET_MAX_REQUESTS
+// We need to support NCCL_NET_MAX_REQUESTS for each concurrent receive
+#define MAX_REQUESTS (NCCL_NET_MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS)
+static_assert(MAX_REQUESTS <= 256, "request id are encoded in wr_id and we need up to 8 requests ids per completion");
#define NCCL_IB_MAX_QPS 128
struct ncclIbQpInfo {
uint32_t lid;
uint8_t ib_port;
+ uint8_t link_layer;
uint32_t qpn[NCCL_IB_MAX_QPS];
// For RoCE
@@ -271,46 +320,83 @@ struct ncclIbQpInfo {
uint64_t fifoAddr;
};
+enum ncclIbCommState {
+ ncclIbCommStateStart = 0,
+ ncclIbCommStateConnect = 1,
+ ncclIbCommStateAccept = 3,
+ ncclIbCommStateSend = 4,
+ ncclIbCommStateRecv = 5,
+ ncclIbCommStateConnected = 6,
+};
+
+struct ncclIbCommStage {
+ enum ncclIbCommState state;
+ int offset;
+ void* buffer;
+ void* comm;
+};
+
struct ncclIbHandle {
- union socketAddress connectAddr;
+ union ncclSocketAddress connectAddr; // Filled by the target
+ struct ncclIbCommStage stage; // Used by the other side when connecting
};
+#define NCCL_NET_IB_REQ_UNUSED 0
+#define NCCL_NET_IB_REQ_SEND 1
+#define NCCL_NET_IB_REQ_RECV 2
+#define NCCL_NET_IB_REQ_FLUSH 3
+
struct ncclIbRequest {
- int used;
- int type;
struct ncclIbVerbs* verbs;
+ int type;
int events;
- int size;
- union socketAddress *addr;
+ union ncclSocketAddress *addr;
+ int nreqs;
+ union {
+ struct {
+ int size;
+ void* data;
+ uint32_t lkey;
+ int offset;
+ } send;
+ struct {
+ int sizes[NCCL_NET_IB_MAX_RECVS];
+ } recv;
+ };
};
struct ncclIbVerbs {
- struct ibv_pd* pd;
+ int dev;
+ struct ibv_pd* pd; // duplicate of ncclIbDevs[dev].pd
struct ibv_cq* cq;
- uint64_t pad[2];
+ uint64_t pad[1];
struct ncclIbRequest reqs[MAX_REQUESTS];
};
struct ncclIbListenComm {
int dev;
- int fd;
+ struct ncclSocket sock;
+ struct ncclIbCommStage stage;
};
struct ncclIbSendFifo {
uint64_t addr;
int size;
- uint32_t seq;
uint32_t rkey;
- uint32_t ready;
- uint64_t pad[1]; // Pad FIFO element size to be 32-bytes
+ uint32_t nreqs;
+ uint32_t tag;
+ uint64_t idx;
};
struct ncclIbSendComm {
struct ncclIbVerbs verbs;
- struct ncclIbSendFifo fifo[MAX_REQUESTS];
- uint32_t fifoHead;
- int fd;
- union socketAddress addr;
+ struct ncclIbSendFifo fifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
+ uint64_t fifoHead;
+ struct ncclIbRequest* fifoReqs[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
+ struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS+1];
+ struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS];
+ struct ncclSocket sock;
+
int ready;
struct ibv_qp* qps[NCCL_IB_MAX_QPS];
int nqps;
@@ -331,10 +417,10 @@ struct ncclIbGpuFlush {
};
struct ncclIbRemFifo {
- struct ncclIbSendFifo elems[MAX_REQUESTS];
+ struct ncclIbSendFifo elems[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
+ uint64_t fifoTail;
uint64_t addr;
uint32_t rkey;
- uint32_t tail;
uint32_t flags;
struct ibv_mr* mr;
struct ibv_sge sge;
@@ -343,8 +429,7 @@ struct ncclIbRemFifo {
struct ncclIbRecvComm {
struct ncclIbVerbs verbs;
struct ncclIbRemFifo remFifo;
- int fd;
- union socketAddress addr;
+ struct ncclSocket sock;
int ready;
struct ibv_qp* qps[NCCL_IB_MAX_QPS];
int nqps;
@@ -354,17 +439,39 @@ static_assert((offsetof(struct ncclIbRecvComm, remFifo) % 32) == 0, "ncclIbSendC
NCCL_PARAM(IbQpsPerConn, "IB_QPS_PER_CONNECTION", 1);
-ncclResult_t ncclIbInitVerbs(ibv_context* ctx, struct ncclIbVerbs* verbs) {
- NCCLCHECK(wrap_ibv_alloc_pd(&verbs->pd, ctx));
+ncclResult_t ncclIbInitVerbs(int dev, struct ibv_context* ctx, struct ncclIbVerbs* verbs) {
+ verbs->dev = dev;
+
+ pthread_mutex_lock(&ncclIbDevs[dev].lock);
+ if (0 == ncclIbDevs[dev].pdRefs++) {
+ ncclResult_t res;
+ NCCLCHECKGOTO(wrap_ibv_alloc_pd(&ncclIbDevs[dev].pd, ctx), res, failure);
+ if (0) {
+ failure:
+ pthread_mutex_unlock(&ncclIbDevs[dev].lock);
+ return res;
+ }
+ }
+ verbs->pd = ncclIbDevs[dev].pd;
+ pthread_mutex_unlock(&ncclIbDevs[dev].lock);
+
// Recv requests can generate 2 completions (one for the post FIFO, one for the Recv).
NCCLCHECK(wrap_ibv_create_cq(&verbs->cq, ctx, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), NULL, NULL, 0));
return ncclSuccess;
}
ncclResult_t ncclIbDestroyVerbs(struct ncclIbVerbs* verbs) {
+ ncclResult_t res;
NCCLCHECK(wrap_ibv_destroy_cq(verbs->cq));
- NCCLCHECK(wrap_ibv_dealloc_pd(verbs->pd));
- return ncclSuccess;
+
+ pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock);
+ if (0 == --ncclIbDevs[verbs->dev].pdRefs) {
+ NCCLCHECKGOTO(wrap_ibv_dealloc_pd(ncclIbDevs[verbs->dev].pd), res, returning);
+ }
+ res = ncclSuccess;
+returning:
+ pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock);
+ return res;
}
ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int access_flags, struct ibv_qp** qp) {
@@ -390,7 +497,7 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int acce
return ncclSuccess;
}
-ncclResult_t ncclIbRtrQp(ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) {
+ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) {
struct ibv_qp_attr qpAttr;
memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
qpAttr.qp_state = IBV_QPS_RTR;
@@ -399,7 +506,7 @@ ncclResult_t ncclIbRtrQp(ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) {
qpAttr.rq_psn = 0;
qpAttr.max_dest_rd_atomic = 1;
qpAttr.min_rnr_timer = 12;
- if (info->lid == 0) {
+ if (info->link_layer == IBV_LINK_LAYER_ETHERNET) {
qpAttr.ah_attr.is_global = 1;
qpAttr.ah_attr.grh.dgid.global.subnet_prefix = info->spn;
qpAttr.ah_attr.grh.dgid.global.interface_id = info->iid;
@@ -418,7 +525,7 @@ ncclResult_t ncclIbRtrQp(ibv_qp* qp, uint32_t qpn, struct ncclIbQpInfo* info) {
return ncclSuccess;
}
-ncclResult_t ncclIbRtsQp(ibv_qp* qp) {
+ncclResult_t ncclIbRtsQp(struct ibv_qp* qp) {
struct ibv_qp_attr qpAttr;
memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
qpAttr.qp_state = IBV_QPS_RTS;
@@ -431,33 +538,56 @@ ncclResult_t ncclIbRtsQp(ibv_qp* qp) {
return ncclSuccess;
}
-
ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
struct ncclIbListenComm* comm;
NCCLCHECK(ncclCalloc(&comm, 1));
struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
static_assert(sizeof(struct ncclIbHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclIbHandle size too large");
+ memset(handle, 0, sizeof(struct ncclIbHandle));
comm->dev = dev;
- NCCLCHECK(GetSocketAddr(&(handle->connectAddr)));
- NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
+ NCCLCHECK(GetSocketAddr(&comm->sock.addr));
+ NCCLCHECK(ncclSocketListen(&comm->sock));
+ memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress));
*listenComm = comm;
return ncclSuccess;
}
ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
- struct ncclIbSendComm* comm;
- NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm)));
-
struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
- NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr));
- *sendComm = comm;
+ enum ncclSocketState conState;
+ struct ncclIbCommStage* stage = &handle->stage;
+ struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm;
+ *sendComm = NULL;
+
+ if (stage->state == ncclIbCommStateConnect) goto ib_connect_check;
+ if (stage->state == ncclIbCommStateSend) goto ib_send;
+ if (stage->state != ncclIbCommStateStart) {
+ WARN("Error: trying to connect already connected sendComm");
+ return ncclInternalError;
+ }
- comm->addr = handle->connectAddr;
+ NCCLCHECK(ncclIbMalloc((void**)&comm, sizeof(struct ncclIbSendComm)));
+ NCCLCHECK(ncclSocketInit(&comm->sock, &handle->connectAddr, NULL, 1));
+ stage->comm = comm;
+ stage->state = ncclIbCommStateConnect;
+ NCCLCHECK(ncclSocketConnect(&comm->sock));
+
+ib_connect_check:
+ /* since ncclSocketConnect is async, we must check if connection is complete */
+ NCCLCHECK(ncclGetSocketState(&comm->sock, &conState));
+ if (conState == ncclSocketConnecting) {
+ /* expect user to call again */
+ return ncclSuccess;
+ } else if (conState == ncclSocketError) {
+ return ncclSystemError;
+ }
// IB Setup
- ibv_context* ctx = ncclIbDevs[dev].context;
- NCCLCHECK(ncclIbInitVerbs(ctx, &comm->verbs));
- uint8_t ib_port = ncclIbDevs[dev].port;
+ struct ibv_context* ctx;
+ ctx = ncclIbDevs[dev].context;
+ NCCLCHECK(ncclIbInitVerbs(dev, ctx, &comm->verbs));
+ uint8_t ib_port;
+ ib_port = ncclIbDevs[dev].port;
comm->nqps = ncclParamIbQpsPerConn();
for (int q=0; q<comm->nqps; q++) {
NCCLCHECK(ncclIbCreateQp(ib_port, &comm->verbs, IBV_ACCESS_REMOTE_WRITE, comm->qps+q));
@@ -472,13 +602,14 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
qpInfo.mtu = portAttr.active_mtu;
// Prepare my fifo
- NCCLCHECK(wrap_ibv_reg_mr(&comm->fifoMr, comm->verbs.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
+ NCCLCHECK(wrap_ibv_reg_mr(&comm->fifoMr, comm->verbs.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
qpInfo.fifoRkey = comm->fifoMr->rkey;
qpInfo.fifoAddr = (uint64_t)comm->fifo;
// RoCE support
qpInfo.lid = portAttr.lid;
- if (qpInfo.lid) { // IB
+ qpInfo.link_layer = portAttr.link_layer;
+ if (qpInfo.link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB
for (int q=0; q<comm->nqps; q++)
INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, qpInfo.lid);
} else { // RoCE
@@ -490,7 +621,19 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) {
INFO(NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn[q], qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid);
}
- NCCLCHECK(socketSend(comm->fd, &comm->addr, &qpInfo, sizeof(qpInfo)));
+ stage->state = ncclIbCommStateSend;
+ stage->offset = 0;
+ NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(qpInfo)));
+ memcpy(stage->buffer, &qpInfo, sizeof(qpInfo));
+
+ib_send:
+ NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->sock, stage->buffer, sizeof(qpInfo), &stage->offset));
+ if (stage->offset != sizeof(qpInfo))
+ return ncclSuccess;
+
+ free(stage->buffer);
+ stage->state = ncclIbCommStateConnected;
+ *sendComm = comm;
return ncclSuccess;
}
@@ -498,24 +641,53 @@ NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
struct ncclIbListenComm* lComm = (struct ncclIbListenComm*)listenComm;
- struct ncclIbRecvComm* rComm;
+ struct ncclIbCommStage* stage = &lComm->stage;
+ struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm;
+ *recvComm = NULL;
+
+ if (stage->state == ncclIbCommStateAccept) goto ib_accept;
+ if (stage->state == ncclIbCommStateRecv) goto ib_recv;
+ if (stage->state == ncclIbCommStateSend) goto ib_send;
+ if (stage->state != ncclIbCommStateStart) {
+ WARN("Listencomm in unknown state %d\n", stage->state);
+ return ncclInternalError;
+ }
+
NCCLCHECK(ncclIbMalloc((void**)&rComm, sizeof(struct ncclIbRecvComm)));
+ stage->comm = rComm;
+ stage->state = ncclIbCommStateAccept;
+ lComm->sock.asyncFlag = 1;
+ rComm->sock.asyncFlag = 1;
+
+ib_accept:
+ NCCLCHECK(ncclSocketAccept(&rComm->sock, &lComm->sock));
+ if (rComm->sock.fd == -1)
+ return ncclSuccess;
- socklen_t socklen = sizeof(union socketAddress);
- SYSCHECKVAL(accept(lComm->fd, &rComm->addr.sa, &socklen), "accept", rComm->fd);
struct ncclIbQpInfo remQpInfo;
- NCCLCHECK(socketRecv(rComm->fd, &rComm->addr, &remQpInfo, sizeof(remQpInfo)));
+ stage->state = ncclIbCommStateRecv;
+ stage->offset = 0;
+ NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remQpInfo)));
+ib_recv:
+ NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->sock, stage->buffer, sizeof(remQpInfo), &stage->offset));
+ if (stage->offset != sizeof(remQpInfo))
+ return ncclSuccess;
+
+ /* copy back the received info */
+ memcpy(&remQpInfo, stage->buffer, sizeof(struct ncclIbQpInfo));
// IB setup
- ibv_context* ctx = ncclIbDevs[lComm->dev].context;
- uint8_t ib_port = ncclIbDevs[lComm->dev].port;
+ struct ibv_context* ctx;
+ uint8_t ib_port;
+ ctx = ncclIbDevs[lComm->dev].context;
+ ib_port = ncclIbDevs[lComm->dev].port;
struct ibv_port_attr portAttr;
NCCLCHECK(wrap_ibv_query_port(ctx, ib_port, &portAttr));
union ibv_gid gid;
NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid));
// QP Creation
- NCCLCHECK(ncclIbInitVerbs(ctx, &rComm->verbs));
+ NCCLCHECK(ncclIbInitVerbs(lComm->dev, ctx, &rComm->verbs));
rComm->nqps = ncclParamIbQpsPerConn();
for (int q=0; q<rComm->nqps; q++) {
NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_REMOTE_WRITE, rComm->qps+q));
@@ -534,8 +706,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
// Retain remote fifo info and prepare my RDMA ops
rComm->remFifo.rkey = remQpInfo.fifoRkey;
rComm->remFifo.addr = remQpInfo.fifoAddr;
- NCCLCHECK(wrap_ibv_reg_mr(&rComm->remFifo.mr, rComm->verbs.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
- rComm->remFifo.sge.length = sizeof(struct ncclIbSendFifo);
+ NCCLCHECK(wrap_ibv_reg_mr(&rComm->remFifo.mr, rComm->verbs.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ));
rComm->remFifo.sge.lkey = rComm->remFifo.mr->lkey;
if (ncclParamIbUseInline()) rComm->remFifo.flags = IBV_SEND_INLINE;
@@ -549,6 +720,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
NCCLCHECK(ncclIbCreateQp(ib_port, &rComm->verbs, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ, &rComm->gpuFlush.qp));
struct ncclIbQpInfo localQpInfo;
localQpInfo.lid=portAttr.lid;
+ localQpInfo.link_layer=portAttr.link_layer;
localQpInfo.ib_port=ib_port;
localQpInfo.spn=gid.global.subnet_prefix;
localQpInfo.iid=gid.global.interface_id;
@@ -560,26 +732,39 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm) {
// Fill Handle
struct ncclIbQpInfo qpInfo;
qpInfo.lid=portAttr.lid;
+ qpInfo.link_layer=portAttr.link_layer;
qpInfo.ib_port=ib_port;
for (int q=0; q<rComm->nqps; q++) qpInfo.qpn[q]=rComm->qps[q]->qp_num;
qpInfo.spn=gid.global.subnet_prefix;
qpInfo.iid=gid.global.interface_id;
qpInfo.mtu=remQpInfo.mtu;
- NCCLCHECK(socketSend(rComm->fd, &rComm->addr, &qpInfo, sizeof(qpInfo)));
+ stage->state = ncclIbCommStateSend;
+ stage->offset = 0;
+ if (stage->buffer) free(stage->buffer);
+ NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(struct ncclIbQpInfo)));
+ memcpy(stage->buffer, &qpInfo, sizeof(struct ncclIbQpInfo));
+ib_send:
+ NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->sock, stage->buffer, sizeof(struct ncclIbQpInfo), &stage->offset));
+ if (stage->offset < sizeof(struct ncclIbQpInfo)) return ncclSuccess;
+
+ free(stage->buffer);
*recvComm = rComm;
+
+ /* reset lComm stage */
+ stage->state = ncclIbCommStateStart;
+ stage->offset = 0;
+ stage->comm = NULL;
+ stage->buffer = NULL;
return ncclSuccess;
}
ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest** req) {
for (int i=0; i<MAX_REQUESTS; i++) {
struct ncclIbRequest* r = verbs->reqs+i;
- if (r->used == 0) {
- r->used = 1;
- r->type = 0;
+ if (r->type == NCCL_NET_IB_REQ_UNUSED) {
r->verbs = verbs;
r->events = 1;
- r->size = -1;
r->addr = NULL;
*req = r;
return ncclSuccess;
@@ -590,7 +775,7 @@ ncclResult_t ncclIbGetRequest(struct ncclIbVerbs* verbs, struct ncclIbRequest**
return ncclInternalError;
}
ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) {
- r->used = 0;
+ r->type = NCCL_NET_IB_REQ_UNUSED;
return ncclSuccess;
}
@@ -599,9 +784,9 @@ ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
// Do not block on this receive, return if not ready.
int bytes = 0;
- NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &remQpInfo, sizeof(remQpInfo), &bytes));
+ NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, &remQpInfo, sizeof(remQpInfo), &bytes));
if (bytes == 0) return ncclSuccess; // Try again later
- NCCLCHECK(socketWait(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &remQpInfo, sizeof(remQpInfo), &bytes));
+ NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, &comm->sock, &remQpInfo, sizeof(remQpInfo), &bytes));
for (int q=0; q<comm->nqps; q++) {
struct ibv_qp* qp = comm->qps[q];
@@ -610,7 +795,7 @@ ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
}
comm->ready = 1;
// Block until this is done. It *should* not block indefinitely.
- NCCLCHECK(socketSend(comm->fd, &comm->addr, &comm->ready, sizeof(int)));
+ NCCLCHECK(ncclSocketSend(&comm->sock, &comm->ready, sizeof(int)));
return ncclSuccess;
}
@@ -618,39 +803,170 @@ ncclResult_t ncclSendCheck(struct ncclIbSendComm* comm) {
ncclResult_t ncclRecvCheck(struct ncclIbRecvComm* comm) {
// Do not block on this receive, return if not ready.
int bytes = 0;
- NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &comm->ready, sizeof(int), &bytes));
+ NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->sock, &comm->ready, sizeof(int), &bytes));
if (bytes == 0) return ncclSuccess; // Try again later
- NCCLCHECK(socketWait(NCCL_SOCKET_RECV, comm->fd, &comm->addr, &comm->ready, sizeof(int), &bytes));
+ NCCLCHECK(ncclSocketWait(NCCL_SOCKET_RECV, &comm->sock, &comm->ready, sizeof(int), &bytes));
return ncclSuccess;
}
ncclResult_t ncclIbTest(void* request, int* done, int* size);
-#define REG_ALIGN (4096)
-
ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) {
static_assert(offsetof(struct ncclIbSendComm, verbs) == offsetof(struct ncclIbRecvComm, verbs), "Send and recv comms must have verbs at the same offset");
- struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
- uint64_t addr = (uint64_t)data;
assert(size > 0);
- // Deregister / register
- uint64_t regAddr = addr & (~(REG_ALIGN-1));
- uint64_t regSize = addr+size - regAddr;
- regSize = ((regSize + REG_ALIGN-1) / REG_ALIGN ) * REG_ALIGN;
- struct ibv_mr* mr;
- NCCLCHECK(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ));
- *mhandle = (void*)mr;
- TRACE(NCCL_INIT,"regAddr %lx size %ld rkey %x", regAddr, regSize, mr->rkey);
- return ncclSuccess;
+ static __thread uintptr_t pageSize = 0;
+ if (pageSize == 0) pageSize = sysconf(_SC_PAGESIZE);
+
+ struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
+ struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache;
+ uintptr_t addr = (uintptr_t)data & -pageSize;
+ int pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
+ ncclResult_t res;
+ pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock);
+ for (int slot=0; /*true*/; slot++) {
+ if (slot == cache->population) { // didn't find in cache
+ if (cache->population == cache->capacity) { // must grow cache
+ cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity;
+ NCCLCHECKGOTO(ncclRealloc(&cache->slots, cache->population, cache->capacity), res, returning);
+ }
+ // Deregister / register
+ struct ibv_mr* mr;
+ unsigned int flags = IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ;
+ if (ncclIbRelaxedOrderingEnabled) {
+ // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
+ NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, verbs->pd, (void*)addr, pages*pageSize, (uintptr_t)addr, flags|IBV_ACCESS_RELAXED_ORDERING), res, returning);
+ }
+ else {
+ NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)addr, pages*pageSize, flags), res, returning);
+ }
+ TRACE(NCCL_INIT,"regAddr %llx size %lld rkey %x", (unsigned long long)addr, (long long)pages*PageSize, mr->rkey);
+ cache->population += 1;
+ cache->slots[slot].addr = addr;
+ cache->slots[slot].pages = pages;
+ cache->slots[slot].refs = 1;
+ cache->slots[slot].mr = mr;
+ *mhandle = (void*)mr;
+ res = ncclSuccess;
+ goto returning;
+ }
+ else if (cache->slots[slot].addr == addr && cache->slots[slot].pages == pages) {
+ cache->slots[slot].refs += 1;
+ *mhandle = (void*)cache->slots[slot].mr;
+ res = ncclSuccess;
+ goto returning;
+ }
+ }
+returning:
+ pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock);
+ return res;
}
ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
- NCCLCHECK(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle));
+ struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm;
+ struct ncclIbMrCache* cache = &ncclIbDevs[verbs->dev].mrCache;
+ ncclResult_t res;
+ pthread_mutex_lock(&ncclIbDevs[verbs->dev].lock);
+ for (int i=0; i < cache->population; i++) {
+ if (mhandle == cache->slots[i].mr) {
+ if (0 == --cache->slots[i].refs) {
+ memmove(&cache->slots[i], &cache->slots[--cache->population], sizeof(struct ncclIbMr));
+ if (cache->population == 0) {
+ free(cache->slots);
+ cache->slots = NULL;
+ cache->capacity = 0;
+ }
+ NCCLCHECKGOTO(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle), res, returning);
+ }
+ res = ncclSuccess;
+ goto returning;
+ }
+ }
+ WARN("NET/IB: could not find mr %p inside cache of %d entries", mhandle, cache->population);
+ res = ncclInternalError;
+returning:
+ pthread_mutex_unlock(&ncclIbDevs[verbs->dev].lock);
+ return res;
+}
+
+ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
+ struct ncclIbRequest** reqs = comm->fifoReqs[slot];
+ volatile struct ncclIbSendFifo* slots = comm->fifo[slot];
+ int nreqs = slots[0].nreqs;
+ if (nreqs > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
+
+ uint64_t wr_id = 0ULL;
+
+ for (int r=0; r<nreqs; r++) {
+ struct ibv_send_wr* wr = comm->wrs+r;
+ memset(wr, 0, sizeof(struct ibv_send_wr));
+
+ struct ibv_sge* sge = comm->sges+r;
+ sge->addr=(uintptr_t)reqs[r]->send.data;
+ sge->lkey=reqs[r]->send.lkey;
+
+ wr->opcode = IBV_WR_RDMA_WRITE;
+ wr->send_flags = 0;
+ wr->wr.rdma.remote_addr = slots[r].addr;
+ wr->wr.rdma.rkey = slots[r].rkey;
+ wr->next = wr+1;
+ wr_id += (reqs[r] - comm->verbs.reqs) << (r*8);
+ }
+
+ // Write size as immediate data. In the case of multi-send, only write
+ // 0 or 1 as size to indicate whether there was data sent or received.
+ uint64_t immData = 0;
+ if (nreqs == 1) {
+ immData = reqs[0]->send.size;
+ } else {
+ uint8_t* multiImmData = (uint8_t*)&immData;
+ for (int r=0; r<nreqs; r++) {
+ multiImmData[r] = reqs[r]->send.size ? 1 : 0;
+ }
+ }
+
+ struct ibv_send_wr* lastWr = comm->wrs+nreqs-1;
+ if (nreqs > 1 || reqs[0]->send.size > ncclParamIbArThreshold()) {
+ // When using adaptive routing, send the bulk of the data first as an
+ // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
+ // completion.
+ lastWr++;
+ memset(lastWr, 0, sizeof(struct ibv_send_wr));
+ }
+ lastWr->wr_id = wr_id;
+ lastWr->opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+ lastWr->imm_data = immData;
+ lastWr->next = NULL;
+ lastWr->send_flags = IBV_SEND_SIGNALED;
+
+ for (int q=0; q<comm->nqps; q++) {
+ for (int r=0; r<nreqs; r++) {
+ int chunkSize = std::max(8, DIVUP(reqs[r]->send.size, comm->nqps));
+ int length = std::min(reqs[r]->send.size-reqs[r]->send.offset, chunkSize);
+ if (length <= 0) {
+ comm->wrs[r].sg_list = NULL;
+ comm->wrs[r].num_sge = 0;
+ } else {
+ comm->sges[r].length = length;
+ comm->wrs[r].sg_list = comm->sges+r;
+ comm->wrs[r].num_sge = 1;
+ }
+ }
+ struct ibv_send_wr* bad_wr;
+ NCCLCHECK(wrap_ibv_post_send(comm->qps[q], comm->wrs, &bad_wr));
+
+ for (int r=0; r<nreqs; r++) {
+ int chunkSize = std::max(8, DIVUP(reqs[r]->send.size, comm->nqps));
+ reqs[r]->send.offset += chunkSize;
+ comm->sges[r].addr += chunkSize;
+ comm->wrs[r].wr.rdma.remote_addr += chunkSize;
+ }
+ }
+
return ncclSuccess;
}
-ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, void** request) {
+ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
if (comm->ready == 0) NCCLCHECK(ncclSendCheck(comm));
if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
@@ -658,108 +974,84 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
struct ibv_mr* mr = (struct ibv_mr*)mhandle;
// Wait for the receiver to have posted the corresponding receive
- volatile struct ncclIbSendFifo* slot = comm->fifo + (comm->fifoHead%MAX_REQUESTS);
- volatile uint32_t * readyPtr = &slot->ready;
- if (*readyPtr == 0) { *request = NULL; return ncclSuccess; }
-
- struct ncclIbRequest* req;
- NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
- req->size = size;
- req->addr = &comm->addr;
+ int nreqs = 0;
+ volatile struct ncclIbSendFifo* slots;
+
+ int slot = (comm->fifoHead)%MAX_REQUESTS;
+ struct ncclIbRequest** reqs = comm->fifoReqs[slot];
+ slots = comm->fifo[slot];
+ int idx = comm->fifoHead+1;
+ if (slots[0].idx != idx) { *request = NULL; return ncclSuccess; }
+ nreqs = slots[0].nreqs;
+ // Wait until all data has arrived
+ for (int r=1; r<nreqs; r++) while(slots[r].idx != idx);
+ __sync_synchronize(); // order the nreqsPtr load against tag/rkey/addr loads below
+ for (int r=0; r<nreqs; r++) {
+ if (reqs[r] != NULL || slots[r].tag != tag) continue;
+
+ // Sanity checks to catch user collective call count/size mismatches
+ // plus any potential programming errors
+ if (size > slots[r].size || slots[r].size < 0 || slots[r].addr == 0 || slots[r].rkey == 0) {
+ char line[SOCKET_NAME_MAXLEN+1];
+ WARN("NET/IB : req %d/%d tag %x peer %s collective mismatch error local size %d remote %d addr %lx rkey %x",
+ r, nreqs, tag, ncclSocketToString(&comm->sock.addr, line), size, slots[r].size, slots[r].addr, slots[r].rkey);
+ return ncclInternalError;
+ }
+ struct ncclIbRequest* req;
+ NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
+ req->type = NCCL_NET_IB_REQ_SEND;
+ req->addr = &comm->sock.addr;
+ req->verbs = &comm->verbs;
+ req->nreqs = nreqs;
+ req->send.size = size;
+ req->send.data = data;
+ req->send.lkey = mr->lkey;
+ req->send.offset = 0;
+ req->addr = &comm->sock.addr;
+ req->events = comm->nqps;
+ *request = reqs[r] = req;
+
+ // If this is a multi-recv, send only when all requests have matched.
+ for (int r=0; r<nreqs; r++) {
+ if (reqs[r] == NULL) return ncclSuccess;
+ }
- struct ibv_send_wr wr[2];
- memset(&wr[0], 0, sizeof(wr[0]));
- wr[0].wr_id = (uint64_t)req;
+ TIME_START(0);
+ NCCLCHECK(ncclIbMultiSend(comm, slot));
- struct ibv_sge sge;
- sge.addr=(uintptr_t)data; sge.lkey=mr->lkey;
-
-#if USE_RDMA_WRITE == 0
- wr[0].opcode = IBV_WR_SEND;
- wr[0].send_flags = IBV_SEND_SIGNALED;
-#else
- __sync_synchronize(); // order the readyPtr load against rkey load below
- // Sanity checks to catch user collective call count/size mismatches
- // plus any potential programming errors
- if (size > slot->size || slot->size < 0 || slot->addr == 0 || slot->rkey == 0 || slot->seq != comm->fifoHead) {
- char line[SOCKET_NAME_MAXLEN+1];
- WARN("NET/IB : peer %s collective mismatch error local size %d remote %d addr %lx rkey %x seq %x/%x",
- socketToString(req->addr, line), size, slot->size, slot->addr, slot->rkey, slot->seq, comm->fifoHead);
- return ncclInternalError;
+ // Clear slots[0]->nreqs, as well as other fields to help debugging and sanity checks
+ memset((void*)slots, 0, sizeof(struct ncclIbSendFifo));
+ memset(reqs, 0, NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbRequest*));
+ comm->fifoHead++;
+ TIME_STOP(0);
+ return ncclSuccess;
}
- wr[0].opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
- wr[0].send_flags = IBV_SEND_SIGNALED;
- wr[0].wr.rdma.remote_addr = slot->addr;
- wr[0].wr.rdma.rkey = slot->rkey;
- wr[0].imm_data = size; // Send the message size via imm_data
- __sync_synchronize();
-#endif
- // We must clear slot->ready, but reset other fields to aid
- // debugging and sanity checks
- slot->ready = 0;
- slot->addr = 0ULL;
- slot->rkey = slot->size = slot->seq = 0;
- comm->fifoHead++;
-
-
-#if USE_RDMA_WRITE
- // When using adaptive routing, send the bulk of the data first as an
- // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
- // completion.
- if (size > ncclParamIbArThreshold()) {
- memset(&wr[1], 0, sizeof(wr[1]));
- memcpy(&wr[1], &wr[0], sizeof(wr[0]));
- wr[1].sg_list = NULL;
- wr[1].num_sge = 0;
- wr[0].next = &wr[1];
-
- wr[0].opcode = IBV_WR_RDMA_WRITE;
- wr[1].opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
-
- wr[0].send_flags = 0;
- wr[1].send_flags = IBV_SEND_SIGNALED;
- }
-#endif
-
- int chunkSize = std::max(8, DIVUP(size, comm->nqps));
-
- int offset = 0;
- for (int q=0; q<comm->nqps; q++) {
- int length = std::min(size-offset, chunkSize);
- if (length <= 0) {
- wr[0].sg_list = NULL;
- wr[0].num_sge = 0;
- } else {
- sge.length = length;
- wr[0].sg_list = &sge;
- wr[0].num_sge = 1;
- }
- struct ibv_send_wr* bad_wr;
- NCCLCHECK(wrap_ibv_post_send(comm->qps[q], wr, &bad_wr));
- offset += chunkSize;
- sge.addr += chunkSize;
- wr[0].wr.rdma.remote_addr += chunkSize;
- }
- req->events = comm->nqps;
- *request = req;
+ *request = NULL;
return ncclSuccess;
}
-ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t addr, int size, struct ncclIbRequest* req) {
+ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int* sizes, int* tags, void** mhandles, struct ncclIbRequest* req) {
struct ibv_send_wr wr;
memset(&wr, 0, sizeof(wr));
- int slot = comm->remFifo.tail%MAX_REQUESTS;
- struct ncclIbSendFifo* localElem = comm->remFifo.elems + slot;
- localElem->addr = addr;
- localElem->rkey = rkey;
- localElem->ready = 1;
- localElem->size = size; // Sanity/Debugging
- localElem->seq = comm->remFifo.tail; // Sanity/Debugging
- wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*sizeof(struct ncclIbSendFifo);
+ int slot = comm->remFifo.fifoTail%MAX_REQUESTS;
+ struct ncclIbSendFifo* localElem = comm->remFifo.elems[slot];
+
+ for (int i=0; i<n; i++) {
+ localElem[i].addr = (uint64_t)data[i];
+ struct ibv_mr* mr = (struct ibv_mr*)mhandles[i];
+ localElem[i].rkey = mr->rkey;
+ localElem[i].nreqs = n;
+ localElem[i].size = sizes[i]; // Sanity/Debugging
+ localElem[i].tag = tags[i];
+ localElem[i].idx = comm->remFifo.fifoTail+1;
+ }
+
+ wr.wr.rdma.remote_addr = comm->remFifo.addr + slot*NCCL_NET_IB_MAX_RECVS*sizeof(struct ncclIbSendFifo);
wr.wr.rdma.rkey = comm->remFifo.rkey;
comm->remFifo.sge.addr = (uint64_t)localElem;
+ comm->remFifo.sge.length = n*sizeof(struct ncclIbSendFifo);
wr.sg_list = &comm->remFifo.sge;
wr.num_sge = 1;
wr.opcode = IBV_WR_RDMA_WRITE;
@@ -788,92 +1080,107 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t
//
if (slot == 0) {
wr.send_flags |= IBV_SEND_SIGNALED;
- wr.wr_id = (uint64_t)req;
+ wr.wr_id = req - comm->verbs.reqs;
req->events++;
}
struct ibv_send_wr* bad_wr;
NCCLCHECK(wrap_ibv_post_send(comm->qps[0], &wr, &bad_wr));
- comm->remFifo.tail++;
+ comm->remFifo.fifoTail++;
return ncclSuccess;
}
-ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) {
+ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
if (comm->ready == 0) NCCLCHECK(ncclRecvCheck(comm));
if (comm->ready == 0) { *request = NULL; return ncclSuccess; }
-
- struct ibv_mr* mr = (struct ibv_mr*)mhandle;
+ if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
struct ncclIbRequest* req;
NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
- req->size = size;
- req->addr = &comm->addr;
+ req->type = NCCL_NET_IB_REQ_RECV;
+ req->addr = &comm->sock.addr;
+ req->nreqs = n;
+ for (int i=0; i<n; i++) req->recv.sizes[i] = 0;
struct ibv_recv_wr wr;
memset(&wr, 0, sizeof(wr));
- wr.wr_id = (uint64_t)req;
+ wr.wr_id = req - comm->verbs.reqs;
wr.sg_list = NULL;
wr.num_sge = 0;
+ TIME_START(1);
for (int q=0; q<comm->nqps; q++) {
struct ibv_qp* qp = comm->qps[q];
struct ibv_recv_wr* bad_wr;
NCCLCHECK(wrap_ibv_post_recv(qp, &wr, &bad_wr));
}
+ TIME_STOP(1);
req->events = comm->nqps;
*request = req;
// Post to FIFO to notify sender
- NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size, req));
+ TIME_START(2);
+ NCCLCHECK(ncclIbPostFifo(comm, n, data, sizes, tags, mhandles, req));
+ TIME_STOP(2);
return ncclSuccess;
}
-ncclResult_t ncclIbIflush(void* recvComm, void* data, int size, void* mhandle, void** request) {
+ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
- if (comm->gpuFlush.enabled == 0 || size == 0) return ncclSuccess;
+ int last = -1;
+ for (int i=0; i<n; i++) if (sizes[i]) last = i;
+ if (comm->gpuFlush.enabled == 0 || last == -1) return ncclSuccess;
+ // Only flush once using the last non-zero receive
struct ncclIbRequest* req;
NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req));
- req->addr = &comm->addr;
- struct ibv_mr* mr = (struct ibv_mr*)mhandle;
+ req->type = NCCL_NET_IB_REQ_FLUSH;
+ req->addr = &comm->sock.addr;
+ struct ibv_mr* mr = (struct ibv_mr*)mhandles[last];
struct ibv_send_wr wr;
memset(&wr, 0, sizeof(wr));
- wr.wr_id = (uint64_t)req;
+ wr.wr_id = req - comm->verbs.reqs;
- wr.wr.rdma.remote_addr = (uint64_t)data;
+ wr.wr.rdma.remote_addr = (uint64_t)data[last];
wr.wr.rdma.rkey = mr->rkey;
wr.sg_list = &comm->gpuFlush.sge;
wr.num_sge = 1;
wr.opcode = IBV_WR_RDMA_READ;
wr.send_flags = IBV_SEND_SIGNALED;
+ TIME_START(4);
struct ibv_send_wr* bad_wr;
NCCLCHECK(wrap_ibv_post_send(comm->gpuFlush.qp, &wr, &bad_wr));
+ TIME_STOP(4);
*request = req;
return ncclSuccess;
}
-ncclResult_t ncclIbTest(void* request, int* done, int* size) {
+ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
struct ncclIbRequest *r = (struct ncclIbRequest*)request;
*done = 0;
while (1) {
if (r->events == 0) {
*done = 1;
- if (size) *size = r->size;
+ if (sizes && r->type == NCCL_NET_IB_REQ_RECV) {
+ for (int i=0; i<r->nreqs; i++) sizes[i] = r->recv.sizes[i];
+ }
NCCLCHECK(ncclIbFreeRequest(r));
return ncclSuccess;
}
int wrDone = 0;
struct ibv_wc wcs[4];
+ TIME_START(3);
NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 4, wcs, &wrDone));
+ if (wrDone == 0) { TIME_CANCEL(3); } else { TIME_STOP(3); }
if (wrDone == 0) return ncclSuccess;
for (int w=0; w<wrDone; w++) {
@@ -881,23 +1188,31 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) {
if (wc->status != IBV_WC_SUCCESS) {
char line[SOCKET_NAME_MAXLEN+1];
WARN("NET/IB : Got completion from peer %s with error %d, opcode %d, len %d, vendor err %d",
- socketToString(r->addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
+ ncclSocketToString(r->addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err);
return ncclSystemError;
}
- struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc->wr_id;
- if (doneReq) {
- if (wc->opcode == IBV_WC_RECV) {
- doneReq->size = wc->byte_len;
-#if USE_RDMA_WRITE
- } else if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
- if (doneReq->size == -1)
- doneReq->size = wc->imm_data;
- else
- doneReq->size += wc->imm_data;
-#endif
+ struct ncclIbRequest* req = r->verbs->reqs+(wc->wr_id & 0xff);
+ if (req->type == NCCL_NET_IB_REQ_SEND) {
+ for (int i=0; i<req->nreqs; i++) {
+ struct ncclIbRequest* sendReq = r->verbs->reqs+((wc->wr_id >> (i*8)) & 0xff);
+ if ((sendReq->events <= 0)) return ncclInternalError;
+ sendReq->events--;
+ }
+ } else {
+ if (req && wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
+ if (req->type != NCCL_NET_IB_REQ_RECV) return ncclInternalError;
+ if (req->nreqs > 1) {
+ // In the case of a multi recv, we only set sizes to 0 or 1.
+ uint8_t* sizes = (uint8_t*)&wc->imm_data;
+ for (int i=0; i<req->nreqs; i++) {
+ req->recv.sizes[i] |= sizes[i];
+ }
+ } else {
+ req->recv.sizes[0] += wc->imm_data;
+ }
}
- doneReq->events--;
+ req->events--;
}
}
}
@@ -906,20 +1221,21 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) {
ncclResult_t ncclIbCloseSend(void* sendComm) {
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
if (comm) {
- close(comm->fd);
+ close(comm->sock.fd);
for (int q=0; q<comm->nqps; q++)
if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q]));
if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr));
NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs));
free(comm);
}
+ TIME_PRINT("IB");
return ncclSuccess;
}
ncclResult_t ncclIbCloseRecv(void* recvComm) {
struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
if (comm) {
- close(comm->fd);
+ close(comm->sock.fd);
for (int q=0; q<comm->nqps; q++)
if (comm->qps[q] != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qps[q]));
if (comm->gpuFlush.enabled) {
@@ -936,7 +1252,7 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) {
ncclResult_t ncclIbCloseListen(void* listenComm) {
struct ncclIbListenComm* comm = (struct ncclIbListenComm*)listenComm;
if (comm) {
- close(comm->fd);
+ close(comm->sock.fd);
free(comm);
}
return ncclSuccess;
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
index c045a8f..d92c46f 100644
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -19,7 +19,7 @@
/* Init functions */
static int ncclNetIfs = -1;
struct ncclSocketDev {
- union socketAddress addr;
+ union ncclSocketAddress addr;
char devName[MAX_IF_NAME_SIZE];
char* pciPath;
};
@@ -40,8 +40,8 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
pthread_mutex_lock(&ncclSocketLock);
if (ncclNetIfs == -1) {
char names[MAX_IF_NAME_SIZE*MAX_IFS];
- union socketAddress addrs[MAX_IFS];
- ncclNetIfs = findInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS);
+ union ncclSocketAddress addrs[MAX_IFS];
+ ncclNetIfs = ncclFindInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS);
if (ncclNetIfs <= 0) {
WARN("NET/Socket : no interface found");
return ncclInternalError;
@@ -53,10 +53,10 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
addrline[SOCKET_NAME_MAXLEN] = '\0';
for (int i=0; i<ncclNetIfs; i++) {
strcpy(ncclSocketDevs[i].devName, names+i*MAX_IF_NAME_SIZE);
- memcpy(&ncclSocketDevs[i].addr, addrs+i, sizeof(union socketAddress));
+ memcpy(&ncclSocketDevs[i].addr, addrs+i, sizeof(union ncclSocketAddress));
NCCLCHECK(ncclSocketGetPciPath(ncclSocketDevs[i].devName, &ncclSocketDevs[i].pciPath));
snprintf(line+strlen(line), MAX_LINE_LEN-strlen(line), " [%d]%s:%s", i, names+i*MAX_IF_NAME_SIZE,
- socketToString(&addrs[i], addrline));
+ ncclSocketToString(&addrs[i], addrline));
}
line[MAX_LINE_LEN] = '\0';
INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
@@ -97,12 +97,14 @@ ncclResult_t ncclSocketGetProperties(int dev, ncclNetProperties_t* props) {
props->guid = dev;
props->ptrSupport = NCCL_PTR_HOST;
NCCLCHECK(ncclSocketGetSpeed(props->name, &props->speed));
+ props->latency = 0; // Not set
props->port = 0;
props->maxComms = 65536;
+ props->maxRecvs = 1;
return ncclSuccess;
}
-ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
+ncclResult_t GetSocketAddr(int dev, union ncclSocketAddress* addr) {
if (dev >= ncclNetIfs) return ncclInternalError;
memcpy(addr, &ncclSocketDevs[dev].addr, sizeof(*addr));
return ncclSuccess;
@@ -118,18 +120,33 @@ ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2);
NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2);
+enum ncclSocketCommState {
+ ncclSocketCommStateStart = 0,
+ ncclSocketCommStateConnect = 1,
+ ncclSocketCommStateAccept = 3,
+ ncclSocketCommStateSend = 4,
+ ncclSocketCommStateRecv = 5,
+};
+
+struct ncclSocketCommStage {
+ enum ncclSocketCommState state;
+ uint8_t iteration;
+ struct ncclSocket* sock;
+ struct ncclSocketComm* comm;
+};
+
struct ncclSocketHandle {
- union socketAddress connectAddr;
+ union ncclSocketAddress connectAddr;
int nSocks;
int nThreads;
+ struct ncclSocketCommStage stage;
};
struct ncclSocketTask {
int op;
void* data;
int size;
- int fd;
- union socketAddress *addr;
+ struct ncclSocket* sock;
int offset;
int used;
ncclResult_t result;
@@ -139,8 +156,7 @@ struct ncclSocketRequest {
int op;
void* data;
int size;
- int ctrlFd;
- union socketAddress *addr;
+ struct ncclSocket* ctrlSock;
int offset;
int used;
struct ncclSocketComm* comm;
@@ -154,29 +170,30 @@ struct ncclSocketTaskQueue {
struct ncclSocketTask* tasks;
};
-enum threadState {start, stop};
-
struct ncclSocketThreadResources {
struct ncclSocketTaskQueue threadTaskQueue;
- enum threadState state;
+ int stop;
struct ncclSocketComm* comm;
pthread_mutex_t threadLock;
pthread_cond_t threadCond;
};
struct ncclSocketListenComm {
- int fd;
+ struct ncclSocket sock;
+ struct ncclSocketCommStage stage;
int nSocks;
int nThreads;
+ int dev;
};
struct ncclSocketComm {
- int ctrlFd;
- union socketAddress addr;
- int fds[MAX_SOCKETS];
+ struct ncclSocket ctrlSock;
+ struct ncclSocket socks[MAX_SOCKETS];
+ int dev;
+ int cudaDev;
int nSocks;
int nThreads;
- int nextFd;
+ int nextSock;
struct ncclSocketRequest requests[MAX_REQUESTS];
pthread_t helperThread[MAX_THREADS];
struct ncclSocketThreadResources threadResources[MAX_THREADS];
@@ -185,7 +202,6 @@ struct ncclSocketComm {
void* persistentSocketThread(void *args_) {
struct ncclSocketThreadResources* resource = (struct ncclSocketThreadResources*)args_;
struct ncclSocketComm* comm = resource->comm;
- volatile enum threadState* state = &resource->state;
struct ncclSocketTaskQueue* myQueue = &resource->threadTaskQueue;
int nSocksPerThread = comm->nSocks / comm->nThreads;
while (1) {
@@ -198,7 +214,7 @@ void* persistentSocketThread(void *args_) {
for (int j=0; j<nSocksPerThread; j++) {
struct ncclSocketTask* r = myQueue->tasks+i+j;
if (r != NULL && r->used == 1 && r->offset < r->size) {
- r->result = socketProgress(r->op, r->fd, r->addr, r->data, r->size, &r->offset);
+ r->result = ncclSocketProgress(r->op, r->sock, r->data, r->size, &r->offset);
if (r->result != ncclSuccess) {
WARN("NET/Socket : socket progress error");
return NULL;
@@ -211,12 +227,12 @@ void* persistentSocketThread(void *args_) {
}
if (idle) {
pthread_mutex_lock(&resource->threadLock);
- while (mark == myQueue->next && *state != stop) { // no new tasks, wait
+ while (mark == myQueue->next && resource->stop == 0) { // no new tasks, wait
pthread_cond_wait(&resource->threadCond, &resource->threadLock);
}
pthread_mutex_unlock(&resource->threadLock);
}
- if (*state == stop) return NULL;
+ if (resource->stop) return NULL;
}
}
@@ -271,17 +287,17 @@ end:
ncclResult_t ncclSocketNewListenComm(struct ncclSocketListenComm** comm) {
NCCLCHECK(ncclCalloc(comm, 1));
- (*comm)->fd = -1;
+ (*comm)->sock.fd = -1;
return ncclSuccess;
}
ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) {
NCCLCHECK(ncclCalloc(comm, 1));
- (*comm)->ctrlFd = -1;
+ (*comm)->ctrlSock.fd = -1;
for (int i=0; i < MAX_SOCKETS; i++) {
- (*comm)->fds[i] = -1;
+ (*comm)->socks[i].fd = -1;
}
- (*comm)->nextFd = 0;
+ (*comm)->nextSock = 0;
return ncclSuccess;
}
@@ -290,14 +306,18 @@ ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) {
return ncclInternalError;
}
struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
- static_assert(sizeof(struct ncclSocketHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large");
+ memset(handle, 0, sizeof(struct ncclSocketHandle));
+ static_assert(sizeof(struct ncclSocketHandle) <= NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large");
struct ncclSocketListenComm* comm;
NCCLCHECK(ncclSocketNewListenComm(&comm));
- NCCLCHECK(GetSocketAddr(dev, &handle->connectAddr));
- NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr));
+ NCCLCHECK(GetSocketAddr(dev, &comm->sock.addr));
+ NCCLCHECK(ncclSocketListen(&comm->sock));
+ memcpy(&handle->connectAddr, &comm->sock.addr, sizeof(union ncclSocketAddress));
NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads));
handle->nSocks = comm->nSocks;
handle->nThreads = comm->nThreads;
+ comm->sock.asyncFlag = 1;
+ comm->dev = dev;
*listenComm = comm;
return ncclSuccess;
}
@@ -306,38 +326,99 @@ ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) {
if (dev < 0) { // data transfer socket is based on specified dev
return ncclInternalError;
}
- struct ncclSocketComm* comm;
- NCCLCHECK(ncclSocketNewComm(&comm));
+
+ enum ncclSocketState conState;
struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle;
+ struct ncclSocketCommStage* stage = &handle->stage;
+ struct ncclSocketComm* comm = stage->comm;
+ uint8_t i = stage->iteration;
+ struct ncclSocket* sock = stage->sock;
+ *sendComm = NULL;
+
+ if (stage->state == ncclSocketCommStateConnect) goto socket_connect_check;
+ if (stage->state == ncclSocketCommStateSend) goto socket_send;
+
+ NCCLCHECK(ncclSocketNewComm(&comm));
+ stage->comm = comm;
comm->nSocks = handle->nSocks;
comm->nThreads = handle->nThreads;
- for (int i=0; i<comm->nSocks+1; i++) {
- int tmpFd, offset=0;
- NCCLCHECK(connectAddress(&tmpFd, &handle->connectAddr));
- NCCLCHECK(socketWait(NCCL_SOCKET_SEND, tmpFd, &handle->connectAddr, &i, sizeof(int), &offset));
- if (i == comm->nSocks) comm->ctrlFd = tmpFd;
- else comm->fds[i] = tmpFd;
+ comm->dev = dev;
+ CUDACHECK(cudaGetDevice(&comm->cudaDev));
+ for (; i<comm->nSocks+1; i++) {
+ sock = i == comm->nSocks ? &comm->ctrlSock : comm->socks+i;
+ NCCLCHECK(ncclSocketInit(sock, &handle->connectAddr, NULL, 1));
+
+ stage->sock = sock;
+ stage->state = ncclSocketCommStateConnect;
+ stage->iteration = i;
+ NCCLCHECK(ncclSocketConnect(sock));
+
+socket_connect_check:
+ NCCLCHECK(ncclGetSocketState(sock, &conState));
+ if (conState == ncclSocketConnecting) {
+ /* expect user to call again */
+ return ncclSuccess;
+ } else if (conState == ncclSocketError) {
+ return ncclSystemError;
+ }
+ stage->state = ncclSocketCommStateSend;
+
+socket_send:
+ int done = 0;
+ NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &i, sizeof(uint8_t), &done));
+ if (done == 0) return ncclSuccess;
}
*sendComm = comm;
- comm->addr = handle->connectAddr;
return ncclSuccess;
}
ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) {
struct ncclSocketListenComm* lComm = (struct ncclSocketListenComm*)listenComm;
- struct ncclSocketComm* rComm;
+ struct ncclSocketCommStage* stage = &lComm->stage;
+ struct ncclSocketComm* rComm = stage->comm;
+ uint8_t i = stage->iteration;
+ struct ncclSocket* sock = stage->sock;
+
+ *recvComm = NULL;
+ if (stage->state == ncclSocketCommStateAccept) goto socket_accept;
+ if (stage->state == ncclSocketCommStateRecv) goto socket_recv;
+
NCCLCHECK(ncclSocketNewComm(&rComm));
+ stage->comm = rComm;
rComm->nSocks = lComm->nSocks;
rComm->nThreads = lComm->nThreads;
- for (int i=0; i<rComm->nSocks+1; i++) {
- int tmpFd, sendSockIdx, offset=0;
- socklen_t socklen = sizeof(union socketAddress);
- SYSCHECKVAL(accept(lComm->fd, &rComm->addr.sa, &socklen), "accept", tmpFd);
- NCCLCHECK(socketWait(NCCL_SOCKET_RECV, tmpFd, &rComm->addr, &sendSockIdx, sizeof(int), &offset));
- if (sendSockIdx == rComm->nSocks) rComm->ctrlFd = tmpFd;
- else rComm->fds[sendSockIdx] = tmpFd;
+ rComm->dev = lComm->dev;
+ CUDACHECK(cudaGetDevice(&rComm->cudaDev));
+ lComm->sock.asyncFlag = 1;
+ for (; i<rComm->nSocks+1; i++) {
+ uint8_t sendSockIdx;
+ ncclCalloc(&sock, 1);
+ NCCLCHECK(ncclSocketInit(sock, NULL, NULL, 1));
+ stage->sock = sock;
+ stage->state = ncclSocketCommStateAccept;
+ stage->iteration = i;
+socket_accept:
+ NCCLCHECK(ncclSocketAccept(sock, &lComm->sock));
+ if (sock->fd == -1) return ncclSuccess;
+
+ stage->state = ncclSocketCommStateRecv;
+socket_recv:
+ int done = 0;
+ NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &sendSockIdx, sizeof(uint8_t), &done));
+ if (done == 0) return ncclSuccess;
+
+ if (sendSockIdx == rComm->nSocks) memcpy(&rComm->ctrlSock, sock, sizeof(struct ncclSocket));
+ else memcpy(rComm->socks+sendSockIdx, sock, sizeof(struct ncclSocket));
+
+ free(sock);
}
*recvComm = rComm;
+
+ /* reset lComm state */
+ stage->state = ncclSocketCommStateStart;
+ stage->iteration = 0;
+ stage->sock = NULL;
+ stage->comm = NULL;
return ncclSuccess;
}
@@ -348,8 +429,7 @@ ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* dat
r->op = op;
r->data = data;
r->size = size;
- r->ctrlFd = comm->ctrlFd;
- r->addr = &comm->addr;
+ r->ctrlSock = &comm->ctrlSock;
r->used = 1;
r->comm = comm;
r->nSubs = 0;
@@ -362,7 +442,7 @@ ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* dat
}
ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketTask** req) {
- int tid = comm->nextFd % comm->nThreads;
+ int tid = comm->nextSock % comm->nThreads;
struct ncclSocketThreadResources* res = comm->threadResources+tid;
struct ncclSocketTaskQueue* queue = &res->threadTaskQueue;
// create helper threads and prepare per-thread task queue
@@ -377,22 +457,21 @@ ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data,
pthread_mutex_init(&res->threadLock, NULL);
pthread_cond_init(&res->threadCond, NULL);
pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res);
+ ncclSetThreadName(comm->helperThread[tid], "NCCL Sock%c%1u%2u%2u", op == NCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->cudaDev);
}
struct ncclSocketTask* r = queue->tasks+queue->next;
if (r->used == 0) {
r->op = op;
r->data = data;
r->size = size;
- r->fd = comm->fds[comm->nextFd];
- r->addr = &comm->addr;
+ r->sock = comm->socks+comm->nextSock;
r->offset = 0;
r->result = ncclSuccess;
- comm->nextFd = (comm->nextFd + 1) % comm->nSocks;
+ comm->nextSock = (comm->nextSock + 1) % comm->nSocks;
r->used = 1;
*req = r;
pthread_mutex_lock(&res->threadLock);
queue->next = (queue->next+1)%queue->len;
- res->state = start;
pthread_cond_signal(&res->threadCond);
pthread_mutex_unlock(&res->threadLock);
return ncclSuccess;
@@ -411,17 +490,17 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
if (r->used == 1) { /* try to send/recv size */
int data = r->size;
int offset = 0;
- NCCLCHECK(socketProgress(r->op, r->ctrlFd, r->addr, &data, sizeof(int), &offset));
+ NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, &data, sizeof(int), &offset));
if (offset == 0) return ncclSuccess; /* Not ready -- retry later */
// Not sure we could ever receive less than 4 bytes, but just in case ...
- if (offset < sizeof(int)) NCCLCHECK(socketWait(r->op, r->ctrlFd, r->addr, &data, sizeof(int), &offset));
+ if (offset < sizeof(int)) NCCLCHECK(ncclSocketWait(r->op, r->ctrlSock, &data, sizeof(int), &offset));
// Check size is less or equal to the size provided by the user
if (r->op == NCCL_SOCKET_RECV && data > r->size) {
char line[SOCKET_NAME_MAXLEN+1];
- WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", socketToString(r->addr, line), data, r->size);
+ WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d", ncclSocketToString(&r->ctrlSock->addr, line), data, r->size);
return ncclInternalError;
}
r->size = data;
@@ -459,7 +538,7 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) {
}
} else { // progress request using main thread
if (r->offset < r->size) {
- NCCLCHECK(socketProgress(r->op, r->ctrlFd, r->addr, r->data, r->size, &r->offset));
+ NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, r->data, r->size, &r->offset));
}
if (r->offset == r->size) {
if (size) *size = r->size;
@@ -476,19 +555,20 @@ ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void**
}
ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
-ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, void* mhandle, void** request) {
+ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm;
NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclSocketRequest**)request));
return ncclSuccess;
}
-ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) {
+ncclResult_t ncclSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm;
- NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_RECV, data, size, (struct ncclSocketRequest**)request));
+ if (n != 1) return ncclInternalError;
+ NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], sizes[0], (struct ncclSocketRequest**)request));
return ncclSuccess;
}
-ncclResult_t ncclSocketIflush(void* recvComm, void* data, int size, void* mhandle, void** request) {
+ncclResult_t ncclSocketIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
// We don't support CUDA pointers, so we don't need a flush operation
return ncclInternalError;
}
@@ -496,7 +576,7 @@ ncclResult_t ncclSocketIflush(void* recvComm, void* data, int size, void* mhandl
ncclResult_t ncclSocketCloseListen(void* opaqueComm) {
struct ncclSocketListenComm* comm = (struct ncclSocketListenComm*)opaqueComm;
if (comm) {
- if (comm->fd != -1) close(comm->fd);
+ if (comm->sock.fd != -1) close(comm->sock.fd);
free(comm);
}
return ncclSuccess;
@@ -509,16 +589,16 @@ ncclResult_t ncclSocketClose(void* opaqueComm) {
struct ncclSocketThreadResources* res = comm->threadResources+i;
if (comm->helperThread[i]) {
pthread_mutex_lock(&res->threadLock);
- res->state = stop;
+ res->stop = 1;
pthread_cond_signal(&res->threadCond);
pthread_mutex_unlock(&res->threadLock);
pthread_join(comm->helperThread[i], NULL);
}
free(res->threadTaskQueue.tasks);
}
- if (comm->ctrlFd != -1) close(comm->ctrlFd);
+ if (comm->ctrlSock.fd != -1) close(comm->ctrlSock.fd);
for (int i=0; i<comm->nSocks; i++) {
- if (comm->fds[i] != -1) close(comm->fds[i]);
+ if (comm->socks[i].fd != -1) close(comm->socks[i].fd);
}
free(comm);
}
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index ca59f3b..e71e157 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,31 +7,29 @@
#include "comm.h"
#include "graph.h"
#include "utils.h"
-#include "bootstrap.h"
+
+struct ncclP2pBuff {
+ void* directPtr;
+ cudaIpcMemHandle_t devIpc;
+};
struct p2pConnectInfo {
int rank;
int read;
- void* directPtr;
- cudaIpcMemHandle_t devIpc;
+ struct ncclP2pBuff p2pBuff;
};
+static_assert(sizeof(p2pConnectInfo) <= CONNECT_SIZE, "P2P Connect info is too large");
struct p2pSendResources {
struct ncclSendMem* devMem;
- void* ipcPtr;
- int remoteId;
- int memRank;
- void* remIpcPtr;
- void* bootstrap;
+ void* sendMemIpc;
+ void* recvMemIpc;
};
struct p2pRecvResources {
struct ncclRecvMem* devMem;
- void* ipcPtr;
- int remoteId;
- int memRank;
- void* remIpcPtr;
- void* bootstrap;
+ void* sendMemIpc;
+ void* recvMemIpc;
};
#include <sys/types.h>
@@ -90,17 +88,23 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
return ncclSuccess;
}
- // Check that legacy IPC support is available
if (p2p != 0) {
+ // Cached result of the legacyIPC detection
+ static int legacyIPC = -1;
+ if (legacyIPC >= 0) {
+ *ret = legacyIPC;
+ return ncclSuccess;
+ }
+ // Check that legacy IPC support is available (WSL WAR)
char *dummy;
cudaIpcMemHandle_t ipc;
NCCLCHECK(ncclCudaCalloc(&dummy, CUDA_IPC_MIN));
if (cudaIpcGetMemHandle(&ipc, dummy) != cudaSuccess) {
- INFO(NCCL_INIT|NCCL_P2P,"Legacy IPC not supported on dev %d(=%lx)",
- cudaDev1, info1->busId);
+ INFO(NCCL_INIT|NCCL_P2P,"Legacy IPC not supported");
*ret = 0;
}
CUDACHECK(cudaFree(dummy));
+ legacyIPC = *ret;
return ncclSuccess;
}
@@ -120,6 +124,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \
} while (0)
+
// Setting this to non zero causes P2P to use Reads rather than Writes
NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
@@ -134,7 +139,7 @@ static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo*
return ncclSuccess;
}
-static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct p2pConnectInfo* p2pInfo, void** devMem, void** ipcPtr) {
+static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclP2pBuff* p2pBuff, void** devMem, void** ipcPtr) {
if (myInfo->pidHash == peerInfo->pidHash) {
if (peerInfo->cudaDev != myInfo->cudaDev) {
// Enable P2P access
@@ -147,10 +152,10 @@ static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* pee
return ncclInternalError;
}
}
- *devMem = p2pInfo->directPtr;
+ *devMem = p2pBuff->directPtr;
*ipcPtr = NULL;
} else {
- CUDACHECK(cudaIpcOpenMemHandle(devMem, p2pInfo->devIpc, cudaIpcMemLazyEnablePeerAccess));
+ CUDACHECK(cudaIpcOpenMemHandle(devMem, p2pBuff->devIpc, cudaIpcMemLazyEnablePeerAccess));
*ipcPtr = *devMem;
}
return ncclSuccess;
@@ -165,44 +170,40 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
int useRead, intermediateRank;
NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
- struct p2pConnectInfo info;
- // For CollNet, we use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0)
- info.read = (connIndex == 0) ? useRead : 0;
- const char* useReadStr = info.read ? "/read" : "";
+ static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
+ struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
+ info->read = useRead;
+ // For CollNet, use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0)
+ if (graph && connIndex == 1) info->read = 0;
+ const char* useReadStr = info->read ? "/read" : "";
int sendSize = sizeof(struct ncclSendMem);
// For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
- if (info.read) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE];
+ if (info->read) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE];
ALIGN_SIZE(sendSize, CUDA_IPC_MIN);
- resources->remoteId = -1;
- resources->bootstrap = comm->bootstrap;
if (intermediateRank == -1) {
- NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, sendSize));
- info.rank = myInfo->rank;
+ info->rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash) {
- send->conn.direct |= info.read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
+ send->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
} else {
- send->conn.direct |= info.read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
- CUDACHECK(cudaIpcGetMemHandle(&info.devIpc, info.directPtr));
+ send->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
INFO(NCCL_INIT|NCCL_P2P,"Channel %02d : %d[%lx] -> %d[%lx] via P2P/IPC%s",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr);
}
} else {
- NCCLCHECK(bootstrapRemAlloc(sendSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr));
- info.rank = intermediateRank;
+ info->rank = intermediateRank;
INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/indirect/%d[%lx]%s",
channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, intermediateRank,
comm->peerInfo[intermediateRank].busId, useReadStr);
}
- resources->memRank = info.rank;
- NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info.rank, &info, (void**)&resources->devMem, &resources->ipcPtr));
+ NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 1, info->rank, &send->proxyConn));
+ NCCLCHECK(ncclProxyCall(&send->proxyConn, ncclProxyMsgSetup, &sendSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
- static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
- memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
+ NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->sendMemIpc));
return ncclSuccess;
}
@@ -215,36 +216,32 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
int useRead, intermediateRank;
NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
- struct p2pConnectInfo info;
- // For CollNet, we use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0)
- info.read = (connIndex == 0) ? useRead : 0;
+ static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
+ struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
+ info->read = useRead;
+ // For CollNet, use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0)
+ if (graph && connIndex == 1) info->read = 0;
- int recvSize = offsetof(struct ncclRecvMem, buff);
+ int recvSize = sizeof(struct ncclRecvMem);
// For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure
- for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(info.read && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p];
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) if (!(info->read && p == NCCL_PROTO_SIMPLE)) recvSize += recv->comm->buffSizes[p];
ALIGN_SIZE(recvSize, CUDA_IPC_MIN);
- resources->remoteId = -1;
- resources->bootstrap = comm->bootstrap;
if (intermediateRank == -1) {
- NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, recvSize));
- info.rank = myInfo->rank;
+ info->rank = myInfo->rank;
if (myInfo->pidHash == peerInfo->pidHash) {
- recv->conn.direct |= info.read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
+ recv->conn.direct |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
} else {
- recv->conn.direct |= info.read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
- CUDACHECK(cudaIpcGetMemHandle(&info.devIpc, info.directPtr));
+ recv->conn.direct |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
}
} else {
- NCCLCHECK(bootstrapRemAlloc(recvSize, intermediateRank, resources->bootstrap, &resources->remoteId, &info.devIpc, &info.directPtr));
- info.rank = intermediateRank;
+ info->rank = intermediateRank;
}
- resources->memRank = info.rank;
- NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info.rank, &info, (void**)&resources->devMem, &resources->ipcPtr));
+ NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_P2P, 0, info->rank, &recv->proxyConn));
+ NCCLCHECK(ncclProxyCall(&recv->proxyConn, ncclProxyMsgSetup, &recvSize, sizeof(int), &info->p2pBuff, sizeof(struct ncclP2pBuff)));
- static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
- memcpy(connectInfo, &info, sizeof(struct p2pConnectInfo));
+ NCCLCHECK(p2pMap(myInfo, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&resources->devMem, &resources->recvMemIpc));
return ncclSuccess;
}
@@ -254,16 +251,16 @@ static ncclResult_t p2pSendConnect(struct ncclComm* comm, struct ncclConnect* co
struct ncclRecvMem* remDevMem;
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
- NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->remIpcPtr));
+ NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->recvMemIpc));
- int offset = 0;
+ char* buff = (char*)(remDevMem+1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (info->read && p == NCCL_PROTO_SIMPLE) {
/* For P2P Read the SIMPLE buffer is local (ncclSendMem) */
- send->conn.buffs[p] = resources->devMem->buff;
+ send->conn.buffs[p] = (char*)(resources->devMem+1);
} else {
- send->conn.buffs[p] = remDevMem->buff + offset;
- offset += send->comm->buffSizes[p];
+ send->conn.buffs[p] = buff;
+ buff += send->comm->buffSizes[p];
}
}
send->conn.tail = &remDevMem->tail;
@@ -279,16 +276,16 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
struct ncclSendMem* remDevMem;
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
- NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, info, (void**)&remDevMem, &resources->remIpcPtr));
+ NCCLCHECK(p2pMap(comm->peerInfo+rank, comm->peerInfo+info->rank, &info->p2pBuff, (void**)&remDevMem, &resources->sendMemIpc));
- int offset = 0;
+ char* buff = (char*)(resources->devMem+1);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (info->read && p == NCCL_PROTO_SIMPLE) {
/* For P2P Read the SIMPLE buffer is remote (ncclSendMem) */
- recv->conn.buffs[p] = remDevMem->buff;
+ recv->conn.buffs[p] = (char*)(remDevMem+1);
} else {
- recv->conn.buffs[p] = resources->devMem->buff + offset;
- offset += recv->comm->buffSizes[p];
+ recv->conn.buffs[p] = buff;
+ buff += recv->comm->buffSizes[p];
}
}
recv->conn.tail = &resources->devMem->tail;
@@ -298,39 +295,49 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
return ncclSuccess;
}
-ncclResult_t p2pSendFree(void* resources) {
- struct p2pSendResources* sendRes = (struct p2pSendResources*)resources;
- if (sendRes->ipcPtr)
- CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr));
- if (sendRes->remIpcPtr)
- CUDACHECK(cudaIpcCloseMemHandle(sendRes->remIpcPtr));
- if (sendRes->remoteId != -1) {
- NCCLCHECK(bootstrapRemFree(sendRes->remoteId, sendRes->memRank, sendRes->bootstrap));
- sendRes->devMem = NULL;
- }
- CUDACHECK(cudaFree(sendRes->devMem));
- free(sendRes);
+ncclResult_t p2pSendFree(struct ncclConnector* send) {
+ struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
+ if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
+ if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
+ free(resources);
return ncclSuccess;
}
-ncclResult_t p2pRecvFree(void* resources) {
- struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources;
- if (recvRes->ipcPtr)
- CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr));
- if (recvRes->remIpcPtr)
- CUDACHECK(cudaIpcCloseMemHandle(recvRes->remIpcPtr));
- if (recvRes->remoteId != -1) {
- NCCLCHECK(bootstrapRemFree(recvRes->remoteId, recvRes->memRank, recvRes->bootstrap));
- recvRes->devMem = NULL;
+ncclResult_t p2pRecvFree(struct ncclConnector* recv) {
+ struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
+ if (resources->sendMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->sendMemIpc));
+ if (resources->recvMemIpc) CUDACHECK(cudaIpcCloseMemHandle(resources->recvMemIpc));
+ free(resources);
+ return ncclSuccess;
+}
+
+static ncclResult_t p2pProxySetup(struct ncclProxyConnection* connection, struct ncclComm* comm, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+ if (reqSize != sizeof(int)) return ncclInternalError;
+ int size = *((int*)reqBuff);
+ if (respSize != sizeof(struct ncclP2pBuff)) return ncclInternalError;
+ struct ncclP2pBuff* p2pBuff = (struct ncclP2pBuff*)respBuff;
+ NCCLCHECK(ncclCudaCalloc((char**)&p2pBuff->directPtr, size));
+ connection->transportResources = p2pBuff->directPtr;
+ cudaError_t res = cudaIpcGetMemHandle(&p2pBuff->devIpc, p2pBuff->directPtr);
+ if (res != cudaSuccess) {
+ WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res));
+ cudaFree(p2pBuff->directPtr);
+ free(p2pBuff);
+ CUDACHECK(res);
}
- CUDACHECK(cudaFree(recvRes->devMem));
- free(recvRes);
+ *done = 1;
+ return ncclSuccess;
+}
+
+static ncclResult_t p2pProxyFree(struct ncclProxyConnection* connection, struct ncclComm* comm) {
+ // Do not check return code as CUDA may have already shut down
+ cudaFree(connection->transportResources);
return ncclSuccess;
}
struct ncclTransport p2pTransport = {
"P2P",
p2pCanConnect,
- { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL },
- { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL }
+ { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL },
+ { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL, p2pProxySetup, NULL, p2pProxyFree, NULL }
};
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
index 98e25a9..974a2ab 100644
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,12 +8,10 @@
#include "shm.h"
struct shmConnectInfo {
- uint64_t pidHash;
- int id;
- int sendRank;
- int recvRank;
+ char shmName[7];
int shmSize;
};
+static_assert(sizeof(shmConnectInfo) <= CONNECT_SIZE, "SHM Connect info is too large");
struct shmSendResources {
int remShmSize;
@@ -62,21 +60,17 @@ ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
NCCLCHECK(ncclCalloc(&resources, 1));
send->transportResources = resources;
- struct shmConnectInfo info;
- info.id = channelId;
- info.pidHash = myInfo->pidHash;
- info.sendRank = myInfo->rank;
- info.recvRank = peerInfo->rank;
+ static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big");
+ struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
- char shmName[MAX_SHM_NAME_LEN];
- sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
- info.shmSize = resources->shmSize = sizeof(struct ncclSendMem);
- TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
- NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+ char shmPath[PATH_MAX];
+ shmPath[0] = '\0';
+ info->shmSize = resources->shmSize = sizeof(struct ncclSendMem);
+ NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+ TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
+ memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
INFO(NCCL_INIT|NCCL_SHM,"Channel %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId);
- static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big");
- memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
return ncclSuccess;
}
@@ -85,22 +79,18 @@ ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
NCCLCHECK(ncclCalloc(&resources, 1));
recv->transportResources = resources;
- struct shmConnectInfo info;
- info.id = channelId;
- info.pidHash = myInfo->pidHash;
- info.sendRank = peerInfo->rank;
- info.recvRank = myInfo->rank;
+ static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Info is too big");
+ struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
- char shmName[MAX_SHM_NAME_LEN];
- sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
- int shmSize = offsetof(struct ncclRecvMem, buff);
+ char shmPath[PATH_MAX];
+ shmPath[0] = '\0';
+ int shmSize = sizeof(struct ncclRecvMem);
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) shmSize += recv->comm->buffSizes[p];
- info.shmSize = resources->shmSize = shmSize;
- TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize);
- NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+ info->shmSize = resources->shmSize = shmSize;
+ NCCLCHECK(ncclShmOpen(shmPath, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1));
+ TRACE(NCCL_SHM,"Opened shmName %s shmSize %d", shmPath, info->shmSize);
+ memcpy(info->shmName, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof(info->shmName));
- static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big");
- memcpy(connectInfo, &info, sizeof(struct shmConnectInfo));
return ncclSuccess;
}
@@ -110,18 +100,18 @@ ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* connectIn
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
- char shmName[MAX_SHM_NAME_LEN];
- sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
+ char shmPath[PATH_MAX];
+ sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
resources->remShmSize = info->shmSize;
- TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
- NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
+ TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
+ NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
// Remove the file to ensure proper clean-up
- NCCLCHECK(shmUnlink(shmName));
+ NCCLCHECK(ncclShmUnlink(shmPath));
send->transportResources = resources;
int offset = 0;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
- send->conn.buffs[p] = resources->devRemHostMem->buff + offset;
+ send->conn.buffs[p] = (char*)(resources->devRemHostMem+1) + offset;
offset += send->comm->buffSizes[p];
}
send->conn.tail = &resources->devRemHostMem->tail;
@@ -135,35 +125,35 @@ ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
- char shmName[MAX_SHM_NAME_LEN];
- sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
+ char shmPath[PATH_MAX];
+ sprintf(shmPath, "/dev/shm/nccl-%s", info->shmName);
resources->remShmSize = info->shmSize;
- TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize);
- NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
- NCCLCHECK(shmUnlink(shmName));
+ TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmPath, info->shmSize);
+ NCCLCHECK(ncclShmOpen(shmPath, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0));
+ NCCLCHECK(ncclShmUnlink(shmPath));
recv->conn.head = &resources->devRemHostMem->head;
int offset = 0;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
- recv->conn.buffs[p] = resources->devHostMem->buff + offset;
+ recv->conn.buffs[p] = (char*)(resources->devHostMem+1) + offset;
offset += recv->comm->buffSizes[p];
}
recv->conn.tail = &resources->devHostMem->tail;
return ncclSuccess;
}
-ncclResult_t shmSendFree(void* transportResources) {
- struct shmSendResources* resources = (struct shmSendResources*)transportResources;
- NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
- NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
+ncclResult_t shmSendFree(struct ncclConnector* send) {
+ struct shmRecvResources* resources = (struct shmRecvResources*)send->transportResources;
+ NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
+ NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
free(resources);
return ncclSuccess;
}
-ncclResult_t shmRecvFree(void* transportResources) {
- struct shmRecvResources* resources = (struct shmRecvResources*)transportResources;
- NCCLCHECK(shmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
- NCCLCHECK(shmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
+ncclResult_t shmRecvFree(struct ncclConnector* recv) {
+ struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
+ NCCLCHECK(ncclShmClose(resources->hostMem, resources->devHostMem, resources->shmSize));
+ NCCLCHECK(ncclShmClose(resources->remHostMem, resources->devRemHostMem, resources->remShmSize));
free(resources);
return ncclSuccess;
}
@@ -171,6 +161,6 @@ ncclResult_t shmRecvFree(void* transportResources) {
struct ncclTransport shmTransport = {
"SHM",
shmCanConnect,
- { shmSendSetup, shmSendConnect, shmSendFree, NULL },
- { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL }
+ { shmSendSetup, shmSendConnect, shmSendFree, NULL, NULL, NULL, NULL, NULL },
+ { shmRecvSetup, shmRecvConnect, shmRecvFree, NULL, NULL, NULL, NULL, NULL }
};