Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSylvain Jeaugey <sjeaugey@nvidia.com>2020-03-27 03:31:24 +0300
committerGitHub <noreply@github.com>2020-03-27 03:31:24 +0300
commit533e3702cf713a9ab9a634fbb8b4c380ecf381e6 (patch)
tree9e7eb6643f4356abd1a44e61de244b05db20c946
parent6c61492eba5c25ac6ed1bf57de23c6a689aa75cc (diff)
parentb221128ecacf4ce1b3054172b9f30163307042c5 (diff)
Merge pull request #314 from NVIDIA/v2.6
2.6.4-1
-rw-r--r--makefiles/common.mk2
-rw-r--r--makefiles/version.mk4
-rw-r--r--src/Makefile6
-rw-r--r--src/channel.cc24
-rw-r--r--src/collectives/device/all_gather.h9
-rw-r--r--src/collectives/device/all_reduce.h118
-rw-r--r--src/collectives/device/broadcast.h9
-rw-r--r--src/collectives/device/common.h3
-rw-r--r--src/collectives/device/functions.cu3
-rw-r--r--src/collectives/device/primitives.h4
-rw-r--r--src/collectives/device/reduce.h9
-rw-r--r--src/collectives/device/reduce_scatter.h9
-rw-r--r--src/debug.cc6
-rw-r--r--src/enqueue.cc67
-rw-r--r--src/graph/connect.cc53
-rw-r--r--src/graph/paths.cc285
-rw-r--r--src/graph/search.cc662
-rw-r--r--src/graph/topo.cc941
-rw-r--r--src/graph/topo.h146
-rw-r--r--src/graph/tuning.cc101
-rw-r--r--src/graph/xml.cc780
-rw-r--r--src/graph/xml.h237
-rw-r--r--src/include/align.h19
-rw-r--r--src/include/alloc.h15
-rw-r--r--src/include/checks.h4
-rw-r--r--src/include/coll_net.h34
-rw-r--r--src/include/collectives.h3
-rw-r--r--src/include/comm.h12
-rw-r--r--src/include/core.h3
-rw-r--r--src/include/cpuset.h4
-rw-r--r--src/include/debug.h5
-rw-r--r--src/include/devcomm.h14
-rw-r--r--src/include/graph.h57
-rw-r--r--src/include/info.h4
-rw-r--r--src/include/nccl_net.h101
-rw-r--r--src/include/net.h53
-rw-r--r--src/include/socket.h2
-rw-r--r--src/include/transport.h4
-rw-r--r--src/include/utils.h4
-rw-r--r--src/init.cc396
-rw-r--r--src/misc/utils.cc4
-rw-r--r--src/transport.cc13
-rw-r--r--src/transport/coll_net.cc430
-rw-r--r--src/transport/net.cc44
-rw-r--r--src/transport/net_ib.cc119
-rw-r--r--src/transport/net_socket.cc75
-rw-r--r--src/transport/p2p.cc80
-rw-r--r--src/transport/shm.cc4
48 files changed, 3602 insertions, 1379 deletions
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 2e44826..ece18c7 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
diff --git a/makefiles/version.mk b/makefiles/version.mk
index 05abbc7..883e625 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
-NCCL_MINOR := 5
-NCCL_PATCH := 7
+NCCL_MINOR := 6
+NCCL_PATCH := 4
NCCL_SUFFIX :=
PKG_REVISION := 1
diff --git a/src/Makefile b/src/Makefile
index b11de5e..db1698a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -11,9 +11,9 @@ include ../makefiles/version.mk
INCEXPORTS := nccl.h nccl_net.h
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc \
misc/nvmlwrap.cc misc/ibvwrap.cc misc/utils.cc misc/argcheck.cc \
- transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
+ transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
- graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc
+ graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
##### lib files
LIBNAME := libnccl.so
diff --git a/src/channel.cc b/src/channel.cc
index b053e5b..0a43e17 100644
--- a/src/channel.cc
+++ b/src/channel.cc
@@ -6,24 +6,32 @@
#include "channel.h"
#include "param.h"
+#include "graph.h"
-NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
+#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
+#define DEFAULT_BUFFER_SIZE_BYTES_ARM (1LL << 20) /* 1MiB */
+
+NCCL_PARAM(Buffsize, "BUFFSIZE", -2);
ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
struct ncclChannel* channel = comm->channels+channelid;
channel->id = channelid;
// Setup intermediate buffering
- channel->buffSize = ncclParamBuffsize();
+ int buffSize = ncclParamBuffsize();
+ int cpuArch, cpuVendor, cpuModel;
+ NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
+ channel->buffSize = buffSize != -2 ? buffSize :
+ cpuArch == NCCL_TOPO_CPU_ARCH_ARM ? DEFAULT_BUFFER_SIZE_BYTES_ARM : DEFAULT_BUFFER_SIZE_BYTES;
// Ring index to user rank table.
NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
// Communication structures with peers.
- NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks));
- NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks));
- for (size_t i=0; i<comm->nRanks; ++i) {
+ NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network)
+ NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1));
+ for (size_t i=0; i<comm->nRanks+1; ++i) {
channel->peers[i].send.comm = comm;
channel->peers[i].recv.comm = comm;
}
@@ -42,9 +50,13 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
CUDACHECK(cudaFree(channel->ring.devUserRanks));
// Free transport proxy resources
- for (int r=0; r<nRanks; r++) {
+ // Note: free all send resources first due to CollNet arrangement
+ for (int r=0; r<nRanks+1; r++) {
struct ncclPeer* peer = channel->peers+r;
if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
+ }
+ for (int r=0; r<nRanks+1; r++) {
+ struct ncclPeer* peer = channel->peers+r;
if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
}
diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
index 0ad5ba9..059092c 100644
--- a/src/collectives/device/all_gather.h
+++ b/src/collectives/device/all_gather.h
@@ -69,6 +69,9 @@ __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) {
template<int UNROLL, class FUNC, typename T>
__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclAllGatherCollNetKernel(struct CollectiveArgs* args) { }
+
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
@@ -130,6 +133,9 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllGatherCollNetLLKernel(struct CollectiveArgs* args) { }
+
#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
@@ -193,3 +199,6 @@ __device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { }
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllGatherCollNetLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
index 2449c2b..173b5fa 100644
--- a/src/collectives/device/all_reduce.h
+++ b/src/collectives/device/all_reduce.h
@@ -106,7 +106,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
do {
struct ncclTree* tree = &channel->treeUp;
// Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
- ncclPrimitives<UNROLL, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
+ ncclPrimitives<UNROLL/2, 1, 1, T, NCCL_MAX_TREE_ARITY, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
@@ -124,7 +124,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
do {
struct ncclTree* tree = &channel->treeDn;
// Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
- ncclPrimitives<UNROLL, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
+ ncclPrimitives<UNROLL/2, 1, 1, T, 1, NCCL_MAX_TREE_ARITY, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
@@ -140,6 +140,62 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
} while(0);
}
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclAllReduceCollNetKernel(struct CollectiveArgs* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads-WARP_SIZE;
+ const int bid = args->bid;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ const ssize_t size = args->N;
+ const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+ int chunkSize = args->lastChunkSize;
+ const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
+ const ssize_t loopSize = args->nChannels*chunkSize;
+
+ if (loopSize > size) {
+ chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+ }
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->ThisInput;
+ T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+ if (blockIdx.x < args->nChannels) { // first half of the channels do reduce
+ struct ncclTree* tree = &channel->collTreeUp;
+ ncclPrimitives<UNROLL, 1, 1, T, 1, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Up
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->up == -1) {
+ prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+ } else if (tree->down[0] == -1) {
+ prims.send(thisInput+offset, nelem);
+ } else {
+ prims.recvReduceSend(thisInput+offset, nelem);
+ }
+ }
+ }
+
+ if (blockIdx.x >= args->nChannels) { // second half of the channels do broadcast
+ struct ncclTree* tree = &channel->collTreeDn;
+ ncclPrimitives<UNROLL, 1, 1, T, 1, 1, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Down
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->up == -1) {
+ prims.send(thisOutput+offset, nelem);
+ } else if (tree->down[0] == -1) {
+ prims.recv(thisOutput+offset, nelem);
+ } else {
+ prims.recvCopySend(thisOutput+offset, nelem);
+ }
+ }
+ }
+}
+
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
@@ -271,6 +327,61 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
} while(0);
}
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllReduceCollNetLLKernel(struct CollectiveArgs* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads;
+ const int bid = args->bid;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ const ssize_t size = args->N;
+ ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+ const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
+ const ssize_t loopSize = args->nChannels*chunkSize;
+
+ if (loopSize > size) {
+ chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+ }
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->ThisInput;
+ T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+ if (blockIdx.x < args->nChannels) { // first half of the channels do reduce
+ struct ncclTree* tree = &channel->collTreeUp;
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Up
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->up == -1) {
+ LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+ } else if (tree->down[0] == -1) {
+ LLprims.send(thisInput+offset, nelem);
+ } else {
+ LLprims.recvReduceSend(thisInput+offset, nelem);
+ }
+ }
+ }
+
+ if (blockIdx.x >= args->nChannels) { // second half of the channels do broadcast
+ struct ncclTree* tree = &channel->collTreeDn;
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Down
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->up == -1) {
+ LLprims.send(thisOutput+offset, nelem);
+ } else if (tree->down[0] == -1) {
+ LLprims.recv(thisOutput+offset, nelem);
+ } else {
+ LLprims.recvCopySend(thisOutput+offset, nelem);
+ }
+ }
+ }
+}
+
#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
@@ -408,3 +519,6 @@ __device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
}
}
}
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclAllReduceCollNetLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
index de8b989..5146682 100644
--- a/src/collectives/device/broadcast.h
+++ b/src/collectives/device/broadcast.h
@@ -54,6 +54,9 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) {
template<int UNROLL, class FUNC, typename T>
__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclBroadcastCollNetKernel(struct CollectiveArgs* args) { }
+
template<int UNUSED, class FUNC, typename T>
__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
@@ -101,6 +104,9 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclBroadcastCollNetLLKernel(struct CollectiveArgs* args) { }
+
#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
@@ -148,3 +154,6 @@ __device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { }
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclBroadcastCollNetLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index 46eb9f5..6e06369 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -102,7 +102,8 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \
#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \
- IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING)
+ IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING) \
+ IMPL_COLL4(coll##CollNet, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_COLLNET)
#if NCCL_TYPE == 0
#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu
index 034fe96..d10f11e 100644
--- a/src/collectives/device/functions.cu
+++ b/src/collectives/device/functions.cu
@@ -17,7 +17,8 @@ __device__ volatile uint64_t* ncclShmem;
#define NCCL_FUNC4(coll, op, dtype) \
NCCL_FUNC5(coll##Tree, op, dtype), \
- NCCL_FUNC5(coll##Ring, op, dtype)
+ NCCL_FUNC5(coll##Ring, op, dtype), \
+ NCCL_FUNC5(coll##CollNet, op, dtype)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h
index b624359..c1067bf 100644
--- a/src/collectives/device/primitives.h
+++ b/src/collectives/device/primitives.h
@@ -227,7 +227,7 @@ class ncclPrimitives {
recvStep[i] = conn->step;
recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS);
recvDirectBuff[i] = NULL;
- if (directBuff && conn->direct) {
+ if (directBuff && (conn->direct & NCCL_DIRECT_GPU)) {
recvDirectBuff[i] = directBuff;
if (tid == 0) *conn->ptrExchange = directBuff;
}
@@ -254,7 +254,7 @@ class ncclPrimitives {
sendStep[i] = conn->step;
sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS);
sendDirectBuff[i] = NULL;
- if (directBuff && conn->direct) {
+ if (directBuff && (conn->direct & NCCL_DIRECT_GPU)) {
void* volatile* ptr = conn->ptrExchange;
while ((sendDirectBuff[i] = (T*)(*ptr)) == NULL);
barrier();
diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h
index 0680abe..e36613f 100644
--- a/src/collectives/device/reduce.h
+++ b/src/collectives/device/reduce.h
@@ -50,6 +50,9 @@ __device__ void ncclReduceRingKernel(struct CollectiveArgs* args) {
template<int UNROLL, class FUNC, typename T>
__device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { }
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclReduceCollNetKernel(struct CollectiveArgs* args) { }
+
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
@@ -94,6 +97,9 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { }
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceCollNetLLKernel(struct CollectiveArgs* args) { }
+
#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) {
@@ -138,3 +144,6 @@ __device__ void ncclReduceRingLL128Kernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceTreeLL128Kernel(struct CollectiveArgs* args) { }
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceCollNetLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h
index 1985148..0b0ae81 100644
--- a/src/collectives/device/reduce_scatter.h
+++ b/src/collectives/device/reduce_scatter.h
@@ -64,6 +64,9 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) {
template<int UNROLL, class FUNC, typename T>
__device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { }
+template<int UNROLL, class FUNC, typename T>
+__device__ void ncclReduceScatterCollNetKernel(struct CollectiveArgs* args) { }
+
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
@@ -122,6 +125,9 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { }
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceScatterCollNetLLKernel(struct CollectiveArgs* args) { }
+
#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) {
@@ -182,3 +188,6 @@ __device__ void ncclReduceScatterRingLL128Kernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__device__ void ncclReduceScatterTreeLL128Kernel(struct CollectiveArgs* args) { }
+
+template<int UNUSED, class FUNC, typename T>
+__device__ void ncclReduceScatterCollNetLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/debug.cc b/src/debug.cc
index 03a77ae..b2fc03c 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -107,7 +107,6 @@ void ncclDebugInit() {
if (debugFn[0] != '\0') {
FILE *file = fopen(debugFn, "w");
if (file != NULL) {
- INFO(NCCL_ALL,"DEBUG file is '%s'", debugFn);
ncclDebugFile = file;
}
}
@@ -125,7 +124,7 @@ void ncclDebugInit() {
*/
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
if (ncclDebugLevel == -1) ncclDebugInit();
- if (ncclDebugNoWarn == 1 && level == NCCL_LOG_WARN) level = NCCL_LOG_INFO;
+ if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
char hostname[1024];
getHostName(hostname, 1024, '.');
@@ -135,7 +134,6 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
char buffer[1024];
size_t len = 0;
pthread_mutex_lock(&ncclDebugLock);
- if (ncclDebugNoWarn && ncclDebugLevel == NCCL_LOG_WARN) printf("WARN -> INFO\n");
if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN)
len = snprintf(buffer, sizeof(buffer),
"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line);
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 2239865..92f3467 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -1,11 +1,12 @@
/*************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "argcheck.h"
+#include "coll_net.h"
// Only generate inline kernels for LL
#define NCCL_FUNC5(coll, op, dtype) \
@@ -15,7 +16,8 @@
#define NCCL_FUNC4(coll, op, dtype) \
(void*)NCCL_FUNC5(coll##Tree, op, dtype), \
- (void*)NCCL_FUNC5(coll##Ring, op, dtype)
+ (void*)NCCL_FUNC5(coll##Ring, op, dtype), \
+ (void*)NCCL_FUNC5(coll##CollNet, op, dtype)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
@@ -227,28 +229,23 @@ ncclResult_t ncclEnqueueEvents(ncclComm_t comm) {
/* Enqueueing system : computation of kernel and proxy operations parameters */
/*****************************************************************************/
-// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
-// factor is not ideal but works quite well. Powers of two, 64 B to 1 GB.
-static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
- { 1.0, 1.0, 1.0, 1.0, .9, .8, .7, .7, .7, .7, .6, .5, .5, .5, .6, .7, .8, .9, .9, 1.0, 1.0, 1.0 },
- { 1.0, 1.0, 1.0, 1.0, 1.0, .9, .8, .8, .8, .8, .7, .7, .7, .6, .6, .7, .7, .8, .8, .9, .9, 1.0 },
- { .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .5, .5, .6, .6, .7, .8, .9 }
-};
-
static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
struct ncclComm* comm = info->comm;
- float minTime = 3600000.0; // Hopefully no operation will take an hour to complete.
+ float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete.
// Find algorithm / protocol.
info->algorithm = -1;
info->protocol = -1;
- for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+ int nAlgos = NCCL_NUM_ALGORITHMS;
+ // Check collNet support
+ int collNetTypeSupport = 0;
+ if (info->comm->collNetSupport)
+ NCCLCHECK(collNetReduceSupport(info->datatype, info->op, &collNetTypeSupport));
+ if (collNetTypeSupport != 1) nAlgos--;
+ for (int a=0; a<nAlgos; a++) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
- float bw = comm->bandwidths[info->coll][a][p];
- if (bw == 0) continue;
- int logSize = log2i(info->nBytes>>6);
- if (a == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[p][logSize];
- float time = comm->latencies[info->coll][a][p] + (info->nBytes) / (1000 * bw);
- if (time < minTime) {
+ float time;
+ NCCLCHECK(ncclTopoGetAlgoTime(info, a, p, &time));
+ if (time >= 0 && time < minTime) {
info->algorithm = a;
info->protocol = p;
minTime = time;
@@ -259,14 +256,14 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
WARN("Error : no algorithm/protocol available");
return ncclInternalError;
}
- //if (comm->rank == 0) INFO(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %d", info->nBytes, info->algorithm, info->protocol, minTime);
+ //if (comm->rank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime);
- int nc = comm->nChannels;
- int nt = comm->maxThreads[info->protocol];
+ int nc = (info->algorithm == NCCL_ALGO_COLLNET) ? comm->nChannels/2 : comm->nChannels; // CollNet uses one channel for up and one channel for down
+ int nt = comm->maxThreads[info->algorithm][info->protocol];
int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol];
while (info->nBytes < nc*nt*threadThreshold) {
- if (nc >= 2) nc--;
+ if (info->algorithm != NCCL_ALGO_COLLNET && nc >= 2) nc--;
else if ((nt % 128) == 0) nt/=2;
else break;
}
@@ -286,7 +283,7 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) {
case ncclCollAllGather:
info->pattern = ncclPatternRing; break;
case ncclCollAllReduce:
- info->pattern = info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
+ info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUp : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
default:
WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
return ncclInternalError;
@@ -301,6 +298,8 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
case ncclPatternTreeUpDown:
case ncclPatternPipelineFrom:
case ncclPatternPipelineTo:
+ case ncclPatternCollTreeUp:
+ case ncclPatternCollTreeDown:
info->nstepsPerLoop = info-> nchunksPerLoop = 1; break;
case ncclPatternRing:
info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
@@ -345,6 +344,13 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
}
// Use lastChunkSize as chunkSize
coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
+ } else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) {
+ // Optimize chunkSize / nSteps
+ while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*16 && chunkSize > 131072) chunkSize /= 2;
+ while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth*4 && chunkSize > 65536) chunkSize /= 2;
+ while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTreeUp.depth && chunkSize > 32768) chunkSize /= 2;
+ // Use lastChunkSize as chunkSize
+ coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype);
} else if (info->protocol == NCCL_PROTO_LL) {
int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t);
const ssize_t loopSize = info->nChannels*info->nchunksPerLoop*(ssize_t)sliceSize;
@@ -369,6 +375,8 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
proxyArgs->chunkSteps = chunkSteps;
proxyArgs->protocol = info->protocol;
proxyArgs->opCount = info->comm->opCount;
+ proxyArgs->dtype = info->datatype;
+ proxyArgs->redOp = info->op;
TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p",
coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads,
nLoops, proxyArgs->nsteps, info->comm);
@@ -395,8 +403,11 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
WARN("Error : mixing different streams within a group call is not supported.");
return ncclInvalidUsage;
}
- for (int bid=0; bid<coll.args.nChannels; bid++) {
- struct ncclChannel* channel = info->comm->channels+(info->comm->myParams->gridDim.x % info->comm->nChannels);
+
+ int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1;
+ for (int bid=0; bid<coll.args.nChannels*nSubChannels; bid++) {
+ int channelId = info->comm->myParams->gridDim.x % info->comm->nChannels;
+ struct ncclChannel* channel = info->comm->channels+channelId;
if (channel->collCount == NCCL_MAX_OPS) {
WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS);
@@ -405,6 +416,10 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
// Proxy
proxyArgs.channel = channel;
+ // Adjust pattern for CollNet based on channel index
+ if (nSubChannels == 2) {
+ info->pattern = (channelId < info->comm->nChannels/nSubChannels) ? ncclPatternCollTreeUp : ncclPatternCollTreeDown;
+ }
NCCLCHECK(transportSaveProxies(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
info->comm->myParams->gridDim.x++;
@@ -416,7 +431,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) {
memcpy(c, &coll, sizeof(struct ncclColl));
- c->args.bid = bid;
+ c->args.bid = bid % coll.args.nChannels;
c->active = 1;
opIndex = (opIndex+1)%NCCL_MAX_OPS;
c->nextIndex = opIndex;
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index af481d2..dd9f9f0 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -14,7 +14,7 @@
/******************************************************************/
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
- struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
+ struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
struct ncclTopoRanks* topoRanks) {
int rank = comm->rank;
int localRanks = comm->localRanks;
@@ -27,9 +27,14 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeUp.down[i] = -1;
channel->treeDn.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeDn.down[i] = -1;
+ channel->collTreeUp.up = -1;
+ for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTreeUp.down[i] = -1;
+ channel->collTreeDn.up = -1;
+ for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTreeDn.down[i] = -1;
int* ringIntra = ringGraph->intra+c*localRanks;
int* treeIntra = treeGraph->intra+c*localRanks;
+ int* collNetIntra = collNetGraph->intra+c*localRanks;
for (int i=0; i<localRanks; i++) {
if (ringIntra[i] == rank) {
@@ -57,6 +62,16 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
channel->treeUp.down[0] = sym ? channel->treeDn.down[0] : channel->treeDn.up ;
channel->treeUp.up = sym ? channel->treeDn.up : channel->treeDn.down[0];
}
+ if (collNetIntra[i] == rank) {
+ int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
+
+ // CollTrees are always symmetric, i.e.
+ // up/down go in reverse directions
+ channel->collTreeDn.up = collNetIntra[prev];
+ channel->collTreeDn.down[0] = collNetIntra[next];
+ channel->collTreeUp.down[0] = channel->collTreeDn.down[0];
+ channel->collTreeUp.up = channel->collTreeDn.up;
+ }
}
topoRanks->ringPrev[c] = channel->ring.prev;
topoRanks->ringNext[c] = channel->ring.next;
@@ -174,6 +189,40 @@ static ncclResult_t connectTrees(struct ncclComm* comm, int* treeUpRecv, int* tr
return ncclSuccess;
}
+ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank) {
+ int nranks = comm->nRanks;
+ int depth = nranks/comm->nNodes;
+ int sendIndex = collNetGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; // send GPU index depends on topo pattern
+ int sendEndIndex = (sendIndex+comm->localRanks-1)%comm->localRanks;
+ for (int c=0; c<comm->nChannels/2; c++) {
+ struct ncclChannel* channel = comm->channels+c;
+ // Set root of collTree to id nranks
+ if (rank == collNetGraph->intra[sendIndex+c*comm->localRanks]) { // is master
+ channel->collTreeUp.up = channel->collTreeDn.up = nranks;
+ }
+ if (rank == collNetGraph->intra[sendEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
+ channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1;
+ }
+ channel->collTreeUp.depth = channel->collTreeDn.depth = depth;
+ INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTreeUp.up, channel->collTreeUp.down[0]);
+ }
+ int recvIndex = 0; // recv GPU index is always 0
+ int recvEndIndex = (recvIndex+comm->localRanks-1)%comm->localRanks;
+ for (int c=0; c<comm->nChannels/2; c++) {
+ struct ncclChannel* channel = comm->channels+comm->nChannels/2+c;
+ // Set root of collTree to id nranks
+ if (rank == collNetGraph->intra[recvIndex+c*comm->localRanks]) { // is master
+ channel->collTreeUp.up = channel->collTreeDn.up = nranks;
+ }
+ if (rank == collNetGraph->intra[recvEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
+ channel->collTreeUp.down[0] = channel->collTreeDn.down[0] = -1;
+ }
+ channel->collTreeUp.depth = channel->collTreeDn.depth = depth;
+ INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTreeDn.up, channel->collTreeDn.down[0]);
+ }
+ return ncclSuccess;
+}
+
// Legacy naming
NCCL_PARAM(MinNrings, "MIN_NRINGS", -2);
NCCL_PARAM(MaxNrings, "MAX_NRINGS", -2);
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index eba1964..0872ae7 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -42,7 +42,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
basePath->count = 0;
basePath->width = LOC_WIDTH;
- basePath->type = LINK_LOC;
+ basePath->type = PATH_LOC;
while (nodeList.count) {
nextNodeList.count = 0;
@@ -58,7 +58,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
}
struct ncclTopoLinkList* remPath;
NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
- int width = std::min(path->width, link->width);
+ float width = std::min(path->width, link->width);
if (remPath->width < width) {
// Find reverse link
for (int l=0; l<remNode->nlinks; l++) {
@@ -68,8 +68,8 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
}
}
if (remPath->list[0] == NULL) {
- WARN("Failed to find reverse path from remNode id %d type %d nlinks %d to node id %d type %d",
- remNode->id, remNode->type, remNode->nlinks, node->id, node->type);
+ WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx",
+ remNode->type, remNode->id, remNode->nlinks, node->type, node->id);
return ncclInternalError;
}
// Copy the rest of the path
@@ -77,9 +77,17 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
remPath->count = path->count + 1;
remPath->width = width;
- // Consider the path is QPI when going through the CPU
- // Also don't consider LINK_NET as we only care about the NIC->GPU path.
- int type = remNode->type == CPU ? LINK_QPI : link->type == LINK_NET ? 0 : link->type;
+ // Start with path type = link type. PATH and LINK types are supposed to match.
+ // Don't consider LINK_NET as we only care about the NIC->GPU path.
+ int type = link->type == LINK_NET ? 0 : link->type;
+ // Differentiate between one and multiple PCI switches
+ if (type == PATH_PIX && (node->type == PCI || link->remNode->type == PCI) && remPath->count > 3) type = PATH_PXB;
+ // Consider a path going through the CPU as PATH_PHB
+ if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB;
+ // Ignore Power CPU in an NVLink path
+ if (path->type == PATH_NVL && type == PATH_SYS && link->remNode->type == CPU &&
+ link->remNode->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) type = 0;
+
remPath->type = std::max(path->type, type);
// Add to the list for the next iteration if not already in the list
@@ -117,9 +125,9 @@ static void printNodePaths(struct ncclTopoSystem* system, struct ncclTopoNode* n
sprintf(line+offset, "--%s->%s/%lX", topoLinkTypeStr[link->type], topoNodeTypeStr[remNode->type], remNode->id);
offset = strlen(line);
}
- INFO(NCCL_GRAPH, "%s (%d)", line, node->paths[t][n].width);
+ INFO(NCCL_GRAPH, "%s (%f)", line, node->paths[t][n].width);
#else
- sprintf(line+offset, "%s/%lX (%d/%d/%d) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, node->paths[t][n].type);
+ sprintf(line+offset, "%s/%lX (%d/%f/%s) ", topoNodeTypeStr[t], system->nodes[t].nodes[n].id, node->paths[t][n].count, node->paths[t][n].width, topoPathTypeStr[node->paths[t][n].type]);
offset = strlen(line);
#endif
}
@@ -171,7 +179,7 @@ static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int
// Update path characteristics
srcNode->paths[t2][i2].count = l;
- srcNode->paths[t2][i2].type = LINK_QPI;
+ srcNode->paths[t2][i2].type = std::max(srcNode->paths[CPU][c].type, cpuNode->paths[t2][i2].type);
srcNode->paths[t2][i2].width = std::min(srcNode->paths[CPU][c].width, cpuNode->paths[t2][i2].width);
return ncclSuccess;
}
@@ -194,6 +202,127 @@ static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType)
}
}
+static const int levelsOldToNew[] = { PATH_LOC, PATH_PIX, PATH_PXB, PATH_PHB, PATH_SYS, PATH_SYS };
+ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelEnv) {
+ if (*level == -1) {
+ int l = -1;
+ if (disableEnv) {
+ char* str = getenv(disableEnv);
+ if (str) {
+ int disable = strtol(str, NULL, 0);
+ if (disable == 1) l = 0;
+ }
+ }
+ if (l == -1) {
+ char* str = getenv(levelEnv);
+ if (str) {
+ for (int i=0; i<PATH_NET; i++) {
+ if (strcmp(str, topoPathTypeStr[i]) == 0) {
+ l = i;
+ break;
+ }
+ }
+ // Old style numbering
+ if (l == -1 && str[0] >= '0' && str[0] <= '9') {
+ int oldLevel = strtol(str, NULL, 0);
+ const int maxOldLevel = sizeof(levelsOldToNew)/sizeof(int) - 1;
+ if (oldLevel > maxOldLevel) oldLevel = maxOldLevel;
+ l = levelsOldToNew[oldLevel];
+ }
+ }
+ }
+ if (l >= 0) INFO(NCCL_GRAPH, "%s set from environment to %s", levelEnv, topoPathTypeStr[l]);
+ *level = l >= 0 ? l : -2;
+ }
+ return ncclSuccess;
+}
+
+int ncclTopoUserP2pLevel = -1;
+ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p) {
+ *p2p = 0;
+
+ // Get GPUs from topology
+ int g1, g2;
+ NCCLCHECK(ncclTopoIdToIndex(system, GPU, id1, &g1));
+ struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1;
+ if (ncclTopoIdToIndex(system, GPU, id2, &g2) == ncclInternalError) {
+ // GPU not found, we can't use p2p.
+ return ncclSuccess;
+ }
+ struct ncclTopoLinkList* path = gpu1->paths[GPU]+g2;
+
+ // In general, use P2P whenever we can.
+ int p2pLevel = PATH_SYS;
+
+ // Don't use P2P through ARM CPUs
+ int arch, vendor, model;
+ NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
+ if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB;
+ if (arch == NCCL_TOPO_CPU_ARCH_X86 &&
+ vendor == NCCL_TOPO_CPU_VENDOR_INTEL &&
+ model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB;
+
+ // User override
+ NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL"));
+ if (ncclTopoUserP2pLevel != -2) p2pLevel = ncclTopoUserP2pLevel;
+
+ // Compute the PCI distance and compare with the p2pLevel.
+ if (path->type <= p2pLevel) *p2p = 1;
+
+ return ncclSuccess;
+}
+
+NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
+int ncclTopoUserGdrLevel = -1;
+
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int netDev, int read, int* useGdr) {
+ *useGdr = 0;
+
+ // Get GPU and NET
+ int n, g;
+ NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n));
+ struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+ NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g));
+ struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+
+ // Check that both the NIC and GPUs support it
+ if (net->net.gdrSupport == 0) return ncclSuccess;
+ if (gpu->gpu.gdrSupport == 0) return ncclSuccess;
+
+ if (read) { // For reads (sends) only enable under certain conditions
+ int gdrReadParam = ncclParamNetGdrRead();
+ if (gdrReadParam == 0) return ncclSuccess;
+ if (gdrReadParam < 0) {
+ int nvlink = 0;
+ // Since we don't know whether there are other communicators,
+ // it's better to keep things local if we have a single GPU.
+ if (system->nodes[GPU].count == 1) nvlink = 1;
+ for (int i=0; i<system->nodes[GPU].count; i++) {
+ if (i == g) continue;
+ if (gpu->paths[GPU][i].type == PATH_NVL) {
+ nvlink = 1;
+ break;
+ }
+ }
+ if (!nvlink) return ncclSuccess;
+ }
+ }
+
+ // Check if we are close enough that it makes sense to enable GDR
+ int netGdrLevel = PATH_PXB;
+ NCCLCHECK(ncclGetLevel(&ncclTopoUserGdrLevel, NULL, "NCCL_NET_GDR_LEVEL"));
+ if (ncclTopoUserGdrLevel != -2) netGdrLevel = ncclTopoUserGdrLevel;
+ int distance = gpu->paths[NET][n].type;
+ if (distance > netGdrLevel) {
+ INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel);
+ return ncclSuccess;
+ }
+
+ *useGdr = 1;
+ INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d <= %d), read %d", busId, netDev, distance, netGdrLevel, read);
+ return ncclSuccess;
+}
+
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* peerInfos) {
// Precompute paths between GPUs/NICs.
@@ -210,26 +339,29 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
// Compute paths to GPU g
NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system));
- if (peerInfos == NULL) continue;
- // Update paths from GPUs p to GPU g when we can't or don't want to use P2P or even SHM
- struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].rank;
+ // Update path when we don't want to / can't use GPU Direct P2P
for (int p=0; p<system->nodes[GPU].count; p++) {
- if (p == g) continue;
- struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].rank;
int p2p;
- NCCLCHECK(ncclTransports[TRANSPORT_P2P].canConnect(&p2p, system, NULL, srcInfo, dstInfo));
+ NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p));
if (p2p == 0) {
- int shm;
- NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
- if (shm == 1) {
- // We cannot use GPU Direct, so we need all traffic to go through a CPU
- int cpu;
- NCCLCHECK(getLocalCpu(system, g, &cpu));
- NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g));
- } else {
- // We cannot communicate with that peer.
- system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
- }
+ // Divert all traffic through the CPU
+ int cpu;
+ NCCLCHECK(getLocalCpu(system, g, &cpu));
+ NCCLCHECK(addCpuStep(system, cpu, GPU, p, GPU, g));
+ }
+ }
+
+ if (peerInfos == NULL) continue;
+ // Remove GPUs we can't talk to because of containers.
+ struct ncclPeerInfo* dstInfo = peerInfos+system->nodes[GPU].nodes[g].gpu.rank;
+ for (int p=0; p<system->nodes[GPU].count; p++) {
+ if (p == g) continue;
+ struct ncclPeerInfo* srcInfo = peerInfos+system->nodes[GPU].nodes[p].gpu.rank;
+ int shm;
+ NCCLCHECK(ncclTransports[TRANSPORT_SHM].canConnect(&shm, system, NULL, srcInfo, dstInfo));
+ if (shm == 0) {
+ // Mark this peer as inaccessible. We'll trim it later.
+ system->nodes[GPU].nodes[p].paths[GPU][g].count = 0;
}
}
}
@@ -239,11 +371,12 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
struct ncclTopoNode* netNode = system->nodes[NET].nodes+n;
NCCLCHECK(ncclTopoSetPaths(netNode, system));
- if (peerInfos == NULL) continue;
for (int g=0; g<system->nodes[GPU].count; g++) {
- if ((peerInfos[system->nodes[GPU].nodes[g].rank].gdrSupport & (1 << n)) == 0) {
- // We cannot use GPU Direct RDMA, so we need all NIC<->GPU paths
- // to go through a CPU
+ // Update path when we dont want to / can't use GPU Direct RDMA.
+ int gdr;
+ NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
+ if (gdr == 0) {
+ // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
int localCpu;
NCCLCHECK(getLocalCpu(system, g, &localCpu));
NCCLCHECK(addCpuStep(system, localCpu, NET, n, GPU, g));
@@ -251,7 +384,6 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
}
}
}
-
return ncclSuccess;
}
@@ -270,7 +402,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
domains[g] = std::min(domains[g], domains[p]);
}
}
- if (gpu->rank == comm->rank) myDomain = domains[g];
+ if (gpu->gpu.rank == comm->rank) myDomain = domains[g];
}
int ngpus = system->nodes[GPU].count;
@@ -288,98 +420,19 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
free(ids);
return ncclInternalError;
}
-
- // Remove GPUs I can't access (even indirectly) from my view of the node
- for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
- for (int n=0; n<system->nodes[t].count; n++) {
- struct ncclTopoNode* node = system->nodes[t].nodes+n;
- if (node == gpu) continue;
- for (int l=0; l<node->nlinks; l++) {
- while (l<node->nlinks && node->links[l].remNode == gpu) {
- if (l<node->nlinks-1)
- memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink));
- node->nlinks--;
- }
- if (l<node->nlinks && node->links[l].remNode->type == GPU && node->links[l].remNode >= gpu) {
- node->links[l].remNode--;
- }
- }
- }
- }
- if (g != system->nodes[GPU].count-1)
- memmove(gpu, gpu+1, (system->nodes[GPU].count-g-1)*sizeof(struct ncclTopoNode));
- system->nodes[GPU].count--;
+ NCCLCHECK(ncclTopoRemoveNode(system, GPU, g));
}
comm->localRanks = system->nodes[GPU].count;
if (system->nodes[GPU].count == comm->nRanks) {
- // Trim network
- ncclTopoRemovePathType(system, NET);
- system->nodes[NET].count = 0;
- for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
- for (int n=0; n<system->nodes[t].count; n++) {
- struct ncclTopoNode* node = system->nodes[t].nodes+n;
- for (int l=0; l<node->nlinks; l++) {
- struct ncclTopoLink* link = &(node->links[l]);
- if (link->remNode->type == NET) {
- // Remove the link
- for (int i=l; i<(node->nlinks-1); i++) {
- memcpy(&(node->links[i]), &(node->links[i+1]), sizeof(ncclTopoLink));
- }
- node->nlinks--;
- l--; // revisit the same value of "l" for the next iteration, since we edited the list in the middle of the loop
- }
- }
- }
- }
+ for (int n=system->nodes[NET].count-1; n>=0; n--)
+ NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
}
free(domains);
free(ids);
return ncclSuccess;
}
-static ncclResult_t getGpuSpeed(struct ncclTopoNode* node, int* speed) {
- int nvlSpeed = 0;
- int nvlPeers = 0;
- int pciSpeed = 0;
- for (int l=0; l<node->nlinks; l++) {
- if (node->links[l].type == LINK_NVL) nvlSpeed += node->links[l].width;
- if (node->links[l].remNode->type == GPU) nvlPeers++; else nvlPeers = 2;
- if (node->links[l].type == LINK_PCI) pciSpeed = node->links[l].width;
- }
- *speed = std::min(*speed, std::max(nvlSpeed, pciSpeed));
- return ncclSuccess;
-}
-
-ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system) {
- // Compute max speed to try to accelerate the search.
- system->maxSpeed = LOC_WIDTH;
-
- for (int g=0; g<system->nodes[GPU].count; g++) {
- NCCLCHECK(getGpuSpeed(system->nodes[GPU].nodes+g, &system->maxSpeed));
- }
- if (system->nodes[NET].count) {
- // Try to assign one NIC per GPU
- int netMaxSpeed = 0;
- int netMaxSpeedCount = 0;
- for (int n=0; n<system->nodes[NET].count; n++) {
- int maxSpeed = 0;
- struct ncclTopoNode* net = system->nodes[NET].nodes+n;
- for (int g=0; g<system->nodes[GPU].count; g++) {
- maxSpeed = std::max(maxSpeed, net->paths[GPU][g].width);
- }
- if (maxSpeed > netMaxSpeed) {
- netMaxSpeed = maxSpeed;
- netMaxSpeedCount = 1;
- } else if (maxSpeed == netMaxSpeed) {
- netMaxSpeedCount++;
- }
- }
- system->maxSpeed = std::min(system->maxSpeed, netMaxSpeedCount*NET_WIDTH);
- }
- return ncclSuccess;
-}
-
void ncclTopoFree(struct ncclTopoSystem* system) {
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
free(system);
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 3a8b4e7..b4c3e35 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -7,29 +7,121 @@
#include "core.h"
#include "graph.h"
#include "topo.h"
+#include "xml.h"
+#include <math.h>
+
+// Initialize system->maxWidth. This is the per-channel (i.e. per-SM)
+// max speed.
+static float getMaxWidth(struct ncclTopoSystem* system, struct ncclTopoNode* gpu, int type) {
+ float nvLinkWidth = gpu->gpu.cudaCompCap > 60 ? VOLTA_NVLINK_WIDTH : PASCAL_NVLINK_WIDTH;
+ float maxWidth = 0.0;
+ for (int i=0; i<system->nodes[type].count; i++) {
+ struct ncclTopoLinkList* path = gpu->paths[type]+i;
+ float width = path->width;
+ if (path->count == 0) continue;
+ if (path->type == PATH_NVL) width = std::min(nvLinkWidth, width);
+ maxWidth = std::max(maxWidth, width);
+ }
+ return maxWidth;
+}
+ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
+ system->maxWidth = 0.0;
+ int inter = system->nodes[NET].count;
+ if (inter == 0 && system->nodes[GPU].count == 1) {
+ system->maxWidth = LOC_WIDTH;
+ return ncclSuccess;
+ }
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+ system->maxWidth = std::max(system->maxWidth, getMaxWidth(system, gpu, inter ? NET : GPU));
+ }
+ return ncclSuccess;
+}
-static ncclResult_t ncclTopoFollowPath(struct ncclTopoGraph* graph, struct ncclTopoLinkList* path, struct ncclTopoNode** node, int width, int typeSave) {
- if (path->count == 0) return ncclSuccess;
+static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* node2, struct ncclTopoLink** revLink) {
+ for (int l=0; l<node2->nlinks; l++) {
+ struct ncclTopoLink* link = node2->links+l;
+ if (link->remNode == node1) {
+ *revLink = link;
+ return ncclSuccess;
+ }
+ }
+ WARN("Could not find rev link for %d/%d -> %d/%d\n", node1->type, node1->id, node2->type, node2->id);
+ return ncclInternalError;
+}
- *node = NULL;
- if (width > 0) {
- if (path->type > graph->type) return ncclSuccess;
- graph->type = std::max(graph->type, path->type);
- graph->nHops += path->count;
- } else {
- graph->type = typeSave;
- graph->nHops -= path->count;
+// This is unfortunately needed since manipulating floats often results in rounding errors.
+#define SUB_ROUND(a, b) (a = roundf((a-b)*1000)/1000)
+
+static ncclResult_t followPath(struct ncclTopoLinkList* path, struct ncclTopoNode* start, int maxSteps, float speed, int* steps) {
+ float pciSpeed = speed;
+ for (int step=0; step<path->count; step++) {
+ struct ncclTopoNode* node = path->list[step]->remNode;
+ if (node->type == CPU) {
+ // Account for P2P inefficiency through Intel CPU RC
+ if (path->type == PATH_PHB && start->type == GPU &&
+ node->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 &&
+ node->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
+ pciSpeed = INTEL_P2P_OVERHEAD(speed);
+ }
+ }
}
- for (int i=0; i<path->count; i++) {
- if (path->list[i]->width < width) {
- // Can't follow this path, rewind and exit
- for (int j=0; j<i; j++) path->list[j]->width += width;
- return ncclSuccess;
+ struct ncclTopoNode* node = start;
+ for (int step=0; step<maxSteps; step++) {
+ struct ncclTopoLink* link = path->list[step];
+ struct ncclTopoLink* revLink = NULL;
+ float fwSpeed = link->type == LINK_PCI ? pciSpeed : speed;
+ float revSpeed = 0;
+ if (link->remNode->type == GPU && start->type != GPU) {
+ if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink));
+ revSpeed += fwSpeed/8;
+ }
+ if (link->remNode->type == CPU && link->type == LINK_NVL) {
+ if (revLink == NULL) NCCLCHECK(findRevLink(node, link->remNode, &revLink));
+ revSpeed += fwSpeed;
}
- path->list[i]->width -= width;
+ if (link->width < fwSpeed || (revSpeed && revLink->width < revSpeed)) { *steps = step; return ncclSuccess; }
+ SUB_ROUND(link->width, fwSpeed);
+ if (revSpeed) SUB_ROUND(revLink->width, revSpeed);
+ node = link->remNode;
}
- *node = path->list[path->count-1]->remNode;
+ *steps = maxSteps;
+ return ncclSuccess;
+}
+
+// Try to go from node type1/index1 to no type2/index2. mult indicates whether we are counting the bandwidth (1) or undoing (-1).
+static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int type1, int index1, int type2, int index2, int mult, struct ncclTopoNode** node) {
+ // First handle easy cases
+ *node = system->nodes[type2].nodes+index2;
+ if (type1 == -1) return ncclSuccess;
+ struct ncclTopoNode* node1 = system->nodes[type1].nodes+index1;
+ struct ncclTopoLinkList* path = node1->paths[type2]+index2;
+ if (path->count == 0 ) return ncclSuccess;
+
+ // Now check link type
+ *node = NULL;
+ int intra = type1 == GPU && type2 == GPU;
+ float speed = intra ? graph->speedIntra : graph->speedInter;
+ int type = intra ? graph->typeIntra : graph->typeInter;
+
+ if (mult == 1 && (path->type > type)) return ncclSuccess;
+
+ speed *= mult;
+
+ // Check there is enough bandwidth on paths.
+ int step = 0;
+ NCCLCHECK(followPath(path, node1, path->count, speed, &step));
+ if (step < path->count) goto rewind;
+
+ // Enough bandwidth : return destination node.
+ graph->nHops += mult*path->count;
+ *node = system->nodes[type2].nodes+index2;
+ return ncclSuccess;
+
+rewind:
+ // Not enough bandwidth : rewind and exit.
+ NCCLCHECK(followPath(path, node1, step, -speed, &step));
return ncclSuccess;
}
@@ -80,22 +172,42 @@ static int cmpIntraScores(struct ncclGpuScore* scores, int count) {
return 0;
}
-static ncclResult_t getNetPaths(struct ncclTopoSystem* system, const uint64_t flag, struct ncclTopoLinkList** netPaths) {
+static ncclResult_t getGpuIndex(struct ncclTopoSystem* system, int rank, int* index) {
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
+ *index = g;
+ return ncclSuccess;
+ }
+ }
+ WARN("Could not find gpu rank %d\n", rank);
+ return ncclInternalError;
+}
+
+static ncclResult_t getNetIndex(struct ncclTopoSystem* system, int64_t id, int* index) {
for (int n=0; n<system->nodes[NET].count; n++) {
- if (system->nodes[NET].nodes[n].used & flag) {
- *netPaths=system->nodes[NET].nodes[n].paths[GPU];
+ if (system->nodes[NET].nodes[n].id == id) {
+ *index = n;
return ncclSuccess;
}
}
+ WARN("Could not find net id %lx\n", id);
return ncclInternalError;
}
+static ncclResult_t getNetPaths(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoLinkList** netPaths) {
+ int netId = graph->inter[graph->nChannels*2];
+ int n;
+ NCCLCHECK(getNetIndex(system, netId, &n));
+ *netPaths=system->nodes[NET].nodes[n].paths[GPU];
+ return ncclSuccess;
+}
+
ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoNode* gpu, int* next, int* countPtr, int sortNet) {
const uint64_t flag = 1ULL<<(graph->nChannels);
int ngpus = system->nodes[GPU].count;
struct ncclTopoLinkList* paths = gpu->paths[GPU];
struct ncclTopoLinkList* netPaths = NULL;
- if (sortNet) NCCLCHECK(getNetPaths(system, flag, &netPaths));
+ if (sortNet) NCCLCHECK(getNetPaths(system, graph, &netPaths));
struct ncclGpuScore scores[NCCL_TOPO_MAX_NODES];
memset(scores, 0, ngpus*sizeof(struct ncclGpuScore));
@@ -130,9 +242,13 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
return ncclSuccess;
}
-ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time);
+ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time);
-#define NCCL_SEARCH_TIMEOUT (1ULL<<20) // This should get contain all search within a second or so.
+// Try to keep all searchs within one second
+#define NCCL_SEARCH_GLOBAL_TIMEOUT (3ULL<<19)
+#define NCCL_SEARCH_TIMEOUT (1<<18)
+#define NCCL_SEARCH_TIMEOUT_TREE (1<<17)
+#define NCCL_SEARCH_TIMEOUT_SAMECHANNELS (1<<10)
#define FORCED_ORDER_PCI 1
#define FORCED_ORDER_REPLAY 2
@@ -142,7 +258,7 @@ ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopo
if (graph->nChannels == 0) return ncclInternalError;
int ngpus = system->nodes[GPU].count;
int nextRank = graph->intra[(graph->nChannels-1)*ngpus+step+1];
- for (int i=0; i<ngpus; i++) if (system->nodes[GPU].nodes[i].rank == nextRank) {
+ for (int i=0; i<ngpus; i++) if (system->nodes[GPU].nodes[i].gpu.rank == nextRank) {
*g = i;
return ncclSuccess;
}
@@ -150,44 +266,37 @@ ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopo
return ncclSuccess;
}
-ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time);
+ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time);
-ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoLinkList* paths, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time, int g, int speed) {
- int typeSave = graph->type;
+ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time, int type, int index, int g) {
const uint64_t flag = 1ULL<<(graph->nChannels);
- struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
- if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, speed, typeSave));
+ struct ncclTopoNode* gpu;
+ NCCLCHECK(ncclTopoFollowPath(system, graph, type, index, GPU, g, 1, &gpu));
if (gpu) {
gpu->used ^= flag;
- NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, maxSpeed, time));
+ NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, backToNet, backToFirstRank, forcedOrder, time));
gpu->used ^= flag;
- if (paths) NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &gpu, -speed, typeSave));
+ NCCLCHECK(ncclTopoFollowPath(system, graph, type, index, GPU, g, -1, &gpu));
}
return ncclSuccess;
}
ncclResult_t ncclTopoCompareGraphs(struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
- // 0. When we are trying to increase speedIntra, do not copy if the solution has less channels
- // since it would likely impact the rings algorithms too.
- if (graph->speedIntra > graph->speedInter && graph->nChannels < refGraph->nChannels) return ncclSuccess;
+ // 1. Constraint to get the same nChannels between Rings and Trees
+ if (graph->nChannels < graph->minChannels) return ncclSuccess;
- // 1. Try to get better bandwidth
+ // 2. Try to get better bandwidth
if (graph->nChannels*graph->speedIntra < refGraph->nChannels*refGraph->speedIntra) return ncclSuccess;
if (graph->nChannels*graph->speedIntra > refGraph->nChannels*refGraph->speedIntra) {
*copy = 1;
return ncclSuccess;
}
- // 2. Give an advantage when all channels are the same
- if (graph->nChannels > 1 && graph->sameChannels && refGraph->sameChannels == 0) {
- *copy = 1;
- return ncclSuccess;
- }
- // 3. Less hops
- if (graph->nHops < refGraph->nHops) *copy = 1;
+ // 3. Less hops (but not at the price of going cross NICs)
+ if (graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1;
return ncclSuccess;
}
-ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int maxSpeed, int *time) {
+ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
if ((*time) <= 0) return ncclSuccess;
(*time)--;
@@ -195,51 +304,39 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
if (step == ngpus) {
// Determine whether we found a better solution or not
int copy = 0;
- int sameChannels = graph->sameChannels;
- if (graph->nChannels > 0) {
- int* intra = graph->intra+graph->nChannels*ngpus;
- for (int g=0; g<ngpus; g++) if (intra[g] != intra[g-ngpus]) graph->sameChannels = 0;
- }
graph->nChannels++;
NCCLCHECK(ncclTopoCompareGraphs(graph, saveGraph, &copy));
if (copy) {
memcpy(saveGraph, graph, sizeof(struct ncclTopoGraph));
- if (graph->nChannels*graph->speedIntra == maxSpeed) *time = -1;
+ if (graph->nChannels == graph->maxChannels) *time = -1;
}
- if (graph->nChannels < MAXCHANNELS/2) {
- NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, maxSpeed, time));
+ if (graph->nChannels < graph->maxChannels) {
+ NCCLCHECK(ncclTopoSearchRec(system, graph, saveGraph, time));
}
graph->nChannels--;
- graph->sameChannels = sameChannels;
return ncclSuccess;
}
- graph->intra[graph->nChannels*ngpus+step] = gpu->rank;
+ graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank;
+ int g = gpu - system->nodes[GPU].nodes;
if (step == backToNet) {
// first get back to NIC
if (system->nodes[NET].count) {
- int maxWidth = 0;
- struct ncclTopoLinkList* paths = gpu->paths[NET];
- for (int n=0; n<system->nodes[NET].count; n++) {
- if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue;
- maxWidth = std::max(paths[n].width, maxWidth);
- }
+ int startNetIndex;
+ NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
+ struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
for (int n=0; n<system->nodes[NET].count; n++) {
- if (graph->crossNic != 1 && (system->nodes[NET].nodes[n].id != graph->inter[graph->nChannels*2])) continue;
- if (paths[n].width == maxWidth) {
- struct ncclTopoNode* net = system->nodes[NET].nodes+n;
- int typeSave = graph->type;
- NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, graph->speedInter, typeSave));
- if (net) {
- graph->inter[graph->nChannels*2+1] = net->id;
- NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, maxSpeed, time));
- NCCLCHECK(ncclTopoFollowPath(graph, paths+n, &net, -graph->speedInter, typeSave));
- }
+ struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+ if (graph->crossNic != 1 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
+ NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
+ if (net) {
+ graph->inter[graph->nChannels*2+1] = net->id;
+ NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, -1, backToFirstRank, forcedOrder, time));
+ NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
}
}
}
} else if (step < system->nodes[GPU].count-1) {
// Go to next GPU
- struct ncclTopoLinkList* paths = gpu->paths[GPU];
int next[NCCL_TOPO_MAX_NODES];
int count;
if (forcedOrder == FORCED_ORDER_PCI) { // Try the PCI order
@@ -252,64 +349,59 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
NCCLCHECK(ncclTopoSearchNextGpuSort(system, graph, gpu, next, &count, backToNet == -1 ? 0 : backToNet == step+1 ? 1 : -1 ));
}
for (int i=0; i<count; i++) {
- int g = next[i];
- int nvlink = graph->nvlink;
- graph->nvlink &= paths[g].type <= LINK_NVL ? 1 : 0;
- int speed = graph->speedIntra;
- if (paths[g].type == LINK_QPI) speed = INTEL_P2P_OVERHEAD(speed);
- NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, step+1, backToNet, backToFirstRank, forcedOrder, maxSpeed, time, g, speed));
- graph->nvlink = nvlink;
+ NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, step+1, backToNet, backToFirstRank, forcedOrder, time, GPU, g, next[i]));
}
} else if (step == backToFirstRank) {
// Find first GPU and loop back to it
- int g;
- int rank = graph->intra[graph->nChannels*ngpus];
- for (g=0; g<ngpus; g++) {
- if (system->nodes[GPU].nodes[g].rank == rank) break;
- }
- if (g == ngpus) {
- WARN("Could not find GPU with rank %d\n", rank);
- return ncclInternalError;
- }
- struct ncclTopoLinkList* paths = gpu->paths[GPU];
- struct ncclTopoNode* firstGpu = system->nodes[GPU].nodes+g;
- int typeSave = graph->type;
- NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, graph->speedIntra, typeSave));
+ int p;
+ NCCLCHECK(getGpuIndex(system, graph->intra[graph->nChannels*ngpus], &p));
+ struct ncclTopoNode* firstGpu;
+ NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, p, 1, &firstGpu));
if (firstGpu) {
- NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step+1, backToNet, -1, forcedOrder, maxSpeed, time));
- NCCLCHECK(ncclTopoFollowPath(graph, paths+g, &firstGpu, -graph->speedIntra, typeSave));
+ NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, firstGpu, step+1, backToNet, -1, forcedOrder, time));
+ NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, GPU, p, -1, &firstGpu));
}
} else {
// Next path
- NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, maxSpeed, time));
+ NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time));
}
return ncclSuccess;
}
-ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int maxSpeed, int* time) {
- const uint64_t flag = 1ULL<<(graph->nChannels);
+ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
const int speed = graph->speedInter;
for (int n=0; n<system->nodes[NET].count; n++) {
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
struct ncclTopoNode* gpu;
- if (net->used == 0) {
- graph->inter[graph->nChannels*2] = net->id;
- for (int i=0; i<system->nodes[NET].count; i++) {
- if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag;
+ if (graph->collNet && net->net.collSupport == 0) continue;
+ if (net->net.width < speed) continue;
+ if (net->net.maxChannels == 0) continue;
+
+ graph->inter[graph->nChannels*2] = net->id;
+ for (int i=0; i<system->nodes[NET].count; i++) {
+ if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) &&
+ (system->nodes[NET].nodes[i].net.port == net->net.port)) {
+ system->nodes[NET].nodes[i].net.width -= speed;
}
- struct ncclTopoLinkList* paths = net->paths[GPU];
+ }
+ net->net.maxChannels--;
- // First try the PCI order to set a reference
- NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, speed));
- // Then try to replay the last channel
- if (graph->nChannels > 0) {
- int g;
- NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
- NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, g, speed));
+ // First try to replay the last channel
+ if (graph->nChannels > 0) {
+ int g;
+ NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
+ NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
+ }
+ if (graph->nChannels == 0 || graph->sameChannels == 0) {
+ if (graph->nChannels == 0) {
+ // Always try the PCI order first to set a reference
+ NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, NET, n, 0));
}
// Then try the most local GPUs
- int maxWidth = 0, minHops = 0xfffffff;
+ float maxWidth = 0;
+ int minHops = 0xfffffff;
+ struct ncclTopoLinkList* paths = net->paths[GPU];
for (int g=0; g<system->nodes[GPU].count; g++) {
if (paths[g].width > maxWidth) {
maxWidth = paths[g].width;
@@ -328,14 +420,19 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
gpu = system->nodes[GPU].nodes+g;
int gpuUsed = gpuPciWidth(gpu) > 0 ? 0 : 1;
if (tryGpuBidir == gpuUsed) {
- NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, paths, 0, backToNet, backToFirstRank, 0, maxSpeed, time, g, speed));
+ NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
}
}
}
}
}
- for (int i=0; i<system->nodes[NET].count; i++) {
- if (system->nodes[NET].nodes[i].rank == net->rank) system->nodes[NET].nodes[i].used ^= flag;
+ }
+
+ net->net.maxChannels++;
+ for (int i=0; i<system->nodes[NET].count; i++) {
+ if ((system->nodes[NET].nodes[i].net.asic == net->net.asic) &&
+ (system->nodes[NET].nodes[i].net.port == net->net.port)) {
+ system->nodes[NET].nodes[i].net.width += speed;
}
}
}
@@ -374,126 +471,201 @@ ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, in
return ncclSuccess;
}
-ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int maxSpeed, int* time) {
+ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time) {
int backToNet, backToFirstRank;
NCCLCHECK(ncclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank));
if (system->nodes[NET].count) {
// Start from NET
- ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, maxSpeed, time);
+ ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time);
} else {
- // Start from GPU 0
- NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, maxSpeed, time, 0, graph->speedIntra));
- if (graph->nChannels > 0) NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, maxSpeed, time, 0, graph->speedIntra));
- NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, NULL, 0, backToNet, backToFirstRank, 0, maxSpeed, time, 0, graph->speedIntra));
+ // Intra-node only.
+ if (graph->nChannels == 0) {
+ // Try PCI order first
+ NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0));
+ } else {
+ // Also try to replay previous channel
+ int g;
+ NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
+ NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, -1, -1, g));
+ }
+ if (graph->sameChannels == 0 || graph->nChannels == 0) {
+ // Finally, try all other possibilities unless we are forced to use the same channels
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g));
+ }
+ }
}
return ncclSuccess;
}
-/* Parse user defined rings. Format is like :
- * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
- * Rings with a non-matching number of ranks are ignored so we can provide
- * rings for multiple cases.
- */
-#define MAX_ENV_RANKS 512
-static ncclResult_t parseGraph(const char* str, int* nChannelsRet, int ngpus, int* channels) {
- int ranks[MAX_ENV_RANKS];
- int nChannels = 0;
- int rank = 0;
- int offset = 0;
- int status = 0; // 0 : between numbers, 1 : inside number
- do {
- int digit = str[offset] - '0';
- if (digit >= 0 && digit <= 9) {
- if (status == 0) {
- ranks[rank] = digit;
- status = 1;
- } else {
- ranks[rank] = ranks[rank]*10+digit;
- }
- } else {
- if (status == 1) {
- rank++;
- if (rank == MAX_ENV_RANKS) goto end;
+/************************************/
+/* User defined graph from XML file */
+/************************************/
+
+struct kvDict kvDictLinkType[] = { { "SYS", PATH_SYS }, { "PHB", PATH_PHB }, { "PIX", PATH_PIX }, { "PXB", PATH_PXB }, { "NVL", PATH_NVL }, { "LOC", PATH_LOC }, { NULL, 0 } };
+ncclResult_t ncclTopoGetChannelFromXml(struct ncclXmlNode *xmlChannel, int c, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
+ int ngpus = system->nodes[GPU].count;
+ int* inter = graph->inter+2*c;
+ int* intra = graph->intra+ngpus*c;
+ int n=0, g=0;
+ for (int s=0; s<xmlChannel->nSubs; s++) {
+ struct ncclXmlNode* sub = xmlChannel->subs[s];
+ int dev;
+ NCCLCHECK(xmlGetAttrInt(sub, "dev", &dev));
+ if (strcmp(sub->name, "net") == 0) {
+ inter[n++] = dev;
+ } else if (strcmp(sub->name, "gpu") == 0) {
+ int rank = -1;
+ for (int g=0; g<ngpus; g++) {
+ if (system->nodes[GPU].nodes[g].gpu.dev == dev) rank = system->nodes[GPU].nodes[g].gpu.rank;
}
- status = 0;
- if (str[offset] == '|' || str[offset] == '\0') {
- // Ignore if ngpus doesn't match
- if (rank != ngpus) goto newchannel;
-
- for (int r=0; r<ngpus; r++) {
- int rank = ranks[r];
- // Ignore if ranks are out of bounds
- if (rank < 0 || rank >= ngpus) goto newchannel;
- // Ignore if ranks are duplicate
- for (int i=0; i<r; i++)
- if (ranks[i] == rank) goto newchannel;
-
- channels[nChannels*ngpus+r] = rank;
- }
- nChannels++;
-newchannel:
- rank = 0;
+ if (rank == -1) {
+ WARN("XML Import Channel : dev %d not found.", dev);
+ return ncclSystemError;
}
+ intra[g++] = rank;
+ }
+ }
+ return ncclSuccess;
+}
+ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
+ int id;
+ NCCLCHECK(xmlGetAttrInt(xmlGraph, "id", &id));
+ if (graph->id != id) return ncclSuccess;
+
+ int crossNic;
+ NCCLCHECK(xmlGetAttrInt(xmlGraph, "crossnic", &crossNic));
+ if (graph->crossNic == 0 && crossNic == 1) return ncclSuccess;
+ graph->crossNic = crossNic;
+
+ NCCLCHECK(xmlGetAttrInt(xmlGraph, "pattern", &graph->pattern));
+ NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels));
+ NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->speedIntra));
+ NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->speedInter));
+ const char* str;
+ NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str));
+ NCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType));
+ NCCLCHECK(xmlGetAttr(xmlGraph, "typeinter", &str));
+ NCCLCHECK(kvConvertToInt(str, &graph->typeInter, kvDictLinkType));
+ NCCLCHECK(xmlGetAttrInt(xmlGraph, "samechannels", &graph->sameChannels));
+ for (int s=0; s<xmlGraph->nSubs; s++) {
+ NCCLCHECK(ncclTopoGetChannelFromXml(xmlGraph->subs[s], s, system, graph));
+ }
+ return ncclSuccess;
+}
+ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
+ for (int s=0; s<xmlGraphs->nSubs; s++) {
+ NCCLCHECK(ncclTopoGetGraphFromXmlSub(xmlGraphs->subs[s], system, graph));
+ }
+ return ncclSuccess;
+}
+
+/* And the reverse : graph->xml */
+ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int c, struct ncclTopoSystem* system, struct ncclXml *xml, struct ncclXmlNode* parent) {
+ struct ncclXmlNode* xmlChannel;
+ int ngpus = system->nodes[GPU].count;
+ int* inter = graph->inter+2*c;
+ int* intra = graph->intra+ngpus*c;
+ NCCLCHECK(xmlAddNode(xml, parent, "channel", &xmlChannel));
+ struct ncclXmlNode* node;
+ if (system->nodes[NET].count) {
+ NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
+ NCCLCHECK(xmlSetAttrInt(node, "dev", inter[0]));
+ }
+ for (int g=0; g<ngpus; g++) {
+ NCCLCHECK(xmlAddNode(xml, xmlChannel, "gpu", &node));
+ int dev = -1;
+ for (int i=0; i<ngpus; i++) {
+ if (system->nodes[GPU].nodes[i].gpu.rank == intra[g]) dev = system->nodes[GPU].nodes[i].gpu.dev;
+ }
+ if (dev == -1) {
+ WARN("XML Export Channel : rank %d not found.", intra[g]);
+ return ncclInternalError;
}
- } while (str[offset++] != 0);
-end:
- *nChannelsRet = nChannels;
+ NCCLCHECK(xmlSetAttrInt(node, "dev", dev));
+ }
+ if (system->nodes[NET].count) {
+ NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
+ NCCLCHECK(xmlSetAttrInt(node, "dev", inter[1]));
+ }
return ncclSuccess;
}
+ncclResult_t ncclTopoGetXmlFromGraph(struct ncclTopoGraph* graph, struct ncclTopoSystem* system, struct ncclXml *xml, struct ncclXmlNode* parent) {
+ struct ncclXmlNode* xmlGraph;
+ NCCLCHECK(xmlAddNode(xml, parent, "graph", &xmlGraph));
+ NCCLCHECK(xmlSetAttrInt(xmlGraph, "id", graph->id));
+ NCCLCHECK(xmlSetAttrInt(xmlGraph, "pattern", graph->pattern));
+ NCCLCHECK(xmlSetAttrInt(xmlGraph, "crossnic", graph->crossNic));
+ NCCLCHECK(xmlSetAttrInt(xmlGraph, "nchannels", graph->nChannels));
+ NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedintra", graph->speedIntra));
+ NCCLCHECK(xmlSetAttrFloat(xmlGraph, "speedinter", graph->speedInter));
+ const char* str;
+ NCCLCHECK(kvConvertToStr(graph->typeIntra, &str, kvDictLinkType));
+ NCCLCHECK(xmlSetAttr(xmlGraph, "typeintra", str));
+ NCCLCHECK(kvConvertToStr(graph->typeInter, &str, kvDictLinkType));
+ NCCLCHECK(xmlSetAttr(xmlGraph, "typeinter", str));
+ NCCLCHECK(xmlSetAttrInt(xmlGraph, "samechannels", graph->sameChannels));
+ for (int c=0; c<graph->nChannels; c++) {
+ NCCLCHECK(ncclTopoGetXmlFromChannel(graph, c, system, xml, xmlGraph));
+ }
+ return ncclSuccess;
+}
+ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml) {
+ xml->maxIndex = 0;
+ struct ncclXmlNode* xmlGraphs;
+ NCCLCHECK(xmlAddNode(xml, NULL, "graphs", &xmlGraphs));
+ NCCLCHECK(xmlSetAttrInt(xmlGraphs, "version", NCCL_GRAPH_XML_VERSION));
+ for (int g=0; g<ngraphs; g++) {
+ NCCLCHECK(ncclTopoGetXmlFromGraph(graphs[g], system, xml, xmlGraphs));
+ }
+ return ncclSuccess;
+}
+
+float speedArray[] = { 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+#define NSPEEDS (sizeof(speedArray)/sizeof(float))
ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
graph->speedIntra = graph->speedInter = 0;
if (graph->crossNic == 2) graph->crossNic = 0;
- graph->nvlink = 0;
- graph->type = LINK_LOC;
+ graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
+ graph->typeInter = PATH_PIX;
graph->nChannels = 0;
graph->sameChannels = 1;
- char* str = getenv("NCCL_GRAPH");
+ char* str = getenv("NCCL_GRAPH_FILE");
if (str) {
- NCCLCHECK(parseGraph(str, &graph->nChannels, ngpus, graph->intra));
- for (int i=0; i<graph->nChannels*ngpus; i++) {
- // Translate gpu numbers into ranks
- graph->intra[i] = system->nodes[GPU].nodes[graph->intra[i]].rank;
- }
- // TODO : let user specify NICs
- graph->inter[0] = graph->inter[1] = 0;
- graph->speedIntra = graph->speedInter = PCI_WIDTH+2;
- graph->nvlink = 0;
- if (graph->pattern == NCCL_TOPO_PATTERN_RING) {
- // Reverse the loop
- for (int c=0; c<graph->nChannels; c++) {
- for (int i=0; i<=ngpus/2; i++) {
- int tmp = graph->intra[ngpus*c+i];
- graph->intra[ngpus*c+i] = graph->intra[ngpus*c+(ngpus-i)%ngpus];
- graph->intra[ngpus*c+ngpus-i] = tmp;
- }
- }
- }
- if (graph->nChannels) return ncclSuccess;
+ struct ncclXml* xml;
+ NCCLCHECK(ncclCalloc(&xml, 1));
+ NCCLCHECK(ncclTopoGetXmlGraphFromFile(str, xml));
+ NCCLCHECK(ncclTopoGetGraphFromXml(xml->nodes, system, graph));
+ free(xml);
+ if (graph->nChannels > 0) return ncclSuccess;
}
if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
struct ncclTopoGraph tmpGraph;
memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
- int bestSpeed = 0;
// First try crossnic, then decrease speed and finally increase speedIntra.
- tmpGraph.speedIntra = tmpGraph.speedInter = system->maxWidth;
- int maxSpeed = system->maxSpeed;
tmpGraph.pattern = graph->pattern;
+ int pass = 1;
+ int speedIndex = 0;
+ while (speedArray[speedIndex] > system->maxWidth && speedIndex < NSPEEDS-1) speedIndex++;
+ tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex];
+ int64_t globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
search:
- int time = NCCL_SEARCH_TIMEOUT;
- tmpGraph.nvlink = 1;
+ int time = tmpGraph.sameChannels ? NCCL_SEARCH_TIMEOUT_SAMECHANNELS :
+ tmpGraph.pattern == NCCL_TOPO_PATTERN_TREE ? NCCL_SEARCH_TIMEOUT_TREE : NCCL_SEARCH_TIMEOUT;
tmpGraph.nChannels = 0;
- tmpGraph.sameChannels = 1;
- NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, maxSpeed, &time));
+ globalTimeout -= time;
+
+ NCCLCHECK(ncclTopoSearchRec(system, &tmpGraph, graph, &time));
#if 0
- printf("Pattern %d, crossNic %d, Speed %d/%d, type %d -> nChannels %dx%d/%d %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.type, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : "");
+ printf("Pattern %d, crossNic %d, Speed %g/%g, type %d/%d, channels %d-%d sameChannels %d -> nChannels %dx%g/%g %s\n", tmpGraph.pattern, tmpGraph.crossNic, tmpGraph.speedInter, tmpGraph.speedIntra, tmpGraph.typeInter, tmpGraph.typeIntra, tmpGraph.minChannels, tmpGraph.maxChannels, tmpGraph.sameChannels, graph->nChannels, graph->speedInter, graph->speedIntra, time == 0 ? "TIMEOUT" : "");
for (int c=0; c<graph->nChannels; c++) {
printf("%2d : ", c);
for (int g=0; g<ngpus; g++) {
@@ -502,13 +674,34 @@ search:
printf("\n");
}
#endif
- if (time == -1) goto done;
- // We already have a solution and we timed out so lower speed will just timeout as well
- if (time == 0 && graph->nChannels > 0) goto done;
- if ((graph->nChannels > 0) && (bestSpeed == 0)) bestSpeed = graph->speedIntra;
+ // Optimal solution, stop here
+ if (graph->nChannels == graph->maxChannels && graph->speedInter == system->maxWidth) goto done;
+
+ if (pass == 1) {
+ // First pass, we don't have a solution yet ; try other options
+
+ // Try having different channels
+ if (tmpGraph.sameChannels == 1) {
+ tmpGraph.sameChannels = 0;
+ goto search;
+ }
+ tmpGraph.sameChannels = 1;
- if (tmpGraph.speedIntra == tmpGraph.speedInter) {
- // First pass, we don't have a solution yet ; try to go slower.
+ if (time != -1) globalTimeout += time;
+ else globalTimeout = NCCL_SEARCH_GLOBAL_TIMEOUT;
+ if (globalTimeout < 0) goto done;
+
+ int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
+ if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
+ tmpGraph.typeIntra += 1;
+ goto search;
+ }
+ tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
+ if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXB)) {
+ tmpGraph.typeInter += 1;
+ goto search;
+ }
+ tmpGraph.typeInter = PATH_PIX;
// Try a simpler tree
if (tmpGraph.pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP) {
@@ -521,50 +714,61 @@ search:
}
tmpGraph.pattern = graph->pattern;
- if (tmpGraph.type < LINK_QPI) {
- tmpGraph.type += 1;
- goto search;
- }
- tmpGraph.type = graph->type;
-
if (crossNic && tmpGraph.crossNic == 0) {
// Try again with crossNic if permitted
tmpGraph.crossNic = crossNic;
goto search;
}
- tmpGraph.crossNic = graph->crossNic;
+ tmpGraph.crossNic = 0;
+
+ // Decrease speed until we find a solution
+ if ((speedIndex < NSPEEDS-1) && (graph->nChannels == 0 || (speedArray[speedIndex+1]/graph->speedInter > .49))) {
+ tmpGraph.speedInter = tmpGraph.speedIntra = speedArray[++speedIndex];
+ goto search;
+ }
+ speedIndex = 0;
+ while (speedArray[speedIndex] > system->maxWidth && speedIndex < NSPEEDS-1) speedIndex++;
+ tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex];
- // Try to reduce speed per channel
- tmpGraph.speedIntra = tmpGraph.speedInter -= 3;
- if (tmpGraph.speedIntra >= bestSpeed/2 && tmpGraph.speedIntra >= 3) goto search;
}
done:
- // We have a solution now. See if we can increase speedIntra
- if (tmpGraph.speedIntra == tmpGraph.speedInter) {
+ // We have a solution. Start from that solution and move to pass 2.
+ if (pass == 1) {
time = -1;
memcpy(&tmpGraph, graph, sizeof(tmpGraph));
+ speedIndex = 0;
+ while (speedArray[speedIndex] > graph->speedInter && speedIndex < NSPEEDS-1) speedIndex++;
+ tmpGraph.speedIntra = tmpGraph.speedInter = speedArray[speedIndex];
+ tmpGraph.minChannels = graph->nChannels;
+ pass = 2;
}
- if (time != 0 && tmpGraph.pattern != NCCL_TOPO_PATTERN_RING && tmpGraph.speedIntra == graph->speedIntra) {
- // Try to increase the intra speed only but keeping nChannels the same
- tmpGraph.speedIntra += 3;
- maxSpeed = tmpGraph.speedIntra * graph->nChannels;
- if (tmpGraph.speedIntra <= tmpGraph.speedInter*2) goto search;
+
+ // 3. See if we can increase speedIntra for trees (2 nodes or collnet)
+ if (pass == 2) {
+ if (time != 0 && graph->pattern != NCCL_TOPO_PATTERN_RING &&
+ tmpGraph.speedIntra == graph->speedIntra && tmpGraph.speedIntra < tmpGraph.speedInter*2 &&
+ speedIndex > 0) {
+ tmpGraph.speedIntra = speedArray[--speedIndex];
+ goto search;
+ }
+ time = -1;
+ memcpy(&tmpGraph, graph, sizeof(tmpGraph));
}
- if (graph->nChannels == 0) {
+ if (graph->nChannels == 0 && graph->collNet == 0) {
WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern);
- for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].rank;
+ for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
graph->inter[0] = graph->inter[1] = 0;
- graph->speedIntra = graph->speedInter = 3;
- graph->nvlink = 0;
+ graph->speedIntra = graph->speedInter = 0.1;
+ graph->typeIntra = graph->typeInter = PATH_SYS;
graph->nChannels = 1;
}
return ncclSuccess;
}
ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
- INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, speed %d/%d, nvlink %d, type %d, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->speedIntra, graph->speedInter, graph->nvlink, graph->type, graph->sameChannels);
+ INFO(NCCL_GRAPH, "Pattern %d, crossNic %d, nChannels %d, speed %f/%f, type %s/%s, sameChannels %d", graph->pattern, graph->crossNic, graph->nChannels, graph->speedIntra, graph->speedInter, topoPathTypeStr[graph->typeIntra], topoPathTypeStr[graph->typeInter], graph->sameChannels);
int ngpus = system->nodes[GPU].count;
char line[1024];
@@ -588,6 +792,18 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
return ncclSuccess;
}
+ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs) {
+ char* str = getenv("NCCL_GRAPH_DUMP_FILE");
+ if (str) {
+ struct ncclXml* xml;
+ NCCLCHECK(ncclCalloc(&xml, 1));
+ NCCLCHECK(ncclTopoGetXmlFromGraphs(ngraphs, graphs, system, xml));
+ NCCLCHECK(ncclTopoDumpXmlToFile(str, xml));
+ free(xml);
+ }
+ return ncclSuccess;
+}
+
ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* dev) {
*dev = graph->inter[(channelId%graph->nChannels)*2+dir];
return ncclSuccess;
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index a1b3209..5cd8d4e 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -10,46 +10,22 @@
#include "comm.h"
#include "nvmlwrap.h"
#include "net.h"
+#include "coll_net.h"
#include <sys/stat.h>
#include <fcntl.h>
+#include "xml.h"
+#include "cpuset.h"
#define BUSID_SIZE (sizeof("0000:00:00.0"))
#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
-const char* pathDists[] = { "PIX", "PXB", "PHB", "NODE", "SYS" };
-
const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
-const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "QPI", "NET" };
+const char* topoLinkTypeStr[] = { "LOC", "NVL", "PCI", "", "", "SYS", "NET" };
+const char* topoPathTypeStr[] = { "LOC", "NVL", "PIX", "PXB", "PHB", "SYS", "NET" };
/******************************************************************/
/******************* Graph Creation Functions *********************/
/******************************************************************/
-static int getNumaId(char *path) {
- char npath[PATH_MAX];
- snprintf(npath, PATH_MAX, "%s/numa_node", path);
- npath[PATH_MAX-1] = '\0';
-
- int numaId = -1;
- FILE *file = fopen(npath, "r");
- if (file == NULL) return -1;
- if (fscanf(file, "%d", &numaId) == EOF) { fclose(file); return -1; }
- fclose(file);
-
- return numaId;
-}
-
-static ncclResult_t getPciPath(char* busId, char** path) {
- for (int i=0; i<BUSID_SIZE; i++) busId[i] = tolower(busId[i]);
- char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
- memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
- memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
- *path = realpath(busPath, NULL);
- if (*path == NULL) {
- WARN("Could not find real path of %s", busPath);
- return ncclSystemError;
- }
- return ncclSuccess;
-}
// Get an int64 from a PCI path. For example, sys/class/pci0000:00/0000:00:02.0/0000:02:00.0/ will return 0x000002000.
ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id) {
@@ -59,110 +35,43 @@ ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id)
// Find next /
while (*str != '/') str--;
str++;
- NCCLCHECK(busIdToInt64(str, id));
+ int64_t numid;
+ NCCLCHECK(busIdToInt64(str, &numid));
+ // Ignore subdevice because those should use the same PCI link so we want to merge nodes.
+ numid -= numid & 0xf;
+ *id = numid;
return ncclSuccess;
}
-static ncclResult_t idToIndex(struct ncclTopoSystem* system, int64_t id, int* index) {
- *index = -1;
- for (int i=0; i<system->nodes[GPU].count; i++) {
- if (system->nodes[GPU].nodes[i].id == id) {
- *index = i;
- }
+static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu) {
+ *cpu = NULL;
+ if (node->type == CPU) {
+ *cpu = node;
+ return ncclSuccess;
+ }
+ for (int l=0; l<node->nlinks; l++) {
+ if (node->links[l].type == LINK_PCI) NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
+ if (*cpu != NULL) return ncclSuccess;
}
return ncclSuccess;
}
-
-static ncclResult_t getPath(int64_t id, char** path) {
- char busId[] = "0000:00:00.0";
- NCCLCHECK(int64ToBusId(id, busId));
- NCCLCHECK(getPciPath(busId, path));
- return ncclSuccess;
-}
-
-ncclResult_t ncclTopoCudaPath(int cudaDev, char** path) {
- char busId[BUSID_SIZE];
- CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev));
- NCCLCHECK(getPciPath(busId, path));
- return ncclSuccess;
-}
-
-
int interCpuWidth = 0;
int cpuPciWidth = 0;
-static ncclResult_t getCpuWidths() {
- // Check if already detected
- if (interCpuWidth + cpuPciWidth) return ncclSuccess;
-
- // Defaults
- char cpu[256];
- sprintf(cpu, "Generic");
- cpuPciWidth = interCpuWidth = PCI_WIDTH;
-
-#ifdef __PPC__
- sprintf(cpu, "ppc64");
- interCpuWidth = P9_WIDTH;
-#endif
-#ifdef __x86_64__
- sprintf(cpu, "x86_64");
- union {
- struct {
- // CPUID 0 String register order
- uint32_t ebx;
- uint32_t edx;
- uint32_t ecx;
- };
- char vendor[12];
- } cpuid0;
-
- asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0));
- if (strncmp(cpuid0.vendor, "GenuineIntel", 12) == 0) sprintf(cpu, "Intel");
-
- if (strcmp(cpu, "Intel") == 0) {
- union {
- struct {
- int steppingId:4;
- int model:4;
- int familyId:4;
- int processorType:2;
- int resv0:2;
- int extModelId:4;
- int modelId:8;
- int resv1:4;
- };
- uint32_t val;
- } cpuid1;
- asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1));
- if (cpuid1.familyId == 6 && cpuid1.modelId >= 0x55) { // Skylake
- sprintf(cpu, "Intel/Skylake (or later)");
- interCpuWidth = SKL_QPI_WIDTH;
- } else {
- interCpuWidth = QPI_WIDTH;
- }
+static ncclResult_t ncclTopoGetInterCpuWidth(struct ncclTopoNode* cpu, float* width) {
+ *width = LOC_WIDTH;
+ if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_POWER) {
+ *width = P9_WIDTH;
+ return ncclSuccess;
+ }
+ if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_ARM) {
+ *width = ARM_WIDTH;
+ return ncclSuccess;
+ }
+ if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
+ *width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_WIDTH : QPI_WIDTH;
}
-#endif
- INFO(NCCL_GRAPH, "%s CPU (PCI %d, InterCpu %d)", cpu, cpuPciWidth, interCpuWidth);
- return ncclSuccess;
-}
-
-static ncclResult_t ncclTopoGetInterCpuWidth(int* width) {
- NCCLCHECK(getCpuWidths());
- *width = interCpuWidth;
- return ncclSuccess;
-}
-static ncclResult_t ncclTopoGetCpuPciP2pWidth(int* width) {
- NCCLCHECK(getCpuWidths());
- *width = cpuPciWidth;
- return ncclSuccess;
-}
-static ncclResult_t ncclTopoGetPciWidth(int* width) {
- *width = PCI_WIDTH;
- return ncclSuccess;
-}
-static ncclResult_t ncclTopoGetNetWidth(int* width) {
- *width = NET_WIDTH;
return ncclSuccess;
}
@@ -173,317 +82,101 @@ enum ncclNvLinkDeviceType {
ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea)
};
-static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) {
- char classPath[] = "/sys/bus/pci/devices/0000:00:00.0/class";
- memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1);
- char* rPath = realpath(classPath, NULL);
- int fd;
- if ((fd = open(rPath, O_RDONLY)) == -1) {
- // Could not find device. It might be because we're in a VM and
- // we don't see the whole machine. This is handled silently so
- // we don't want to print an INFO error.
- TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno));
- return ncclSystemError;
- }
- free(rPath);
- char pciClass[9];
- strncpy(pciClass, "0x000000", 9);
- int len;
- SYSCHECKVAL(read(fd, pciClass, 8), "read", len);
- SYSCHECK(close(fd), "close");
- if (strcmp(pciClass, "0x068000") == 0) {
- // PCI device is of type "Bridge / Other Bridge Device" (NVswitch)
- *type = ncclNvLinkDeviceSwitch;
- } else if (strcmp(pciClass, "0x068001") == 0) {
- // PCI device is of type "Bridge: IBM Device 04ea"
- *type = ncclNvLinkDeviceBridge;
- } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla)
- || strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce)
- *type = ncclNvLinkDeviceGpu;
- } else {
- *type = ncclNvLinkDeviceUnknown;
+ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
+ for (int i=0; i<system->nodes[type].count; i++) {
+ if (system->nodes[type].nodes[i].id == id) {
+ *node = system->nodes[type].nodes+i;
+ return ncclSuccess;
+ }
}
return ncclSuccess;
}
-ncclResult_t ncclTopoConnectCpu(struct ncclTopoSystem* system, int numaId, struct ncclTopoNode* node, int linkType, int linkWidth) {
- struct ncclTopoNode* cpuNode = NULL;
- for (int c=0; c<system->nodes[CPU].count; c++) {
- if (system->nodes[CPU].nodes[c].id == numaId) cpuNode = system->nodes[CPU].nodes+c;
+ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
+ if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
+ WARN("Error : tried to create too many nodes of type %d\n", type);
+ return ncclInternalError;
}
- if (cpuNode == NULL) { // Create CPU
- NCCLCHECK(ncclTopoCreateNode(system, &cpuNode, CPU, numaId));
+ struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
+ system->nodes[type].count++;
+ n->type = type;
+ n->id = id;
+ if (type == GPU) {
+ // Create link to itself (used in some corner cases)
+ n->nlinks=1;
+ n->links[0].type = LINK_LOC;
+ n->links[0].remNode = n;
+ n->links[0].width = LOC_WIDTH;
+ n->gpu.dev = NCCL_TOPO_UNDEF;
+ n->gpu.rank = NCCL_TOPO_UNDEF;
+ n->gpu.cudaCompCap = NCCL_TOPO_UNDEF;
+ } else if (type == CPU) {
+ n->cpu.arch = NCCL_TOPO_UNDEF;
+ n->cpu.vendor = NCCL_TOPO_UNDEF;
+ n->cpu.model = NCCL_TOPO_UNDEF;
+ } else if (type == NET) {
+ n->net.asic = 0ULL;
+ n->net.port = NCCL_TOPO_UNDEF;
+ n->net.width = 0.0;
}
- NCCLCHECK(ncclTopoConnectNodes(node, cpuNode, linkType, linkWidth));
- NCCLCHECK(ncclTopoConnectNodes(cpuNode, node, linkType, linkWidth));
+ *node = n;
return ncclSuccess;
}
-ncclResult_t ncclTopoConnectNVLink(nvmlDevice_t* nvmlDevs, struct ncclTopoSystem* system) {
- struct ncclTopoNode* nvsNode = NULL;
-
- int minNvlinks = 6, minWidth = VOLTA_NVLINK_WIDTH;
- for (int g=0; g<system->nodes[GPU].count; g++) {
- struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
- int cudaMajor, cudaMinor;
- NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDevs[g], &cudaMajor, &cudaMinor));
- int maxNvLinks, width;
- if (cudaMajor < 6) {
- maxNvLinks = 0;
- width = 0;
- } else if (cudaMajor == 6) {
- maxNvLinks = 4;
- width = PASCAL_NVLINK_WIDTH;
- } else {
- maxNvLinks = 6;
- width = VOLTA_NVLINK_WIDTH;
- }
-
- int nvlinks = 0;
- for (int l=0; l<maxNvLinks; ++l) {
- // Check whether we can use this NVLink for P2P
- unsigned canP2P;
- if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDevs[g], l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
-
- // Make sure the Nvlink is up. The previous call should have trained the link.
- nvmlEnableState_t isActive;
- if ((wrapNvmlDeviceGetNvLinkState(nvmlDevs[g], l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
-
- // Try to figure out what's on the other side of the NVLink
- nvmlPciInfo_t remoteProc;
- if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevs[g], l, &remoteProc) != ncclSuccess) continue;
-
- // Make a lower case copy of the bus ID for calling ncclDeviceType
- // PCI system path is in lower case
- char* p = remoteProc.busId;
- char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
- for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
- lowerId[c] = tolower(p[c]);
- if (p[c] == 0) break;
- }
-
- enum ncclNvLinkDeviceType type;
- NCCLCHECK(ncclDeviceType(lowerId, &type));
- if (type == ncclNvLinkDeviceGpu) {
- int64_t remoteId;
- NCCLCHECK(busIdToInt64(lowerId, &remoteId));
- int peer;
- NCCLCHECK(idToIndex(system, remoteId, &peer));
- if (peer != -1) {
- NCCLCHECK(ncclTopoConnectNodes(gpu, system->nodes[GPU].nodes+peer, LINK_NVL, width));
- nvlinks++;
- }
- } else if (type == ncclNvLinkDeviceBridge) {
- // Nvlink between GPU and CPU (PPC)
- // Since the remote bridge does not have a valid numa_node, assume we
- // are connected to the closest CPU.
- char* path;
- NCCLCHECK(getPath(gpu->id, &path));
- int numaId = getNumaId(path);
- free(path);
- NCCLCHECK(ncclTopoConnectCpu(system, numaId, gpu, LINK_NVL, width));
- nvlinks++;
- } else { // Nvswitch
- if (type == ncclNvLinkDeviceUnknown) {
- // The NVLink is up but we couldn't find the PCI device on the other
- // side. Assume it's an NVswitch outside a VM.
- if (l == 0) INFO(NCCL_INIT, "%d/%d -> %s : Assuming NVLink is connected to NVswitch", g, l, lowerId);
+ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int index) {
+ struct ncclTopoNode* delNode = system->nodes[type].nodes+index;
+ for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
+ free(delNode->paths[t]);
+ for (int n=0; n<system->nodes[t].count; n++) {
+ struct ncclTopoNode* node = system->nodes[t].nodes+n;
+ if (node == delNode) continue;
+ for (int l=0; l<node->nlinks; l++) {
+ while (l<node->nlinks && node->links[l].remNode == delNode) {
+ memmove(node->links+l, node->links+l+1, (node->nlinks-l-1)*sizeof(struct ncclTopoLink));
+ node->nlinks--;
}
- if (nvsNode == NULL) { // Create nvswitch
- NCCLCHECK(ncclTopoCreateNode(system, &nvsNode, NVS, 0));
+ if (l<node->nlinks && node->links[l].remNode->type == type && node->links[l].remNode >= delNode) {
+ node->links[l].remNode--;
}
- NCCLCHECK(ncclTopoConnectNodes(gpu, nvsNode, LINK_NVL, VOLTA_NVLINK_WIDTH));
- NCCLCHECK(ncclTopoConnectNodes(nvsNode, gpu, LINK_NVL, VOLTA_NVLINK_WIDTH));
- nvlinks++;
}
}
- minNvlinks = std::min(minNvlinks, nvlinks);
- minWidth = std::min(minWidth, width);
}
- int pciWidth;
- NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
- system->maxSpeed = minNvlinks ? minNvlinks*minWidth : pciWidth;
- system->maxWidth = minNvlinks ? minWidth : pciWidth;
+ memmove(delNode, delNode+1, (system->nodes[type].count-index-1)*sizeof(struct ncclTopoNode));
+ system->nodes[type].count--;
return ncclSuccess;
}
-ncclResult_t ncclTopoCreatePciPath(struct ncclTopoSystem* system, struct ncclTopoNode* endNode, char* path) {
- struct ncclTopoNode* lastNode = endNode;
- int pciWidth;
- NCCLCHECK(ncclTopoGetPciWidth(&pciWidth));
- // Find intermediate PCI switches
- int slashCount = 0;
- int offsetRC = 0;
- while (offsetRC < strlen(path)) {
- if (path[offsetRC] == '/') slashCount++;
- if (slashCount == 4) break;
- offsetRC++;
- }
- int offset = strlen(path);
- slashCount = 0;
- while (--offset > offsetRC) {
- if (path[offset] == '/') {
- slashCount++;
- // Find if already existing
- if ((slashCount%2) == 0) {
- int64_t pciId;
- NCCLCHECK(pciPathToInt64(path, offset, offsetRC, &pciId));
- for (int p=0; p<system->nodes[PCI].count; p++) {
- if (system->nodes[PCI].nodes[p].id == pciId) {
- // Found our PCI switch. Attach and stop since the rest should already
- // be connected
- NCCLCHECK(ncclTopoConnectNodes(system->nodes[PCI].nodes+p, lastNode, LINK_PCI, pciWidth));
- NCCLCHECK(ncclTopoConnectNodes(lastNode, system->nodes[PCI].nodes+p, LINK_PCI, pciWidth));
- return ncclSuccess;
- }
- }
- struct ncclTopoNode* pciNode;
- NCCLCHECK(ncclTopoCreateNode(system, &pciNode, PCI, pciId));
- NCCLCHECK(ncclTopoConnectNodes(pciNode, lastNode, LINK_PCI, pciWidth));
- NCCLCHECK(ncclTopoConnectNodes(lastNode, pciNode, LINK_PCI, pciWidth));
- lastNode = pciNode;
- }
- }
+ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width) {
+ // Aggregate links into higher width for NVLink
+ struct ncclTopoLink* link;
+ for (link = node->links; link->remNode; link++) {
+ if (link->remNode == remNode && link->type == type) break;
}
- // Then attach to a CPU node
- int numaId = getNumaId(path);
- int width;
- NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
- NCCLCHECK(ncclTopoConnectCpu(system, numaId, lastNode, LINK_PCI, width));
- return ncclSuccess;
-}
-
-// Try to detect if IB cards are in fact the same physical NIC, hence sharing ports.
-#include <glob.h>
-#define IB_GUID_PATH "%s/infiniband/mlx5_*/sys_image_guid"
-uint64_t getIbGuid(char* path) {
- uint64_t guid = 0ULL;
- char guidPath[PATH_MAX];
- snprintf(guidPath, PATH_MAX, IB_GUID_PATH, path);
- // PATH has a wildcard in it so use glob()
- glob_t globbuf;
- glob(guidPath, 0, NULL, &globbuf);
- if (globbuf.gl_pathc > 0)
- strncpy(guidPath, globbuf.gl_pathv[0], PATH_MAX);
- globfree(&globbuf);
- guidPath[PATH_MAX-1] = '\0';
- FILE *file = fopen(guidPath, "r");
- if (file != NULL) {
- uint64_t a, b, c, d;
- if (fscanf(file, "%04lx:%04lx:%04lx:%04lx", &a, &b, &c, &d) != EOF) {
- guid = (a << 48) + (b << 32) + (c<<16) + d;
- TRACE(NCCL_GRAPH, "Opened %s guid %lx", guidPath, guid);
- }
- fclose(file);
- }
- return guid;
-}
-
-struct netInfo {
- char* path;
- int64_t nic;
- uint64_t asic;
- int port;
- int net;
-};
-
-ncclResult_t ncclTopoComputeNetInfo(struct netInfo* netInfos, int ndev) {
- for (int n=0; n<ndev; n++) {
- struct netInfo* info = netInfos+n;
- uint64_t ibGuid;
- info->nic = n;
- info->asic = n;
- info->port = 0;
- info->net = n;
- if (info->path && (ibGuid = getIbGuid(info->path)) != 0) {
- info->asic = ibGuid;
-
- // Ignore PCI subdevice when computing the ID to merge multi-port cards
- // and make them use the same PCI link.
- char* path = strdup(info->path);
- path[strlen(path)-1]='0';
- NCCLCHECK(pciPathToInt64(path, strlen(path), 0, &info->nic));
- free(path);
-
- // Same PCI path -> different ports of the same NIC
- for (int i=0; i<n; i++) if (netInfos[i].nic == info->nic) info->port++;
-
- // Same GUID -> same network links as the other NIC
- for (int i=0; i<n; i++) if (netInfos[i].asic == info->asic && netInfos[i].port == info->port) info->net = netInfos[i].net;
- }
- INFO(NCCL_GRAPH, "%s -> %x/%lx/%d/%d", info->path, info->nic, info->asic, info->port, info->net);
+ if (link->remNode == NULL) node->nlinks++;
+ link->type = type;
+ link->remNode = remNode;
+ link->width += width;
+
+ // Sort links in BW descending order
+ struct ncclTopoLink linkSave;
+ memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
+ while (link != node->links) {
+ if ((link-1)->width >= linkSave.width) break;
+ memcpy(link, link-1, sizeof(struct ncclTopoLink));
+ link--;
}
+ memcpy(link, &linkSave, sizeof(struct ncclTopoLink));
return ncclSuccess;
}
-ncclResult_t ncclTopoConnectPCI(struct ncclTopoSystem* system) {
- for (int g=0; g<system->nodes[GPU].count; g++) {
- struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
- char* path;
- NCCLCHECK(getPath(gpu->id, &path));
- NCCLCHECK(ncclTopoCreatePciPath(system, gpu, path));
- free(path);
- }
-
- // Connect the NICs
- int netDevCount;
- NCCLCHECK(ncclNetDevices(&netDevCount));
- int netWidth;
- NCCLCHECK(ncclTopoGetNetWidth(&netWidth));
-
- struct netInfo* netInfos;
- NCCLCHECK(ncclCalloc(&netInfos, netDevCount));
-
- for (int n=0; n<netDevCount; n++) {
- ncclResult_t res = ncclNetPciPath(n, &netInfos[n].path);
- if (res != ncclSuccess) netInfos[n].path = NULL;
- }
-
- NCCLCHECK(ncclTopoComputeNetInfo(netInfos, netDevCount));
-
- for (int n=0; n<netDevCount; n++) {
- struct netInfo* info = netInfos+n;
- // Create NIC and attach it to the PCI tree
- struct ncclTopoNode* nicNode = NULL;
- for (int i=0; i<system->nodes[NIC].count; i++) {
- if (system->nodes[NIC].nodes[i].id == info->nic) {
- nicNode = system->nodes[NIC].nodes+i;
- break;
- }
- }
- if (!nicNode) {
- NCCLCHECK(ncclTopoCreateNode(system, &nicNode, NIC, info->nic));
- if (info->path) {
- // Create the PCI path
- NCCLCHECK(ncclTopoCreatePciPath(system, nicNode, info->path));
- } else {
- // This is probably a virtual NIC. Just attach it directly to CPU 0
- int width;
- NCCLCHECK(ncclTopoGetCpuPciP2pWidth(&width));
- NCCLCHECK(ncclTopoConnectCpu(system, 0, nicNode, LINK_PCI, width));
- }
- }
- free(info->path);
-
- // Create the network side
- struct ncclTopoNode* netNode;
- NCCLCHECK(ncclTopoCreateNode(system, &netNode, NET, n));
-
- // Use rank to store the net information
- netNode->rank = info->net;
-
- NCCLCHECK(ncclTopoConnectNodes(nicNode, netNode, LINK_NET, netWidth));
- NCCLCHECK(ncclTopoConnectNodes(netNode, nicNode, LINK_NET, netWidth));
- }
- free(netInfos);
-
+ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
// And connect all CPU nodes together
for (int n=0; n<system->nodes[CPU].count; n++) {
for (int p=0; p<system->nodes[CPU].count; p++) {
if (n == p) continue;
- int width;
- NCCLCHECK(ncclTopoGetInterCpuWidth(&width));
- NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_QPI, width));
+ float width;
+ NCCLCHECK(ncclTopoGetInterCpuWidth(system->nodes[CPU].nodes+n, &width));
+ NCCLCHECK(ncclTopoConnectNodes(system->nodes[CPU].nodes+n, system->nodes[CPU].nodes+p, LINK_SYS, width));
}
}
return ncclSuccess;
@@ -491,7 +184,9 @@ ncclResult_t ncclTopoConnectPCI(struct ncclTopoSystem* system) {
static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoNode* prevNode, char* line, int offset) {
if (node->type == GPU) {
- sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->rank);
+ sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank);
+ } else if (node->type == CPU) {
+ sprintf(line+offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model);
} else {
sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
}
@@ -501,14 +196,14 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
for (int l=0; l<node->nlinks; l++) {
struct ncclTopoLink* link = node->links+l;
if (link->type == LINK_LOC) continue;
- if (link->remNode != prevNode) {
- sprintf(line+offset, "+ %s[%2d] - ", topoLinkTypeStr[link->type], link->width);
+ if (link->type != LINK_PCI || link->remNode != prevNode) {
+ sprintf(line+offset, "+ %s[%2.1f] - ", topoLinkTypeStr[link->type], link->width);
int nextOffset = strlen(line);
if (link->type == LINK_PCI) {
NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
} else {
if (link->remNode->type == NET) {
- sprintf(line+nextOffset, "%s/%lX (%d)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->rank);
+ sprintf(line+nextOffset, "%s/%lX (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], link->remNode->id, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.width);
} else {
sprintf(line+nextOffset, "%s/%lX", topoNodeTypeStr[link->remNode->type], link->remNode->id);
}
@@ -520,7 +215,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
}
ncclResult_t ncclTopoPrint(struct ncclTopoSystem* s) {
- INFO(NCCL_GRAPH, "=== System : maxWidth %2d maxSpeed %2d ===", s->maxWidth, s->maxSpeed);
+ INFO(NCCL_GRAPH, "=== System : maxWidth %2.1f ===", s->maxWidth);
char line[1024];
for (int n=0; n<s->nodes[CPU].count; n++) NCCLCHECK(ncclTopoPrintRec(s->nodes[CPU].nodes+n, NULL, line, 0));
INFO(NCCL_GRAPH, "==========================================");
@@ -554,88 +249,400 @@ static ncclResult_t ncclTopoSort(struct ncclTopoNode* node, struct ncclTopoNode*
// 1. NVLinks (already the case)
// 2. PCI down
// 3. PCI up
-// 4. QPI (already the case)
+// 4. SYS (already the case)
ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) {
for (int n=0; n<system->nodes[CPU].count; n++) NCCLCHECK(ncclTopoSort(system->nodes[CPU].nodes+n, NULL));
return ncclSuccess;
}
-ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
- struct ncclTopoSystem* s;
- NCCLCHECK(ncclCalloc(&s, 1));
- nvmlDevice_t* nvmlDevs;
- int g = 0;
- NCCLCHECK(ncclCalloc(&nvmlDevs, comm->nRanks));
- for (int r=0; r<comm->nRanks; r++) {
- if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
- // Consider the GPU as outside of our node if we can't see it through NVML.
- char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
- NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
- if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevs+g) != ncclSuccess) continue;
- g++;
- struct ncclTopoNode* gpuNode;
- NCCLCHECK(ncclTopoCreateNode(s, &gpuNode, GPU, comm->peerInfo[r].busId));
- gpuNode->rank = r;
+ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
+ int dev;
+ NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));
+
+ struct ncclTopoNode* net;
+ NCCLCHECK(ncclTopoCreateNode(system, &net, NET, dev));
+ const char* str;
+ NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str));
+ if (str) sscanf(str, "0x%lx", &net->net.asic);
+ else net->net.asic = dev;
+
+ ncclDebugNoWarn = NCCL_GRAPH;
+ int mbps;
+ if (xmlGetAttrInt(xmlNet, "speed", &mbps) != ncclSuccess) mbps = 0;
+ if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1
+ net->net.width = mbps / 8000.0;
+ if (xmlGetAttrInt(xmlNet, "port", &net->net.port) != ncclSuccess) net->net.port = 0;
+ if (xmlGetAttrInt(xmlNet, "gdr", &net->net.gdrSupport) != ncclSuccess) net->net.gdrSupport = 0;
+ if (xmlGetAttrInt(xmlNet, "maxconn", &net->net.maxChannels) != ncclSuccess) net->net.maxChannels = MAXCHANNELS;
+ if (xmlGetAttrInt(xmlNet, "coll", &net->net.collSupport) != ncclSuccess) net->net.collSupport = 0;
+ ncclDebugNoWarn = 0;
+
+ NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width));
+ NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.width));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
+ for (int s=0; s<xmlNic->nSubs; s++) {
+ struct ncclXmlNode* xmlNet = xmlNic->subs[s];
+ if (strcmp(xmlNet->name, "net") != 0) continue;
+ int index;
+ NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));
+ if (index == -1) continue;
+ NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic));
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
+ NCCLCHECK(xmlGetAttrInt(xmlGpu, "sm", &gpu->gpu.cudaCompCap));
+ NCCLCHECK(xmlGetAttrInt(xmlGpu, "rank", &gpu->gpu.rank));
+ NCCLCHECK(xmlGetAttrInt(xmlGpu, "dev", &gpu->gpu.dev));
+ NCCLCHECK(xmlGetAttrInt(xmlGpu, "gdr", &gpu->gpu.gdrSupport));
+ // Do not go any further, nvlinks will be added in a second pass
+ return ncclSuccess;
+}
+
+struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, 0 } };
+struct kvDict kvDictPciGen[] = { { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { NULL, 0 } }; // x100 Mbps per lane
+ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) {
+ const char* str;
+
+ int type;
+ NCCLCHECK(xmlGetAttrStr(xmlPci, "class", &str));
+ NCCLCHECK(kvConvertToInt(str, &type, kvDictPciClass));
+
+ int64_t busId;
+ NCCLCHECK(xmlGetAttrStr(xmlPci, "busid", &str));
+ NCCLCHECK(busIdToInt64(str, &busId));
+
+ struct ncclTopoNode* node = NULL;
+ if (type == GPU) {
+ struct ncclXmlNode* xmlGpu;
+ NCCLCHECK(xmlGetSub(xmlPci, "gpu", &xmlGpu));
+ if (xmlGpu == NULL) return ncclSuccess;
+ int index;
+ NCCLCHECK(xmlGetAttrIndex(xmlGpu, "rank", &index));
+ if (index == -1) return ncclSuccess;
+ NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
+ NCCLCHECK(ncclTopoAddGpu(xmlGpu, system, node));
+ }
+ if (type == NIC) {
+ struct ncclXmlNode* xmlNic;
+ NCCLCHECK(xmlGetSub(xmlPci, "nic", &xmlNic));
+ if (xmlNic == NULL) return ncclSuccess;
+
+ // Ignore sub device ID and merge multi-port NICs into one PCI device.
+ busId &= 0xfffffffffffffff0;
+ struct ncclTopoNode* nicNode = NULL;
+ NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, busId));
+ if (nicNode == NULL) {
+ NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, busId));
+ node = nicNode; // Connect it to parent later on
+ }
+ NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode));
+ } else if (type == PCI) {
+ NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
+ for (int s=0; s<xmlPci->nSubs; s++) {
+ struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
+ NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node));
}
}
- NCCLCHECK(ncclTopoConnectNVLink(nvmlDevs, s));
- NCCLCHECK(ncclTopoConnectPCI(s));
+ if (node) {
+ int width, speed;
+ NCCLCHECK(xmlGetAttrInt(xmlPci, "link_width", &width));
+ NCCLCHECK(xmlGetAttrStr(xmlPci, "link_speed", &str));
+
+ // Manage cases where speed was not indicated in /sys
+ if (width == 0) width = 16;
+ if (strlen(str) == 0 || strcasecmp(str, "Unknown speed") == 0) str = "8 GT/s";
- free(nvmlDevs);
- NCCLCHECK(ncclTopoSortSystem(s));
- *system = s;
+ NCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end)
+
+ NCCLCHECK(ncclTopoConnectNodes(node, parent, LINK_PCI, width*speed/80.0));
+ NCCLCHECK(ncclTopoConnectNodes(parent, node, LINK_PCI, width*speed/80.0));
+ }
return ncclSuccess;
}
-ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink) {
- int g1, g2;
- NCCLCHECK(idToIndex(system, busId1, &g1));
- NCCLCHECK(idToIndex(system, busId2, &g2));
- *nvlink = g1 != -1 && g2 != -1 && system->nodes[GPU].nodes[g1].paths[GPU][g2].type == LINK_NVL;
+struct kvDict kvDictCpuArch[] = { { "x86_64", NCCL_TOPO_CPU_ARCH_X86 }, { "arm64", NCCL_TOPO_CPU_ARCH_ARM }, { "ppc64", NCCL_TOPO_CPU_ARCH_POWER }, { NULL, 0 } };
+struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { NULL, 0 } };
+
+ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) {
+ int numaId;
+ NCCLCHECK(xmlGetAttrInt(xmlCpu, "numaid", &numaId));
+ struct ncclTopoNode* cpu;
+ NCCLCHECK(ncclTopoCreateNode(system, &cpu, CPU, numaId));
+ const char* str;
+ NCCLCHECK(xmlGetAttr(xmlCpu, "affinity", &str));
+ if (str != NULL) {
+ NCCLCHECK(ncclStrToCpuset(str, &cpu->cpu.affinity));
+ }
+
+ NCCLCHECK(xmlGetAttrStr(xmlCpu, "arch", &str));
+ NCCLCHECK(kvConvertToInt(str, &cpu->cpu.arch, kvDictCpuArch));
+ if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86) {
+ NCCLCHECK(xmlGetAttrStr(xmlCpu, "vendor", &str));
+ NCCLCHECK(kvConvertToInt(str, &cpu->cpu.vendor, kvDictCpuVendor));
+ if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
+ int familyId, modelId;
+ NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
+ NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
+ cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;
+ }
+ }
+ for (int s=0; s<xmlCpu->nSubs; s++) {
+ struct ncclXmlNode* node = xmlCpu->subs[s];
+ if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu));
+ if (strcmp(node->name, "nic") == 0) {
+ struct ncclTopoNode* nic = NULL;
+ NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0));
+ if (nic == NULL) {
+ NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, 0));
+ NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_WIDTH));
+ NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_WIDTH));
+ }
+ NCCLCHECK(ncclTopoAddNic(node, system, nic));
+ }
+ }
return ncclSuccess;
}
-ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink) {
- int g;
- NCCLCHECK(idToIndex(system, busId, &g));
- for (int i=0; i<system->nodes[GPU].count; i++) {
- if (i == g) continue;
- if (system->nodes[GPU].nodes[g].paths[GPU][i].type == LINK_NVL) {
- *nvlink = 1;
- return ncclSuccess;
+ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) {
+ if (strcmp(node->name, "nvlink") == 0) {
+ struct ncclTopoNode* gpu = NULL;
+ int64_t pBusId;
+ NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
+ NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
+ if (gpu == NULL) {
+ WARN("Add NVLink error : could not find GPU %lx\n", pBusId);
+ return ncclInternalError;
+ }
+ int count;
+ NCCLCHECK(xmlGetAttrInt(node, "count", &count));
+ const char* targetClass;
+ NCCLCHECK(xmlGetAttrStr(node, "tclass", &targetClass));
+ int targetType;
+ NCCLCHECK(kvConvertToInt(targetClass, &targetType, kvDictPciClass));
+ struct ncclTopoNode* remote = NULL;
+ if (targetType == GPU) {
+ // NVL P2P connection to another GPU
+ const char* target;
+ NCCLCHECK(xmlGetAttrStr(node, "target", &target));
+ int64_t busId;
+ NCCLCHECK(busIdToInt64(target, &busId));
+ NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, busId));
+ } else if (targetType == CPU) {
+ // NVL connection to the local CPU
+ NCCLCHECK(findLocalCpu(gpu, &remote));
+ } else {
+ if (system->nodes[NVS].count == 0) {
+ NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0));
+ } else {
+ remote = system->nodes[NVS].nodes;
+ }
+ }
+ if (remote) {
+ int nvlSpeed = gpu->gpu.cudaCompCap == 60 ? PASCAL_NVLINK_WIDTH : VOLTA_NVLINK_WIDTH;
+ NCCLCHECK(ncclTopoConnectNodes(gpu, remote, LINK_NVL, count*nvlSpeed));
+ if (remote->type != GPU) {
+ NCCLCHECK(ncclTopoConnectNodes(remote, gpu, LINK_NVL, count*nvlSpeed));
+ }
+ }
+ } else {
+ const char* busId;
+ NCCLCHECK(xmlGetAttr(node, "busid", &busId));
+ for (int s=0; s<node->nSubs; s++) {
+ NCCLCHECK(ncclTopoAddNvLinks(node->subs[s], system, busId ? busId : parentBusId));
}
}
- *nvlink = 0;
return ncclSuccess;
}
-static int pathDistance(struct ncclTopoLinkList* links) {
- int distance = PATH_PIX;
- if (links->count > 2) distance = PATH_PXB;
- for (int l=0; l<links->count; l++) {
- // PHB if we go through 1 CPU, SYS if we go through 2 CPUs
- if (links->list[l]->remNode->type == CPU) distance = (distance == PATH_PHB) ? PATH_SYS : PATH_PHB;
+ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem) {
+ NCCLCHECK(ncclCalloc(topoSystem, 1));
+ struct ncclXmlNode* topNode;
+ NCCLCHECK(xmlFindTag(xml, "system", &topNode));
+ for (int s=0; s<topNode->nSubs; s++) {
+ struct ncclXmlNode* node = topNode->subs[s];
+ if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
+ }
+ NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL));
+
+ NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
+ NCCLCHECK(ncclTopoSortSystem(*topoSystem));
+
+ return ncclSuccess;
+}
+
+NCCL_PARAM(TopoDumpFileRank, "TOPO_DUMP_FILE_RANK", 0);
+
+// Only set values if not already set
+static ncclResult_t xmlInitAttrInt(struct ncclXmlNode* node, const char* attrName, const int value) {
+ int index;
+ NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+ if (index == -1) {
+ index = node->nAttrs++;
+ strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+ snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
}
- return distance;
+ return ncclSuccess;
+}
+static ncclResult_t xmlInitAttrUint64(struct ncclXmlNode* node, const char* attrName, const uint64_t value) {
+ int index;
+ NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+ if (index == -1) {
+ index = node->nAttrs++;
+ strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+ snprintf(node->attrs[index].value, MAX_STR_LEN, "0x%lx", value);
+ }
+ return ncclSuccess;
}
-ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance) {
- int g1, g2;
- NCCLCHECK(idToIndex(system, busId1, &g1));
- NCCLCHECK(idToIndex(system, busId2, &g2));
- *distance = pathDistance(system->nodes[GPU].nodes[g1].paths[GPU]+g2);
+
+ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
+ struct ncclXml* xml;
+ NCCLCHECK(ncclCalloc(&xml, 1));
+ char* xmlTopoFile = getenv("NCCL_TOPO_FILE");
+ if (xmlTopoFile) {
+ NCCLCHECK(ncclTopoGetXmlFromFile(xmlTopoFile, xml));
+ }
+ if (xml->maxIndex == 0) {
+ // Create top tag
+ struct ncclXmlNode* top;
+ NCCLCHECK(xmlAddNode(xml, NULL, "system", &top));
+ NCCLCHECK(xmlSetAttrInt(top, "version", NCCL_TOPO_XML_VERSION));
+ }
+
+ // Auto-detect GPUs if needed
+ for (int r=0; r<comm->nRanks; r++) {
+ if (comm->peerInfo[r].hostHash == comm->peerInfo[comm->rank].hostHash) {
+ char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+ NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
+ struct ncclXmlNode* node;
+ NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
+ NCCLCHECK(xmlSetAttrInt(node, "rank", r));
+ NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
+ }
+ }
+ // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
+ // so we start with collnet so that it has precedence.
+ int netDevCount = 0;
+ if (ncclCollNet) {
+ NCCLCHECK(collNetDevices(&netDevCount));
+ for (int n=0; n<netDevCount; n++) {
+ ncclNetProperties_t props;
+ NCCLCHECK(collNetGetProperties(n, &props));
+ struct ncclXmlNode* netNode;
+ NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
+ NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
+ NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
+ NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
+ NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
+ NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
+ NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
+ NCCLCHECK(xmlInitAttrInt(netNode, "coll", 1));
+ }
+ }
+ if (netDevCount == 0) {
+ NCCLCHECK(ncclNetDevices(&netDevCount));
+ }
+ for (int n=0; n<netDevCount; n++) {
+ ncclNetProperties_t props;
+ NCCLCHECK(ncclNetGetProperties(n, &props));
+ struct ncclXmlNode* netNode;
+ NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode));
+ NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
+ NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
+ NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
+ NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
+ NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
+ NCCLCHECK(xmlInitAttrInt(netNode, "gdr", props.ptrSupport & NCCL_PTR_CUDA ? 1 : 0));
+ }
+
+ xmlTopoFile = getenv("NCCL_TOPO_DUMP_FILE");
+ if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
+ NCCLCHECK(ncclTopoDumpXmlToFile(xmlTopoFile, xml));
+ }
+
+ NCCLCHECK(ncclTopoGetSystemFromXml(xml, system));
+ free(xml);
return ncclSuccess;
}
-ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance) {
- int g;
- NCCLCHECK(idToIndex(system, busId, &g));
- *distance = pathDistance(system->nodes[GPU].nodes[g].paths[NET]+netDev);
+/****************************/
+/* External query functions */
+/****************************/
+
+ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model) {
+ *arch = system->nodes[CPU].nodes[0].cpu.arch;
+ *vendor = system->nodes[CPU].nodes[0].cpu.vendor;
+ *model = system->nodes[CPU].nodes[0].cpu.model;
return ncclSuccess;
}
-ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count) {
- *count = system->nodes[CPU].count;
+NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
+
+ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank) {
+ struct ncclTopoNode* cpu = NULL, *gpu = NULL;
+ for (int g=0; g<system->nodes[GPU].count; g++) {
+ if (system->nodes[GPU].nodes[g].gpu.rank == rank) {
+ gpu = system->nodes[GPU].nodes+g;
+ // Find closer CPU
+ int cpuIndex = -1, minHops = 0;
+ for (int c=0; c<system->nodes[CPU].count; c++) {
+ int nHops = system->nodes[GPU].nodes[g].paths[CPU][c].count;
+ if (cpuIndex == -1 || nHops < minHops) {
+ cpuIndex = c;
+ minHops = nHops;
+ }
+ }
+ cpu = system->nodes[CPU].nodes+cpuIndex;
+ }
+ }
+ if (cpu == NULL) {
+ WARN("Set CPU affinity : unable to find GPU/CPU for rank %d", rank);
+ return ncclInternalError;
+ }
+
+ // Query the CPU affinity set we were provided
+ cpu_set_t mask;
+ SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
+
+#ifdef ENABLE_TRACE
+ {
+ char affinityStr[sizeof(cpu_set_t)*2];
+ NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
+ TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev, affinityStr);
+ }
+#endif
+
+ // Get the affinity of the CPU close to our GPU.
+ cpu_set_t cpuMask = cpu->cpu.affinity;
+
+#ifdef ENABLE_TRACE
+ {
+ char affinityStr[sizeof(cpu_set_t)*2];
+ NCCLCHECK(ncclCpusetToStr(&cpuMask, affinityStr));
+ TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev, affinityStr);
+ }
+#endif
+
+ cpu_set_t finalMask;
+ if (ncclParamIgnoreCpuAffinity())
+ // Ignore the CPU affinity set and use the GPU one instead
+ finalMask = cpuMask;
+ else
+ // Use a subset of the GPU affinity set
+ CPU_AND(&finalMask, &mask, &cpuMask);
+
+ // If there is a non empty set, use it to set affinity
+ if (CPU_COUNT(&finalMask)) {
+ char affinityStr[sizeof(cpu_set_t)*2];
+ NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
+ INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr);
+ SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity");
+ }
return ncclSuccess;
}
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 6b8a2f9..848fc03 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,22 +9,24 @@
#include "graph.h"
#include "core.h"
-
-#define LOC_WIDTH 5000
-#define PASCAL_NVLINK_WIDTH 18
-#define VOLTA_NVLINK_WIDTH 21
-#define PCI_WIDTH 12 // PCI Gen3 x16
-#define QPI_WIDTH 8
-#define SKL_QPI_WIDTH 12
-#define P9_WIDTH 32
-#define NET_WIDTH 12 // 100Gbit
-
-// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, to GPU
-// to GPU traffic consumed more PCI bandwidth.
+#include <sched.h>
+
+#define LOC_WIDTH 5000.0
+#define PASCAL_NVLINK_WIDTH 18.0
+#define VOLTA_NVLINK_WIDTH 21.0
+#define PCI_WIDTH 12.0 // PCI Gen3 x16
+#define QPI_WIDTH 6.0
+#define SKL_QPI_WIDTH 9.0
+#define P9_WIDTH 32.0
+#define ARM_WIDTH 6.0
+#define NET_WIDTH 12.0 // 100Gbit
+
+// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
+// to GPU traffic consumes more PCI bandwidth.
#define INTEL_P2P(speed) (speed*9/12)
#define INTEL_P2P_OVERHEAD(speed) (speed*12/9)
-#define NCCL_TOPO_NODE_TYPES 6
+#define NCCL_TOPO_NODE_TYPES 7
#define GPU 0
#define PCI 1
#define NVS 2
@@ -33,37 +35,72 @@
#define NET 5
extern const char* topoNodeTypeStr[];
+// We want link types and path types to match as much as possible
#define LINK_LOC 0
#define LINK_NVL 1
#define LINK_PCI 2
-#define LINK_QPI 3
-#define LINK_NET 4
+// Skipping 3 for PATH_PXB
+// Skipping 4 for PATH_PHB
+#define LINK_SYS 5
+#define LINK_NET 6
extern const char* topoLinkTypeStr[];
+#define PATH_LOC 0
+#define PATH_NVL 1
+#define PATH_PIX 2
+#define PATH_PXB 3
+#define PATH_PHB 4
+#define PATH_SYS 5
+#define PATH_NET 6
+extern const char* topoPathTypeStr[];
+
struct ncclTopoNode;
struct ncclTopoLink {
int type;
- int width;
+ float width;
struct ncclTopoNode* remNode;
};
#define NCCL_TOPO_MAX_LINKS 32
#define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES)
-#define SELECT_PATH 1
-#define SELECT_LAST 2
-
-#define NET_GDR_MASK 0x70000000
struct ncclTopoLinkList {
struct ncclTopoLink* list[NCCL_TOPO_MAX_HOPS];
int count;
- int width;
+ float width;
int type;
};
+#define NCCL_TOPO_CPU_INTEL_BDW 1
+#define NCCL_TOPO_CPU_INTEL_SKL 2
+
+#define NCCL_TOPO_UNDEF (-1)
+
struct ncclTopoNode {
int type;
int64_t id;
- int rank;
+ // Type specific data
+ union {
+ struct {
+ int dev; // NVML dev number
+ int rank;
+ int cudaCompCap;
+ int gdrSupport;
+ }gpu;
+ struct {
+ uint64_t asic;
+ int port;
+ float width;
+ int gdrSupport;
+ int collSupport;
+ int maxChannels;
+ }net;
+ struct {
+ int arch;
+ int vendor;
+ int model;
+ cpu_set_t affinity;
+ }cpu;
+ };
int nlinks;
struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS];
// Pre-computed paths to GPUs and NICs
@@ -79,60 +116,29 @@ struct ncclTopoNodeSet {
struct ncclTopoSystem {
struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
- int maxSpeed;
- int maxWidth;
- int searchInitDone;
+ float maxWidth;
};
-static ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
+ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
+ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
+ncclResult_t ncclTopoRemoveNode(struct ncclTopoSystem* system, int type, int id);
+ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, float width);
+ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
+ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system);
+
+ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem);
+ncclResult_t ncclTopoGetGraphFromXml(struct ncclXmlNode *xmlGraphs, struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
+ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs, struct ncclTopoSystem* system, struct ncclXml *xml);
+
+static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, int64_t id, int* index) {
+ *index = -1;
for (int i=0; i<system->nodes[type].count; i++) {
if (system->nodes[type].nodes[i].id == id) {
- *node = system->nodes[type].nodes+i;
+ *index = i;
return ncclSuccess;
}
}
- if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
- WARN("Error : tried to create too many nodes of type %d\n", type);
- return ncclInternalError;
- }
- struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
- system->nodes[type].count++;
- n->type = type;
- n->id = id;
- if (type == GPU) {
- // Create link to itself (used in some corner cases)
- n->nlinks=1;
- n->links[0].type = LINK_LOC;
- n->links[0].remNode = n;
- n->links[0].width = LOC_WIDTH;
- }
- *node = n;
- return ncclSuccess;
-}
-
-static ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode* remNode, int type, int width) {
- // Aggregate links into higher width for NVLink
- struct ncclTopoLink* link;
- for (link = node->links; link->remNode; link++) {
- if (link->remNode == remNode && link->type == type) break;
- }
- if (link->remNode == NULL) node->nlinks++;
- link->type = type;
- link->remNode = remNode;
- link->width += width;
-
- // Sort links in BW descending order
- struct ncclTopoLink linkSave;
- memcpy(&linkSave, link, sizeof(struct ncclTopoLink));
- while (link != node->links) {
- if ((link-1)->width >= linkSave.width) break;
- memcpy(link, link-1, sizeof(struct ncclTopoLink));
- link--;
- }
- memcpy(link, &linkSave, sizeof(struct ncclTopoLink));
- return ncclSuccess;
+ return ncclInternalError;
}
-ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
-
#endif
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index 87afb2f..8a0b4cd 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -52,12 +52,12 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li
}
static const char* ncclFuncStr[] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce" };
-static const char* ncclAlgoStr[] = { "Tree", "Ring" };
+static const char* ncclAlgoStr[] = { "Tree", "Ring", "CollNet" };
static const char* ncclProtoStr[] = { "LL", "LL128", "Simple" };
// Latencies in us, Bandwidths in GB/s
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
-static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4, 0 }, { 3.6, 3.6, 8.4 } };
+static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4, 4.4, 0 }, { 3.6, 3.6, 8.4 }, { 4.4, 4.4, 0 } };
// NVLink, PCI, Network
#define NCCL_HW_NVLINK 0
@@ -66,29 +66,32 @@ static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 4.4,
// Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network).
static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
{ /* NVLINK */
- { /* Tree (LL/LL128/Simple)*/ { .5, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { .4, 2.5, 5.7 } },
+ { /* Tree (LL/LL128/Simple)*/ { .5, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { .4, 2.5, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { .5, 1.9, 4.0 } },
/* PCI */
- { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 } },
+ { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 28 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { 1.0, 1.9, 5.5 } },
/* NET */
- { /* Tree (LL/LL128/Simple)*/ { 5.0, 7.5, 50 }, /* Ring (LL/LL128/Simple)*/ { .9, 2.5, 6.6 } }
+ { /* Tree (LL/LL128/Simple)*/ { 5.0, 7.5, 50 }, /* Ring (LL/LL128/Simple)*/ { .9, 2.5, 6.6 }, /* CollNet (LL/LL128/Simple)*/ { 5.0, 5.0, 10.7 } }
};
// LL128 max BW for the different collectives
static const double ll128MaxBw[NCCL_NUM_FUNCTIONS] = { 113.0, 72.0, 110.0, 91.0, 100.0 };
-ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph) {
- int simpleDefaultThreads = (treeGraph->speedIntra*treeGraph->nChannels <= 12) ? 256 : NCCL_MAX_NTHREADS;
- comm->maxThreads[NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
- comm->maxThreads[NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
- comm->maxThreads[NCCL_PROTO_LL128] = getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
-
- INFO(NCCL_INIT, "Threads per block : %d/%d/%d", comm->maxThreads[NCCL_PROTO_LL], comm->maxThreads[NCCL_PROTO_LL128], comm->maxThreads[NCCL_PROTO_SIMPLE]);
+ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) {
+ int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_MAX_NTHREADS;
+ comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
+ getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, simpleDefaultThreads);
+ comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
+ getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
+ comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
+ getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS);
+ comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128] =
+ getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
if (comm->nRanks <= 1) return ncclSuccess;
- struct ncclTopoGraph* graphs[2] = { treeGraph, ringGraph };
- int intraHw[2], hw[2];
- for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->nvlink ? NCCL_HW_NVLINK : NCCL_HW_PCI;
+ struct ncclTopoGraph* graphs[NCCL_NUM_ALGORITHMS] = { treeGraph, ringGraph, collNetGraph };
+ int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
+ for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = comm->nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
@@ -97,21 +100,24 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom
comm->nRanks;
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
- if (coll != ncclCollAllReduce && a == NCCL_ALGO_TREE) continue;
+ if (coll != ncclCollAllReduce && a != NCCL_ALGO_RING) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
- int speed = comm->nNodes <= 2 ? graphs[a]->speedIntra : graphs[a]->speedInter;
- float busBw = graphs[a]->nChannels * speed * 1.0;
+ float speed = comm->nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
+ float busBw = graphs[a]->nChannels * speed;
// Various model refinements
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= 1.0/4.0;
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw*120.0/128.0, ll128MaxBw[coll]);
- if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.9, comm->nNodes > 1 ? 70.0 : 90.0);
+ if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.9, comm->nNodes > 2 ? 80.0 : 110.0);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/3.0;
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw *= 7.0/9.0;
+ if (a == NCCL_ALGO_COLLNET) busBw *= .9;
+ if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL) busBw *= 1.0/6.0; // Take into account that GDR read is disabled on both sides
+ if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL128) busBw = 0; // CollNet does not support LL128
// Convert bus BW to algorithm BW
- float ratio = a == NCCL_ALGO_TREE ? .5 : (1.0 * comm->nRanks) / nsteps;
+ float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * comm->nRanks) / nsteps;
comm->bandwidths[coll][a][p] = busBw * ratio;
comm->latencies[coll][a][p] = baseLat[a][p];
@@ -127,11 +133,16 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom
} else {
comm->latencies[coll][a][p] += nsteps*lat;
}
- } else {
+ } else if (a == NCCL_ALGO_TREE) {
float intraLat = hwLat[intraHw[a]][a][p];
float interLat = hwLat[NCCL_HW_NET][a][p];
comm->latencies[coll][a][p] +=
2 * ((comm->nRanks/comm->nNodes-1) * intraLat + log2i(comm->nNodes) * interLat);
+ } else {
+ float intraLat = hwLat[intraHw[a]][a][p];
+ float interLat = hwLat[NCCL_HW_NET][a][p];
+ comm->latencies[coll][a][p] +=
+ 2 * (comm->nRanks/comm->nNodes-1) * intraLat + interLat;
}
}
}
@@ -140,7 +151,7 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom
// Protocols/Algorithms enable/disable, and user overrides.
// All are enabled except ll128 which is enabled by default only in certain cases.
int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
- int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1 };
+ int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1 };
const char *protoStr = getenv("NCCL_PROTO");
if (protoStr) NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
@@ -151,30 +162,32 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom
int pEnable = protoEnable[p];
if (pEnable == 2 && p == NCCL_PROTO_LL128) {
// Enable LL128 by default only on Volta+NVLink. Other cases are not tested and may cause silent data corruption.
- pEnable = (graphs[a]->type <= LINK_PCI) && graphs[a]->nvlink && minCompCap == 70 && maxCompCap == 70 ? 1 : 0;
+ pEnable = (graphs[a]->typeInter <= LINK_PCI) && graphs[a]->typeIntra == LINK_NVL && minCompCap == 70 && maxCompCap == 70 ? 1 : 0;
}
if (pEnable == 0 || algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
}
if (comm->rank == 0) {
char line[1024];
- int offset = 0;
sprintf(line, "Latency/AlgBw |");
- offset = strlen(line);
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
- sprintf(line+offset, " %4s/%6s |", ncclAlgoStr[a], ncclProtoStr[p]);
- offset = strlen(line);
+ sprintf(line+strlen(line), " %7s/%6s |", ncclAlgoStr[a], ncclProtoStr[p]);
+ }
+ }
+ INFO(NCCL_TUNING, "%s", line);
+ sprintf(line, " Max NThreads |");
+ for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+ for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+ sprintf(line+strlen(line), " %14d |", comm->maxThreads[a][p]);
}
}
INFO(NCCL_TUNING, "%s", line);
for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
sprintf(line, "%13s |", ncclFuncStr[c]);
- offset = strlen(line);
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
- sprintf(line+offset, "%7.1f/%5.1f|", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
- offset = strlen(line);
+ sprintf(line+strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
}
}
INFO(NCCL_TUNING, "%s", line);
@@ -201,12 +214,34 @@ ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCom
}
}
- INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld",
+ INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld | %ld/%ld/%ld",
comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL],
comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL128],
comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE],
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL],
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL128],
- comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
+ comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE],
+ comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_LL],
+ comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128],
+ comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE]);
+ return ncclSuccess;
+}
+
+// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
+// factor is not ideal but works quite well. Powers of two, 64 B to 1 GB.
+static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = {
+ { 1.0, 1.0, 1.0, 1.0, .9, .8, .7, .7, .7, .7, .6, .5, .5, .5, .6, .7, .8, .9, .9, 1.0, 1.0, 1.0 },
+ { 1.0, 1.0, 1.0, 1.0, 1.0, .9, .8, .8, .8, .8, .7, .7, .7, .6, .6, .7, .7, .8, .8, .9, .9, 1.0 },
+ { .9, .9, .9, .9, .9, .9, .9, .8, .7, .6, .6, .5, .5, .5, .5, .5, .5, .6, .6, .7, .8, .9 }
+};
+
+ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time) {
+ float bw = info->comm->bandwidths[info->coll][algorithm][protocol];
+ if (bw == 0) {
+ *time = -1.0; return ncclSuccess;
+ }
+ int logSize = log2i(info->nBytes>>6);
+ if (algorithm == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[protocol][logSize];
+ *time = info->comm->latencies[info->coll][algorithm][protocol] + (info->nBytes) / (1000 * bw);
return ncclSuccess;
}
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
new file mode 100644
index 0000000..550cfcd
--- /dev/null
+++ b/src/graph/xml.cc
@@ -0,0 +1,780 @@
+/*************************************************************************
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <ctype.h>
+#include "core.h"
+#include "nvmlwrap.h"
+#include "xml.h"
+
+/*******************/
+/* XML File Parser */
+/*******************/
+
+ncclResult_t xmlGetChar(FILE* file, char* c) {
+ if (fread(c, 1, 1, file) == 0) {
+ WARN("XML Parse : Unexpected EOF");
+ return ncclInternalError;
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t xmlGetValue(FILE* file, char* value, char* last) {
+ char c;
+ NCCLCHECK(xmlGetChar(file, &c));
+ if (c != '"' && c != '\'') {
+#if INT_OK
+ int o = 0;
+ do {
+ value[o++] = c;
+ NCCLCHECK(xmlGetChar(file, &c));
+ } while (c >= '0' && c <= '9');
+ value[o] = '\0';
+ *last = c;
+ return ncclSuccess;
+#else
+ WARN("XML Parse : Expected (double) quote.");
+ return ncclInternalError;
+#endif
+ }
+ int o = 0;
+ do {
+ NCCLCHECK(xmlGetChar(file, &c));
+ value[o++] = c;
+ } while (c != '"');
+ value[o-1] = '\0';
+ NCCLCHECK(xmlGetChar(file, last));
+ return ncclSuccess;
+}
+
+ncclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) {
+ char c;
+ char* ptr = name;
+ int o = 0;
+ do {
+ NCCLCHECK(xmlGetChar(file, &c));
+ if (c == '=') {
+ ptr[o] = '\0';
+ if (value == NULL) {
+ WARN("XML Parse : Unexpected value with name %s\n", ptr);
+ return ncclInternalError;
+ }
+ return xmlGetValue(file, value, last);
+ }
+ ptr[o] = c;
+ if (o == MAX_STR_LEN-1) {
+ ptr[o] = '\0';
+ WARN("Error : name %s too long (max %d)", ptr, MAX_STR_LEN);
+ return ncclInternalError;
+ }
+ o++;
+ } while (c != ' ' && c != '>' && c != '/' && c != '\n' && c != '\r');
+ ptr[o-1] = '\0';
+ *last = c;
+ return ncclSuccess;
+}
+
+// Shift the 3-chars string by one char and append c at the end
+#define SHIFT_APPEND(s, c) do { s[0]=s[1]; s[1]=s[2]; s[2]=c; } while(0)
+ncclResult_t xmlSkipComment(FILE* file, char* start, char next) {
+ // Start from something neutral with \0 at the end.
+ char end[4] = "...";
+
+ // Inject all trailing chars from previous reads. We don't need
+ // to check for --> here because there cannot be a > in the name.
+ for (int i=0; i<strlen(start); i++) SHIFT_APPEND(end, start[i]);
+ SHIFT_APPEND(end, next);
+
+ // Stop when we find "-->"
+ while (strcmp(end, "-->") != 0) {
+ int c;
+ if (fread(&c, 1, 1, file) != 1) {
+ WARN("XML Parse error : unterminated comment");
+ return ncclInternalError;
+ }
+ SHIFT_APPEND(end, c);
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
+ node->type = NODE_TYPE_NONE;
+ char c = ' ';
+ while (c == ' ' || c == '\n' || c == '\r') {
+ if (fread(&c, 1, 1, file) == 0) return ncclSuccess;
+ }
+ if (c != '<') {
+ WARN("XML Parse error : expecting '<', got '%c'", c);
+ return ncclInternalError;
+ }
+ // Read XML element name
+ NCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
+
+ // Check for comments
+ if (strncmp(node->name, "!--", 3) == 0) {
+ NCCLCHECK(xmlSkipComment(file, node->name+3, c));
+ return xmlGetNode(file, node);
+ }
+
+ // Check for closing tag
+ if (node->name[0] == '\0' && c == '/') {
+ node->type = NODE_TYPE_CLOSE;
+ // Re-read the name, we got '/' in the first call
+ NCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
+ if (c != '>') {
+ WARN("XML Parse error : unexpected trailing %c in closing tag %s\n", c, node->name);
+ return ncclInternalError;
+ }
+ return ncclSuccess;
+ }
+
+ node->type = NODE_TYPE_OPEN;
+
+ // Get Attributes
+ int a = 0;
+ while (c == ' ') {
+ NCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c));
+ if (a == MAX_ATTR_COUNT) {
+ INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)\n", MAX_ATTR_COUNT);
+ // Actually we need to still consume the extra attributes so we have an extra one.
+ } else a++;
+ }
+ node->nAttrs = a;
+ if (c == '/') {
+ node->type = NODE_TYPE_SINGLE;
+ char str[MAX_STR_LEN];
+ NCCLCHECK(xmlGetToken(file, str, NULL, &c));
+ }
+ if (c != '>') {
+ WARN("XML Parse : expected >, got '%c'", c);
+ return ncclInternalError;
+ }
+ return ncclSuccess;
+}
+
+typedef ncclResult_t (*xmlHandlerFunc_t)(FILE*, struct ncclXml*, struct ncclXmlNode*);
+
+struct xmlHandler {
+ const char * name;
+ xmlHandlerFunc_t func;
+};
+
+ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head, struct xmlHandler handlers[], int nHandlers) {
+ if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess;
+ while (1) {
+ if (xml->maxIndex == MAX_NODES) {
+ WARN("Error : XML parser is limited to 1024 nodes\n");
+ return ncclInternalError;
+ }
+ struct ncclXmlNode* node = xml->nodes+xml->maxIndex;
+ memset(node, 0, sizeof(struct ncclXmlNode));
+ NCCLCHECK(xmlGetNode(file, node));
+ if (node->type == NODE_TYPE_NONE) {
+ if (head) {
+ WARN("XML Parse : unterminated %s", head->name);
+ return ncclInternalError;
+ } else {
+ // All done
+ return ncclSuccess;
+ }
+ }
+ if (head && node->type == NODE_TYPE_CLOSE) {
+ if (strcmp(node->name, head->name) != 0) {
+ WARN("XML Mismatch : %s / %s", head->name, node->name);
+ return ncclInternalError;
+ }
+ return ncclSuccess;
+ }
+ int found = 0;
+ for (int h=0; h<nHandlers; h++) {
+ if (strcmp(node->name, handlers[h].name) == 0) {
+ if (head) head->subs[head->nSubs++] = node;
+ node->parent = head;
+ node->nSubs = 0;
+ xml->maxIndex++;
+ NCCLCHECK(handlers[h].func(file, xml, node));
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ if (nHandlers) INFO(NCCL_GRAPH, "Ignoring element %s", node->name);
+ NCCLCHECK(xmlLoadSub(file, xml, node, NULL, 0));
+ }
+ }
+}
+
+/**************/
+/* XML Writer */
+/**************/
+
+ncclResult_t ncclTopoDumpXmlRec(int indent, FILE* file, struct ncclXmlNode* node) {
+ for (int i=0; i<indent; i++) fprintf(file, " ");
+ fprintf(file, "<%s", node->name);
+
+ for (int a=0; a<node->nAttrs; a++) {
+ fprintf(file, " %s=\"%s\"", node->attrs[a].key, node->attrs[a].value);
+ }
+ if (node->nSubs == 0) {
+ fprintf(file, "/>\n");
+ } else {
+ fprintf(file, ">\n");
+ for (int s=0; s<node->nSubs; s++) {
+ NCCLCHECK(ncclTopoDumpXmlRec(indent+2, file, node->subs[s]));
+ }
+ for (int i=0; i<indent; i++) fprintf(file, " ");
+ fprintf(file, "</%s>\n", node->name);
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml) {
+ FILE* file = fopen(xmlTopoFile, "w");
+ if (file == NULL) {
+ WARN("Unable to open %s, not dumping topology.", xmlTopoFile);
+ return ncclSuccess;
+ }
+ NCCLCHECK(ncclTopoDumpXmlRec(0, file, xml->nodes));
+ fclose(file);
+ return ncclSuccess;
+}
+
+/****************************************/
+/* Parser rules for our specific format */
+/****************************************/
+
+ncclResult_t ncclTopoXmlLoadNvlink(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+ NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+ struct xmlHandler handlers[] = { { "nvlink", ncclTopoXmlLoadNvlink } };
+ NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlLoadNet(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+ NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlLoadNic(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+ struct xmlHandler handlers[] = { { "net", ncclTopoXmlLoadNet } };
+ NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlLoadPci(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+ struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "gpu", ncclTopoXmlLoadGpu }, { "nic", ncclTopoXmlLoadNic} };
+ NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 3));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlLoadCpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+ struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "nic", ncclTopoXmlLoadNic } };
+ NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlLoadSystem(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+ int version;
+ NCCLCHECK(xmlGetAttrInt(head, "version", &version));
+ if (version != NCCL_TOPO_XML_VERSION) {
+ WARN("XML Topology has wrong version %d, %d needed", version, NCCL_TOPO_XML_VERSION);
+ return ncclInvalidUsage;
+ }
+ const char* name;
+ NCCLCHECK(xmlGetAttr(head, "name", &name));
+ if (name != NULL) INFO(NCCL_GRAPH, "Loading topology %s", name);
+ else INFO(NCCL_GRAPH, "Loading unnamed topology");
+
+ struct xmlHandler handlers[] = { { "cpu", ncclTopoXmlLoadCpu } };
+ NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml) {
+ FILE* file = fopen(xmlTopoFile, "r");
+ if (file == NULL) {
+ WARN("Could not open XML topology file %s : %s", xmlTopoFile, strerror(errno));
+ return ncclSuccess;
+ }
+ struct xmlHandler handlers[] = { { "system", ncclTopoXmlLoadSystem } };
+ xml->maxIndex = 0;
+ NCCLCHECK(xmlLoadSub(file, xml, NULL, handlers, 1));
+ fclose(file);
+ return ncclSuccess;
+}
+
+/**********************/
+/* XML creation */
+/* from autodetection */
+/**********************/
+
+#define BUSID_SIZE (sizeof("0000:00:00.0"))
+#define BUSID_REDUCED_SIZE (sizeof("0000:00"))
+static void memcpylower(char* dst, const char* src, const size_t size) {
+ for (int i=0; i<size; i++) dst[i] = tolower(src[i]);
+}
+static ncclResult_t getPciPath(const char* busId, char** path) {
+ char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0";
+ memcpylower(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1);
+ memcpylower(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1);
+ *path = realpath(busPath, NULL);
+ if (*path == NULL) {
+ WARN("Could not find real path of %s", busPath);
+ return ncclSystemError;
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue) {
+ char filePath[PATH_MAX];
+ sprintf(filePath, "%s/%s", path, fileName);
+ int offset = 0;
+ FILE* file;
+ if ((file = fopen(filePath, "r")) != NULL) {
+ while (feof(file) == 0 && ferror(file) == 0 && offset < MAX_STR_LEN) {
+ int len = fread(strValue+offset, 1, MAX_STR_LEN-offset, file);
+ offset += len;
+ }
+ fclose(file);
+ }
+ if (offset == 0) {
+ strValue[0] = '\0';
+ INFO(NCCL_GRAPH, "Topology detection : could not read %s, ignoring", filePath);
+ } else {
+ strValue[offset-1] = '\0';
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoSetAttrFromSys(struct ncclXmlNode* pciNode, const char* path, const char* fileName, const char* attrName) {
+ char strValue[MAX_STR_LEN];
+ NCCLCHECK(ncclTopoGetStrFromSys(path, fileName, strValue));
+ if (strValue[0] != '\0') { NCCLCHECK(xmlSetAttr(pciNode, attrName, strValue)); }
+ TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s\n", path, fileName, attrName, strValue);
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetXmlFromCpu(struct ncclXmlNode* cpuNode, struct ncclXml* xml) {
+ int index;
+ NCCLCHECK(xmlGetAttrIndex(cpuNode, "affinity", &index));
+ if (index == -1) {
+ const char* numaId;
+ NCCLCHECK(xmlGetAttr(cpuNode, "numaid", &numaId));
+ if (numaId == NULL) {
+ WARN("GetXmlFromCpu : could not find CPU numa ID.");
+ return ncclInternalError;
+ }
+ // Set affinity
+ char cpumaskPath[] = "/sys/devices/system/node/node0000";
+ sprintf(cpumaskPath, "/sys/devices/system/node/node%s", numaId);
+ NCCLCHECK(ncclTopoSetAttrFromSys(cpuNode, cpumaskPath, "cpumap", "affinity"));
+ }
+
+ NCCLCHECK(xmlGetAttrIndex(cpuNode, "arch", &index));
+ if (index == -1) {
+ // Fill CPU type / vendor / model
+#if defined(__PPC__)
+ NCCLCHECK(xmlSetAttr(cpuNode, "arch", "ppc64"));
+#elif defined(__aarch64__)
+ NCCLCHECK(xmlSetAttr(cpuNode, "arch", "arm64"));
+#elif defined(__x86_64__)
+ NCCLCHECK(xmlSetAttr(cpuNode, "arch", "x86_64"));
+#endif
+ }
+
+#if defined(__x86_64__)
+ NCCLCHECK(xmlGetAttrIndex(cpuNode, "vendor", &index));
+ if (index == -1) {
+ union {
+ struct {
+ // CPUID 0 String register order
+ uint32_t ebx;
+ uint32_t edx;
+ uint32_t ecx;
+ };
+ char vendor[12];
+ } cpuid0;
+
+ asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0) : "memory");
+ char vendor[13];
+ strncpy(vendor, cpuid0.vendor, 12);
+ vendor[12] = '\0';
+ NCCLCHECK(xmlSetAttr(cpuNode, "vendor", vendor));
+ }
+
+ NCCLCHECK(xmlGetAttrIndex(cpuNode, "familyid", &index));
+ if (index == -1) {
+ union {
+ struct {
+ unsigned steppingId:4;
+ unsigned modelId:4;
+ unsigned familyId:4;
+ unsigned processorType:2;
+ unsigned resv0:2;
+ unsigned extModelId:4;
+ unsigned extFamilyId:8;
+ unsigned resv1:4;
+ };
+ uint32_t val;
+ } cpuid1;
+ asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1) : "memory");
+ int familyId = cpuid1.familyId + (cpuid1.extFamilyId << 4);
+ int modelId = cpuid1.modelId + (cpuid1.extModelId << 4);
+ NCCLCHECK(xmlSetAttrInt(cpuNode, "familyid", familyId));
+ NCCLCHECK(xmlSetAttrInt(cpuNode, "modelid", modelId));
+ }
+#endif
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetPciNode(struct ncclXml* xml, const char* busId, struct ncclXmlNode** pciNode) {
+ NCCLCHECK(xmlFindTagKv(xml, "pci", pciNode, "busid", busId));
+ if (*pciNode == NULL) {
+ NCCLCHECK(xmlAddNode(xml, NULL, "pci", pciNode));
+ }
+ NCCLCHECK(xmlSetAttr(*pciNode, "busid", busId));
+ return ncclSuccess;
+}
+
+// Check whether a string is in BDF format or not.
+// BDF (Bus-Device-Function) is "BBBB:BB:DD.F" where B, D and F are hex digits.
+// There can be trailing chars.
+int isHex(char c) { return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')); }
+int checkBDFFormat(char* bdf) {
+ if (bdf[4] != ':' || bdf[7] != ':' || bdf[10] != '.') return 0;
+ if (isHex(bdf[0]) == 0 || isHex(bdf[1] == 0) || isHex(bdf[2] == 0) || isHex(bdf[3] == 0) ||
+ isHex(bdf[5] == 0) || isHex(bdf[6] == 0) || isHex(bdf[8] == 0) || isHex(bdf[9] == 0) ||
+ isHex(bdf[11] == 0)) return 0;
+ return 1;
+}
+
+ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* xml) {
+ // Fill info, then parent
+ const char* busId;
+ NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
+ char* path = NULL;
+ int index;
+ NCCLCHECK(xmlGetAttrIndex(pciNode, "class", &index));
+ if (index == -1) {
+ if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
+ NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
+ }
+ NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
+ if (index == -1) {
+ if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
+ char deviceSpeedStr[MAX_STR_LEN];
+ float deviceSpeed;
+ NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_speed", deviceSpeedStr));
+ sscanf(deviceSpeedStr, "%f GT/s", &deviceSpeed);
+ char portSpeedStr[MAX_STR_LEN];
+ float portSpeed;
+ NCCLCHECK(ncclTopoGetStrFromSys(path, "../max_link_speed", portSpeedStr));
+ sscanf(portSpeedStr, "%f GT/s", &portSpeed);
+ NCCLCHECK(xmlSetAttr(pciNode, "link_speed", portSpeed < deviceSpeed ? portSpeedStr : deviceSpeedStr));
+ }
+ NCCLCHECK(xmlGetAttrIndex(pciNode, "link_width", &index));
+ if (index == -1) {
+ if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
+ char strValue[MAX_STR_LEN];
+ NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_width", strValue));
+ int deviceWidth = strtol(strValue, NULL, 0);
+ NCCLCHECK(ncclTopoGetStrFromSys(path, "../max_link_width", strValue));
+ int portWidth = strtol(strValue, NULL, 0);
+ NCCLCHECK(xmlSetAttrInt(pciNode, "link_width", std::min(deviceWidth,portWidth)));
+ }
+ struct ncclXmlNode* parent = pciNode->parent;
+ if (parent == NULL) {
+ if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
+
+ // Save that for later in case next step is a CPU
+ char numaIdStr[MAX_STR_LEN];
+ NCCLCHECK(ncclTopoGetStrFromSys(path, "numa_node", numaIdStr));
+
+ // Go up one level in the PCI tree. Rewind two "/" and follow the upper PCI
+ // switch, or stop if we reach a CPU root complex.
+ int slashCount = 0;
+ int parentOffset;
+ for (parentOffset = strlen(path)-1; parentOffset>0; parentOffset--) {
+ if (path[parentOffset] == '/') {
+ slashCount++;
+ path[parentOffset] = '\0';
+ int start = parentOffset - 1;
+ while (start>0 && path[start] != '/') start--;
+ // Check whether the parent path looks like "BBBB:BB:DD.F" or not.
+ if (checkBDFFormat(path+start+1) == 0) {
+ // This a CPU root complex. Create a CPU tag and stop there.
+ struct ncclXmlNode* topNode;
+ NCCLCHECK(xmlFindTag(xml, "system", &topNode));
+ NCCLCHECK(xmlGetSubKv(topNode, "cpu", &parent, "numaid", numaIdStr));
+ if (parent == NULL) {
+ NCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent));
+ NCCLCHECK(xmlSetAttr(parent, "numaid", numaIdStr));
+ }
+ } else if (slashCount == 2) {
+ // Continue on the upper PCI switch
+ for (int i = strlen(path)-1; i>0; i--) {
+ if (path[i] == '/') {
+ NCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", path+i+1));
+ if (parent == NULL) {
+ NCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent));
+ NCCLCHECK(xmlSetAttr(parent, "busid", path+i+1));
+ }
+ break;
+ }
+ }
+ }
+ }
+ if (parent) break;
+ }
+ pciNode->parent = parent;
+ parent->subs[parent->nSubs++] = pciNode;
+ }
+ if (strcmp(parent->name, "pci") == 0) {
+ NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
+ } else if (strcmp(parent->name, "cpu") == 0) {
+ NCCLCHECK(ncclTopoGetXmlFromCpu(parent, xml));
+ }
+ free(path);
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvmlDev, struct ncclXml* xml, struct ncclXmlNode** gpuNodeRet) {
+ struct ncclXmlNode* gpuNode = NULL;
+ NCCLCHECK(xmlGetSub(pciNode, "gpu", &gpuNode));
+ if (gpuNode == NULL) NCCLCHECK(xmlAddNode(xml, pciNode, "gpu", &gpuNode));
+
+ int index = -1;
+
+ int dev = -1;
+ NCCLCHECK(xmlGetAttrIndex(gpuNode, "dev", &index));
+ if (index == -1) {
+ if (nvmlDev == NULL) {
+ WARN("No NVML, trying to use CUDA instead");
+ const char* busId;
+ NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
+ if (busId == NULL || cudaDeviceGetByPCIBusId(&dev, busId) != cudaSuccess) dev = -1;
+ } else {
+ NCCLCHECK(wrapNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&dev));
+ }
+ NCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev));
+ }
+ NCCLCHECK(xmlGetAttrInt(gpuNode, "dev", &dev));
+ if (dev == -1) return ncclSuccess;
+
+ NCCLCHECK(xmlGetAttrIndex(gpuNode, "sm", &index));
+ if (index == -1) {
+ int cudaMajor, cudaMinor;
+ if (nvmlDev == NULL) {
+ cudaDeviceProp devProp;
+ CUDACHECK(cudaGetDeviceProperties(&devProp, dev));
+ cudaMajor = devProp.major; cudaMinor = devProp.minor;
+ } else {
+ NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDev, &cudaMajor, &cudaMinor));
+ }
+ NCCLCHECK(xmlSetAttrInt(gpuNode, "sm", cudaMajor*10+cudaMinor));
+ }
+ int sm;
+ NCCLCHECK(xmlGetAttrInt(gpuNode, "sm", &sm));
+
+ struct ncclXmlNode* nvlNode = NULL;
+ NCCLCHECK(xmlGetSub(pciNode, "nvlink", &nvlNode));
+ if (nvlNode == NULL) {
+ // NVML NVLink detection
+ int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : 6;
+
+ if (maxNvLinks > 0 && nvmlDev == NULL) {
+ WARN("No NVML device handle. Skipping nvlink detection.\n");
+ maxNvLinks = 0;
+ }
+
+ for (int l=0; l<maxNvLinks; ++l) {
+ // Check whether we can use this NVLink for P2P
+ unsigned canP2P;
+ if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue;
+
+ // Make sure the Nvlink is up. The previous call should have trained the link.
+ nvmlEnableState_t isActive;
+ if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue;
+
+ // Try to figure out what's on the other side of the NVLink
+ nvmlPciInfo_t remoteProc;
+ if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue;
+
+ // Make a lower case copy of the bus ID for calling ncclDeviceType
+ // PCI system path is in lower case
+ char* p = remoteProc.busId;
+ char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+ for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) {
+ lowerId[c] = tolower(p[c]);
+ if (p[c] == 0) break;
+ }
+
+ NCCLCHECK(xmlGetSubKv(gpuNode, "nvlink", &nvlNode, "target", lowerId));
+ if (nvlNode == NULL) {
+ NCCLCHECK(xmlAddNode(xml, gpuNode, "nvlink", &nvlNode));
+ NCCLCHECK(xmlSetAttr(nvlNode, "target", lowerId));
+ NCCLCHECK(xmlSetAttrInt(nvlNode, "count", 1));
+ } else {
+ int count;
+ NCCLCHECK(xmlGetAttrInt(nvlNode, "count", &count));
+ NCCLCHECK(xmlSetAttrInt(nvlNode, "count", count+1));
+ }
+ }
+ }
+ // Fill target classes
+ for (int s=0; s<gpuNode->nSubs; s++) {
+ struct ncclXmlNode* sub = gpuNode->subs[s];
+ if (strcmp(sub->name, "nvlink") != 0) continue;
+ int index;
+ NCCLCHECK(xmlGetAttrIndex(sub, "tclass", &index));
+ if (index == -1) {
+ const char* busId;
+ NCCLCHECK(xmlGetAttr(sub, "target", &busId));
+ char* path;
+ NCCLCHECK(getPciPath(busId, &path));
+ NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass"));
+ }
+ }
+ *gpuNodeRet = gpuNode;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode) {
+ struct ncclXmlNode* node;
+ NCCLCHECK(ncclTopoGetPciNode(xml, busId, &node));
+ NCCLCHECK(ncclTopoGetXmlFromSys(node, xml));
+ NCCLCHECK(wrapNvmlSymbols());
+ NCCLCHECK(wrapNvmlInit());
+ nvmlDevice_t nvmlDev;
+ if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL;
+ NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode));
+ return ncclSuccess;
+}
+
+// Returns the subsystem name of a path, i.e. the end of the path
+// where sysPath/subsystem points to.
+ncclResult_t ncclTopoGetSubsystem(const char* sysPath, char* subSys) {
+ char subSysPath[PATH_MAX];
+ sprintf(subSysPath, "%s/subsystem", sysPath);
+ char* path = realpath(subSysPath, NULL);
+ if (path == NULL) {
+ subSys[0] = '\0';
+ } else {
+ int offset;
+ for (offset = strlen(path); offset > 0 && path[offset] != '/'; offset--);
+ strcpy(subSys, path+offset+1);
+ free(path);
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode) {
+ NCCLCHECK(xmlFindTagKv(xml, "net", netNode, "name", netName));
+ if (*netNode != NULL) return ncclSuccess;
+
+ const char* pciSysPath = pciPath;
+ if (pciSysPath) {
+ char subSystem[PATH_MAX];
+ NCCLCHECK(ncclTopoGetSubsystem(pciSysPath, subSystem));
+ // This is not a PCI device (virtual, usb, ...).
+ if (strcmp(subSystem, "pci") != 0) {
+ INFO(NCCL_GRAPH, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem);
+ pciSysPath = NULL;
+ }
+ }
+
+ struct ncclXmlNode* parent = NULL;
+ if (pciSysPath) {
+ int offset;
+ for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--);
+ char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+ strcpy(busId, pciSysPath+offset+1);
+ NCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", busId));
+ if (parent == NULL) {
+ NCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent));
+ NCCLCHECK(xmlSetAttr(parent, "busid", busId));
+ NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
+ }
+ } else {
+ // Virtual NIC, no PCI device, attach to first CPU
+ NCCLCHECK(xmlFindTag(xml, "cpu", &parent));
+ }
+
+ struct ncclXmlNode* nicNode = NULL;
+ NCCLCHECK(xmlGetSub(parent, "nic", &nicNode));
+ if (nicNode == NULL) {
+ NCCLCHECK(xmlAddNode(xml, parent, "nic", &nicNode));
+ }
+
+ // We know that this net does not exist yet (we searched for it at the
+ // beginning of this function), so we can add it.
+ NCCLCHECK(xmlAddNode(xml, nicNode, "net", netNode));
+ NCCLCHECK(xmlSetAttr(*netNode, "name", netName));
+ return ncclSuccess;
+}
+
+/**************************************************/
+/* Parser rules for the user-defined graph search */
+/**************************************************/
+
+ncclResult_t ncclTopoXmlGraphLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+ NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlGraphLoadNet(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+ NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlGraphLoadChannel(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+ struct xmlHandler handlers[] = { { "net", ncclTopoXmlGraphLoadNet }, { "gpu", ncclTopoXmlGraphLoadGpu } };
+ NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlGraphLoadGraph(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
+ struct xmlHandler handlers[] = { { "channel", ncclTopoXmlGraphLoadChannel } };
+ NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoXmlGraphLoadGraphs(FILE* file, struct ncclXml* xmlGraph, struct ncclXmlNode* head) {
+ int version;
+ NCCLCHECK(xmlGetAttrInt(head, "version", &version));
+ if (version != NCCL_GRAPH_XML_VERSION) {
+ WARN("XML Graph has wrong version %d, %d needed", version, NCCL_GRAPH_XML_VERSION);
+ return ncclInvalidUsage;
+ }
+ const char* name;
+ NCCLCHECK(xmlGetAttr(head, "name", &name));
+ if (name != NULL) INFO(NCCL_GRAPH, "Loading graphs for topology %s", name);
+ else INFO(NCCL_GRAPH, "Loading graphs");
+
+ struct xmlHandler handlers[] = { { "graph", ncclTopoXmlGraphLoadGraph } };
+ NCCLCHECK(xmlLoadSub(file, xmlGraph, head, handlers, 1));
+ return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXml* xml) {
+ FILE* file = fopen(xmlGraphFile, "r");
+ if (file == NULL) {
+ WARN("Could not open XML graph file %s : %s", xmlGraphFile, strerror(errno));
+ return ncclSystemError;
+ }
+ struct xmlHandler handlers[] = { { "graphs", ncclTopoXmlGraphLoadGraphs } };
+ xml->maxIndex = 0;
+ NCCLCHECK(xmlLoadSub(file, xml, NULL, handlers, 1));
+ fclose(file);
+ return ncclSuccess;
+}
diff --git a/src/graph/xml.h b/src/graph/xml.h
new file mode 100644
index 0000000..fa04527
--- /dev/null
+++ b/src/graph/xml.h
@@ -0,0 +1,237 @@
+/*************************************************************************
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef XML_H_
+#define XML_H_
+
+// A few constraints to make the implementation easy
+#define MAX_STR_LEN 256
+#define MAX_ATTR_COUNT 16
+#define MAX_SUBS 32
+#define MAX_NODES 1024
+
+#define NODE_TYPE_NONE 0
+#define NODE_TYPE_OPEN 1
+#define NODE_TYPE_CLOSE 2
+#define NODE_TYPE_SINGLE 3
+
+struct ncclXmlNode {
+ char name[MAX_STR_LEN];
+ struct {
+ char key[MAX_STR_LEN];
+ char value[MAX_STR_LEN];
+ } attrs[MAX_ATTR_COUNT+1]; // Need an extra one to consume extra params
+ int nAttrs;
+ int type;
+ struct ncclXmlNode* parent;
+ struct ncclXmlNode* subs[MAX_SUBS];
+ int nSubs;
+};
+
+struct ncclXml {
+ struct ncclXmlNode nodes[MAX_NODES];
+ int maxIndex;
+};
+
+/* File functions */
+#define NCCL_TOPO_XML_VERSION 1
+ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml);
+ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml);
+#define NCCL_GRAPH_XML_VERSION 1
+ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXml* xml);
+
+/* Auto-detect functions */
+ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode);
+ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode);
+
+/**************/
+/* XML Struct */
+/* Functions */
+/**************/
+
+static ncclResult_t xmlGetAttrIndex(struct ncclXmlNode* node, const char* attrName, int* index) {
+ *index = -1;
+ const int nAttrs = node->nAttrs;
+ for (int a=0; a<nAttrs; a++) {
+ if (strncmp(node->attrs[a].key, attrName, MAX_STR_LEN-1) == 0) {
+ *index = a;
+ return ncclSuccess;
+ }
+ }
+ return ncclSuccess;
+}
+
+static ncclResult_t xmlGetAttr(struct ncclXmlNode* node, const char* attrName, const char** value) {
+ int index;
+ NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+ *value = index == -1 ? NULL : node->attrs[index].value;
+ return ncclSuccess;
+}
+
+static ncclResult_t xmlGetAttrStr(struct ncclXmlNode* node, const char* attrName, const char** value) {
+ NCCLCHECK(xmlGetAttr(node, attrName, value));
+ if (*value == NULL) {
+ WARN("Attribute %s of node %s not found", attrName, node->name);
+ return ncclInternalError;
+ }
+ return ncclSuccess;
+}
+static ncclResult_t xmlGetAttrInt(struct ncclXmlNode* node, const char* attrName, int* value) {
+ const char* str;
+ NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
+ *value = strtol(str, NULL, 0);
+ return ncclSuccess;
+}
+
+static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrName, float* value) {
+ const char* str;
+ NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
+ *value = strtof(str, NULL);
+ return ncclSuccess;
+}
+
+static ncclResult_t xmlFindTag(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node) {
+ *node = NULL;
+ for (int i=0; i<xml->maxIndex; i++) {
+ struct ncclXmlNode* n = xml->nodes+i;
+ if (strcmp(n->name, tagName) == 0) {
+ *node = n;
+ return ncclSuccess;
+ }
+ }
+ return ncclSuccess;
+}
+
+static ncclResult_t xmlFindTagKv(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node, const char* attrName, const char* attrValue) {
+ *node = NULL;
+ for (int i=0; i<xml->maxIndex; i++) {
+ struct ncclXmlNode* n = xml->nodes+i;
+ if (strcmp(n->name, tagName) == 0) {
+ const char* value;
+ NCCLCHECK(xmlGetAttr(n, attrName, &value));
+ if (value && strcmp(value, attrValue) == 0) {
+ *node = n;
+ return ncclSuccess;
+ }
+ }
+ }
+ return ncclSuccess;
+}
+
+static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, const char* value) {
+ int index;
+ NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+ if (index == -1) {
+ index = node->nAttrs++;
+ strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+ }
+ strncpy(node->attrs[index].value, value, MAX_STR_LEN);
+ return ncclSuccess;
+}
+
+static ncclResult_t xmlSetAttrInt(struct ncclXmlNode* node, const char* attrName, const int value) {
+ int index;
+ NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+ if (index == -1) {
+ index = node->nAttrs++;
+ strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+ }
+ snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
+ return ncclSuccess;
+}
+
+static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrName, const float value) {
+ int index;
+ NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
+ if (index == -1) {
+ index = node->nAttrs++;
+ strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+ }
+ snprintf(node->attrs[index].value, MAX_STR_LEN, "%g", value);
+ return ncclSuccess;
+}
+
+static ncclResult_t xmlGetSub(struct ncclXmlNode* node, const char* subName, struct ncclXmlNode** sub) {
+ *sub = NULL;
+ for (int s=0; s<node->nSubs; s++) {
+ if (strcmp(node->subs[s]->name, subName) == 0) {
+ *sub = node->subs[s];
+ return ncclSuccess;
+ }
+ }
+ return ncclSuccess;
+}
+
+static ncclResult_t xmlGetSubKv(struct ncclXmlNode* node, const char* subName, struct ncclXmlNode** sub, const char* attrName, const char* attrValue) {
+ *sub = NULL;
+ for (int s=0; s<node->nSubs; s++) {
+ struct ncclXmlNode* subNode = node->subs[s];
+ if (strcmp(subNode->name, subName) == 0) {
+ const char* value;
+ NCCLCHECK(xmlGetAttr(subNode, attrName, &value));
+ if (value && strcmp(value, attrValue) == 0) {
+ *sub = node->subs[s];
+ return ncclSuccess;
+ }
+ }
+ }
+ return ncclSuccess;
+}
+static ncclResult_t xmlGetSubKvInt(struct ncclXmlNode* node, const char* subName, struct ncclXmlNode** sub, const char* attrName, const int attrValue) {
+ char strValue[10];
+ snprintf(strValue, 10, "%d", attrValue);
+ NCCLCHECK(xmlGetSubKv(node, subName, sub, attrName, strValue));
+ return ncclSuccess;
+}
+
+static ncclResult_t xmlAddNode(struct ncclXml* xml, struct ncclXmlNode* parent, const char* subName, struct ncclXmlNode** sub) {
+ if (xml->maxIndex == MAX_NODES) {
+ WARN("Error : too many XML nodes (max %d)", MAX_NODES);
+ return ncclInternalError;
+ }
+ struct ncclXmlNode* s = xml->nodes+xml->maxIndex++;
+ s->nSubs = 0;
+ s->nAttrs = 0;
+ *sub = s;
+ s->parent = parent;
+ if (parent) parent->subs[parent->nSubs++] = s;
+ strncpy(s->name, subName, MAX_STR_LEN);
+ return ncclSuccess;
+}
+
+// Dictionary for STR -> INT conversions. No dictionary size information,
+// there needs to be a last element with str == NULL.
+struct kvDict {
+ const char* str;
+ int value;
+};
+
+static ncclResult_t kvConvertToInt(const char* str, int* value, struct kvDict* dict) {
+ struct kvDict* d = dict;
+ while (d->str) {
+ if (strncmp(str, d->str, strlen(d->str)) == 0) {
+ *value = d->value;
+ return ncclSuccess;
+ }
+ d++;
+ }
+ WARN("KV Convert to int : could not find value of '%s' in dictionary", str);
+ return ncclInternalError;
+}
+static ncclResult_t kvConvertToStr(int value, const char** str, struct kvDict* dict) {
+ struct kvDict* d = dict;
+ while (d->str) {
+ if (value == d->value) {
+ *str = d->str;
+ return ncclSuccess;
+ }
+ d++;
+ }
+ WARN("KV Convert to str : could not find value %d in dictionary", value);
+ return ncclInternalError;
+}
+
+#endif
diff --git a/src/include/align.h b/src/include/align.h
new file mode 100644
index 0000000..1c9e7aa
--- /dev/null
+++ b/src/include/align.h
@@ -0,0 +1,19 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ALIGN_H_
+#define NCCL_ALIGN_H_
+
+#define DIVUP(x, y) \
+ (((x)+(y)-1)/(y))
+
+#define ROUNDUP(x, y) \
+ (DIVUP((x), (y))*(y))
+
+#define ALIGN_SIZE(size, align) \
+ size = ((size + (align) - 1) / (align)) * (align);
+
+#endif
diff --git a/src/include/alloc.h b/src/include/alloc.h
index bcdbd18..27e206f 100644
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -9,6 +9,7 @@
#include "nccl.h"
#include "checks.h"
+#include "align.h"
#include <sys/mman.h>
static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) {
@@ -48,4 +49,18 @@ static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
return ncclSuccess;
}
+// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
+// allocated on separate pages as those pages will be marked DONTFORK
+// and if they are shared, that could cause a crash in a child process
+static ncclResult_t ncclIbMalloc(void** ptr, size_t size) {
+ size_t page_size = sysconf(_SC_PAGESIZE);
+ void* p;
+ int size_aligned = ROUNDUP(size, page_size);
+ int ret = posix_memalign(&p, page_size, size_aligned);
+ if (ret != 0) return ncclSystemError;
+ memset(p, 0, size);
+ *ptr = p;
+ return ncclSuccess;
+}
+
#endif
diff --git a/src/include/checks.h b/src/include/checks.h
index 50737b0..257e9ca 100644
--- a/src/include/checks.h
+++ b/src/include/checks.h
@@ -56,7 +56,7 @@
ncclResult_t res = call; \
if (res != ncclSuccess) { \
/* Print the back trace*/ \
- INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
+ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
return res; \
} \
} while (0);
@@ -65,7 +65,7 @@
res = call; \
if (res != ncclSuccess) { \
/* Print the back trace*/ \
- INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
+ if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \
goto label; \
} \
} while (0);
diff --git a/src/include/coll_net.h b/src/include/coll_net.h
new file mode 100644
index 0000000..3278560
--- /dev/null
+++ b/src/include/coll_net.h
@@ -0,0 +1,34 @@
+/*************************************************************************
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COLL_NET_H_
+#define COLL_NET_H_
+
+#include "nccl.h"
+#include "nccl_net.h"
+
+extern ncclCollNet_t* ncclCollNet;
+typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
+
+// Translation to external API
+static const char* collNetName() { return ncclCollNet->name; }
+static ncclResult_t collNetDevices(int* ndev) { NCCLCHECK(ncclCollNet->devices(ndev)); return ncclSuccess; }
+static ncclResult_t collNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
+static ncclResult_t collNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t collNetConnect(void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
+static ncclResult_t collNetReduceSupport(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
+static ncclResult_t collNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclCollNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclCollNet->deregMr(comm, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetIallreduce(void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+ NCCLCHECK(ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
+static ncclResult_t collNetFlush(void* collComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclCollNet->flush(collComm, data, size, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclCollNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t collNetCloseColl(void* collComm) { NCCLCHECK(ncclCollNet->closeColl(collComm)); return ncclSuccess; }
+static ncclResult_t collNetCloseListen(void* listenComm) { NCCLCHECK(ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
+
+static int collNetSupport() { return ncclCollNet != NULL ? 1 : 0; }
+
+#endif
diff --git a/src/include/collectives.h b/src/include/collectives.h
index 69c8e74..bd64106 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -30,7 +30,8 @@
#define DECL_COLL3(coll, op, dtype) \
DECL_COLL4(coll##Ring, op, dtype) \
- DECL_COLL4(coll##Tree, op, dtype)
+ DECL_COLL4(coll##Tree, op, dtype) \
+ DECL_COLL4(coll##CollNet, op, dtype)
#define DECL_COLL2(coll, op) \
DECL_COLL3(coll, op, i8) \
diff --git a/src/include/comm.h b/src/include/comm.h
index 7164dc0..cc87a42 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -20,8 +20,6 @@ struct cudaLaunchParams {
};
#endif
-#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
-
#define CACHE_LINE_SIZE 128
#define MEM_ALIGN 4096
#define CUDA_IPC_MIN 2097152UL
@@ -91,14 +89,11 @@ struct ncclComm {
// Channels for collectives
int nChannels;
- // Only nvlink is used for inter-GPU communication
- int nvlink;
-
// Algorithm/Protocols thresholds
ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
- int maxThreads[NCCL_NUM_PROTOCOLS];
+ int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
// An internal CUDA stream for NCCL kernel CGMD launches
int groupCudaStream;
@@ -136,6 +131,9 @@ struct ncclComm {
// Global proxy thread
pthread_t proxyThread;
struct ncclProxyState proxyState;
+
+ // Whether this communicator uses collNet
+ int collNetSupport;
};
#endif
diff --git a/src/include/core.h b/src/include/core.h
index 250f43b..ac5fa85 100644
--- a/src/include/core.h
+++ b/src/include/core.h
@@ -53,9 +53,10 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) {
#define NCCL_NUM_FUNCTIONS 5
typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce } ncclFunc_t;
-#define NCCL_NUM_ALGORITHMS 2 // Tree/Ring
+#define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
#define NCCL_ALGO_TREE 0
#define NCCL_ALGO_RING 1
+#define NCCL_ALGO_COLLNET 2
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
#define NCCL_PROTO_LL 0
diff --git a/src/include/cpuset.h b/src/include/cpuset.h
index 98b93de..40c1594 100644
--- a/src/include/cpuset.h
+++ b/src/include/cpuset.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -19,7 +19,7 @@ static int hexToInt(char c) {
#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
-ncclResult_t ncclStrToCpuset(char* str, cpu_set_t* mask) {
+ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
uint32_t cpumasks[CPU_SET_N_U32];
int m = CPU_SET_N_U32-1;
cpumasks[m] = 0;
diff --git a/src/include/debug.h b/src/include/debug.h
index 89b6e42..d88458c 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -29,11 +29,6 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
// Let code temporarily downgrade WARN into INFO
extern thread_local int ncclDebugNoWarn;
-#define NOWARN(a, ret) do { \
- ncclDebugNoWarn = 1; \
- ret = a; \
- ncclDebugNoWarn = 0; \
-} while (0)
#define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
#define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
index 46d236b..96c69ba 100644
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@@ -8,19 +8,12 @@
#define NCCL_DEVICE_H_
#include "nccl.h"
+#include "align.h"
#include <stdint.h>
#define NCCL_MAX_OPS 2048
#define NCCL_STEPS 8
-#define DIVUP(x, y) \
- (((x)+(y)-1)/(y))
-#define ROUNDUP(x, y) \
- (DIVUP((x), (y))*(y))
-
-#define ALIGN_SIZE(size, align) \
- size = ((size + (align) - 1) / (align)) * (align);
-
union ncclLLFifoLine {
/* Flags have to be *after* data, because otherwise, an incomplete receive
from the network may receive the flag but not the data.
@@ -73,6 +66,9 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
+#define NCCL_DIRECT_GPU 0x01
+#define NCCL_DIRECT_NIC 0x10
+
struct ncclConnInfo {
// Regular comm mechanism
char *buff; // Local for recv, remote for send
@@ -171,6 +167,8 @@ struct ncclChannel {
struct ncclRing ring;
struct ncclTree treeUp;
struct ncclTree treeDn;
+ struct ncclTree collTreeUp;
+ struct ncclTree collTreeDn;
int id;
int nthreads;
diff --git a/src/include/graph.h b/src/include/graph.h
index 3c8ba19..b27ea35 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -14,17 +14,6 @@
#include <ctype.h>
#include <stdio.h>
-enum ncclPathDist {
- PATH_PIX = 0,
- PATH_PXB = 1,
- PATH_PHB = 2,
- PATH_NODE = 3,
- PATH_SYS = 4,
- PATH_ARRAY_SIZE = 5
-};
-
-extern const char* pathDists[PATH_ARRAY_SIZE];
-
ncclResult_t ncclTopoCudaPath(int cudaDev, char** path);
struct ncclTopoSystem;
@@ -36,32 +25,47 @@ ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeerInfo* info);
void ncclTopoFree(struct ncclTopoSystem* system);
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
-ncclResult_t ncclTopoGetMaxSpeed(struct ncclTopoSystem* system);
// Query topology
-ncclResult_t ncclTopoGetNvlink(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* nvlink);
-ncclResult_t ncclTopoHasNvlink(struct ncclTopoSystem* system, int64_t busId, int* nvlink);
-ncclResult_t ncclTopoGpuDistance(struct ncclTopoSystem* system, int64_t busId1, int64_t busId2, int* distance);
ncclResult_t ncclTopoGetNetDev(struct ncclTopoGraph* graph, int dir, int channelId, int* net);
-ncclResult_t ncclTopoNetDistance(struct ncclTopoSystem* system, int64_t busId, int netDev, int* distance);
-ncclResult_t ncclTopoCpuCount(struct ncclTopoSystem* system, int* count);
+ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p);
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
+
+// Set CPU affinity
+ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank);
+
+#define NCCL_TOPO_CPU_ARCH_X86 1
+#define NCCL_TOPO_CPU_ARCH_POWER 2
+#define NCCL_TOPO_CPU_ARCH_ARM 3
+#define NCCL_TOPO_CPU_VENDOR_INTEL 1
+#define NCCL_TOPO_CPU_VENDOR_AMD 2
+#define NCCL_TOPO_CPU_TYPE_BDW 1
+#define NCCL_TOPO_CPU_TYPE_SKL 2
+ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
#define NCCL_TOPO_MAX_NODES 256
+// Init search. Needs to be done before calling ncclTopoCompute
+ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
+
#define NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP 1 // Split tree (send/recv from different ranks) always flowing in the same direction
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Split tree (send/recv from different ranks) flowing in both directions
#define NCCL_TOPO_PATTERN_TREE 3 // Simple tree (send/recv from same rank) flowing in both directions
#define NCCL_TOPO_PATTERN_RING 4 // Ring
struct ncclTopoGraph {
// Input / output
+ int id; // ring : 0, tree : 1, collnet : 2
int pattern;
int crossNic;
+ int collNet;
+ int minChannels;
+ int maxChannels;
// Output
int nChannels;
- int speedIntra;
- int speedInter;
- int type;
- int nvlink;
+ float speedIntra;
+ float speedInter;
+ int typeIntra;
+ int typeInter;
int sameChannels;
int nHops;
int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES];
@@ -70,6 +74,7 @@ struct ncclTopoGraph {
ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
+ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs);
struct ncclTopoRanks {
int ringRecv[MAXCHANNELS];
@@ -83,12 +88,16 @@ struct ncclTopoRanks {
};
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
- struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
+ struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
struct ncclTopoRanks* topoRanks);
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
struct ncclTopoRanks** allTopoRanks, int* rings);
-ncclResult_t ncclSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph);
+ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
+
+ncclResult_t ncclTopoSetThresholds(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph);
+#include "info.h"
+ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time);
#endif
diff --git a/src/include/info.h b/src/include/info.h
index 9461759..46b9795 100644
--- a/src/include/info.h
+++ b/src/include/info.h
@@ -17,7 +17,9 @@ typedef enum {
ncclPatternPipelineTo,
ncclPatternTreeUp,
ncclPatternTreeDown,
- ncclPatternTreeUpDown
+ ncclPatternTreeUpDown,
+ ncclPatternCollTreeUp,
+ ncclPatternCollTreeDown
} ncclPattern_t;
// Used to pass NCCL call information between functions
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
index d6ae9f8..95dce5b 100644
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,6 +8,7 @@
#define NCCL_NET_H_
#include "nccl.h"
+#include <stdint.h>
#define NCCL_NET_HANDLE_MAXSIZE 64
@@ -20,18 +21,27 @@ typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCC
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
typedef struct {
+ char* name; // Used mostly for logging.
+ char* pciPath; // Path to the PCI device in /sys.
+ uint64_t guid; // Unique identifier for the NIC chip. Important for
+ // cards with multiple PCI functions (Physical or virtual).
+ int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
+ int speed; // Port speed in Mbps.
+ int port; // Port number.
+ int maxComms; // Maximum number of comms we can create
+}ncclNetProperties_v3_t;
+
+typedef ncclNetProperties_v3_t ncclNetProperties_t;
+
+typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
- // Return the device path in /sys. NCCL will call free on this path.
- ncclResult_t (*pciPath)(int dev, char** path);
- // Return whether this device supports host pointers and/or CUDA pointers
- // as data from the current GPU. Supported types should be composed with
- // NCCL_PTR_HOST and NCCL_PTR_CUDA.
- ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
+ // Get various device properties.
+ ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
@@ -40,15 +50,19 @@ typedef struct {
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
// Finalize connection establishment after remote peer has called connectHandle
ncclResult_t (*accept)(void* listenComm, void** recvComm);
- // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+ // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+ // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+ ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+ ncclResult_t (*deregMr)(void* comm, void* mhandle);
+ // Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
- ncclResult_t (*isend)(void* sendComm, void* data, int size, int type, void** request);
- // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+ ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
+ // Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
- ncclResult_t (*irecv)(void* recvComm, void* data, int size, int type, void** request);
+ ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
- ncclResult_t (*flush)(void* recvComm, void* data, int size);
+ ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
@@ -56,53 +70,52 @@ typedef struct {
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v1_t;
+} ncclNet_v3_t;
+
+typedef ncclNet_v3_t ncclNet_t;
+
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v3
typedef struct {
- // Name of the network (mainly for logs)
+ // Name of the collective network (mainly for logs)
const char* name;
- // Initialize the network.
+ // Initialize the collective network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
- // Return the number of adapters.
+ // Return the number of adapters capable of doing collective operations.
+ // If ndev returns 0, all other functions might be set to NULL.
ncclResult_t (*devices)(int* ndev);
- // Return the device path in /sys. NCCL will call free on this path.
- ncclResult_t (*pciPath)(int dev, char** path);
- // Return whether this device supports host pointers and/or CUDA pointers
- // as data from the current GPU. Supported types should be composed with
- // NCCL_PTR_HOST and NCCL_PTR_CUDA.
- ncclResult_t (*ptrSupport)(int dev, int* supportedTypes);
+ // Get various device properties.
+ ncclResult_t (*getProperties)(int dev, ncclNetProperties_v3_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
- // between ranks to create a connection.
+ // between ranks to create connections.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
- // Connect to a handle and return a sending comm object for that peer.
- ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
- // Finalize connection establishment after remote peer has called connectHandle
- ncclResult_t (*accept)(void* listenComm, void** recvComm);
- // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
- // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
- ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
- ncclResult_t (*deregMr)(void* comm, void* mhandle);
- // Asynchronous send to a peer.
- // May return request == NULL if the call cannot be performed (or would block)
- ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
- // Asynchronous recv from a peer.
- // May return request == NULL if the call cannot be performed (or would block)
- ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
+ // Create a group for collective operations. handles have been created
+ // using listen() above. rank indicates caller's rank in the collective network.
+ ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+ // Returns whether a reduction operation on a data type is supported.
+ // 1 for supported, 0 otherwise.
+ ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+ // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+ ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+ ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+ // Performs an asynchronous allreduce operation on the collective group.
+ // May return request == NULL if the call cannot be performed (or would block).
+ ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+ ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
- ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle);
+ ncclResult_t (*flush)(void* collComm, void* data, int size, void* mhandle);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
- // Close and free send/recv comm objects
- ncclResult_t (*closeSend)(void* sendComm);
- ncclResult_t (*closeRecv)(void* recvComm);
+ // Close and free collective comm objects
+ ncclResult_t (*closeColl)(void* collComm);
ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v2_t;
+} ncclCollNet_v3_t;
-typedef ncclNet_v2_t ncclNet_t;
+typedef ncclCollNet_v3_t ncclCollNet_t;
-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v2
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v3
#endif // end include guard
diff --git a/src/include/net.h b/src/include/net.h
index 3d37c8c..bc81965 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -16,7 +16,7 @@ typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
// Translation to external API
static const char* ncclNetName() { return ncclNet->name; }
static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; }
-static ncclResult_t ncclNetPciPath(int dev, char** path) { NCCLCHECK(ncclNet->pciPath(dev, path)); return ncclSuccess; }
+static ncclResult_t ncclNetGetProperties(int dev, ncclNetProperties_t* props) { NCCLCHECK(ncclNet->getProperties(dev, props)); return ncclSuccess; }
static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; }
static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; }
static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; }
@@ -30,33 +30,40 @@ static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeS
static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; }
static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; }
+// Test whether the current GPU support GPU Direct RDMA.
#define GPU_BUF_SIZE (2*1024*1024)
-static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) {
- int support;
- NCCLCHECK(ncclNet->ptrSupport(dev, &support));
- *supportedTypes = support & ~NCCL_PTR_CUDA;
- // The network supports GPU Direct RDMA ; verify the GPU supports it as well.
- if (support & NCCL_PTR_CUDA) {
+static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
+ int netDevs;
+ NCCLCHECK(ncclNetDevices(&netDevs));
+ *gdrSupport = 0;
+ for (int dev=0; dev<netDevs; dev++) {
+ // Find a net device which is GDR-capable
+ ncclNetProperties_t props;
+ NCCLCHECK(ncclNet->getProperties(dev, &props));
+ if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
+
+ // Allocate memory on the GPU and try to register it on the NIC.
void *lComm = NULL, *sComm = NULL, *rComm = NULL;
ncclNetHandle_t handle;
void* gpuPtr = NULL;
void* mHandle = NULL;
- ncclResult_t res;
- NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), res, cleanup);
- NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), res, cleanup);
- NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), res, cleanup);
- CUDACHECKGOTO(cudaMalloc(&gpuPtr, GPU_BUF_SIZE), res, cleanup);
- NOWARN(ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res);
- if (res != ncclSuccess) goto cleanup;
- NCCLCHECKGOTO(ncclNetDeregMr(sComm, mHandle), res, cleanup);
- NCCLCHECKGOTO(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), res, cleanup);
- NCCLCHECKGOTO(ncclNetDeregMr(rComm, mHandle), res, cleanup);
- *supportedTypes |= NCCL_PTR_CUDA;
-cleanup:
- if (gpuPtr) cudaFree(gpuPtr);
- if (rComm) ncclNetCloseRecv(rComm);
- if (sComm) ncclNetCloseSend(sComm);
- if (lComm) ncclNetCloseListen(lComm);
+ NCCLCHECK(ncclNetListen(dev, &handle, &lComm));
+ NCCLCHECK(ncclNetConnect(dev, &handle, &sComm));
+ NCCLCHECK(ncclNetAccept(lComm, &rComm));
+ CUDACHECK(cudaMalloc(&gpuPtr, GPU_BUF_SIZE));
+ ncclDebugNoWarn = NCCL_NET;
+ if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
+ NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
+ NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
+ NCCLCHECK(ncclNetDeregMr(rComm, mHandle));
+ *gdrSupport = 1;
+ }
+ ncclDebugNoWarn = 0;
+ CUDACHECK(cudaFree(gpuPtr));
+ NCCLCHECK(ncclNetCloseRecv(rComm));
+ NCCLCHECK(ncclNetCloseSend(sComm));
+ NCCLCHECK(ncclNetCloseListen(lComm));
+ break;
}
return ncclSuccess;
}
diff --git a/src/include/socket.h b/src/include/socket.h
index 96bf5db..9376062 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -283,6 +283,7 @@ static ncclResult_t GetSocketAddrFromString(union socketAddress* ua, const char*
}
static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) {
+ static int shownIfName = 0;
int nIfs = 0;
// Allow user to force the INET socket family selection
int sock_family = envSocketFamily();
@@ -290,6 +291,7 @@ static int findInterfaces(char* ifNames, union socketAddress *ifAddrs, int ifNam
char* env = getenv("NCCL_SOCKET_IFNAME");
if (env && strlen(env) > 1) {
// Specified by user : find or fail
+ if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env);
nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
} else {
// Try to automatically pick the right one
diff --git a/src/include/transport.h b/src/include/transport.h
index 8f9bf0e..e25132f 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -53,6 +53,8 @@ struct ncclProxyArgs {
int nsteps;
uint64_t opCount;
int protocol;
+ ncclDataType_t dtype;
+ ncclRedOp_t redOp;
int state; // add component before this line -- it is left out during initialization
// Internal state
@@ -80,7 +82,7 @@ struct ncclProxyState {
struct ncclTransportComm {
ncclResult_t (*setup)(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId);
- ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*);
+ ncclResult_t (*connect)(struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
ncclResult_t (*free)(void*);
ncclResult_t (*proxy)(struct ncclProxyArgs*);
};
diff --git a/src/include/utils.h b/src/include/utils.h
index 266abca..86ab3a2 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -14,7 +14,7 @@ int ncclCudaCompCap();
// PCI Bus ID <-> int64 conversion functions
ncclResult_t int64ToBusId(int64_t id, char* busId);
-ncclResult_t busIdToInt64(char* busId, int64_t* id);
+ncclResult_t busIdToInt64(const char* busId, int64_t* id);
ncclResult_t getBusId(int cudaDev, int64_t *busId);
diff --git a/src/init.cc b/src/init.cc
index 627f6c7..0a02760 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,11 +11,10 @@
#include "transport.h"
#include "group.h"
#include "net.h"
+#include "coll_net.h"
#include "enqueue.h"
#include "graph.h"
#include "argcheck.h"
-#include "cpuset.h"
-#include <sched.h>
#include <fcntl.h>
#include <string.h>
#include <errno.h>
@@ -43,6 +42,7 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
ncclNet_t* ncclNet = NULL;
+ncclCollNet_t* ncclCollNet = NULL;
// Returns ncclInternalError if anything fails, causing that network to be ignored.
ncclResult_t initNet(ncclNet_t* net) {
@@ -53,7 +53,15 @@ ncclResult_t initNet(ncclNet_t* net) {
return ncclSuccess;
}
-ncclResult_t initNetPlugin(ncclNet_t** net) {
+ncclResult_t initCollNet(ncclCollNet_t* collnet) {
+ int ndev;
+ if (collnet->init(ncclDebugLog) != ncclSuccess) return ncclInternalError;
+ if (collnet->devices(&ndev) != ncclSuccess) return ncclInternalError;
+ if (ndev <= 0) return ncclSystemError;
+ return ncclSuccess;
+}
+
+ncclResult_t initNetPlugin(ncclNet_t** net, ncclCollNet_t** collnet) {
void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL);
if (netPluginLib == NULL) {
// dlopen does not guarantee to set errno, but dlerror only gives us a
@@ -69,13 +77,17 @@ ncclResult_t initNetPlugin(ncclNet_t** net) {
ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL));
if (extNet == NULL) {
INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol.");
- goto cleanup;
- }
- if (initNet(extNet) == ncclSuccess) {
+ } else if (initNet(extNet) == ncclSuccess) {
*net = extNet;
+ // Check for CollNet
+ ncclCollNet_t* extCollNet = (ncclCollNet_t*) dlsym(netPluginLib, STR(NCCL_COLLNET_PLUGIN_SYMBOL));
+ if (extCollNet == NULL) {
+ INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_COLLNET_PLUGIN_SYMBOL) " symbol.");
+ } else if (initCollNet(extCollNet) == ncclSuccess) {
+ *collnet = extCollNet;
+ }
return ncclSuccess;
}
-cleanup:
if (netPluginLib != NULL) dlclose(netPluginLib);
return ncclSuccess;
}
@@ -84,7 +96,7 @@ ncclResult_t initNet() {
// Always initialize bootstrap network
NCCLCHECK(bootstrapNetInit());
- NCCLCHECK(initNetPlugin(&ncclNet));
+ NCCLCHECK(initNetPlugin(&ncclNet, &ncclCollNet));
if (ncclNet != NULL) return ncclSuccess;
if (initNet(&ncclNetIb) == ncclSuccess) {
ncclNet = &ncclNetIb;
@@ -95,6 +107,8 @@ ncclResult_t initNet() {
return ncclSuccess;
}
+NCCL_PARAM(CollNetEnable, "COLLNET_ENABLE", 0);
+
pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
static bool initialized = false;
static ncclResult_t ncclInit() {
@@ -103,6 +117,7 @@ static ncclResult_t ncclInit() {
if (!initialized) {
initEnv();
initNet();
+ INFO(NCCL_INIT, "Using network %s", ncclNetName());
initialized = true;
}
pthread_mutex_unlock(&initLock);
@@ -220,6 +235,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
*comm->abortFlag = 0;
comm->argsptr = &comm->args;
+ comm->collNetSupport = 0;
*comret = comm;
return ncclSuccess;
@@ -233,7 +249,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
// Copy userRanks and peers
for (int r=0; r<comm->nChannels; r++) {
NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
- NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks));
+ NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks+1));
}
// Duplicate the dev comm on the device
@@ -269,14 +285,8 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
info->shmDev = statbuf.st_dev;
info->busId = comm->busId;
- int netDevs;
- NCCLCHECK(ncclNetDevices(&netDevs));
- for (int n=0; n<netDevs; n++) {
- int ptrSupport;
- NCCLCHECK(ncclNetPtrSupport(n, &ptrSupport));
- if (ptrSupport & NCCL_PTR_CUDA) info->gdrSupport |= (1 << n);
- }
+ NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport));
return ncclSuccess;
}
@@ -396,7 +406,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph,
struct ncclConnector* conn;
for (int i=0; i<nrecv; i++) {
int peer = peerRecv[i];
- if (peer == -1) continue;
+ if (peer == -1 || peer >= comm->nRanks) continue;
conn = &channel->peers[peer].recv;
if (conn->connected) { ++nSkippedRecv; continue; }
memset(&connect, 0, sizeof(connect));
@@ -405,7 +415,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph,
}
for (int i=0; i<nsend; i++) {
int peer = peerSend[i];
- if (peer == -1) continue;
+ if (peer == -1 || peer >= comm->nRanks) continue;
conn = &channel->peers[peer].send;
if (conn->connected) { ++nSkippedSend; continue; }
memset(&connect, 0, sizeof(connect));
@@ -414,29 +424,148 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph,
}
for (int i=0; i<nsend; i++) {
int peer = peerSend[i];
- if (peer == -1) continue;
+ if (peer == -1 || peer >= comm->nRanks) continue;
conn = &channel->peers[peer].send;
if (conn->connected) {++nSkippedSend; continue; }
memset(&connect, 0, sizeof(connect));
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
- NCCLCHECK(conn->transportComm->connect(&connect, conn));
+ NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
conn->connected = 1;
}
for (int i=0; i<nrecv; i++) {
int peer = peerRecv[i];
- if (peer == -1) continue;
+ if (peer == -1 || peer >= comm->nRanks) continue;
conn = &channel->peers[peer].recv;
if (conn->connected) {++nSkippedRecv; continue; }
memset(&connect, 0, sizeof(connect));
NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect)));
- NCCLCHECK(conn->transportComm->connect(&connect, conn));
+ NCCLCHECK(conn->transportComm->connect(&connect, 1, comm->rank, conn));
conn->connected = 1;
}
TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv);
return ncclSuccess;
}
+extern struct ncclTransport collNetTransport;
+
+// All ranks must participate in collNetSetup call
+// type: 0 for send, 1 for recv
+// return: 0 - unsupported, 1 - supported
+static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int collNetChannels, int rank, int nranks, int masterRank, int masterPeer, int nMasters, int type) {
+ int rankInCollNet = -1;
+ int supported = 0;
+ int isMaster = (rank == masterRank) ? 1 : 0;
+ struct {
+ int collNetRank;
+ ncclConnect connect;
+ } sendrecvExchange;
+
+ // check if we can connect to collnet, whose root is the nranks-th rank
+ struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks;
+ peerInfo->rank = nranks;
+ int ret = 1;
+ if (isMaster) {
+ NCCLCHECK(collNetTransport.canConnect(&ret, comm->topo, collNetGraph, myInfo, peerInfo));
+ }
+
+ // send master receives connect info from peer recv master
+ if (isMaster && type == 0) {
+ NCCLCHECK(bootstrapRecv(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange)));
+ rankInCollNet = sendrecvExchange.collNetRank;
+ INFO(NCCL_INIT, "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d", rank, rankInCollNet, nMasters, masterPeer);
+ }
+
+ // select
+ struct ncclPeer* root = channel->peers+nranks;
+ struct ncclConnector* conn = (type == 1) ? &root->recv : &root->send;
+ struct ncclTransportComm* transportComm = (type == 1) ? &(collNetTransport.recv) : &(collNetTransport.send);
+ conn->transportComm = transportComm;
+ // setup
+ struct ncclConnect myConnect;
+ if (isMaster && ret > 0) {
+ NCCLCHECK(transportComm->setup(comm->topo, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->buffSize, channel->id));
+ }
+ // prepare connect handles
+ ncclResult_t res;
+ struct {
+ int isMaster;
+ ncclConnect connect;
+ } *allConnects = NULL;
+ ncclConnect *masterConnects = NULL;
+ NCCLCHECK(ncclCalloc(&masterConnects, nMasters));
+ if (type == 1) { // recv side: AllGather
+ // all ranks must participate
+ NCCLCHECK(ncclCalloc(&allConnects, nranks));
+ allConnects[rank].isMaster = isMaster;
+ memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect));
+ NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), res, cleanup);
+ // consolidate
+ int c = 0;
+ for (int r = 0; r < nranks; r++) {
+ if (allConnects[r].isMaster) {
+ memcpy(masterConnects+c, &(allConnects[r].connect), sizeof(struct ncclConnect));
+ if (r == rank) rankInCollNet = c;
+ c++;
+ }
+ }
+ } else { // send side : copy in connect info received from peer recv master
+ if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect));
+ }
+ // connect
+ if (isMaster && ret > 0) {
+ NCCLCHECKGOTO(transportComm->connect(masterConnects, nMasters, rankInCollNet, conn), res, cleanup);
+ }
+ // recv side sends connect info to send side
+ if (isMaster && type == 1) {
+ sendrecvExchange.collNetRank = rankInCollNet;
+ memcpy(&sendrecvExchange.connect, masterConnects+rankInCollNet, sizeof(struct ncclConnect));
+ NCCLCHECK(bootstrapSend(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange)));
+ INFO(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer);
+ }
+ if (ret > 0) {
+ supported = 1;
+ }
+cleanup:
+ if (allConnects != NULL) free(allConnects);
+ if (masterConnects != NULL) free(masterConnects);
+ return supported;
+}
+
+static ncclResult_t checkCollNetSetup(struct ncclComm* comm, int rank, int collNetSetupFail) {
+ int nranks = comm->nRanks;
+ // AllGather collNet setup results
+ int* allGatherFailures;
+ NCCLCHECK(ncclCalloc(&allGatherFailures, nranks));
+ allGatherFailures[rank] = collNetSetupFail;
+ NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGatherFailures, sizeof(int)));
+ for (int i=0; i<nranks; i++) {
+ if (allGatherFailures[i] != 0) {
+ collNetSetupFail = 1;
+ break;
+ }
+ }
+ free(allGatherFailures);
+ if (collNetSetupFail) {
+ if (rank == 0) WARN("Cannot initialize CollNet, using %s instead", ncclNetName());
+ // Free collNet resources
+ for (int r=0; r<comm->nChannels; r++) {
+ struct ncclChannel* channel = comm->channels+r;
+ struct ncclPeer* peer = channel->peers+nranks;
+ if (peer->send.transportResources && peer->send.transportComm) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
+ if (peer->recv.transportResources && peer->recv.transportComm) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
+ peer->send.transportResources = NULL; // avoid double free
+ peer->recv.transportResources = NULL; // avoid double free
+ }
+ // Set support to 0
+ comm->collNetSupport = 0;
+ } else {
+ comm->collNetSupport = 1;
+ }
+ return ncclSuccess;
+}
+
NCCL_PARAM(CrossNic, "CROSS_NIC", 2);
+NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0);
static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
// We use 3 AllGathers
@@ -462,7 +591,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECK(fillInfo(comm, myInfo, commHash));
NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
- NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
+ NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks+1)); // Extra rank to represent CollNet root
for (int i = 0; i < nranks; i++) {
memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
@@ -481,60 +610,82 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECK(ncclTopoTrimSystem(comm->topo, comm));
// Recompute paths after trimming
NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo));
- // Compute max speed to accelerate search
- NCCLCHECK(ncclTopoGetMaxSpeed(comm->topo));
+ // Init search
+ NCCLCHECK(ncclTopoSearchInit(comm->topo));
// Print final topology
NCCLCHECK(ncclTopoPrint(comm->topo));
// Get rings and trees
- struct ncclTopoGraph treeGraph;
- treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
- treeGraph.crossNic = ncclParamCrossNic();
- // We communicate only half the data between node with trees on 2 nodes.
- NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph));
- NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph));
struct ncclTopoGraph ringGraph;
+ ringGraph.id = 0;
ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
ringGraph.crossNic = ncclParamCrossNic();
+ ringGraph.collNet = 0;
+ ringGraph.minChannels = 1;
+ ringGraph.maxChannels = MAXCHANNELS/2;
NCCLCHECK(ncclTopoCompute(comm->topo, &ringGraph));
NCCLCHECK(ncclTopoPrintGraph(comm->topo, &ringGraph));
+ struct ncclTopoGraph treeGraph;
+ treeGraph.id = 1;
+ treeGraph.pattern = NCCL_TOPO_PATTERN_SPLIT_TREE;
+ treeGraph.crossNic = ncclParamCrossNic();
+ treeGraph.collNet = 0;
+ treeGraph.minChannels = 1;
+ treeGraph.maxChannels = ringGraph.nChannels;
+ NCCLCHECK(ncclTopoCompute(comm->topo, &treeGraph));
+ NCCLCHECK(ncclTopoPrintGraph(comm->topo, &treeGraph));
+
+ struct ncclTopoGraph collNetGraph;
+ collNetGraph.id = 2;
+ collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE;
+ collNetGraph.collNet = 1;
+ collNetGraph.crossNic = ncclParamCrossNic();
+ collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels;
+ NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph));
+ NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph));
+
+ if (comm->rank == ncclParamGraphDumpFileRank()) {
+ struct ncclTopoGraph* graphs[3] = { &ringGraph, &treeGraph, &collNetGraph };
+ NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs));
+ }
+
// AllGather3 - begin
+ struct ncclGraphInfo {
+ int sameChannels;
+ float speedIntra;
+ float speedInter;
+ int typeIntra;
+ };
struct {
int cudaCompCap;
int fullCudaCompCap;
- int nvlink;
int nChannels;
- struct {
- int sameChannels;
- int speedIntra;
- int speedInter;
- int nvlink;
- } tree;
- struct {
- int sameChannels;
- int speedIntra;
- int speedInter;
- int nvlink;
- } ring;
+ struct ncclGraphInfo tree;
+ struct ncclGraphInfo ring;
+ struct ncclGraphInfo collNet;
struct ncclTopoRanks topoRanks;
} *allGather3Data;
NCCLCHECK(ncclCalloc(&allGather3Data, nranks));
allGather3Data[rank].cudaCompCap = ncclCudaCompCap();
- allGather3Data[rank].nvlink = treeGraph.nvlink;
- allGather3Data[rank].nChannels = comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
+ allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
+ std::min(treeGraph.nChannels, ringGraph.nChannels);
allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;
allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra;
allGather3Data[rank].tree.speedInter = treeGraph.speedInter;
- allGather3Data[rank].tree.nvlink = treeGraph.nvlink;
+ allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra;
allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels;
allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra;
allGather3Data[rank].ring.speedInter = ringGraph.speedInter;
- allGather3Data[rank].ring.nvlink = ringGraph.nvlink;
+ allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra;
+ allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels;
+ allGather3Data[rank].collNet.speedIntra = collNetGraph.speedIntra;
+ allGather3Data[rank].collNet.speedInter = collNetGraph.speedInter;
+ allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra;
- NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &allGather3Data[rank].topoRanks));
+ NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks));
NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)));
@@ -562,9 +713,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
maxCompCap = std::max(allGather3Data[i].cudaCompCap, maxCompCap);
}
- comm->nvlink = 1;
- for (int i = 0; i < nranks; i++) comm->nvlink &= allGather3Data[i].nvlink;
-
int nChannelsOrig = comm->nChannels;
struct ncclTopoRanks** allTopoRanks;
NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
@@ -575,11 +723,15 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra);
treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter);
- treeGraph.nvlink = std::min(allGather3Data[i].tree.nvlink, treeGraph.nvlink);
+ treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra);
ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels);
ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra);
ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter);
- ringGraph.nvlink = std::min(allGather3Data[i].ring.nvlink, ringGraph.nvlink);
+ ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra);
+ collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels);
+ collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra);
+ collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter);
+ collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra);
}
if (comm->nChannels < nChannelsOrig) {
@@ -592,6 +744,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings));
+ if (comm->nNodes > 1 &&
+ ncclParamCollNetEnable() == 1 &&
+ collNetSupport()) {
+ NCCLCHECK(ncclTopoConnectCollNet(comm, &collNetGraph, rank));
+ }
free(allTopoRanks);
free(nodesFirstRank);
@@ -601,7 +758,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
TRACE(NCCL_INIT, "rank %d nranks %d - BUILT %d TREES/RINGS", rank, nranks, comm->nChannels);
- NCCLCHECK(ncclSetThresholds(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph));
+ NCCLCHECK(ncclTopoSetThresholds(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph));
char line[1024];
line[0]='\0';
@@ -615,21 +772,58 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
line[1023] = '\0';
INFO(NCCL_INIT, "Trees%s", line);
+ // Set Affinity to a CPU local the our GPU, so that all memory we allocate
+ // on the host is local.
+ cpu_set_t affinitySave;
+ sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
+ NCCLCHECK(ncclTopoSetAffinity(comm->topo, comm->rank));
+ ncclResult_t ret;
+
// Connect with prev/next for each ring
struct ncclConnect *connect;
- NCCLCHECK(ncclCalloc(&connect, 2));
+ NCCLCHECKGOTO(ncclCalloc(&connect, 2), ret, affinity_restore);
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
- NCCLCHECK(setupChannel(comm, c, rank, nranks, rings+c*nranks));
+ NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore);
if (comm->nRanks == 1) continue;
- NCCLCHECK(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next));
- NCCLCHECK(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up));
- NCCLCHECK(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down));
+ NCCLCHECKGOTO(p2pSetup(comm, &ringGraph, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore);
+ NCCLCHECKGOTO(p2pSetup(comm, &treeGraph, channel, NCCL_MAX_TREE_ARITY, channel->treeUp.down, 1, &channel->treeUp.up), ret, affinity_restore);
+ NCCLCHECKGOTO(p2pSetup(comm, &treeGraph, channel, 1, &channel->treeDn.up, NCCL_MAX_TREE_ARITY, channel->treeDn.down), ret, affinity_restore);
+ }
+
+ // Check if we can setup CollNet
+ if (comm->nNodes > 1 &&
+ ncclParamCollNetEnable() == 1 &&
+ collNetSupport()) {
+ int logicChannels = comm->nChannels/2;
+ int collNetSetupFail = 0;
+ const int recvIndex = 0; // recv GPU index is always 0
+ const int sendIndex = collNetGraph.pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; // send GPU index depends on topo pattern
+ for (int c=0; c<logicChannels; c++) {
+ struct ncclChannel* channelRecv = comm->channels+logicChannels+c;
+ struct ncclChannel* channelSend = comm->channels+c;
+ NCCLCHECK(p2pSetup(comm, &collNetGraph, channelRecv, 1, &channelRecv->collTreeDn.up, 1, channelRecv->collTreeDn.down));
+ NCCLCHECK(p2pSetup(comm, &collNetGraph, channelSend, 1, channelSend->collTreeUp.down, 1, &channelSend->collTreeUp.up));
+ const int recvMaster = collNetGraph.intra[c*comm->localRanks+recvIndex];
+ const int sendMaster = collNetGraph.intra[c*comm->localRanks+sendIndex];
+ if (collNetSetup(comm, &collNetGraph, channelRecv, logicChannels, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1)
+ collNetSetupFail = 1;
+ if (collNetSetup(comm, &collNetGraph, channelSend, logicChannels, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1)
+ collNetSetupFail = 1;
+ }
+ // Verify CollNet setup across ranks
+ NCCLCHECK(checkCollNetSetup(comm, rank, collNetSetupFail));
}
TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
free(connect);
free(rings);
+ // We should have allocated all buffers, collective fifos, ... we can
+ // restore the affinity.
+affinity_restore:
+ sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+ if (ret != ncclSuccess) return ret;
+
// Compute intra ranks (using AllGather1 data)
int intraRank0 = -1, intraRank = -1, intraRanks = 0;
for (int i = 0; i < nranks; i++) {
@@ -658,98 +852,20 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
return ncclSuccess;
}
-static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) {
- CPU_ZERO_S(sizeof(cpu_set_t), mask);
- char* cudaPath;
- NCCLCHECK(ncclTopoCudaPath(cudaDev, &cudaPath));
- char path[PATH_MAX];
- strncpy(path, cudaPath, PATH_MAX-1);
- snprintf(path+strlen(path), PATH_MAX-1-strlen(path), "/local_cpus");
- path[PATH_MAX-1] = '\0';
- int fd;
- SYSCHECKVAL(open(path, O_RDONLY), "open", fd);
- char affinityStr[sizeof(cpu_set_t)*2 + 1];
- int r = read(fd, affinityStr, sizeof(cpu_set_t)*2);
- if (r > 0) {
- affinityStr[r] = '\0';
- NCCLCHECK(ncclStrToCpuset(affinityStr, mask));
- }
- close(fd);
- free(cudaPath);
- return ncclSuccess;
-}
-
-NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
-
-static ncclResult_t setCpuAffinity(int cudaDev) {
- // Query the CPU affinity set we were provided
- cpu_set_t mask;
- SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
-
-#ifdef ENABLE_TRACE
- {
- char affinityStr[sizeof(cpu_set_t)*2];
- NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
- TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", cudaDev, affinityStr);
- }
-#endif
-
- // Find the CPUs that are local to the supplied GPU
- cpu_set_t gpuMask;
- NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask));
-
-#ifdef ENABLE_TRACE
- {
- char affinityStr[sizeof(cpu_set_t)*2];
- NCCLCHECK(ncclCpusetToStr(&gpuMask, affinityStr));
- TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", cudaDev, affinityStr);
- }
-#endif
-
- cpu_set_t finalMask;
- if (ncclParamIgnoreCpuAffinity())
- // Ignore the CPU affinity set and use the GPU one instead
- finalMask = gpuMask;
- else
- // Use a subset of the GPU affinity set
- CPU_AND(&finalMask, &mask, &gpuMask);
-
- // If there is a non empty set, use it to set affinity
- if (CPU_COUNT(&finalMask)) {
- char affinityStr[sizeof(cpu_set_t)*2];
- NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
- INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", cudaDev, affinityStr);
- SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity");
- }
- return ncclSuccess;
-}
-
ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) {
- cpu_set_t affinitySave;
- sched_getaffinity(0, sizeof(cpu_set_t), &affinitySave);
-
- NCCLCHECK(wrapNvmlSymbols());
- NCCLCHECK(wrapNvmlInit());
-
- // Make sure all host memory allocation are close to the GPU
- CUDACHECK(cudaSetDevice(cudaDev));
- NCCLCHECK(setCpuAffinity(cudaDev));
ncclResult_t res;
+ CUDACHECK(cudaSetDevice(cudaDev));
NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup);
NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
- sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
- NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup);
-
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
return ncclSuccess;
cleanup:
if ((*newcomm) && (*newcomm)->bootstrap) bootstrapAbort((*newcomm)->bootstrap);
*newcomm = NULL;
- sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
return res;
}
diff --git a/src/misc/utils.cc b/src/misc/utils.cc
index 5158529..782e9c0 100644
--- a/src/misc/utils.cc
+++ b/src/misc/utils.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -24,7 +24,7 @@ ncclResult_t int64ToBusId(int64_t id, char* busId) {
return ncclSuccess;
}
-ncclResult_t busIdToInt64(char* busId, int64_t* id) {
+ncclResult_t busIdToInt64(const char* busId, int64_t* id) {
const int size = strlen(busId);
char* hexStr;
NCCLCHECK(ncclCalloc(&hexStr, size));
diff --git a/src/transport.cc b/src/transport.cc
index 4059849..cc8d5d1 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -100,6 +100,7 @@ static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) {
struct ncclPeer* peerComm = args->channel->peers+peer;
struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send;
+ if (connector->transportComm == NULL) return ncclInternalError;
if (connector->transportComm->proxy == NULL) return ncclSuccess;
struct ncclProxyArgs* op;
@@ -130,6 +131,18 @@ ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int r
for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy<proxySend>(tree->down[i], args));
NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
}
+ if (pattern == ncclPatternCollTreeUp) {
+ // CollTree up
+ struct ncclTree* tree = &args->channel->collTreeUp;
+ NCCLCHECK(SaveProxy<proxyRecv>(tree->down[0], args));
+ NCCLCHECK(SaveProxy<proxySend>(tree->up, args));
+ }
+ if (pattern == ncclPatternCollTreeDown) {
+ // CollTree down
+ struct ncclTree* tree = &args->channel->collTreeDn;
+ NCCLCHECK(SaveProxy<proxySend>(tree->down[0], args));
+ NCCLCHECK(SaveProxy<proxyRecv>(tree->up, args));
+ }
return ncclSuccess;
}
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
new file mode 100644
index 0000000..73e9fdd
--- /dev/null
+++ b/src/transport/coll_net.cc
@@ -0,0 +1,430 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "coll_net.h"
+#include "graph.h"
+#include <assert.h>
+
+struct collNetRecvConnectInfo {
+ collNetHandle_t collNetHandle;
+};
+
+struct collNetSendConnectInfo {
+ void* collNetComm;
+ void* mhandle;
+ void* llMhandle;
+ struct reqSlot* reqFifo;
+};
+
+struct ncclLLDataLine {
+ uint32_t data1;
+ uint32_t data2;
+};
+static_assert(sizeof(struct ncclLLDataLine) == sizeof(union ncclLLFifoLine)>>1, "ncclLLDataLine is not half size of ncclLLFifoLine");
+
+struct reqSlot {
+ volatile void* recvBuff;
+ volatile int size;
+};
+
+struct collNetSendResources {
+ void* collNetSendComm;
+ struct ncclSendMem* hostSendMem;
+ struct ncclRecvMem* hostRecvMem;
+ struct ncclSendMem* devHostSendMem;
+ struct ncclRecvMem* devHostRecvMem;
+ struct ncclLLDataLine* llData;
+ int netDev;
+ int useGdr;
+ int buffSize;
+ void* sendMhandle;
+ void* llSendMhandle;
+ void* recvMhandle;
+ void* llRecvMhandle;
+ struct ncclRecvMem* devRecvMem;
+ uint64_t step;
+ uint64_t llLastCleaning;
+ struct reqSlot* reqFifo;
+ int collNetRank;
+};
+
+struct collNetRecvResources {
+ void* netListenComm;
+ void* collNetRecvComm;
+ struct ncclSendMem* hostSendMem;
+ struct ncclRecvMem* hostRecvMem;
+ struct ncclSendMem* devHostSendMem;
+ struct ncclRecvMem* devHostRecvMem;
+ struct ncclLLDataLine* llData;
+ int netDev;
+ int useGdr;
+ int buffSize;
+ void* mhandle;
+ void* llMhandle;
+ struct ncclRecvMem* devRecvMem;
+ uint64_t step;
+ uint64_t llLastCleaning;
+ struct reqSlot* reqFifo;
+ int collNetRank;
+};
+
+/* Determine if we can communicate with the peer */
+ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
+ *ret = 1;
+ return ncclSuccess;
+}
+
+/* Setup send connector, and return connect information for others in the coll communicator to connect to me */
+ncclResult_t collNetSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
+ struct collNetSendResources* sendResources;
+ NCCLCHECK(ncclCalloc(&sendResources, 1));
+ send->transportResources = sendResources;
+
+ NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &sendResources->netDev));
+ NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, sendResources->netDev, 1, &sendResources->useGdr));
+
+ int sendSize = sizeof(struct ncclSendMem);
+ NCCLCHECK(ncclCudaHostAlloc((void**)&sendResources->hostSendMem, (void**)&sendResources->devHostSendMem, sendSize));
+
+ int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+ if (sendResources->useGdr) {
+ NCCLCHECK(ncclCudaCalloc((char**)(&sendResources->devRecvMem), recvSize));
+ }
+ NCCLCHECK(ncclCudaHostAlloc((void**)&sendResources->hostRecvMem, (void**)&sendResources->devHostRecvMem, recvSize));
+ NCCLCHECK(ncclIbMalloc((void**)&(sendResources->llData), NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine)));
+ sendResources->buffSize = buffSize;
+
+ INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), sendResources->netDev,
+ sendResources->useGdr ? "/GDRDMA" : "");
+
+ return ncclSuccess;
+}
+
+/* Setup recv connector */
+ncclResult_t collNetRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) {
+ struct collNetRecvResources* recvResources;
+ NCCLCHECK(ncclCalloc(&recvResources, 1));
+ recv->transportResources = recvResources;
+
+ NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &recvResources->netDev));
+ NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, recvResources->netDev, 0, &recvResources->useGdr));
+
+ int sendSize = sizeof(struct ncclSendMem);
+ NCCLCHECK(ncclCudaHostAlloc((void**)&recvResources->hostSendMem, (void**)&recvResources->devHostSendMem, sendSize));
+
+ int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize;
+ if (recvResources->useGdr) {
+ NCCLCHECK(ncclCudaCalloc((char**)(&recvResources->devRecvMem), recvSize));
+ }
+ NCCLCHECK(ncclCudaHostAlloc((void**)&recvResources->hostRecvMem, (void**)&recvResources->devHostRecvMem, recvSize));
+ NCCLCHECK(ncclIbMalloc((void**)&(recvResources->llData), NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine)));
+ recvResources->buffSize = buffSize;
+
+ INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), recvResources->netDev,
+ recvResources->useGdr ? "/GDRDMA" : "");
+
+ struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo;
+ NCCLCHECK(collNetListen(recvResources->netDev, &info->collNetHandle, &recvResources->netListenComm));
+
+ return ncclSuccess;
+}
+
+ncclResult_t collNetSendConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* send) {
+ // Setup device pointers
+ struct collNetSendResources* sendResources = (struct collNetSendResources*)send->transportResources;
+ sendResources->collNetRank = rank;
+
+ // Get info from recv side
+ struct collNetSendConnectInfo* sInfo = (struct collNetSendConnectInfo*)(connectInfos+rank);
+ sendResources->reqFifo = sInfo->reqFifo;
+ sendResources->collNetSendComm = sInfo->collNetComm;
+ sendResources->recvMhandle = sInfo->mhandle;
+ sendResources->llRecvMhandle = sInfo->llMhandle;
+
+ // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host
+ struct ncclRecvMem* sRecvMem = sendResources->useGdr ? sendResources->devRecvMem : sendResources->devHostRecvMem;
+ // Register buffers
+ NCCLCHECK(collNetRegMr(sendResources->collNetSendComm, sRecvMem->buff, sendResources->buffSize,
+ sendResources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &sendResources->sendMhandle));
+ NCCLCHECK(collNetRegMr(sendResources->collNetSendComm, sendResources->llData,
+ NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine), NCCL_PTR_HOST, &sendResources->llSendMhandle));
+
+ send->conn.buff = sRecvMem->buff;
+ send->conn.llBuff = sendResources->devHostRecvMem->llBuff;
+ send->conn.direct |= sendResources->useGdr ? NCCL_DIRECT_NIC : 0;
+
+ // Head/Tail/Opcount/Fifos are always on host
+ send->conn.tail = &sendResources->devHostRecvMem->tail;
+ send->conn.opCountRem = &sendResources->devHostRecvMem->opCount;
+ send->conn.fifo = sendResources->devHostRecvMem->sizesFifo;
+ send->conn.head = &sendResources->devHostSendMem->head;
+ send->conn.opCountLoc = &sendResources->devHostSendMem->opCount;
+ for (int i=0; i<NCCL_STEPS; i++) send->conn.fifo[i] = -1;
+
+ return ncclSuccess;
+}
+
+ncclResult_t collNetRecvConnect(struct ncclConnect* connectInfos, int nranks, int rank, struct ncclConnector* recv) {
+ // Setup device pointers
+ struct collNetRecvResources* recvResources = (struct collNetRecvResources*)recv->transportResources;
+ struct collNetSendConnectInfo* sInfo = (struct collNetSendConnectInfo*)(connectInfos+rank);
+ recvResources->collNetRank = rank;
+
+ // Intermediate buffering on GPU for GPU Direct RDMA
+ struct ncclRecvMem* rRecvMem = recvResources->useGdr ? recvResources->devRecvMem : recvResources->devHostRecvMem;
+ recv->conn.buff = rRecvMem->buff;
+ recv->conn.llBuff = recvResources->devHostRecvMem->llBuff; // recv LL buff always on host
+ recv->conn.direct |= recvResources->useGdr ? NCCL_DIRECT_NIC : 0;
+
+ // Head/Tail/Opcount are always on host
+ recv->conn.tail = &recvResources->devHostRecvMem->tail;
+ recv->conn.opCountLoc = &recvResources->devHostRecvMem->opCount;
+ recv->conn.head = &recvResources->devHostSendMem->head;
+ recv->conn.opCountRem = &recvResources->devHostSendMem->opCount;
+
+ // Connect to coll comm
+ collNetHandle_t** handlePtrs = NULL;
+ NCCLCHECK(ncclCalloc(&handlePtrs, nranks));
+ for (int i = 0; i < nranks; i++) {
+ struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i);
+ handlePtrs[i] = &(info->collNetHandle);
+ }
+ ncclResult_t res;
+ NCCLCHECKGOTO(collNetConnect((void**)handlePtrs, nranks, rank, recvResources->netListenComm, &recvResources->collNetRecvComm), res, cleanup);
+
+ // Register buffers
+ NCCLCHECK(collNetRegMr(recvResources->collNetRecvComm, rRecvMem->buff, recvResources->buffSize,
+ recvResources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &recvResources->mhandle));
+ NCCLCHECK(collNetRegMr(recvResources->collNetRecvComm, recvResources->llData,
+ NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine), NCCL_PTR_HOST, &recvResources->llMhandle));
+
+ // Create shared info between send and recv proxies
+ NCCLCHECK(ncclCalloc(&(recvResources->reqFifo), NCCL_STEPS));
+
+ // Pass info to send side
+ sInfo->reqFifo = recvResources->reqFifo;
+ sInfo->collNetComm = recvResources->collNetRecvComm;
+ sInfo->mhandle = recvResources->mhandle;
+ sInfo->llMhandle = recvResources->llMhandle;
+
+cleanup:
+ if (handlePtrs != NULL) free(handlePtrs);
+ // Close listen comm
+ NCCLCHECK(collNetCloseListen(recvResources->netListenComm));
+
+ return res;
+}
+
+ncclResult_t collNetSendFree(void* sendTransportResources) {
+ struct collNetSendResources* sendResources = (struct collNetSendResources*)sendTransportResources;
+ NCCLCHECK(ncclCudaHostFree(sendResources->hostSendMem));
+ NCCLCHECK(ncclCudaHostFree(sendResources->hostRecvMem));
+ if (sendResources->collNetSendComm) {
+ NCCLCHECK(collNetDeregMr(sendResources->collNetSendComm, sendResources->sendMhandle));
+ NCCLCHECK(collNetDeregMr(sendResources->collNetSendComm, sendResources->llSendMhandle));
+ }
+ if (sendResources->useGdr)
+ CUDACHECK(cudaFree(sendResources->devRecvMem));
+ free(sendResources->llData);
+ free(sendResources);
+ return ncclSuccess;
+}
+
+ncclResult_t collNetRecvFree(void* recvTransportResources) {
+ struct collNetRecvResources* recvResources = (struct collNetRecvResources*)recvTransportResources;
+ NCCLCHECK(ncclCudaHostFree(recvResources->hostSendMem));
+ if (recvResources->collNetRecvComm) {
+ NCCLCHECK(collNetDeregMr(recvResources->collNetRecvComm, recvResources->mhandle));
+ NCCLCHECK(collNetDeregMr(recvResources->collNetRecvComm, recvResources->llMhandle));
+ }
+ NCCLCHECK(ncclCudaHostFree(recvResources->hostRecvMem));
+ if (recvResources->useGdr)
+ CUDACHECK(cudaFree(recvResources->devRecvMem));
+ free(recvResources->llData);
+ free(recvResources->reqFifo);
+
+ // Make sure SendFree is called before RecvFree
+ if (recvResources->collNetRecvComm) {
+ NCCLCHECK(collNetCloseColl(recvResources->collNetRecvComm));
+ }
+ free(recvResources);
+ return ncclSuccess;
+}
+
+ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
+ if (args->protocol == NCCL_PROTO_LL128) {
+ WARN("CollNet does not support LL128");
+ return ncclInternalError;
+ }
+ struct collNetSendResources* resources = (struct collNetSendResources*) (args->connector->transportResources);
+ if (args->state == ncclProxyOpReady) {
+ // Update opCount
+ resources->hostRecvMem->opCount = args->opCount;
+
+ // Round to next multiple of sliceSteps
+ resources->step = ROUNDUP(resources->step, args->chunkSteps);
+ args->head = resources->step;
+ args->tail = resources->step;
+ args->end = args->head + args->nsteps;
+ args->state = ncclProxyOpProgress;
+ }
+ if (args->state == ncclProxyOpProgress) {
+ args->idle = 1;
+ struct reqSlot* reqFifo = resources->reqFifo;
+ if (args->head < args->end) {
+ int buffSlot = args->tail%NCCL_STEPS;
+ if (args->tail < args->end && args->tail < args->head + NCCL_STEPS
+ && reqFifo[buffSlot].recvBuff != NULL) {
+ volatile int* sizesFifo = resources->hostRecvMem->sizesFifo;
+ volatile uint64_t* recvTail = &resources->hostRecvMem->tail;
+ if (args->protocol == NCCL_PROTO_LL) {
+ int size = sizesFifo[buffSlot];
+ if (size != -1) {
+ uint32_t flag = NCCL_LL_FLAG(args->tail + 1);
+ int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine));
+ union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES;
+ int ready = 1;
+ for (int i=0; i<nFifoLines; i++) {
+ volatile uint32_t *f1 = &lines[i].flag1;
+ volatile uint32_t *f2 = &lines[i].flag2;
+ if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
+ }
+ if (ready) {
+ //separate data from flag
+ struct ncclLLDataLine* sendBuff = resources->llData+buffSlot*NCCL_LL_SLICE_LINES;
+ for (int i=0; i<nFifoLines; i++) {
+ volatile uint32_t *d1 = &lines[i].data1;
+ volatile uint32_t *d2 = &lines[i].data2;
+ sendBuff[i].data1 = d1[0];
+ sendBuff[i].data2 = d2[0];
+ }
+ int count = nFifoLines*sizeof(struct ncclLLDataLine) / ncclTypeSize(args->dtype);
+ NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*)sendBuff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, resources->llSendMhandle, resources->llRecvMhandle, args->requests+buffSlot));
+ if (args->requests[buffSlot] != NULL) {
+ TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce (LL) posted, req %p", args->head, buffSlot, args->requests[buffSlot]);
+ sizesFifo[buffSlot] = -1;
+ // Make sure size is reset to zero before we update the head.
+ __sync_synchronize();
+ args->tail += args->sliceSteps;
+ args->idle = 0;
+ }
+ }
+ }
+ } else if (args->tail < *recvTail) {
+ int stepSize = args->channel->buffSize/NCCL_STEPS;
+ struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
+ // Send through network
+ if (sizesFifo[buffSlot] != -1) {
+ int count = sizesFifo[buffSlot]/ncclTypeSize(args->dtype);
+ NCCLCHECK(collNetIallreduce(resources->collNetSendComm, localMem->buff+buffSlot*stepSize, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, resources->sendMhandle, resources->recvMhandle, args->requests+buffSlot));
+ if (args->requests[buffSlot] != NULL) {
+ TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce posted, req %p count %d", args->head, buffSlot, args->requests[buffSlot], count);
+ sizesFifo[buffSlot] = -1;
+ // Make sure size is reset to zero before we update the head.
+ __sync_synchronize();
+ args->tail += args->sliceSteps;
+ args->idle = 0;
+ }
+ }
+ }
+ }
+ if (args->head < args->tail) {
+ int done, size;
+ int buffSlot = args->head%NCCL_STEPS;
+ NCCLCHECK(collNetTest((void*)(args->requests[buffSlot]), &done, &size));
+ if (done) {
+ TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->head, buffSlot, args->requests[buffSlot], size);
+ reqFifo[buffSlot].size = size;
+ // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
+ // (reordered store after store is possible on POWER, though not on x86)
+ __sync_synchronize();
+ reqFifo[buffSlot].recvBuff = NULL; // Notify recvProxy
+ args->head += args->sliceSteps;
+ resources->hostSendMem->head = args->head;
+ args->idle = 0;
+ }
+ }
+ }
+ if (args->head == args->end) {
+ resources->step = args->end;
+ args->idle = 0;
+ args->state = ncclProxyOpNone;
+ }
+ }
+ return ncclSuccess;
+}
+
+ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
+ if (args->protocol == NCCL_PROTO_LL128) {
+ WARN("CollNet does not support LL128");
+ return ncclInternalError;
+ }
+ struct collNetRecvResources* resources = (struct collNetRecvResources*) (args->connector->transportResources);
+ if (args->state == ncclProxyOpReady) {
+ // Update opCount
+ resources->hostSendMem->opCount = args->opCount;
+
+ // Round to next multiple of sliceSteps
+ resources->step = ROUNDUP(resources->step, args->chunkSteps);
+ args->head = resources->step;
+ args->tail = resources->step;
+ args->end = args->head + args->nsteps;
+ args->state = ncclProxyOpProgress;
+ }
+ if (args->state == ncclProxyOpProgress) {
+ args->idle = 1;
+ int stepSize = ( args->protocol == NCCL_PROTO_LL ? NCCL_LL_BUFF_LINES*sizeof(struct ncclLLDataLine) : args->channel->buffSize ) / NCCL_STEPS;
+ struct reqSlot* reqFifo = resources->reqFifo;
+ if (args->head < args->end) {
+ struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem;
+ char* localBuff = args->protocol == NCCL_PROTO_LL ? (char*)resources->llData : localMem->buff;
+ void* mhandle = args->protocol == NCCL_PROTO_LL ? resources->llMhandle : resources->mhandle;
+ if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) {
+ int buffSlot = args->tail%NCCL_STEPS;
+ reqFifo[buffSlot].recvBuff = localBuff+buffSlot*stepSize;
+ TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->tail, buffSlot, localBuff+buffSlot*stepSize);
+ args->tail += args->sliceSteps;
+ args->idle = 0;
+ }
+ if (args->tail > args->head) {
+ int buffSlot = args->head%NCCL_STEPS;
+ if (reqFifo[buffSlot].recvBuff == NULL) { // Buffer is cleared : coll is complete
+ TRACE(NCCL_NET, "recvProxy [%d/%d] done, size %d", args->head, buffSlot, reqFifo[buffSlot].size);
+ args->head += args->sliceSteps;
+ if (args->protocol == NCCL_PROTO_LL) { // ll
+ // re-attach flag
+ uint32_t flag = args->head;
+ union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(resources->hostRecvMem->llBuff)+buffSlot*NCCL_LL_SLICE_LINES;
+ struct ncclLLDataLine* recvData = resources->llData+buffSlot*NCCL_LL_SLICE_LINES;
+ int nFifoLines = DIVUP(reqFifo[buffSlot].size, sizeof(struct ncclLLDataLine));
+ for (int i=0; i<nFifoLines; i++) {
+ lines[i].v[0] = ((uint64_t)flag << 32) + recvData[i].data1;
+ lines[i].v[1] = ((uint64_t)flag << 32) + recvData[i].data2;
+ }
+ } else if (args->protocol == NCCL_PROTO_SIMPLE) {
+ if (resources->useGdr) collNetFlush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, reqFifo[buffSlot].size, mhandle);
+ resources->hostRecvMem->tail = args->head;
+ }
+ args->idle = 0;
+ }
+ }
+ }
+ if (args->head == args->end) {
+ resources->step = args->end;
+ args->idle = 0;
+ args->state = ncclProxyOpNone;
+ }
+ }
+ return ncclSuccess;
+}
+
+struct ncclTransport collNetTransport = {
+ "COL",
+ collNetCanConnect,
+ { collNetSendSetup, collNetSendConnect, collNetSendFree, collNetSendProxy },
+ { collNetRecvSetup, collNetRecvConnect, collNetRecvFree, collNetRecvProxy }
+};
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 928a6a9..db82a40 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -53,40 +53,6 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
return ncclSuccess;
}
-NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
-NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);
-
-static ncclResult_t netGetGdrSupport(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr) {
- *useGdr = 0;
-
- if (read) { // For reads (sends) only enable under certain conditions
- int gdrReadParam = ncclParamNetGdrRead();
- if (gdrReadParam == 0) return ncclSuccess;
- if (gdrReadParam < 0) {
- int nvlink;
- NCCLCHECK(ncclTopoHasNvlink(topo, busId, &nvlink));
- if (!nvlink) return ncclSuccess;
- }
- }
-
- // Check if we are close enough that it makes sense to enable GDR
- int netGdrLevel = ncclParamNetGdrLevel();
- int distance;
- NCCLCHECK(ncclTopoNetDistance(topo, busId, netDev, &distance));
- if (distance >= netGdrLevel) {
- INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d >= %d)", ncclNetName(), busId, netDev, distance, netGdrLevel);
- return ncclSuccess;
- }
-
- // Finally, check if the NIC supports it
- int flags;
- NCCLCHECK(ncclNetPtrSupport(netDev, &flags));
- if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess;
- *useGdr = 1;
- INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %lx / HCA %d (distance %d < %d), read %d", ncclNetName(), busId, netDev, distance, netGdrLevel, read);
- return ncclSuccess;
-}
-
/* Determine if we will use this transport for this peer and return connect
* information for this peer */
ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) {
@@ -95,7 +61,7 @@ ncclResult_t netSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
send->transportResources = resources;
NCCLCHECK(ncclTopoGetNetDev(graph, 1, channelId, &resources->netDev));
- NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
+ NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 1, &resources->useGdr));
int sendSize = sizeof(struct ncclSendMem);
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
@@ -118,7 +84,7 @@ ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
recv->transportResources = resources;
NCCLCHECK(ncclTopoGetNetDev(graph, 0, channelId, &resources->netDev));
- NCCLCHECK(netGetGdrSupport(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
+ NCCLCHECK(ncclTopoCheckGdr(topo, myInfo->busId, resources->netDev, 0, &resources->useGdr));
int sendSize = sizeof(struct ncclSendMem);
NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize));
@@ -137,7 +103,7 @@ ncclResult_t netRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
return ncclSuccess;
}
-ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+ncclResult_t netSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
// Setup device pointers
struct netSendResources* resources = (struct netSendResources*)send->transportResources;
@@ -146,6 +112,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
send->conn.buff = recvMem->buff;
send->conn.llBuff = resources->devHostRecvMem->llBuff;
send->conn.ll128Buff = recvMem->ll128Buff;
+ send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
// Head/Tail/Opcount/Fifos are always on host
send->conn.tail = &resources->devHostRecvMem->tail;
@@ -170,7 +137,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
}
/* Connect to this peer */
-ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
// Setup device pointers
struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources;
@@ -179,6 +146,7 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto
recv->conn.buff = recvMem->buff;
recv->conn.llBuff = recvMem->llBuff;
recv->conn.ll128Buff = recvMem->ll128Buff;
+ recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0;
// Head/Tail/Opcount are always on host
recv->conn.tail = &resources->devHostRecvMem->tail;
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index 0d5307c..1a832f2 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -28,13 +28,19 @@
#define MAXNAMESIZE 64
static char ncclIbIfName[MAX_IF_NAME_SIZE];
static union socketAddress ncclIbIfAddr;
+
static int ncclNIbDevs = -1;
struct ncclIbDev {
int device;
+ uint64_t guid;
uint8_t port;
uint8_t link;
+ int speed;
ibv_context* context;
char devName[MAXNAMESIZE];
+ char* pciPath;
+ int realPort;
+ int maxQp;
};
#define MAX_IB_PORT 15
@@ -53,20 +59,7 @@ NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 14);
NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
NCCL_PARAM(IbSl, "IB_SL", 0);
NCCL_PARAM(IbTc, "IB_TC", 0);
-
-// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
-// allocated on separate pages as those pages will be marked DONTFORK
-// and if they are shared, that could cause a crash in a child process
-static ncclResult_t ncclIbMalloc(void** ptr, size_t size) {
- size_t page_size = sysconf(_SC_PAGESIZE);
- void* p;
- int size_aligned = ROUNDUP(size, page_size);
- int ret = posix_memalign(&p, page_size, size_aligned);
- if (ret != 0) return ncclSystemError;
- memset(p, 0, size);
- *ptr = p;
- return ncclSuccess;
-}
+NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
pthread_t ncclIbAsyncThread;
static void* ncclIbAsyncThreadMain(void* args) {
@@ -85,6 +78,39 @@ static void* ncclIbAsyncThreadMain(void* args) {
NCCL_PARAM(IbDisable, "IB_DISABLE", 0);
+static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort) {
+ char devicePath[PATH_MAX];
+ snprintf(devicePath, PATH_MAX, "/sys/class/infiniband/%s/device", devName);
+ char* p = realpath(devicePath, NULL);
+ if (p == NULL) {
+ WARN("Could not find real path of %s", *devicePath);
+ } else {
+ // Merge multi-port NICs into the same PCI device
+ p[strlen(p)-1] = '0';
+ // And keep the real port aside (the ibv port is always 1 on recent cards)
+ *realPort = 0;
+ for (int d=0; d<ncclNIbDevs; d++) {
+ if (strcmp(p, ncclIbDevs[d].pciPath) == 0) (*realPort)++;
+ }
+ }
+ *path = p;
+ return ncclSuccess;
+}
+
+static int ibvWidths[] = { 1, 4, 8, 12 };
+static int ibvSpeeds[] = { 2500, 5000, 10000, 10000, 14000, 25000, 50000 };
+static int firstBitSet(int val, int max) {
+ int i = 0;
+ while (i<max && ((val & (1<<i)) == 0)) i++;
+ return i;
+}
+static int ncclIbWidth(int width) {
+ return ibvWidths[firstBitSet(width, sizeof(ibvWidths)/sizeof(int)-1)];
+}
+static int ncclIbSpeed(int speed) {
+ return ibvSpeeds[firstBitSet(speed, sizeof(ibvSpeeds)/sizeof(int)-1)];
+}
+
ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
if (ncclParamIbDisable()) return ncclInternalError;
@@ -145,10 +171,14 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port,
portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
ncclIbDevs[ncclNIbDevs].device = d;
+ ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid;
ncclIbDevs[ncclNIbDevs].port = port;
ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
+ ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width);
ncclIbDevs[ncclNIbDevs].context = context;
strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
+ NCCLCHECK(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort));
+ ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp;
ncclNIbDevs++;
nPorts++;
pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context);
@@ -180,17 +210,6 @@ ncclResult_t ncclIbDevices(int* ndev) {
return ncclSuccess;
}
-ncclResult_t ncclIbPciPath(int dev, char** path) {
- char devicepath[PATH_MAX];
- snprintf(devicepath, PATH_MAX, "/sys/class/infiniband/%s/device", ncclIbDevs[dev].devName);
- *path = realpath(devicepath, NULL);
- if (*path == NULL) {
- WARN("Could not find real path of %s", devicepath);
- return ncclSystemError;
- }
- return ncclSuccess;
-}
-
// Detect whether GDR can work on a given NIC with the current CUDA device
// Returns :
// ncclSuccess : GDR works
@@ -204,19 +223,24 @@ ncclResult_t ncclIbGdrSupport(int ibDev) {
return ncclSuccess;
}
-ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) {
- *supportedTypes = NCCL_PTR_HOST;
+static ncclResult_t GetSocketAddr(union socketAddress* addr) {
+ memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
+ return ncclSuccess;
+}
+ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
+ props->name = ncclIbDevs[dev].devName;
+ props->pciPath = ncclIbDevs[dev].pciPath;
+ props->guid = ncclIbDevs[dev].guid;
+ props->ptrSupport = NCCL_PTR_HOST;
if (ncclIbGdrSupport(dev) != ncclSuccess) {
INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName);
- return ncclSuccess;
+ } else {
+ props->ptrSupport |= NCCL_PTR_CUDA;
}
- *supportedTypes |= NCCL_PTR_CUDA;
- return ncclSuccess;
-}
-
-static ncclResult_t GetSocketAddr(union socketAddress* addr) {
- memcpy(addr, &ncclIbIfAddr, sizeof(*addr));
+ props->speed = ncclIbDevs[dev].speed;
+ props->port = ncclIbDevs[dev].port + ncclIbDevs[dev].realPort;
+ props->maxComms = ncclIbDevs[dev].maxQp;
return ncclSuccess;
}
@@ -325,7 +349,8 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbVerbs* verbs, int acce
qpInitAttr.send_cq = verbs->cq;
qpInitAttr.recv_cq = verbs->cq;
qpInitAttr.qp_type = IBV_QPT_RC;
- qpInitAttr.cap.max_send_wr = MAX_REQUESTS;
+ // We might send 2 requests per send (RDMA_WRITE+RDMA_WRITE_WITH_IMM)
+ qpInitAttr.cap.max_send_wr = 2*MAX_REQUESTS;
qpInitAttr.cap.max_recv_wr = MAX_REQUESTS;
qpInitAttr.cap.max_send_sge = 1;
qpInitAttr.cap.max_recv_sge = 1;
@@ -627,6 +652,10 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
wr.opcode = IBV_WR_SEND;
wr.send_flags = IBV_SEND_SIGNALED;
+ int useAr = 0;
+ if (size > ncclParamIbArThreshold()) {
+ useAr = 1;
+ }
#if USE_RDMA_WRITE
__sync_synchronize(); // order the readyPtr load against rkey load below
// Sanity checks to catch user collective call count/size mismatches
@@ -636,7 +665,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
size, slot->size, slot->addr, slot->rkey, slot->seq, comm->fifoHead);
return ncclInternalError;
}
- wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+ wr.opcode = useAr ? IBV_WR_RDMA_WRITE : IBV_WR_RDMA_WRITE_WITH_IMM;
wr.wr.rdma.remote_addr = slot->addr;
wr.wr.rdma.rkey = slot->rkey;
wr.imm_data = size; // Send the message size via imm_data
@@ -651,6 +680,19 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo
struct ibv_send_wr* bad_wr;
NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
+
+#if USE_RDMA_WRITE
+ // When using adaptive routing, send the bulk of the data first as an
+ // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote
+ // completion.
+ if (useAr) {
+ wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+ wr.sg_list = NULL;
+ wr.num_sge = 0;
+ wr.send_flags &= ~IBV_SEND_SIGNALED;
+ NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr));
+ }
+#endif
*request = req;
return ncclSuccess;
}
@@ -835,8 +877,7 @@ ncclNet_t ncclNetIb = {
"IB",
ncclIbInit,
ncclIbDevices,
- ncclIbPciPath,
- ncclIbPtrSupport,
+ ncclIbGetProperties,
ncclIbListen,
ncclIbConnect,
ncclIbAccept,
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
index 1b1fc4f..5bc22c3 100644
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -19,16 +19,31 @@
#include <fcntl.h>
/* Init functions */
-static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
-static union socketAddress ncclNetIfAddrs[MAX_IFS];
static int ncclNetIfs = -1;
+struct ncclSocketDev {
+ union socketAddress addr;
+ char devName[MAX_IF_NAME_SIZE];
+ char* pciPath;
+};
+static struct ncclSocketDev ncclSocketDevs[MAX_IFS];
+
pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
+static ncclResult_t ncclSocketGetPciPath(char* devName, char** pciPath) {
+ char devicePath[PATH_MAX];
+ snprintf(devicePath, PATH_MAX, "/sys/class/net/%s/device", devName);
+ // May return NULL if the file doesn't exist.
+ *pciPath = realpath(devicePath, NULL);
+ return ncclSuccess;
+}
+
ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
if (ncclNetIfs == -1) {
pthread_mutex_lock(&ncclSocketLock);
if (ncclNetIfs == -1) {
- ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
+ char names[MAX_IF_NAME_SIZE*MAX_IFS];
+ union socketAddress addrs[MAX_IFS];
+ ncclNetIfs = findInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS);
if (ncclNetIfs <= 0) {
WARN("NET/Socket : no interface found");
return ncclInternalError;
@@ -37,8 +52,11 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
char addrline[1024];
line[0] = '\0';
for (int i=0; i<ncclNetIfs; i++) {
- snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, ncclNetIfNames+i*MAX_IF_NAME_SIZE,
- socketToString(&ncclNetIfAddrs[i].sa, addrline));
+ strcpy(ncclSocketDevs[i].devName, names+i*MAX_IF_NAME_SIZE);
+ memcpy(&ncclSocketDevs[i].addr, addrs+i, sizeof(union socketAddress));
+ NCCLCHECK(ncclSocketGetPciPath(ncclSocketDevs[i].devName, &ncclSocketDevs[i].pciPath));
+ snprintf(line+strlen(line), 1023-strlen(line), " [%d]%s:%s", i, names+i*MAX_IF_NAME_SIZE,
+ socketToString(&addrs[i].sa, addrline));
}
line[1023] = '\0';
INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
@@ -49,30 +67,44 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
return ncclSuccess;
}
-ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) {
- *supportedTypes = NCCL_PTR_HOST;
- return ncclSuccess;
-}
-
ncclResult_t ncclSocketDevices(int* ndev) {
*ndev = ncclNetIfs;
return ncclSuccess;
}
-ncclResult_t ncclSocketPciPath(int dev, char** path) {
- char devicepath[PATH_MAX];
- snprintf(devicepath, PATH_MAX, "/sys/class/net/%s/device", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
- *path = realpath(devicepath, NULL);
- if (*path == NULL) {
- INFO(NCCL_NET|NCCL_INIT, "Could not find real path of %s", devicepath);
- return ncclSystemError;
+static ncclResult_t ncclSocketGetSpeed(char* devName, int* speed) {
+ *speed = 0;
+ char speedPath[PATH_MAX];
+ sprintf(speedPath, "/sys/class/net/%s/speed", devName);
+ int fd = open(speedPath, O_RDONLY);
+ if (fd != -1) {
+ char speedStr[] = " ";
+ if (read(fd, speedStr, sizeof(speedStr)-1) > 0) {
+ *speed = strtol(speedStr, NULL, 0);
+ }
+ close(fd);
+ }
+ if (*speed <= 0) {
+ INFO(NCCL_NET, "Could not get speed from %s. Defaulting to 10 Gbps.", speedPath);
+ *speed = 10000;
}
return ncclSuccess;
}
+ncclResult_t ncclSocketGetProperties(int dev, ncclNetProperties_t* props) {
+ props->name = ncclSocketDevs[dev].devName;
+ props->pciPath = ncclSocketDevs[dev].pciPath;
+ props->guid = dev;
+ props->ptrSupport = NCCL_PTR_HOST;
+ NCCLCHECK(ncclSocketGetSpeed(props->name, &props->speed));
+ props->port = 0;
+ props->maxComms = 65536;
+ return ncclSuccess;
+}
+
ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
if (dev >= ncclNetIfs) return ncclInternalError;
- memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr));
+ memcpy(addr, &ncclSocketDevs[dev].addr, sizeof(*addr));
return ncclSuccess;
}
@@ -196,7 +228,7 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
// Auto-detection
int autoNt=0, autoNs=1; // By default, we only use the main thread and do not spawn extra threads
char vendorPath[PATH_MAX];
- snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
+ snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclSocketDevs[dev].devName);
char* rPath = realpath(vendorPath, NULL);
int fd = open(rPath, O_RDONLY);
free(rPath);
@@ -486,8 +518,7 @@ ncclNet_t ncclNetSocket = {
"Socket",
ncclSocketInit,
ncclSocketDevices,
- ncclSocketPciPath,
- ncclSocketPtrSupport,
+ ncclSocketGetProperties,
ncclSocketListen,
ncclSocketConnect,
ncclSocketAccept,
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index 0cc92f3..6586ce7 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -1,5 +1,5 @@
/*************************************************************************
- * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -28,9 +28,6 @@ struct p2pRecvResources {
#include <sys/types.h>
-NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2);
-NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2);
-
/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
static int busIdToCudaDev(int64_t busId) {
int ndev;
@@ -50,73 +47,44 @@ static int busIdToCudaDev(int64_t busId) {
/* Determine if two peers can communicate through p2p */
ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
- int cpuCount;
- NCCLCHECK(ncclTopoCpuCount(topo, &cpuCount));
- // Do not use P2P across sockets by default (provided CUDA permits it).
- // When we are on a single socket, don't even use P2P through the CPU as
- // it should be able to sustain two flows to sysmem faster than PCI P2P.
- int p2pLevel = cpuCount == 1 ? PATH_PHB : PATH_NODE;
- if (ncclParamP2pDisable() == 1) p2pLevel = 0;
- if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel();
-
- // Disable P2P
- *ret = 0;
-
- if (p2pLevel == 0) return ncclSuccess;
-
// Rule out different nodes
- if (info1->hostHash != info2->hostHash) return ncclSuccess;
+ if (info1->hostHash != info2->hostHash) {
+ *ret = 0;
+ return ncclSuccess;
+ }
+
+ // Check topology / p2p level.
+ NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret));
+ if (*ret == 0) return ncclSuccess;
// Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
int cudaDev1 = busIdToCudaDev(info1->busId);
int cudaDev2 = busIdToCudaDev(info2->busId);
if (cudaDev1 == -1 || cudaDev2 == -1) {
- // Peer's CUDA device is not visible in this process
#if CUDART_VERSION >= 10010
- // But in CUDA 10.1 we can still communicate with 'invisible' devices
- TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %lx and %lx", info1->busId, info2->busId);
- // Check for NVLink/NVswitch including P2P access
- int nvlink;
- NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink));
- if (nvlink > 0) {
- *ret = 1;
- return ncclSuccess;
- }
-#endif
+ // CUDA 10.1 and later can use P2P with invisible devices.
return ncclSuccess;
- }
-
- TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%lx] and [%d=%lx]", cudaDev1, info1->busId, cudaDev2, info2->busId);
-
- // Do not detect topology if we're on the same GPU. Note this is not really supported.
- if (cudaDev1 == cudaDev2) {
- *ret = 1;
+#else
+ // Peer's CUDA device is not visible in this process : we can't communicate with it.
+ *ret = 0;
return ncclSuccess;
+#endif
}
- // See if CUDA can do P2P
+ // Check that CUDA can do P2P
int p2p;
if (cudaDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != cudaSuccess) {
INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%lx) and dev %d(=%lx)",
cudaDev1, info1->busId, cudaDev2, info2->busId);
+ *ret = 0;
return ncclSuccess;
}
- if (p2p == 0) return ncclSuccess;
-
- // Check for NVLink/NVswitch
- int nvlink;
- NCCLCHECK(ncclTopoGetNvlink(topo, info1->busId, info2->busId, &nvlink));
- if (nvlink > 0) {
- *ret = 1;
+ if (p2p == 0) {
+ INFO(NCCL_INIT|NCCL_P2P,"Could not enable P2P between dev %d(=%lx) and dev %d(=%lx)",
+ cudaDev1, info1->busId, cudaDev2, info2->busId);
+ *ret = 0;
return ncclSuccess;
}
-
- // Finally compute the PCI distance and compare with the p2pLevel.
- int distance;
- NCCLCHECK(ncclTopoGpuDistance(topo, info1->busId, info2->busId, &distance));
- if (distance < p2pLevel) {
- *ret = 1;
- }
return ncclSuccess;
}
@@ -227,13 +195,13 @@ ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
}
/* Connect/Send to this peer */
-static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources;
struct ncclRecvMem* remDevMem;
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
if (info->direct) {
remDevMem = (struct ncclRecvMem*)(info->directPtr);
- send->conn.direct = 1;
+ send->conn.direct |= NCCL_DIRECT_GPU;
} else {
//TRACE_DUMP_IPC(&info->devIpc);
cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess);
@@ -257,13 +225,13 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC
}
/* Connect/Recv from this peer */
-ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources;
struct ncclSendMem* remDevMem;
struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
if (info->direct) {
remDevMem = (struct ncclSendMem*)(info->directPtr);
- recv->conn.direct = 1;
+ recv->conn.direct |= NCCL_DIRECT_GPU;
recv->conn.ptrExchange = &remDevMem->ptrExchange;
} else {
//TRACE_DUMP_IPC(&info->devIpc);
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
index 60f16c8..0b1d8ee 100644
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@@ -104,7 +104,7 @@ ncclResult_t shmRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
}
/* Connect to this peer */
-ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) {
+ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
// Setup device pointers
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
@@ -129,7 +129,7 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto
return ncclSuccess;
}
-ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) {
+ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
// Setup device pointers
struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources;
struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;