Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSylvain Jeaugey <sjeaugey@nvidia.com>2020-01-17 02:40:36 +0300
committerGitHub <noreply@github.com>2020-01-17 02:40:36 +0300
commit44c34e5d102f7f62936b5eee2db88b15ea11bc06 (patch)
tree787fb8b1dd9df8cca71960c37bc60df55092dcbf
parent3899f6e0f219b0cbf58537f791b0dd104b377750 (diff)
parent7a18fe07847300fbe7fec8d5512b3b44d8bc1716 (diff)
Merge pull request #283 from lukeyeager/topo-trim-net-links
Topo trim net links
-rw-r--r--makefiles/common.mk7
-rw-r--r--src/graph/paths.cc23
2 files changed, 27 insertions, 3 deletions
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 37e81be..2e44826 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -42,9 +42,10 @@ else
endif
#$(info NVCC_GENCODE is ${NVCC_GENCODE})
-CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden
-CXXFLAGS += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla
-CXXFLAGS += -I $(CUDA_INC)
+CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \
+ -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla \
+ -I $(CUDA_INC) \
+ $(CXXFLAGS)
# Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors)
# 512 : 120, 640 : 96, 768 : 80, 1024 : 60
# We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions.
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index ce1772c..eba1964 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -179,11 +179,18 @@ static ncclResult_t addCpuStep(struct ncclTopoSystem* system, int c, int t1, int
// Remove/free paths for a given type
static void ncclTopoRemovePathType(struct ncclTopoSystem* system, int nodeType) {
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
+ // Remove links _to_ the given type
for (int n=0; n<system->nodes[t].count; n++) {
struct ncclTopoNode* node = system->nodes[t].nodes+n;
free(node->paths[nodeType]);
node->paths[nodeType] = NULL;
}
+ // Remove links _from_ the given type
+ for (int n=0; n<system->nodes[nodeType].count; n++) {
+ struct ncclTopoNode* node = system->nodes[nodeType].nodes+n;
+ free(node->paths[t]);
+ node->paths[t] = NULL;
+ }
}
}
@@ -309,6 +316,22 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
// Trim network
ncclTopoRemovePathType(system, NET);
system->nodes[NET].count = 0;
+ for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) {
+ for (int n=0; n<system->nodes[t].count; n++) {
+ struct ncclTopoNode* node = system->nodes[t].nodes+n;
+ for (int l=0; l<node->nlinks; l++) {
+ struct ncclTopoLink* link = &(node->links[l]);
+ if (link->remNode->type == NET) {
+ // Remove the link
+ for (int i=l; i<(node->nlinks-1); i++) {
+ memcpy(&(node->links[i]), &(node->links[i+1]), sizeof(ncclTopoLink));
+ }
+ node->nlinks--;
+ l--; // revisit the same value of "l" for the next iteration, since we edited the list in the middle of the loop
+ }
+ }
+ }
+ }
}
free(domains);
free(ids);