2.7.5-1

Minor fixes for A100 platforms. Add a WARN for invalid GroupEnd call.
author: Sylvain Jeaugey <sjeaugey@nvidia.com> 2020-06-22 19:36:20 +0300
committer: Sylvain Jeaugey <sjeaugey@nvidia.com> 2020-06-27 00:39:49 +0300
commit: 01afd20a77b5804e0ecf1042509dd9d20ebf9e93 (patch)
tree: 04b04147f6e616b9cc9f282c341ffd0693d142c4
parent: 5949d96f36d050e59d05872f8bbffd2549318e95 (diff)
4 files changed, 8 insertions, 5 deletions
diff --git a/makefiles/version.mk b/makefiles/version.mk
index 4a82cb9..6f3b266 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 7
-NCCL_PATCH   := 3
+NCCL_PATCH   := 5
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 42e1bb9..cb52921 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -774,8 +774,8 @@ done:
     int dupChannels = std::min(graph->nChannels*2, graph->maxChannels);
     memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int));
     memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int));
-    graph->speedIntra /= 2;
-    graph->speedInter /= 2;
+    graph->speedIntra /= DIVUP(dupChannels, graph->nChannels);
+    graph->speedInter /= DIVUP(dupChannels, graph->nChannels);
     graph->nChannels = dupChannels;
   }
   return ncclSuccess;
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index 29424b0..62f50ef 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -107,9 +107,9 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
       for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
         float speed = comm->nNodes <= 2 || a == NCCL_ALGO_COLLNET ? graphs[a]->speedIntra : graphs[a]->speedInter;
         float busBw = graphs[a]->nChannels * speed;
-        if (compCap80) busBw *= 0.92;
 
         // Various model refinements
+        if (compCap80) busBw = std::min(busBw, 235.0f);
         if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL)    busBw *= (comm->nNodes > 1 || coll == ncclCollAllReduce || coll == ncclCollReduce) ? 1.0/4.0 : 1.0/3.0;
         if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
         double maxTreeBw = comm->nNodes > 2 ?
diff --git a/src/group.cc b/src/group.cc
index 549a4fd..5ce4901 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -145,7 +145,10 @@ void* ncclAsyncThreadPreconnect(void* args_) {
 
 NCCL_API(ncclResult_t, ncclGroupEnd);
 ncclResult_t ncclGroupEnd() {
-  if (ncclGroupMode == 0) return ncclInvalidUsage;
+  if (ncclGroupMode == 0) {
+    WARN("ncclGroupEnd: not in a group call.");
+    return ncclInvalidUsage;
+  }
   ncclGroupMode--;
   if (ncclGroupMode > 0) return ncclSuccess;
   int savedDev;
author	Sylvain Jeaugey <sjeaugey@nvidia.com>	2020-06-22 19:36:20 +0300
committer	Sylvain Jeaugey <sjeaugey@nvidia.com>	2020-06-27 00:39:49 +0300
commit	01afd20a77b5804e0ecf1042509dd9d20ebf9e93 (patch)
tree	04b04147f6e616b9cc9f282c341ffd0693d142c4
parent	5949d96f36d050e59d05872f8bbffd2549318e95 (diff)