Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSylvain Jeaugey <sjeaugey@nvidia.com>2020-04-17 20:03:14 +0300
committerSylvain Jeaugey <sjeaugey@nvidia.com>2020-04-17 20:03:14 +0300
commitf36540f55a15683a121b6c330657af442b85c796 (patch)
tree31b541332f17c5e723d11c309c23b4815f92df68
parent23a9fbb788ff1150496238026bdf21eef741bbfd (diff)
Fix crash when only a subset of GPUs are visible within a container.
Fixes #326.
-rw-r--r--src/graph/topo.cc1
-rw-r--r--src/graph/xml.cc2
2 files changed, 2 insertions, 1 deletions
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index 3767ba9..ac6b111 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -520,6 +520,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId));
struct ncclXmlNode* node;
NCCLCHECK(ncclTopoFillGpu(xml, busId, &node));
+ if (node == NULL) continue;
NCCLCHECK(xmlSetAttrInt(node, "rank", r));
NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport));
}
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index 550cfcd..f138d0b 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -569,7 +569,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
NCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev));
}
NCCLCHECK(xmlGetAttrInt(gpuNode, "dev", &dev));
- if (dev == -1) return ncclSuccess;
+ if (dev == -1) { *gpuNodeRet = NULL; return ncclSuccess; }
NCCLCHECK(xmlGetAttrIndex(gpuNode, "sm", &index));
if (index == -1) {