diff options
author | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2020-04-17 20:03:14 +0300 |
---|---|---|
committer | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2020-04-17 20:03:14 +0300 |
commit | f36540f55a15683a121b6c330657af442b85c796 (patch) | |
tree | 31b541332f17c5e723d11c309c23b4815f92df68 | |
parent | 23a9fbb788ff1150496238026bdf21eef741bbfd (diff) |
Fix crash when only a subset of GPUs are visible within a container.
Fixes #326.
-rw-r--r-- | src/graph/topo.cc | 1 | ||||
-rw-r--r-- | src/graph/xml.cc | 2 |
2 files changed, 2 insertions, 1 deletions
diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 3767ba9..ac6b111 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -520,6 +520,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy NCCLCHECK(int64ToBusId(comm->peerInfo[r].busId, busId)); struct ncclXmlNode* node; NCCLCHECK(ncclTopoFillGpu(xml, busId, &node)); + if (node == NULL) continue; NCCLCHECK(xmlSetAttrInt(node, "rank", r)); NCCLCHECK(xmlInitAttrInt(node, "gdr", comm->peerInfo[r].gdrSupport)); } diff --git a/src/graph/xml.cc b/src/graph/xml.cc index 550cfcd..f138d0b 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -569,7 +569,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm NCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev)); } NCCLCHECK(xmlGetAttrInt(gpuNode, "dev", &dev)); - if (dev == -1) return ncclSuccess; + if (dev == -1) { *gpuNodeRet = NULL; return ncclSuccess; } NCCLCHECK(xmlGetAttrIndex(gpuNode, "sm", &index)); if (index == -1) { |