From 23a9fbb788ff1150496238026bdf21eef741bbfd Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Thu, 16 Apr 2020 14:27:50 -0700 Subject: Improve robustness of PCI detection Fallback to default values when class/speed is unknown. --- src/graph/topo.cc | 6 ++---- src/graph/xml.h | 5 +++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/graph/topo.cc b/src/graph/topo.cc index 5cd8d4e..3767ba9 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -303,8 +303,8 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s return ncclSuccess; } -struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, 0 } }; -struct kvDict kvDictPciGen[] = { { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { NULL, 0 } }; // x100 Mbps per lane +struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } }; +struct kvDict kvDictPciGen[] = { { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent) { const char* str; @@ -356,8 +356,6 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s // Manage cases where speed was not indicated in /sys if (width == 0) width = 16; - if (strlen(str) == 0 || strcasecmp(str, "Unknown speed") == 0) str = "8 GT/s"; - NCCLCHECK(kvConvertToInt(str, &speed, kvDictPciGen)); // Values in 100Mbps, per lane (we want GB/s in the end) NCCLCHECK(ncclTopoConnectNodes(node, parent, LINK_PCI, width*speed/80.0)); diff --git a/src/graph/xml.h b/src/graph/xml.h index fa04527..22e016f 100644 --- a/src/graph/xml.h +++ b/src/graph/xml.h @@ -218,8 +218,9 @@ static ncclResult_t kvConvertToInt(const char* str, int* value, struct kvDict* d } d++; } - WARN("KV Convert to int : could not find value of '%s' in dictionary", str); - return ncclInternalError; + INFO(NCCL_GRAPH, "KV Convert to int : could not find value of '%s' in dictionary, falling back to %d", str, d->value); + *value = d->value; + return ncclSuccess; } static ncclResult_t kvConvertToStr(int value, const char** str, struct kvDict* dict) { struct kvDict* d = dict; -- cgit v1.2.3