diff options
Diffstat (limited to 'src/graph/xml.cc')
-rw-r--r-- | src/graph/xml.cc | 780 |
1 files changed, 780 insertions, 0 deletions
diff --git a/src/graph/xml.cc b/src/graph/xml.cc new file mode 100644 index 0000000..550cfcd --- /dev/null +++ b/src/graph/xml.cc @@ -0,0 +1,780 @@ +/************************************************************************* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <fcntl.h> +#include <ctype.h> +#include "core.h" +#include "nvmlwrap.h" +#include "xml.h" + +/*******************/ +/* XML File Parser */ +/*******************/ + +ncclResult_t xmlGetChar(FILE* file, char* c) { + if (fread(c, 1, 1, file) == 0) { + WARN("XML Parse : Unexpected EOF"); + return ncclInternalError; + } + return ncclSuccess; +} + +ncclResult_t xmlGetValue(FILE* file, char* value, char* last) { + char c; + NCCLCHECK(xmlGetChar(file, &c)); + if (c != '"' && c != '\'') { +#if INT_OK + int o = 0; + do { + value[o++] = c; + NCCLCHECK(xmlGetChar(file, &c)); + } while (c >= '0' && c <= '9'); + value[o] = '\0'; + *last = c; + return ncclSuccess; +#else + WARN("XML Parse : Expected (double) quote."); + return ncclInternalError; +#endif + } + int o = 0; + do { + NCCLCHECK(xmlGetChar(file, &c)); + value[o++] = c; + } while (c != '"'); + value[o-1] = '\0'; + NCCLCHECK(xmlGetChar(file, last)); + return ncclSuccess; +} + +ncclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) { + char c; + char* ptr = name; + int o = 0; + do { + NCCLCHECK(xmlGetChar(file, &c)); + if (c == '=') { + ptr[o] = '\0'; + if (value == NULL) { + WARN("XML Parse : Unexpected value with name %s\n", ptr); + return ncclInternalError; + } + return xmlGetValue(file, value, last); + } + ptr[o] = c; + if (o == MAX_STR_LEN-1) { + ptr[o] = '\0'; + WARN("Error : name %s too long (max %d)", ptr, MAX_STR_LEN); + return ncclInternalError; + } + o++; + } while (c != ' ' && c != '>' && c != '/' && c != '\n' && c != '\r'); + ptr[o-1] = '\0'; + *last = c; + return ncclSuccess; +} + +// Shift the 3-chars string by one char and append c at the end +#define SHIFT_APPEND(s, c) do { s[0]=s[1]; s[1]=s[2]; s[2]=c; } while(0) +ncclResult_t xmlSkipComment(FILE* file, char* start, char next) { + // Start from something neutral with \0 at the end. + char end[4] = "..."; + + // Inject all trailing chars from previous reads. We don't need + // to check for --> here because there cannot be a > in the name. + for (int i=0; i<strlen(start); i++) SHIFT_APPEND(end, start[i]); + SHIFT_APPEND(end, next); + + // Stop when we find "-->" + while (strcmp(end, "-->") != 0) { + int c; + if (fread(&c, 1, 1, file) != 1) { + WARN("XML Parse error : unterminated comment"); + return ncclInternalError; + } + SHIFT_APPEND(end, c); + } + return ncclSuccess; +} + +ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) { + node->type = NODE_TYPE_NONE; + char c = ' '; + while (c == ' ' || c == '\n' || c == '\r') { + if (fread(&c, 1, 1, file) == 0) return ncclSuccess; + } + if (c != '<') { + WARN("XML Parse error : expecting '<', got '%c'", c); + return ncclInternalError; + } + // Read XML element name + NCCLCHECK(xmlGetToken(file, node->name, NULL, &c)); + + // Check for comments + if (strncmp(node->name, "!--", 3) == 0) { + NCCLCHECK(xmlSkipComment(file, node->name+3, c)); + return xmlGetNode(file, node); + } + + // Check for closing tag + if (node->name[0] == '\0' && c == '/') { + node->type = NODE_TYPE_CLOSE; + // Re-read the name, we got '/' in the first call + NCCLCHECK(xmlGetToken(file, node->name, NULL, &c)); + if (c != '>') { + WARN("XML Parse error : unexpected trailing %c in closing tag %s\n", c, node->name); + return ncclInternalError; + } + return ncclSuccess; + } + + node->type = NODE_TYPE_OPEN; + + // Get Attributes + int a = 0; + while (c == ' ') { + NCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c)); + if (a == MAX_ATTR_COUNT) { + INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)\n", MAX_ATTR_COUNT); + // Actually we need to still consume the extra attributes so we have an extra one. + } else a++; + } + node->nAttrs = a; + if (c == '/') { + node->type = NODE_TYPE_SINGLE; + char str[MAX_STR_LEN]; + NCCLCHECK(xmlGetToken(file, str, NULL, &c)); + } + if (c != '>') { + WARN("XML Parse : expected >, got '%c'", c); + return ncclInternalError; + } + return ncclSuccess; +} + +typedef ncclResult_t (*xmlHandlerFunc_t)(FILE*, struct ncclXml*, struct ncclXmlNode*); + +struct xmlHandler { + const char * name; + xmlHandlerFunc_t func; +}; + +ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head, struct xmlHandler handlers[], int nHandlers) { + if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess; + while (1) { + if (xml->maxIndex == MAX_NODES) { + WARN("Error : XML parser is limited to 1024 nodes\n"); + return ncclInternalError; + } + struct ncclXmlNode* node = xml->nodes+xml->maxIndex; + memset(node, 0, sizeof(struct ncclXmlNode)); + NCCLCHECK(xmlGetNode(file, node)); + if (node->type == NODE_TYPE_NONE) { + if (head) { + WARN("XML Parse : unterminated %s", head->name); + return ncclInternalError; + } else { + // All done + return ncclSuccess; + } + } + if (head && node->type == NODE_TYPE_CLOSE) { + if (strcmp(node->name, head->name) != 0) { + WARN("XML Mismatch : %s / %s", head->name, node->name); + return ncclInternalError; + } + return ncclSuccess; + } + int found = 0; + for (int h=0; h<nHandlers; h++) { + if (strcmp(node->name, handlers[h].name) == 0) { + if (head) head->subs[head->nSubs++] = node; + node->parent = head; + node->nSubs = 0; + xml->maxIndex++; + NCCLCHECK(handlers[h].func(file, xml, node)); + found = 1; + break; + } + } + if (!found) { + if (nHandlers) INFO(NCCL_GRAPH, "Ignoring element %s", node->name); + NCCLCHECK(xmlLoadSub(file, xml, node, NULL, 0)); + } + } +} + +/**************/ +/* XML Writer */ +/**************/ + +ncclResult_t ncclTopoDumpXmlRec(int indent, FILE* file, struct ncclXmlNode* node) { + for (int i=0; i<indent; i++) fprintf(file, " "); + fprintf(file, "<%s", node->name); + + for (int a=0; a<node->nAttrs; a++) { + fprintf(file, " %s=\"%s\"", node->attrs[a].key, node->attrs[a].value); + } + if (node->nSubs == 0) { + fprintf(file, "/>\n"); + } else { + fprintf(file, ">\n"); + for (int s=0; s<node->nSubs; s++) { + NCCLCHECK(ncclTopoDumpXmlRec(indent+2, file, node->subs[s])); + } + for (int i=0; i<indent; i++) fprintf(file, " "); + fprintf(file, "</%s>\n", node->name); + } + return ncclSuccess; +} + +ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml) { + FILE* file = fopen(xmlTopoFile, "w"); + if (file == NULL) { + WARN("Unable to open %s, not dumping topology.", xmlTopoFile); + return ncclSuccess; + } + NCCLCHECK(ncclTopoDumpXmlRec(0, file, xml->nodes)); + fclose(file); + return ncclSuccess; +} + +/****************************************/ +/* Parser rules for our specific format */ +/****************************************/ + +ncclResult_t ncclTopoXmlLoadNvlink(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + struct xmlHandler handlers[] = { { "nvlink", ncclTopoXmlLoadNvlink } }; + NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlLoadNet(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlLoadNic(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + struct xmlHandler handlers[] = { { "net", ncclTopoXmlLoadNet } }; + NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlLoadPci(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "gpu", ncclTopoXmlLoadGpu }, { "nic", ncclTopoXmlLoadNic} }; + NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 3)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlLoadCpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + struct xmlHandler handlers[] = { { "pci", ncclTopoXmlLoadPci }, { "nic", ncclTopoXmlLoadNic } }; + NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlLoadSystem(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + int version; + NCCLCHECK(xmlGetAttrInt(head, "version", &version)); + if (version != NCCL_TOPO_XML_VERSION) { + WARN("XML Topology has wrong version %d, %d needed", version, NCCL_TOPO_XML_VERSION); + return ncclInvalidUsage; + } + const char* name; + NCCLCHECK(xmlGetAttr(head, "name", &name)); + if (name != NULL) INFO(NCCL_GRAPH, "Loading topology %s", name); + else INFO(NCCL_GRAPH, "Loading unnamed topology"); + + struct xmlHandler handlers[] = { { "cpu", ncclTopoXmlLoadCpu } }; + NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1)); + return ncclSuccess; +} + +ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml) { + FILE* file = fopen(xmlTopoFile, "r"); + if (file == NULL) { + WARN("Could not open XML topology file %s : %s", xmlTopoFile, strerror(errno)); + return ncclSuccess; + } + struct xmlHandler handlers[] = { { "system", ncclTopoXmlLoadSystem } }; + xml->maxIndex = 0; + NCCLCHECK(xmlLoadSub(file, xml, NULL, handlers, 1)); + fclose(file); + return ncclSuccess; +} + +/**********************/ +/* XML creation */ +/* from autodetection */ +/**********************/ + +#define BUSID_SIZE (sizeof("0000:00:00.0")) +#define BUSID_REDUCED_SIZE (sizeof("0000:00")) +static void memcpylower(char* dst, const char* src, const size_t size) { + for (int i=0; i<size; i++) dst[i] = tolower(src[i]); +} +static ncclResult_t getPciPath(const char* busId, char** path) { + char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0"; + memcpylower(busPath+sizeof("/sys/class/pci_bus/")-1, busId, BUSID_REDUCED_SIZE-1); + memcpylower(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, BUSID_SIZE-1); + *path = realpath(busPath, NULL); + if (*path == NULL) { + WARN("Could not find real path of %s", busPath); + return ncclSystemError; + } + return ncclSuccess; +} + +ncclResult_t ncclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue) { + char filePath[PATH_MAX]; + sprintf(filePath, "%s/%s", path, fileName); + int offset = 0; + FILE* file; + if ((file = fopen(filePath, "r")) != NULL) { + while (feof(file) == 0 && ferror(file) == 0 && offset < MAX_STR_LEN) { + int len = fread(strValue+offset, 1, MAX_STR_LEN-offset, file); + offset += len; + } + fclose(file); + } + if (offset == 0) { + strValue[0] = '\0'; + INFO(NCCL_GRAPH, "Topology detection : could not read %s, ignoring", filePath); + } else { + strValue[offset-1] = '\0'; + } + return ncclSuccess; +} + +ncclResult_t ncclTopoSetAttrFromSys(struct ncclXmlNode* pciNode, const char* path, const char* fileName, const char* attrName) { + char strValue[MAX_STR_LEN]; + NCCLCHECK(ncclTopoGetStrFromSys(path, fileName, strValue)); + if (strValue[0] != '\0') { NCCLCHECK(xmlSetAttr(pciNode, attrName, strValue)); } + TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s\n", path, fileName, attrName, strValue); + return ncclSuccess; +} + +ncclResult_t ncclTopoGetXmlFromCpu(struct ncclXmlNode* cpuNode, struct ncclXml* xml) { + int index; + NCCLCHECK(xmlGetAttrIndex(cpuNode, "affinity", &index)); + if (index == -1) { + const char* numaId; + NCCLCHECK(xmlGetAttr(cpuNode, "numaid", &numaId)); + if (numaId == NULL) { + WARN("GetXmlFromCpu : could not find CPU numa ID."); + return ncclInternalError; + } + // Set affinity + char cpumaskPath[] = "/sys/devices/system/node/node0000"; + sprintf(cpumaskPath, "/sys/devices/system/node/node%s", numaId); + NCCLCHECK(ncclTopoSetAttrFromSys(cpuNode, cpumaskPath, "cpumap", "affinity")); + } + + NCCLCHECK(xmlGetAttrIndex(cpuNode, "arch", &index)); + if (index == -1) { + // Fill CPU type / vendor / model +#if defined(__PPC__) + NCCLCHECK(xmlSetAttr(cpuNode, "arch", "ppc64")); +#elif defined(__aarch64__) + NCCLCHECK(xmlSetAttr(cpuNode, "arch", "arm64")); +#elif defined(__x86_64__) + NCCLCHECK(xmlSetAttr(cpuNode, "arch", "x86_64")); +#endif + } + +#if defined(__x86_64__) + NCCLCHECK(xmlGetAttrIndex(cpuNode, "vendor", &index)); + if (index == -1) { + union { + struct { + // CPUID 0 String register order + uint32_t ebx; + uint32_t edx; + uint32_t ecx; + }; + char vendor[12]; + } cpuid0; + + asm volatile("cpuid" : "=b" (cpuid0.ebx), "=c" (cpuid0.ecx), "=d" (cpuid0.edx) : "a" (0) : "memory"); + char vendor[13]; + strncpy(vendor, cpuid0.vendor, 12); + vendor[12] = '\0'; + NCCLCHECK(xmlSetAttr(cpuNode, "vendor", vendor)); + } + + NCCLCHECK(xmlGetAttrIndex(cpuNode, "familyid", &index)); + if (index == -1) { + union { + struct { + unsigned steppingId:4; + unsigned modelId:4; + unsigned familyId:4; + unsigned processorType:2; + unsigned resv0:2; + unsigned extModelId:4; + unsigned extFamilyId:8; + unsigned resv1:4; + }; + uint32_t val; + } cpuid1; + asm volatile("cpuid" : "=a" (cpuid1.val) : "a" (1) : "memory"); + int familyId = cpuid1.familyId + (cpuid1.extFamilyId << 4); + int modelId = cpuid1.modelId + (cpuid1.extModelId << 4); + NCCLCHECK(xmlSetAttrInt(cpuNode, "familyid", familyId)); + NCCLCHECK(xmlSetAttrInt(cpuNode, "modelid", modelId)); + } +#endif + return ncclSuccess; +} + +ncclResult_t ncclTopoGetPciNode(struct ncclXml* xml, const char* busId, struct ncclXmlNode** pciNode) { + NCCLCHECK(xmlFindTagKv(xml, "pci", pciNode, "busid", busId)); + if (*pciNode == NULL) { + NCCLCHECK(xmlAddNode(xml, NULL, "pci", pciNode)); + } + NCCLCHECK(xmlSetAttr(*pciNode, "busid", busId)); + return ncclSuccess; +} + +// Check whether a string is in BDF format or not. +// BDF (Bus-Device-Function) is "BBBB:BB:DD.F" where B, D and F are hex digits. +// There can be trailing chars. +int isHex(char c) { return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')); } +int checkBDFFormat(char* bdf) { + if (bdf[4] != ':' || bdf[7] != ':' || bdf[10] != '.') return 0; + if (isHex(bdf[0]) == 0 || isHex(bdf[1] == 0) || isHex(bdf[2] == 0) || isHex(bdf[3] == 0) || + isHex(bdf[5] == 0) || isHex(bdf[6] == 0) || isHex(bdf[8] == 0) || isHex(bdf[9] == 0) || + isHex(bdf[11] == 0)) return 0; + return 1; +} + +ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* xml) { + // Fill info, then parent + const char* busId; + NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId)); + char* path = NULL; + int index; + NCCLCHECK(xmlGetAttrIndex(pciNode, "class", &index)); + if (index == -1) { + if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); + NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class")); + } + NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index)); + if (index == -1) { + if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); + char deviceSpeedStr[MAX_STR_LEN]; + float deviceSpeed; + NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_speed", deviceSpeedStr)); + sscanf(deviceSpeedStr, "%f GT/s", &deviceSpeed); + char portSpeedStr[MAX_STR_LEN]; + float portSpeed; + NCCLCHECK(ncclTopoGetStrFromSys(path, "../max_link_speed", portSpeedStr)); + sscanf(portSpeedStr, "%f GT/s", &portSpeed); + NCCLCHECK(xmlSetAttr(pciNode, "link_speed", portSpeed < deviceSpeed ? portSpeedStr : deviceSpeedStr)); + } + NCCLCHECK(xmlGetAttrIndex(pciNode, "link_width", &index)); + if (index == -1) { + if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); + char strValue[MAX_STR_LEN]; + NCCLCHECK(ncclTopoGetStrFromSys(path, "max_link_width", strValue)); + int deviceWidth = strtol(strValue, NULL, 0); + NCCLCHECK(ncclTopoGetStrFromSys(path, "../max_link_width", strValue)); + int portWidth = strtol(strValue, NULL, 0); + NCCLCHECK(xmlSetAttrInt(pciNode, "link_width", std::min(deviceWidth,portWidth))); + } + struct ncclXmlNode* parent = pciNode->parent; + if (parent == NULL) { + if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); + + // Save that for later in case next step is a CPU + char numaIdStr[MAX_STR_LEN]; + NCCLCHECK(ncclTopoGetStrFromSys(path, "numa_node", numaIdStr)); + + // Go up one level in the PCI tree. Rewind two "/" and follow the upper PCI + // switch, or stop if we reach a CPU root complex. + int slashCount = 0; + int parentOffset; + for (parentOffset = strlen(path)-1; parentOffset>0; parentOffset--) { + if (path[parentOffset] == '/') { + slashCount++; + path[parentOffset] = '\0'; + int start = parentOffset - 1; + while (start>0 && path[start] != '/') start--; + // Check whether the parent path looks like "BBBB:BB:DD.F" or not. + if (checkBDFFormat(path+start+1) == 0) { + // This a CPU root complex. Create a CPU tag and stop there. + struct ncclXmlNode* topNode; + NCCLCHECK(xmlFindTag(xml, "system", &topNode)); + NCCLCHECK(xmlGetSubKv(topNode, "cpu", &parent, "numaid", numaIdStr)); + if (parent == NULL) { + NCCLCHECK(xmlAddNode(xml, topNode, "cpu", &parent)); + NCCLCHECK(xmlSetAttr(parent, "numaid", numaIdStr)); + } + } else if (slashCount == 2) { + // Continue on the upper PCI switch + for (int i = strlen(path)-1; i>0; i--) { + if (path[i] == '/') { + NCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", path+i+1)); + if (parent == NULL) { + NCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent)); + NCCLCHECK(xmlSetAttr(parent, "busid", path+i+1)); + } + break; + } + } + } + } + if (parent) break; + } + pciNode->parent = parent; + parent->subs[parent->nSubs++] = pciNode; + } + if (strcmp(parent->name, "pci") == 0) { + NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml)); + } else if (strcmp(parent->name, "cpu") == 0) { + NCCLCHECK(ncclTopoGetXmlFromCpu(parent, xml)); + } + free(path); + return ncclSuccess; +} + +ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvmlDev, struct ncclXml* xml, struct ncclXmlNode** gpuNodeRet) { + struct ncclXmlNode* gpuNode = NULL; + NCCLCHECK(xmlGetSub(pciNode, "gpu", &gpuNode)); + if (gpuNode == NULL) NCCLCHECK(xmlAddNode(xml, pciNode, "gpu", &gpuNode)); + + int index = -1; + + int dev = -1; + NCCLCHECK(xmlGetAttrIndex(gpuNode, "dev", &index)); + if (index == -1) { + if (nvmlDev == NULL) { + WARN("No NVML, trying to use CUDA instead"); + const char* busId; + NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId)); + if (busId == NULL || cudaDeviceGetByPCIBusId(&dev, busId) != cudaSuccess) dev = -1; + } else { + NCCLCHECK(wrapNvmlDeviceGetIndex(nvmlDev, (unsigned int*)&dev)); + } + NCCLCHECK(xmlSetAttrInt(gpuNode, "dev", dev)); + } + NCCLCHECK(xmlGetAttrInt(gpuNode, "dev", &dev)); + if (dev == -1) return ncclSuccess; + + NCCLCHECK(xmlGetAttrIndex(gpuNode, "sm", &index)); + if (index == -1) { + int cudaMajor, cudaMinor; + if (nvmlDev == NULL) { + cudaDeviceProp devProp; + CUDACHECK(cudaGetDeviceProperties(&devProp, dev)); + cudaMajor = devProp.major; cudaMinor = devProp.minor; + } else { + NCCLCHECK(wrapNvmlDeviceGetCudaComputeCapability(nvmlDev, &cudaMajor, &cudaMinor)); + } + NCCLCHECK(xmlSetAttrInt(gpuNode, "sm", cudaMajor*10+cudaMinor)); + } + int sm; + NCCLCHECK(xmlGetAttrInt(gpuNode, "sm", &sm)); + + struct ncclXmlNode* nvlNode = NULL; + NCCLCHECK(xmlGetSub(pciNode, "nvlink", &nvlNode)); + if (nvlNode == NULL) { + // NVML NVLink detection + int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : 6; + + if (maxNvLinks > 0 && nvmlDev == NULL) { + WARN("No NVML device handle. Skipping nvlink detection.\n"); + maxNvLinks = 0; + } + + for (int l=0; l<maxNvLinks; ++l) { + // Check whether we can use this NVLink for P2P + unsigned canP2P; + if ((wrapNvmlDeviceGetNvLinkCapability(nvmlDev, l, NVML_NVLINK_CAP_P2P_SUPPORTED, &canP2P) != ncclSuccess) || !canP2P) continue; + + // Make sure the Nvlink is up. The previous call should have trained the link. + nvmlEnableState_t isActive; + if ((wrapNvmlDeviceGetNvLinkState(nvmlDev, l, &isActive) != ncclSuccess) || (isActive != NVML_FEATURE_ENABLED)) continue; + + // Try to figure out what's on the other side of the NVLink + nvmlPciInfo_t remoteProc; + if (wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDev, l, &remoteProc) != ncclSuccess) continue; + + // Make a lower case copy of the bus ID for calling ncclDeviceType + // PCI system path is in lower case + char* p = remoteProc.busId; + char lowerId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + for (int c=0; c<NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE; c++) { + lowerId[c] = tolower(p[c]); + if (p[c] == 0) break; + } + + NCCLCHECK(xmlGetSubKv(gpuNode, "nvlink", &nvlNode, "target", lowerId)); + if (nvlNode == NULL) { + NCCLCHECK(xmlAddNode(xml, gpuNode, "nvlink", &nvlNode)); + NCCLCHECK(xmlSetAttr(nvlNode, "target", lowerId)); + NCCLCHECK(xmlSetAttrInt(nvlNode, "count", 1)); + } else { + int count; + NCCLCHECK(xmlGetAttrInt(nvlNode, "count", &count)); + NCCLCHECK(xmlSetAttrInt(nvlNode, "count", count+1)); + } + } + } + // Fill target classes + for (int s=0; s<gpuNode->nSubs; s++) { + struct ncclXmlNode* sub = gpuNode->subs[s]; + if (strcmp(sub->name, "nvlink") != 0) continue; + int index; + NCCLCHECK(xmlGetAttrIndex(sub, "tclass", &index)); + if (index == -1) { + const char* busId; + NCCLCHECK(xmlGetAttr(sub, "target", &busId)); + char* path; + NCCLCHECK(getPciPath(busId, &path)); + NCCLCHECK(ncclTopoSetAttrFromSys(sub, path, "class", "tclass")); + } + } + *gpuNodeRet = gpuNode; + return ncclSuccess; +} + +ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode) { + struct ncclXmlNode* node; + NCCLCHECK(ncclTopoGetPciNode(xml, busId, &node)); + NCCLCHECK(ncclTopoGetXmlFromSys(node, xml)); + NCCLCHECK(wrapNvmlSymbols()); + NCCLCHECK(wrapNvmlInit()); + nvmlDevice_t nvmlDev; + if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDev) != ncclSuccess) nvmlDev = NULL; + NCCLCHECK(ncclTopoGetXmlFromGpu(node, nvmlDev, xml, gpuNode)); + return ncclSuccess; +} + +// Returns the subsystem name of a path, i.e. the end of the path +// where sysPath/subsystem points to. +ncclResult_t ncclTopoGetSubsystem(const char* sysPath, char* subSys) { + char subSysPath[PATH_MAX]; + sprintf(subSysPath, "%s/subsystem", sysPath); + char* path = realpath(subSysPath, NULL); + if (path == NULL) { + subSys[0] = '\0'; + } else { + int offset; + for (offset = strlen(path); offset > 0 && path[offset] != '/'; offset--); + strcpy(subSys, path+offset+1); + free(path); + } + return ncclSuccess; +} + +ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode) { + NCCLCHECK(xmlFindTagKv(xml, "net", netNode, "name", netName)); + if (*netNode != NULL) return ncclSuccess; + + const char* pciSysPath = pciPath; + if (pciSysPath) { + char subSystem[PATH_MAX]; + NCCLCHECK(ncclTopoGetSubsystem(pciSysPath, subSystem)); + // This is not a PCI device (virtual, usb, ...). + if (strcmp(subSystem, "pci") != 0) { + INFO(NCCL_GRAPH, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem); + pciSysPath = NULL; + } + } + + struct ncclXmlNode* parent = NULL; + if (pciSysPath) { + int offset; + for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--); + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + strcpy(busId, pciSysPath+offset+1); + NCCLCHECK(xmlFindTagKv(xml, "pci", &parent, "busid", busId)); + if (parent == NULL) { + NCCLCHECK(xmlAddNode(xml, NULL, "pci", &parent)); + NCCLCHECK(xmlSetAttr(parent, "busid", busId)); + NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml)); + } + } else { + // Virtual NIC, no PCI device, attach to first CPU + NCCLCHECK(xmlFindTag(xml, "cpu", &parent)); + } + + struct ncclXmlNode* nicNode = NULL; + NCCLCHECK(xmlGetSub(parent, "nic", &nicNode)); + if (nicNode == NULL) { + NCCLCHECK(xmlAddNode(xml, parent, "nic", &nicNode)); + } + + // We know that this net does not exist yet (we searched for it at the + // beginning of this function), so we can add it. + NCCLCHECK(xmlAddNode(xml, nicNode, "net", netNode)); + NCCLCHECK(xmlSetAttr(*netNode, "name", netName)); + return ncclSuccess; +} + +/**************************************************/ +/* Parser rules for the user-defined graph search */ +/**************************************************/ + +ncclResult_t ncclTopoXmlGraphLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlGraphLoadNet(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + NCCLCHECK(xmlLoadSub(file, xml, head, NULL, 0)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlGraphLoadChannel(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + struct xmlHandler handlers[] = { { "net", ncclTopoXmlGraphLoadNet }, { "gpu", ncclTopoXmlGraphLoadGpu } }; + NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 2)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlGraphLoadGraph(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { + struct xmlHandler handlers[] = { { "channel", ncclTopoXmlGraphLoadChannel } }; + NCCLCHECK(xmlLoadSub(file, xml, head, handlers, 1)); + return ncclSuccess; +} + +ncclResult_t ncclTopoXmlGraphLoadGraphs(FILE* file, struct ncclXml* xmlGraph, struct ncclXmlNode* head) { + int version; + NCCLCHECK(xmlGetAttrInt(head, "version", &version)); + if (version != NCCL_GRAPH_XML_VERSION) { + WARN("XML Graph has wrong version %d, %d needed", version, NCCL_GRAPH_XML_VERSION); + return ncclInvalidUsage; + } + const char* name; + NCCLCHECK(xmlGetAttr(head, "name", &name)); + if (name != NULL) INFO(NCCL_GRAPH, "Loading graphs for topology %s", name); + else INFO(NCCL_GRAPH, "Loading graphs"); + + struct xmlHandler handlers[] = { { "graph", ncclTopoXmlGraphLoadGraph } }; + NCCLCHECK(xmlLoadSub(file, xmlGraph, head, handlers, 1)); + return ncclSuccess; +} + +ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXml* xml) { + FILE* file = fopen(xmlGraphFile, "r"); + if (file == NULL) { + WARN("Could not open XML graph file %s : %s", xmlGraphFile, strerror(errno)); + return ncclSystemError; + } + struct xmlHandler handlers[] = { { "graphs", ncclTopoXmlGraphLoadGraphs } }; + xml->maxIndex = 0; + NCCLCHECK(xmlLoadSub(file, xml, NULL, handlers, 1)); + fclose(file); + return ncclSuccess; +} |