diff options
author | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2018-11-13 21:37:20 +0300 |
---|---|---|
committer | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2018-11-27 03:24:31 +0300 |
commit | 0d3a20f96d4887bee86a0fd7bf79feb14e5a01f5 (patch) | |
tree | 117f10ef33e424e7a62886c46b4994b9e17bfd30 /src/transport/net_ib.cu | |
parent | d7a58cfa5865c4f627a128c3238cc72502649881 (diff) |
Add support for external network.
Dynamically load external network from libnccl-net.so.
Add init function in networks.
Move PCI scoring to net.cu, only ask transport to provide a path.
Simplify CUDA PCI path detection.
Add dummy external network
Diffstat (limited to 'src/transport/net_ib.cu')
-rw-r--r-- | src/transport/net_ib.cu | 103 |
1 files changed, 34 insertions, 69 deletions
diff --git a/src/transport/net_ib.cu b/src/transport/net_ib.cu index 7b1fc99..fb8bd7b 100644 --- a/src/transport/net_ib.cu +++ b/src/transport/net_ib.cu @@ -82,8 +82,12 @@ static void* ncclIbAsyncThreadMain(void* args) { return NULL; } -static void initDevices() { - if(wrap_ibv_symbols() != ncclSuccess) { return; } +NCCL_PARAM(IbDisable, "IB_DISABLE", 0); + +ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { + if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; } + if (ncclParamIbDisable()) return ncclInternalError; + if (ncclNIbDevs == -1) { pthread_mutex_lock(&ncclIbLock); wrap_ibv_fork_init(); @@ -91,9 +95,9 @@ static void initDevices() { ncclNIbDevs = 0; if (findInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) { WARN("NET/IB : No IP interface found."); - return; + return ncclInternalError; } - INFO(INIT|NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName); + INFO(NCCL_INIT|NCCL_NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName); // Detect IB cards int nIbDevs; @@ -105,7 +109,7 @@ static void initDevices() { bool searchNot = userIbEnv && userIbEnv[0] == '^'; int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS); - if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return; + if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return ncclInternalError; for (int d=0; d<nIbDevs; d++) { struct ibv_context * context; @@ -134,7 +138,7 @@ static void initDevices() { if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) { continue; } - INFO(INIT|NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port, + INFO(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port, portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); ncclIbDevs[ncclNIbDevs].device = d; ncclIbDevs[ncclNIbDevs].port = port; @@ -145,38 +149,29 @@ static void initDevices() { pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context); } - if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return; } } + if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } } } } - if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return; }; + if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; }; } - pthread_mutex_unlock(&ncclIbLock); } + return ncclSuccess; } -ncclResult_t ncclIbDevices(int* ndev, int** scores) { - initDevices(); +ncclResult_t ncclIbDevices(int* ndev) { *ndev = ncclNIbDevs; - int cudaDev; - cudaGetDevice(&cudaDev); - char* cudaPath; - ncclResult_t err1 = getCudaPath(cudaDev, &cudaPath); - int* sc; - NCCLCHECK(ncclCalloc(&sc, ncclNIbDevs)); - char line[1024]; - sprintf(line, "CUDA Dev %d, IB Ports : ", cudaDev); - for (int d=0; d<ncclNIbDevs; d++) { - char* mlxPath; - ncclResult_t err2 = getMlxPath(ncclIbDevs[d].devName, &mlxPath); - int distance = (err1 != ncclSuccess || err2 != ncclSuccess || mlxPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(mlxPath, cudaPath); - sprintf(line+strlen(line), "%s/%d(%s) ", ncclIbDevs[d].devName, ncclIbDevs[d].port, pathDists[distance]); - sc[d] = 1+PATH_SOC-distance; - if (err2 == ncclSuccess) free(mlxPath); + return ncclSuccess; +} + +ncclResult_t ncclIbPciPath(int dev, char** path) { + char devicepath[PATH_MAX]; + snprintf(devicepath, PATH_MAX, "/sys/class/infiniband/%s/device", ncclIbDevs[dev].devName); + *path = realpath(devicepath, NULL); + if (*path == NULL) { + WARN("Could not find real path of %s", devicepath); + return ncclSystemError; } - INFO(INIT|NET,"%s", line); - if (err1 == ncclSuccess) free(cudaPath); - *scores = sc; return ncclSuccess; } @@ -207,45 +202,21 @@ ncclResult_t ncclIbGdrSupport(int ibDev) { return ret; } -NCCL_PARAM(IbGdrLevel, "IB_GDR_LEVEL", -2); -NCCL_PARAM(IbCudaSupport, "IB_CUDA_SUPPORT", -2); - ncclResult_t ncclIbPtrSupport(int dev, int* supportedTypes) { - initDevices(); *supportedTypes = NCCL_PTR_HOST; int cudaDev; - if (cudaGetDevice(&cudaDev) != cudaSuccess) return ncclSuccess; - - int ibGdrLevel = PATH_PHB; - if (ncclParamIbCudaSupport() != -2) ibGdrLevel = ncclParamIbCudaSupport() ? PATH_SOC + 1 : 0; - if (ncclParamIbGdrLevel() != -2) ibGdrLevel = ncclParamIbGdrLevel(); - if (ibGdrLevel > 0) { - int gdrSupport = ncclIbGdrSupport(dev); - if (gdrSupport > 0) { - INFO(INIT|NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (%s)", cudaDev, ncclIbDevs[dev].devName, gdrSupport == 1 ? "no module" : "not supported by GPU"); - ibGdrLevel = 0; - } - } - - if (ibGdrLevel <= 0) return ncclSuccess; + CUDACHECK(cudaGetDevice(&cudaDev)); - char* cudaPath; - if (getCudaPath(cudaDev, &cudaPath) != ncclSuccess) return ncclSuccess; - char* mlxPath; - if (getMlxPath(ncclIbDevs[dev].devName, &mlxPath) != ncclSuccess) { free(cudaPath); return ncclSuccess; } - int distance = (mlxPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(mlxPath, cudaPath); - free(mlxPath); free(cudaPath); - if (distance < ibGdrLevel) { - *supportedTypes |= NCCL_PTR_CUDA; - } else { - INFO(INIT|NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (distance %d >= %d)", cudaDev, ncclIbDevs[dev].devName, distance, ibGdrLevel); + if (ncclIbGdrSupport(dev) != ncclSuccess) { + INFO(NCCL_INIT|NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (no module or not supported by GPU)", cudaDev, ncclIbDevs[dev].devName); + return ncclSuccess; } + *supportedTypes |= NCCL_PTR_CUDA; return ncclSuccess; } static ncclResult_t GetSocketAddr(union socketAddress* addr) { - if (ncclNIbDevs == -1) initDevices(); memcpy(addr, &ncclIbIfAddr, sizeof(*addr)); return ncclSuccess; } @@ -442,7 +413,6 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) { *sendComm = comm; // IB Setup - initDevices(); /*NOTE: We need to do this for ncclNet unit test that bypasses nccl initialization*/ ibv_context* ctx = ncclIbDevs[dev].context; NCCLCHECK(ncclIbInitVerbs(ctx, &comm->verbs)); uint8_t ib_port = ncclIbDevs[dev].port; @@ -464,13 +434,13 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) { // RoCE support qpInfo.lid = portAttr.lid; if (qpInfo.lid) { // IB - INFO(INIT|NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid); + INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid); } else { // RoCE union ibv_gid gid; NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid)); qpInfo.spn = gid.global.subnet_prefix; qpInfo.iid = gid.global.interface_id; - INFO(INIT|NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid); + INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid); } NCCLCHECK(socketSend(comm->fd, &qpInfo, sizeof(qpInfo))); @@ -649,7 +619,7 @@ ncclResult_t ncclIbGetMr(struct ncclIbVerbs* verbs, void* data, int size, struct NCCLCHECK(wrap_ibv_reg_mr(&verbs->mrPool[elem].mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); *mrRet = verbs->mrPool+elem; verbs->mrPool[elem].refcnt++; - TRACE(INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey); + TRACE(NCCL_INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey); return ncclSuccess; } @@ -903,7 +873,9 @@ ncclResult_t ncclIbCloseListen(void* listenComm) { ncclNet_t ncclNetIb = { "IB", + ncclIbInit, ncclIbDevices, + ncclIbPciPath, ncclIbPtrSupport, ncclIbListen, ncclIbConnect, @@ -917,10 +889,3 @@ ncclNet_t ncclNetIb = { ncclIbCloseListen }; -NCCL_PARAM(IbDisable, "IB_DISABLE", 0); - -bool ncclIbSupport() { - if (ncclParamIbDisable()) return 0; - initDevices(); - return ncclNIbDevs > 0; -} |