diff options
author | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2018-11-13 21:37:20 +0300 |
---|---|---|
committer | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2018-11-27 03:24:31 +0300 |
commit | 0d3a20f96d4887bee86a0fd7bf79feb14e5a01f5 (patch) | |
tree | 117f10ef33e424e7a62886c46b4994b9e17bfd30 /src/transport/net_socket.cu | |
parent | d7a58cfa5865c4f627a128c3238cc72502649881 (diff) |
Add support for external network.
Dynamically load external network from libnccl-net.so.
Add init function in networks.
Move PCI scoring to net.cu, only ask transport to provide a path.
Simplify CUDA PCI path detection.
Add dummy external network
Diffstat (limited to 'src/transport/net_socket.cu')
-rw-r--r-- | src/transport/net_socket.cu | 53 |
1 files changed, 23 insertions, 30 deletions
diff --git a/src/transport/net_socket.cu b/src/transport/net_socket.cu index cff1973..a8ae866 100644 --- a/src/transport/net_socket.cu +++ b/src/transport/net_socket.cu @@ -8,67 +8,58 @@ #include "core.h" #include "socket.h" #include "net.h" -#include "topo.h" #include <assert.h> #include <pthread.h> #include <stdio.h> #include <stdlib.h> #include <poll.h> +#include <limits.h> /* Init functions */ - -ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) { - *supportedTypes = NCCL_PTR_HOST; - return ncclSuccess; -} - static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS]; static union socketAddress ncclNetIfAddrs[MAX_IFS]; static int ncclNetIfs = -1; pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER; -static void initDevices() { +ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) { if (ncclNetIfs == -1) { pthread_mutex_lock(&ncclSocketLock); if (ncclNetIfs == -1) { ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS); - INFO(INIT|NET,"NET/Socket : %d interfaces found", ncclNetIfs); + INFO(NCCL_INIT|NCCL_NET,"NET/Socket : %d interfaces found", ncclNetIfs); if (ncclNetIfs <= 0) { WARN("NET/Socket : no interface found"); + return ncclInternalError; } } pthread_mutex_unlock(&ncclSocketLock); } + return ncclSuccess; } -ncclResult_t ncclSocketDevices(int* ndev, int** scores) { - initDevices(); +ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) { + *supportedTypes = NCCL_PTR_HOST; + return ncclSuccess; +} + +ncclResult_t ncclSocketDevices(int* ndev) { *ndev = ncclNetIfs; - int cudaDev; - cudaGetDevice(&cudaDev); - char* cudaPath; - ncclResult_t err1 = getCudaPath(cudaDev, &cudaPath); - int* sc; - NCCLCHECK(ncclCalloc(&sc, ncclNetIfs)); - char line[1024]; - sprintf(line, "CUDA Dev %d, IP Interfaces : ", cudaDev); - for (int i=0; i<ncclNetIfs; i++) { - char* sockPath; - ncclResult_t err2 = getSockPath(ncclNetIfNames+i*MAX_IF_NAME_SIZE, &sockPath); - int distance = (err1 != ncclSuccess || err2 != ncclSuccess || sockPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(sockPath, cudaPath); - sprintf(line+strlen(line), "%s(%s) ", ncclNetIfNames+i*MAX_IF_NAME_SIZE, pathDists[distance]); - sc[i] = 1+PATH_SOC-distance; - if (err2 == ncclSuccess) free(sockPath); + return ncclSuccess; +} + +ncclResult_t ncclSocketPciPath(int dev, char** path) { + char devicepath[PATH_MAX]; + snprintf(devicepath, PATH_MAX, "/sys/class/net/%s/device", ncclNetIfNames+dev*MAX_IF_NAME_SIZE); + *path = realpath(devicepath, NULL); + if (*path == NULL) { + INFO(NCCL_NET|NCCL_INIT, "Could not find real path of %s", devicepath); + return ncclSystemError; } - INFO(INIT|NET,"%s", line); - if (err1 == ncclSuccess) free(cudaPath); - *scores = sc; return ncclSuccess; } static ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) { - if (ncclNetIfs == -1) initDevices(); if (dev >= ncclNetIfs) return ncclInternalError; memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr)); return ncclSuccess; @@ -223,7 +214,9 @@ ncclResult_t ncclSocketClose(void* opaqueComm) { ncclNet_t ncclNetSocket = { "Socket", + ncclSocketInit, ncclSocketDevices, + ncclSocketPciPath, ncclSocketPtrSupport, ncclSocketListen, ncclSocketConnect, |