Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSylvain Jeaugey <sjeaugey@nvidia.com>2018-11-13 21:37:20 +0300
committerSylvain Jeaugey <sjeaugey@nvidia.com>2018-11-27 03:24:31 +0300
commit0d3a20f96d4887bee86a0fd7bf79feb14e5a01f5 (patch)
tree117f10ef33e424e7a62886c46b4994b9e17bfd30 /src/transport/net_socket.cu
parentd7a58cfa5865c4f627a128c3238cc72502649881 (diff)
Add support for external network.
Dynamically load external network from libnccl-net.so. Add init function in networks. Move PCI scoring to net.cu, only ask transport to provide a path. Simplify CUDA PCI path detection. Add dummy external network
Diffstat (limited to 'src/transport/net_socket.cu')
-rw-r--r--src/transport/net_socket.cu53
1 files changed, 23 insertions, 30 deletions
diff --git a/src/transport/net_socket.cu b/src/transport/net_socket.cu
index cff1973..a8ae866 100644
--- a/src/transport/net_socket.cu
+++ b/src/transport/net_socket.cu
@@ -8,67 +8,58 @@
#include "core.h"
#include "socket.h"
#include "net.h"
-#include "topo.h"
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <poll.h>
+#include <limits.h>
/* Init functions */
-
-ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) {
- *supportedTypes = NCCL_PTR_HOST;
- return ncclSuccess;
-}
-
static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS];
static union socketAddress ncclNetIfAddrs[MAX_IFS];
static int ncclNetIfs = -1;
pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER;
-static void initDevices() {
+ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) {
if (ncclNetIfs == -1) {
pthread_mutex_lock(&ncclSocketLock);
if (ncclNetIfs == -1) {
ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS);
- INFO(INIT|NET,"NET/Socket : %d interfaces found", ncclNetIfs);
+ INFO(NCCL_INIT|NCCL_NET,"NET/Socket : %d interfaces found", ncclNetIfs);
if (ncclNetIfs <= 0) {
WARN("NET/Socket : no interface found");
+ return ncclInternalError;
}
}
pthread_mutex_unlock(&ncclSocketLock);
}
+ return ncclSuccess;
}
-ncclResult_t ncclSocketDevices(int* ndev, int** scores) {
- initDevices();
+ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) {
+ *supportedTypes = NCCL_PTR_HOST;
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketDevices(int* ndev) {
*ndev = ncclNetIfs;
- int cudaDev;
- cudaGetDevice(&cudaDev);
- char* cudaPath;
- ncclResult_t err1 = getCudaPath(cudaDev, &cudaPath);
- int* sc;
- NCCLCHECK(ncclCalloc(&sc, ncclNetIfs));
- char line[1024];
- sprintf(line, "CUDA Dev %d, IP Interfaces : ", cudaDev);
- for (int i=0; i<ncclNetIfs; i++) {
- char* sockPath;
- ncclResult_t err2 = getSockPath(ncclNetIfNames+i*MAX_IF_NAME_SIZE, &sockPath);
- int distance = (err1 != ncclSuccess || err2 != ncclSuccess || sockPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(sockPath, cudaPath);
- sprintf(line+strlen(line), "%s(%s) ", ncclNetIfNames+i*MAX_IF_NAME_SIZE, pathDists[distance]);
- sc[i] = 1+PATH_SOC-distance;
- if (err2 == ncclSuccess) free(sockPath);
+ return ncclSuccess;
+}
+
+ncclResult_t ncclSocketPciPath(int dev, char** path) {
+ char devicepath[PATH_MAX];
+ snprintf(devicepath, PATH_MAX, "/sys/class/net/%s/device", ncclNetIfNames+dev*MAX_IF_NAME_SIZE);
+ *path = realpath(devicepath, NULL);
+ if (*path == NULL) {
+ INFO(NCCL_NET|NCCL_INIT, "Could not find real path of %s", devicepath);
+ return ncclSystemError;
}
- INFO(INIT|NET,"%s", line);
- if (err1 == ncclSuccess) free(cudaPath);
- *scores = sc;
return ncclSuccess;
}
static ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) {
- if (ncclNetIfs == -1) initDevices();
if (dev >= ncclNetIfs) return ncclInternalError;
memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr));
return ncclSuccess;
@@ -223,7 +214,9 @@ ncclResult_t ncclSocketClose(void* opaqueComm) {
ncclNet_t ncclNetSocket = {
"Socket",
+ ncclSocketInit,
ncclSocketDevices,
+ ncclSocketPciPath,
ncclSocketPtrSupport,
ncclSocketListen,
ncclSocketConnect,