diff options
author | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2018-11-06 03:56:14 +0300 |
---|---|---|
committer | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2018-11-29 23:52:13 +0300 |
commit | eed8218e17a8a3c10dbfb1ebf739e5896b583ebd (patch) | |
tree | 62de00818d7446165e2dd285b88dabde5644a4d4 | |
parent | 302d538b7308c704aa689d327ef4a9ca161e889a (diff) |
Rework shared memory code to use SYSCHECK macros.
This is to handle EINTR/EGAIN properly (issue #137), and also
make the code consistent with the rest.
Unfortunately posix_fallocate and mmap do not follow the classic
return code/errno pattern, so we need to write wrappers around those
functions.
-rw-r--r-- | src/include/shm.h | 73 |
1 files changed, 34 insertions, 39 deletions
diff --git a/src/include/shm.h b/src/include/shm.h index ce46a16..4fb49cb 100644 --- a/src/include/shm.h +++ b/src/include/shm.h @@ -12,51 +12,46 @@ #include <sys/stat.h> #include <fcntl.h> -static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPtr, void** devShmPtr, int create) { - *shmPtr = NULL; - int fd = shm_open(shmname, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); - if (fd == -1) { - WARN("shm_open failed to open %s : %s", shmname, strerror(errno)); - return ncclSystemError; - } +// Change functions behavior to match other SYS functions +static int shm_allocate(int fd, const int shmsize) { + int err = posix_fallocate(fd, 0, shmsize); + if (err) { errno = err; return -1; } + return 0; +} +static int shm_map(int fd, const int shmsize, void** ptr) { + *ptr = mmap(NULL, shmsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + return (*ptr == MAP_FAILED) ? -1 : 0; +} - if (create) { - int res = posix_fallocate(fd, 0, shmsize); - if (res != 0) { - WARN("Unable to allocate shared memory (%d bytes) : %s", shmsize, strerror(res)); - shm_unlink(shmname); - close(fd); - return ncclSystemError; - } - } +static ncclResult_t shmSetup(const char* shmname, const int shmsize, int* fd, void** ptr, int create) { + SYSCHECKVAL(shm_open(shmname, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "shm_open", *fd); + if (create) SYSCHECK(shm_allocate(*fd, shmsize), "posix_fallocate"); + SYSCHECK(shm_map(*fd, shmsize, ptr), "mmap"); + close(*fd); + *fd = -1; + if (create) memset(*ptr, 0, shmsize); + return ncclSuccess; +} - void *ptr = mmap(NULL, shmsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - close(fd); - if (ptr == MAP_FAILED) { - WARN("failure in mmap of %s (size %d) : %s", shmname, shmsize, strerror(errno)); - shm_unlink(shmname); - return ncclSystemError; - } - if (create) { - memset(ptr, 0, shmsize); - } +static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPtr, void** devShmPtr, int create) { + int fd = -1; + void* ptr = MAP_FAILED; + ncclResult_t res = ncclSuccess; - cudaError_t e; - if ((e=cudaHostRegister(ptr, shmsize, cudaHostRegisterMapped)) != cudaSuccess) { - WARN("failed to register host buffer %p : %s", ptr, cudaGetErrorString(e)); - if (create) shm_unlink(shmname); - munmap(ptr, shmsize); - return ncclUnhandledCudaError; - } + NCCLCHECKGOTO(shmSetup(shmname, shmsize, &fd, &ptr, create), res, sysError); + CUDACHECKGOTO(cudaHostRegister(ptr, shmsize, cudaHostRegisterMapped), res, cudaError); + CUDACHECKGOTO(cudaHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError); - if ((e=cudaHostGetDevicePointer(devShmPtr, ptr, 0)) != cudaSuccess) { - WARN("failed to get device pointer for local shmem %p : %s", ptr, cudaGetErrorString(e)); - if (create) shm_unlink(shmname); - munmap(ptr, shmsize); - return ncclUnhandledCudaError; - } *shmPtr = ptr; return ncclSuccess; +sysError: + WARN("Error while %s shared memory segment %s (size %d)\n", create ? "creating" : "attaching to", shmname, shmsize); +cudaError: + if (fd != -1) close(fd); + if (create) shm_unlink(shmname); + if (ptr != MAP_FAILED) munmap(ptr, shmsize); + *shmPtr = NULL; + return res; } static ncclResult_t shmUnlink(const char* shmname) { |