Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSylvain Jeaugey <sjeaugey@nvidia.com>2018-11-06 03:56:14 +0300
committerSylvain Jeaugey <sjeaugey@nvidia.com>2018-11-29 23:52:13 +0300
commiteed8218e17a8a3c10dbfb1ebf739e5896b583ebd (patch)
tree62de00818d7446165e2dd285b88dabde5644a4d4
parent302d538b7308c704aa689d327ef4a9ca161e889a (diff)
Rework shared memory code to use SYSCHECK macros.
This is to handle EINTR/EGAIN properly (issue #137), and also make the code consistent with the rest. Unfortunately posix_fallocate and mmap do not follow the classic return code/errno pattern, so we need to write wrappers around those functions.
-rw-r--r--src/include/shm.h73
1 files changed, 34 insertions, 39 deletions
diff --git a/src/include/shm.h b/src/include/shm.h
index ce46a16..4fb49cb 100644
--- a/src/include/shm.h
+++ b/src/include/shm.h
@@ -12,51 +12,46 @@
#include <sys/stat.h>
#include <fcntl.h>
-static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPtr, void** devShmPtr, int create) {
- *shmPtr = NULL;
- int fd = shm_open(shmname, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
- if (fd == -1) {
- WARN("shm_open failed to open %s : %s", shmname, strerror(errno));
- return ncclSystemError;
- }
+// Change functions behavior to match other SYS functions
+static int shm_allocate(int fd, const int shmsize) {
+ int err = posix_fallocate(fd, 0, shmsize);
+ if (err) { errno = err; return -1; }
+ return 0;
+}
+static int shm_map(int fd, const int shmsize, void** ptr) {
+ *ptr = mmap(NULL, shmsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ return (*ptr == MAP_FAILED) ? -1 : 0;
+}
- if (create) {
- int res = posix_fallocate(fd, 0, shmsize);
- if (res != 0) {
- WARN("Unable to allocate shared memory (%d bytes) : %s", shmsize, strerror(res));
- shm_unlink(shmname);
- close(fd);
- return ncclSystemError;
- }
- }
+static ncclResult_t shmSetup(const char* shmname, const int shmsize, int* fd, void** ptr, int create) {
+ SYSCHECKVAL(shm_open(shmname, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR), "shm_open", *fd);
+ if (create) SYSCHECK(shm_allocate(*fd, shmsize), "posix_fallocate");
+ SYSCHECK(shm_map(*fd, shmsize, ptr), "mmap");
+ close(*fd);
+ *fd = -1;
+ if (create) memset(*ptr, 0, shmsize);
+ return ncclSuccess;
+}
- void *ptr = mmap(NULL, shmsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
- close(fd);
- if (ptr == MAP_FAILED) {
- WARN("failure in mmap of %s (size %d) : %s", shmname, shmsize, strerror(errno));
- shm_unlink(shmname);
- return ncclSystemError;
- }
- if (create) {
- memset(ptr, 0, shmsize);
- }
+static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPtr, void** devShmPtr, int create) {
+ int fd = -1;
+ void* ptr = MAP_FAILED;
+ ncclResult_t res = ncclSuccess;
- cudaError_t e;
- if ((e=cudaHostRegister(ptr, shmsize, cudaHostRegisterMapped)) != cudaSuccess) {
- WARN("failed to register host buffer %p : %s", ptr, cudaGetErrorString(e));
- if (create) shm_unlink(shmname);
- munmap(ptr, shmsize);
- return ncclUnhandledCudaError;
- }
+ NCCLCHECKGOTO(shmSetup(shmname, shmsize, &fd, &ptr, create), res, sysError);
+ CUDACHECKGOTO(cudaHostRegister(ptr, shmsize, cudaHostRegisterMapped), res, cudaError);
+ CUDACHECKGOTO(cudaHostGetDevicePointer(devShmPtr, ptr, 0), res, cudaError);
- if ((e=cudaHostGetDevicePointer(devShmPtr, ptr, 0)) != cudaSuccess) {
- WARN("failed to get device pointer for local shmem %p : %s", ptr, cudaGetErrorString(e));
- if (create) shm_unlink(shmname);
- munmap(ptr, shmsize);
- return ncclUnhandledCudaError;
- }
*shmPtr = ptr;
return ncclSuccess;
+sysError:
+ WARN("Error while %s shared memory segment %s (size %d)\n", create ? "creating" : "attaching to", shmname, shmsize);
+cudaError:
+ if (fd != -1) close(fd);
+ if (create) shm_unlink(shmname);
+ if (ptr != MAP_FAILED) munmap(ptr, shmsize);
+ *shmPtr = NULL;
+ return res;
}
static ncclResult_t shmUnlink(const char* shmname) {