Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorNathan Luehr <nluehr@nvidia.com>2016-02-18 22:59:54 +0300
committerPrzemek Tredak <ptredak@nvidia.com>2016-02-19 00:45:42 +0300
commit5554a4c9f0d81a39f1a737c92ade1fe88de6760e (patch)
tree60d3d7ac08aa7e36eb2af29e1d163df6f186e0ca /src
parent9442285526c082ba85520e722389fb8b87092c35 (diff)
Fixed useRemoteRecv consistency issue.
Change-Id: Ib093a8dc3bb093eddc89dad81d3fffa53c03a6a2 Reviewed-on: http://git-master/r/1013543 Reviewed-by: Cliff Woolley <jwoolley@nvidia.com> Tested-by: Przemek Tredak <ptredak@nvidia.com>
Diffstat (limited to 'src')
-rw-r--r--src/core.cu143
1 files changed, 85 insertions, 58 deletions
diff --git a/src/core.cu b/src/core.cu
index 67ac7e3..6adeb36 100644
--- a/src/core.cu
+++ b/src/core.cu
@@ -133,6 +133,7 @@ typedef struct {
union {
struct {
volatile int bar;
+ int ringDirectFail;
};
char pad[16];
};
@@ -178,6 +179,22 @@ static ncclResult_t initGather(RankGather** gather, ncclUniqueId commId,
return ncclSuccess;
}
+static void syncRingDirect(RankGather* gather, int* ringDirectOk) {
+ int bar_tmp = gather->bar - 1;
+ int ndev = gather->ranks[0].ndev;
+ bool swapped;
+ do {
+ bar_tmp += 1;
+ swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1);
+ } while(!swapped);
+
+ while (gather->bar != 2*ndev) // Wait for all ranks to arrive at this second barrier
+ sched_yield();
+ __sync_synchronize();
+
+ *ringDirectOk = gather->ringDirectFail ? 0 : 1;
+}
+
static ncclResult_t closeGather(RankGather* gather, int ndev) {
int bar_tmp = gather->bar - 1;
bool swapped;
@@ -186,7 +203,7 @@ static ncclResult_t closeGather(RankGather* gather, int ndev) {
swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1);
} while(!swapped);
- while (gather->bar != 2*ndev)
+ while (gather->bar != 3*ndev) // Wait for all ranks to arrive at this third barrier
sched_yield();
__sync_synchronize();
@@ -347,7 +364,7 @@ static ncclResult_t commClearMaps(ncclComm_t comm) {
return retval;
}
-static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int rank, RankEntry* ranks) {
+static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int rank, RankEntry* ranks, int* ringDirectFailed) {
int ndev = comm->nDev;
for(int i=0; i<ndev; ++i) {
// Check for inconsistencies between ranks
@@ -418,64 +435,65 @@ static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int ran
}
}
- if (iPid == myPid && (canpeer || myDev == iDev)) {
- INFO("rank access %d -> %d via P2P device mem", rank, iRank);
- comm->ptrs[i].local = ranks[myId].devptr;
- comm->ptrs[i].remote = ranks[i].devptr;
- comm->ptrs[i].remoteCleanup = CLEANUP_NONE;
- } else if (iPid == myPid) {
- INFO("rank access %d -> %d via zero-copy host mem", rank, iRank);
- if (j <= 2) {
- comm->useRemoteRecv = 0;
- }
- if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myId].hostptr, 0) != cudaSuccess) {
- WARN("rank %d failed to map zero copy buffer to device", rank);
- commClearMaps(comm);
- return ncclUnhandledCudaError;
- }
- if (cudaHostGetDevicePointer(&comm->ptrs[i].remote, ranks[i].hostptr, 0) != cudaSuccess) {
- WARN("rank %d failed to map %d's zero copy buffer to device", rank, iRank);
- commClearMaps(comm);
- return ncclUnhandledCudaError;
- }
- comm->ptrs[i].remoteCleanup = CLEANUP_NONE;
- } else if (canpeer || myDev == iDev) {
- INFO("rank access %d -> %d via Ipc P2P device mem", rank, iRank);
- comm->useRemoteRecv = 0;
- comm->ptrs[i].local = ranks[myId].devptr;
- if (wrapCuIpcOpenMemHandle((CUdeviceptr*)(&comm->ptrs[i].remote),
- ranks[i].devipc, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS) != ncclSuccess) {
- WARN("rank %d failed to open Ipc handle to rank %d", rank, iRank);
- commClearMaps(comm);
- return ncclUnhandledCudaError;
- }
- comm->ptrs[i].remoteCleanup = CLEANUP_CUIPC;
- comm->ptrs[i].cleanupHandle = comm->ptrs[i].remote;
- } else {
- INFO("rank access %d -> %d via zero copy host shm", rank, iRank);
- comm->useRemoteRecv = 0;
- if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myId].hostptr, 0) != cudaSuccess) {
- WARN("rank %d failed to obtain dev ptr to sysmem buffer", rank);
- commClearMaps(comm);
- return ncclUnhandledCudaError;
- }
- char rankname[1024];
- sprintf(rankname, "%s-%d", commId->internal, ranks[i].rank);
- if (openHostMemShm(rankname, (ncclMem**)&comm->ptrs[i].cleanupHandle, ranks[i].buffSize)
- != ncclSuccess) {
- WARN("rank %d failed to open sysmem buffer of rank %d", rank, iRank);
- commClearMaps(comm);
- return ncclUnhandledCudaError;
+ if (iPid == myPid) {
+ if (canpeer || myDev == iDev) {
+ INFO("rank access %d -> %d via P2P device mem", rank, iRank);
+ comm->ptrs[i].local = ranks[myId].devptr;
+ comm->ptrs[i].remote = ranks[i].devptr;
+ comm->ptrs[i].remoteCleanup = CLEANUP_NONE;
+ } else { // go through hostmem
+ INFO("rank access %d -> %d via zero-copy host mem", rank, iRank);
+ if (j <= 2)
+ *ringDirectFailed = 1;
+ if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myId].hostptr, 0) != cudaSuccess) {
+ WARN("rank %d failed to map zero copy buffer to device", rank);
+ commClearMaps(comm);
+ return ncclUnhandledCudaError;
+ }
+ if (cudaHostGetDevicePointer(&comm->ptrs[i].remote, ranks[i].hostptr, 0) != cudaSuccess) {
+ WARN("rank %d failed to map %d's zero copy buffer to device", rank, iRank);
+ commClearMaps(comm);
+ return ncclUnhandledCudaError;
+ }
+ comm->ptrs[i].remoteCleanup = CLEANUP_NONE;
}
- if (cudaHostGetDevicePointer(&comm->ptrs[i].remote, comm->ptrs[i].cleanupHandle, 0) != cudaSuccess) {
- WARN("rank %d failed to obtain dev ptr for rank %d", rank, iRank);
- commClearMaps(comm);
- return ncclUnhandledCudaError;
+ } else { // multi-process!
+ *ringDirectFailed = 1;
+ if (canpeer || myDev == iDev) {
+ INFO("rank access %d -> %d via Ipc P2P device mem", rank, iRank);
+ comm->ptrs[i].local = ranks[myId].devptr;
+ if (wrapCuIpcOpenMemHandle((CUdeviceptr*)(&comm->ptrs[i].remote),
+ ranks[i].devipc, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS) != ncclSuccess) {
+ WARN("rank %d failed to open Ipc handle to rank %d", rank, iRank);
+ commClearMaps(comm);
+ return ncclUnhandledCudaError;
+ }
+ comm->ptrs[i].remoteCleanup = CLEANUP_CUIPC;
+ comm->ptrs[i].cleanupHandle = comm->ptrs[i].remote;
+ } else { // go through hostmem
+ INFO("rank access %d -> %d via zero copy host shm", rank, iRank);
+ if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myId].hostptr, 0) != cudaSuccess) {
+ WARN("rank %d failed to obtain dev ptr to sysmem buffer", rank);
+ commClearMaps(comm);
+ return ncclUnhandledCudaError;
+ }
+ char rankname[1024];
+ sprintf(rankname, "%s-%d", commId->internal, ranks[i].rank);
+ if (openHostMemShm(rankname, (ncclMem**)&comm->ptrs[i].cleanupHandle, ranks[i].buffSize)
+ != ncclSuccess) {
+ WARN("rank %d failed to open sysmem buffer of rank %d", rank, iRank);
+ commClearMaps(comm);
+ return ncclUnhandledCudaError;
+ }
+ if (cudaHostGetDevicePointer(&comm->ptrs[i].remote, comm->ptrs[i].cleanupHandle, 0) != cudaSuccess) {
+ WARN("rank %d failed to obtain dev ptr for rank %d", rank, iRank);
+ commClearMaps(comm);
+ return ncclUnhandledCudaError;
+ }
+ comm->ptrs[i].remoteCleanup = CLEANUP_UNMAP;
}
- comm->ptrs[i].remoteCleanup = CLEANUP_UNMAP;
}
}
- INFO("PushToRecv algos are %s\n", (comm->useRemoteRecv) ? "enabled" : "disabled");
return ncclSuccess;
}
@@ -679,12 +697,15 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int ndev, ncclUniqueId commId
goto cleanup;
}
- res = commBuildMaps(*newcomm, &commId, myrank, gath->ranks);
+ res = commBuildMaps(*newcomm, &commId, myrank, gath->ranks, &gath->ringDirectFail);
if (res != ncclSuccess) {
WARN("rank %d failed to build comm maps", myrank);
goto cleanup;
}
+ syncRingDirect(gath, &((*newcomm)->useRemoteRecv));
+ INFO("PushToRecv algos are %s\n", (*newcomm)->useRemoteRecv ? "enabled" : "disabled");
+
res = closeGather(gath, ndev); // includes a barrier
gath = NULL;
if (res != ncclSuccess) {
@@ -722,6 +743,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, int* devlist) {
char busId[13];
nvmlDevice_t nvmlHandle;
int affinity_set = 0;
+ int ringDirectFail = 0; // Assume direct access to recv ptr OK
res = wrapSymbols();
if (res != ncclSuccess) {
@@ -792,13 +814,18 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, int* devlist) {
for(rank=0; rank<ndev; ++rank) {
comm = comms[rank];
cudaSetDevice(comm->cudaDev);
- res = commBuildMaps(comm, NULL, rank, ranks);
+ res = commBuildMaps(comm, NULL, rank, ranks, &ringDirectFail);
if (res != ncclSuccess) {
WARN("rank %d failed to build comm maps", rank);
goto cleanup;
}
}
+ INFO("PushToRecv algos are %s\n", (ringDirectFail) ? "disabled" : "enabled");
+ for(rank=0; rank<ndev; ++rank) {
+ comms[rank]->useRemoteRecv = ringDirectFail ? 0 : 1;
+ }
+
free(ranks);
ranks = NULL;
res = ncclSuccess;