Fixed useRemoteRecv consistency issue.

Change-Id: Ib093a8dc3bb093eddc89dad81d3fffa53c03a6a2 Reviewed-on: http://git-master/r/1013543 Reviewed-by: Cliff Woolley <jwoolley@nvidia.com> Tested-by: Przemek Tredak <ptredak@nvidia.com>
author: Nathan Luehr <nluehr@nvidia.com> 2016-02-18 22:59:54 +0300
committer: Przemek Tredak <ptredak@nvidia.com> 2016-02-19 00:45:42 +0300
commit: 5554a4c9f0d81a39f1a737c92ade1fe88de6760e (patch)
tree: 60d3d7ac08aa7e36eb2af29e1d163df6f186e0ca /src
parent: 9442285526c082ba85520e722389fb8b87092c35 (diff)
1 files changed, 85 insertions, 58 deletions
diff --git a/src/core.cu b/src/core.cu
index 67ac7e3..6adeb36 100644
--- a/src/core.cu
+++ b/src/core.cu
@@ -133,6 +133,7 @@ typedef struct {
   union {
     struct {
       volatile int bar;
+      int ringDirectFail;
     };
     char pad[16];
    };
@@ -178,6 +179,22 @@ static ncclResult_t initGather(RankGather** gather, ncclUniqueId commId,
   return ncclSuccess;
 }
 
+static void syncRingDirect(RankGather* gather, int* ringDirectOk) {
+  int bar_tmp = gather->bar - 1;
+  int ndev = gather->ranks[0].ndev;
+  bool swapped;
+  do {
+    bar_tmp += 1;
+    swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1);
+  } while(!swapped);
+
+  while (gather->bar != 2*ndev) // Wait for all ranks to arrive at this second barrier
+    sched_yield();
+  __sync_synchronize();
+
+  *ringDirectOk = gather->ringDirectFail ? 0 : 1;
+}
+
 static ncclResult_t closeGather(RankGather* gather, int ndev) {
   int bar_tmp = gather->bar - 1;
   bool swapped;
@@ -186,7 +203,7 @@ static ncclResult_t closeGather(RankGather* gather, int ndev) {
     swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1);
   } while(!swapped);
 
-  while (gather->bar != 2*ndev)
+  while (gather->bar != 3*ndev) // Wait for all ranks to arrive at this third barrier
     sched_yield();
   __sync_synchronize();
 
@@ -347,7 +364,7 @@ static ncclResult_t commClearMaps(ncclComm_t comm) {
   return retval;
 }
 
-static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int rank, RankEntry* ranks) {
+static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int rank, RankEntry* ranks, int* ringDirectFailed) {
   int ndev = comm->nDev;
   for(int i=0; i<ndev; ++i) {
     // Check for inconsistencies between ranks
@@ -418,64 +435,65 @@ static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int ran
       }
     }
 
-    if (iPid == myPid && (canpeer || myDev == iDev)) {
-      INFO("rank access %d -> %d via P2P device mem", rank, iRank);
-      comm->ptrs[i].local  = ranks[myId].devptr;
-      comm->ptrs[i].remote = ranks[i].devptr;
-      comm->ptrs[i].remoteCleanup = CLEANUP_NONE;
-    } else if (iPid == myPid) {
-      INFO("rank access %d -> %d via zero-copy host mem", rank, iRank);
-      if (j <= 2) {
-        comm->useRemoteRecv = 0;
-      }
-      if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myId].hostptr, 0) != cudaSuccess) {
-        WARN("rank %d failed to map zero copy buffer to device", rank);
-        commClearMaps(comm);
-        return ncclUnhandledCudaError;
-      }
-      if (cudaHostGetDevicePointer(&comm->ptrs[i].remote, ranks[i].hostptr, 0) != cudaSuccess) {
-        WARN("rank %d failed to map %d's zero copy buffer to device", rank, iRank);
-        commClearMaps(comm);
-        return ncclUnhandledCudaError;
-      }
-      comm->ptrs[i].remoteCleanup = CLEANUP_NONE;
-    } else if (canpeer || myDev == iDev) {
-      INFO("rank access %d -> %d via Ipc P2P device mem", rank, iRank);
-      comm->useRemoteRecv = 0;
-      comm->ptrs[i].local = ranks[myId].devptr;
-      if (wrapCuIpcOpenMemHandle((CUdeviceptr*)(&comm->ptrs[i].remote),
-          ranks[i].devipc, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS) != ncclSuccess) {
-        WARN("rank %d failed to open Ipc handle to rank %d", rank, iRank);
-        commClearMaps(comm);
-        return ncclUnhandledCudaError;
-      }
-      comm->ptrs[i].remoteCleanup = CLEANUP_CUIPC;
-      comm->ptrs[i].cleanupHandle = comm->ptrs[i].remote;
-    } else {
-      INFO("rank access %d -> %d via zero copy host shm", rank, iRank);
-      comm->useRemoteRecv = 0;
-      if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myId].hostptr, 0) != cudaSuccess) {
-        WARN("rank %d failed to obtain dev ptr to sysmem buffer", rank);
-        commClearMaps(comm);
-        return ncclUnhandledCudaError;
-      }
-      char rankname[1024];
-      sprintf(rankname, "%s-%d", commId->internal, ranks[i].rank);
-      if (openHostMemShm(rankname, (ncclMem**)&comm->ptrs[i].cleanupHandle, ranks[i].buffSize)
-          != ncclSuccess) {
-        WARN("rank %d failed to open sysmem buffer of rank %d", rank, iRank);
-        commClearMaps(comm);
-        return ncclUnhandledCudaError;
+    if (iPid == myPid) {
+      if (canpeer || myDev == iDev) {
+        INFO("rank access %d -> %d via P2P device mem", rank, iRank);
+        comm->ptrs[i].local  = ranks[myId].devptr;
+        comm->ptrs[i].remote = ranks[i].devptr;
+        comm->ptrs[i].remoteCleanup = CLEANUP_NONE;
+      } else { // go through hostmem
+        INFO("rank access %d -> %d via zero-copy host mem", rank, iRank);
+        if (j <= 2)
+          *ringDirectFailed = 1;
+        if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myId].hostptr, 0) != cudaSuccess) {
+          WARN("rank %d failed to map zero copy buffer to device", rank);
+          commClearMaps(comm);
+          return ncclUnhandledCudaError;
+        }
+        if (cudaHostGetDevicePointer(&comm->ptrs[i].remote, ranks[i].hostptr, 0) != cudaSuccess) {
+          WARN("rank %d failed to map %d's zero copy buffer to device", rank, iRank);
+          commClearMaps(comm);
+          return ncclUnhandledCudaError;
+        }
+        comm->ptrs[i].remoteCleanup = CLEANUP_NONE;
       }
-      if (cudaHostGetDevicePointer(&comm->ptrs[i].remote, comm->ptrs[i].cleanupHandle, 0) != cudaSuccess) {
-        WARN("rank %d failed to obtain dev ptr for rank %d", rank, iRank);
-        commClearMaps(comm);
-        return ncclUnhandledCudaError;
+    } else { // multi-process!
+      *ringDirectFailed = 1;
+      if (canpeer || myDev == iDev) {
+        INFO("rank access %d -> %d via Ipc P2P device mem", rank, iRank);
+        comm->ptrs[i].local = ranks[myId].devptr;
+        if (wrapCuIpcOpenMemHandle((CUdeviceptr*)(&comm->ptrs[i].remote),
+            ranks[i].devipc, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS) != ncclSuccess) {
+          WARN("rank %d failed to open Ipc handle to rank %d", rank, iRank);
+          commClearMaps(comm);
+          return ncclUnhandledCudaError;
+        }
+        comm->ptrs[i].remoteCleanup = CLEANUP_CUIPC;
+        comm->ptrs[i].cleanupHandle = comm->ptrs[i].remote;
+      } else { // go through hostmem
+        INFO("rank access %d -> %d via zero copy host shm", rank, iRank);
+        if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myId].hostptr, 0) != cudaSuccess) {
+          WARN("rank %d failed to obtain dev ptr to sysmem buffer", rank);
+          commClearMaps(comm);
+          return ncclUnhandledCudaError;
+        }
+        char rankname[1024];
+        sprintf(rankname, "%s-%d", commId->internal, ranks[i].rank);
+        if (openHostMemShm(rankname, (ncclMem**)&comm->ptrs[i].cleanupHandle, ranks[i].buffSize)
+            != ncclSuccess) {
+          WARN("rank %d failed to open sysmem buffer of rank %d", rank, iRank);
+          commClearMaps(comm);
+          return ncclUnhandledCudaError;
+        }
+        if (cudaHostGetDevicePointer(&comm->ptrs[i].remote, comm->ptrs[i].cleanupHandle, 0) != cudaSuccess) {
+          WARN("rank %d failed to obtain dev ptr for rank %d", rank, iRank);
+          commClearMaps(comm);
+          return ncclUnhandledCudaError;
+        }
+        comm->ptrs[i].remoteCleanup = CLEANUP_UNMAP;
       }
-      comm->ptrs[i].remoteCleanup = CLEANUP_UNMAP;
     }
   }
-  INFO("PushToRecv algos are %s\n", (comm->useRemoteRecv) ? "enabled" : "disabled");
   return ncclSuccess;
 }
 
@@ -679,12 +697,15 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int ndev, ncclUniqueId commId
     goto cleanup;
   }
 
-  res = commBuildMaps(*newcomm, &commId, myrank, gath->ranks);
+  res = commBuildMaps(*newcomm, &commId, myrank, gath->ranks, &gath->ringDirectFail);
   if (res != ncclSuccess) {
     WARN("rank %d failed to build comm maps", myrank);
     goto cleanup;
   }
 
+  syncRingDirect(gath, &((*newcomm)->useRemoteRecv));
+  INFO("PushToRecv algos are %s\n", (*newcomm)->useRemoteRecv ? "enabled" : "disabled");
+
   res = closeGather(gath, ndev); // includes a barrier
   gath = NULL;
   if (res != ncclSuccess) {
@@ -722,6 +743,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, int* devlist) {
   char busId[13];
   nvmlDevice_t nvmlHandle;
   int affinity_set = 0;
+  int ringDirectFail = 0; // Assume direct access to recv ptr OK
 
   res = wrapSymbols();
   if (res != ncclSuccess) {
@@ -792,13 +814,18 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, int* devlist) {
   for(rank=0; rank<ndev; ++rank) {
     comm = comms[rank];
     cudaSetDevice(comm->cudaDev);
-    res = commBuildMaps(comm, NULL, rank, ranks);
+    res = commBuildMaps(comm, NULL, rank, ranks, &ringDirectFail);
     if (res != ncclSuccess) {
       WARN("rank %d failed to build comm maps", rank);
       goto cleanup;
     }
   }
 
+  INFO("PushToRecv algos are %s\n", (ringDirectFail) ? "disabled" : "enabled");
+  for(rank=0; rank<ndev; ++rank) {
+    comms[rank]->useRemoteRecv = ringDirectFail ? 0 : 1;
+  }
+
   free(ranks);
   ranks = NULL;
   res = ncclSuccess;
author	Nathan Luehr <nluehr@nvidia.com>	2016-02-18 22:59:54 +0300
committer	Przemek Tredak <ptredak@nvidia.com>	2016-02-19 00:45:42 +0300
commit	5554a4c9f0d81a39f1a737c92ade1fe88de6760e (patch)
tree	60d3d7ac08aa7e36eb2af29e1d163df6f186e0ca /src
parent	9442285526c082ba85520e722389fb8b87092c35 (diff)