Fix share memory collision in multi-communicator case.

Current SHM object name would only use pidHash and ranks as identification, which would collide each other when program runs with multiple communicators. Here we added commId info into pidHash, it makes 'pidHash'es of different communicators keeping in same process will be distincted with each other.
author: Cao Zongyan <zongyan.cao@alibaba-inc.com> 2019-03-13 12:13:39 +0300
committer: Cao Zongyan <zongyan.cao@alibaba-inc.com> 2019-03-15 07:50:32 +0300
commit: 161763aab2befd038da7cdecbbe8e52a56677200 (patch)
tree: 76fea7e18201a9b35e0cc3a78a5fb103f03be447
parent: 14e0cf644b9ba2214f2b6d2e299e8218f6145d32 (diff)
3 files changed, 17 insertions, 6 deletions
diff --git a/src/include/utils.h b/src/include/utils.h
index 5a6a588..0ed875c 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -11,6 +11,7 @@
 #include <stdint.h>
 
 ncclResult_t getHostName(char* hostname, int maxlen);
+uint64_t getnHash(const char* string, int n);
 uint64_t getHostHash();
 uint64_t getPidHash();
 
diff --git a/src/init.cu b/src/init.cu
index 75822e6..b8032e8 100644
--- a/src/init.cu
+++ b/src/init.cu
@@ -302,12 +302,12 @@ static void showVersion() {
   }
 }
 
-static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank) {
+static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank, uint64_t commHash) {
   info->rank = rank;
   CUDACHECK(cudaGetDevice(&info->cudaDev));
   NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev))
-  info->hostHash=getHostHash();
-  info->pidHash=getPidHash();
+  info->hostHash=getHostHash()+commHash;
+  info->pidHash=getPidHash()+commHash;
 
   // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
   // cudaDev is a CUDA runtime dev number which could be different from the
@@ -679,7 +679,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
 
   int rank = comm->rank;
   int nranks = comm->nRanks;
-  TRACE(NCCL_INIT, "rank %d nranks %d - BEGIN", rank, nranks);
+  uint64_t commHash = getnHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
+  TRACE(NCCL_INIT, "comm %p, commHash %lu, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
   NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
 
   // AllGather1 - begin
@@ -690,7 +691,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
 
   NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
   allGather1Data[rank].comm = comm;
-  NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank));
+  NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank, commHash));
   NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
 
   NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
@@ -960,7 +961,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
   NCCLCHECK(ncclCalloc(&allInfo, nranks));
   for (int rank=0; rank<nranks; rank++) {
     CUDACHECK(cudaSetDevice(devs[rank]));
-    NCCLCHECK(fillInfo(allInfo+rank, rank));
+    NCCLCHECK(fillInfo(allInfo+rank, rank, 0));
   }
 
   int* connectTransport;
diff --git a/src/misc/utils.cu b/src/misc/utils.cu
index c618e71..c605c41 100644
--- a/src/misc/utils.cu
+++ b/src/misc/utils.cu
@@ -96,6 +96,15 @@ uint64_t getHash(const char* string) {
   return result;
 }
 
+uint64_t getnHash(const char* string, int n) {
+  // Based on DJB2, result = result * 33 + char
+  uint64_t result = 9527;
+  for (int c = 0; c < n; c++) {
+    result = ((result << 5) + result) + string[c];
+  }
+  return result;
+}
+
 /* Generate a hash of the unique identifying string for this host
  * that will be unique for both bare-metal and container instances
  * Equivalent of a hash of;
author	Cao Zongyan <zongyan.cao@alibaba-inc.com>	2019-03-13 12:13:39 +0300
committer	Cao Zongyan <zongyan.cao@alibaba-inc.com>	2019-03-15 07:50:32 +0300
commit	161763aab2befd038da7cdecbbe8e52a56677200 (patch)
tree	76fea7e18201a9b35e0cc3a78a5fb103f03be447
parent	14e0cf644b9ba2214f2b6d2e299e8218f6145d32 (diff)