Merge branch 'shm' of git://github.com/lowintelligence/nccl into lowintelligence-shm

author: David Addison <daddison@nvidia.com> 2019-08-14 19:45:45 +0300
committer: David Addison <daddison@nvidia.com> 2019-08-14 19:45:45 +0300
commit: 01d1836668ad59bfc26d14aede5fec593da6ae42 (patch)
tree: a1a63ee00fc11f48a3cfa143881e7a0db5da9ecb
parent: 7f2b337e703d73ed369937c9996e1f3d5f664ad0 (diff)
parent: 161763aab2befd038da7cdecbbe8e52a56677200 (diff)
3 files changed, 17 insertions, 6 deletions
diff --git a/src/include/utils.h b/src/include/utils.h
index 93e72c8..3038e68 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -11,6 +11,7 @@
 #include <stdint.h>
 
 ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
+uint64_t getnHash(const char* string, int n);
 uint64_t getHostHash();
 uint64_t getPidHash();
 
diff --git a/src/init.cc b/src/init.cc
index 0158f8d..229742c 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -308,12 +308,12 @@ static void showVersion() {
   }
 }
 
-static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank) {
+static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank, uint64_t commHash) {
   info->rank = rank;
   CUDACHECK(cudaGetDevice(&info->cudaDev));
   NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev))
-  info->hostHash=getHostHash();
-  info->pidHash=getPidHash();
+  info->hostHash=getHostHash()+commHash;
+  info->pidHash=getPidHash()+commHash;
 
   // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
   // cudaDev is a CUDA runtime dev number which could be different from the
@@ -691,7 +691,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
 
   int rank = comm->rank;
   int nranks = comm->nRanks;
-  TRACE(NCCL_INIT, "rank %d nranks %d - BEGIN", rank, nranks);
+  uint64_t commHash = getnHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
+  TRACE(NCCL_INIT, "comm %p, commHash %lu, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
   NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
 
   // AllGather1 - begin
@@ -702,7 +703,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
 
   NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
   allGather1Data[rank].comm = comm;
-  NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank));
+  NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank, commHash));
   NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
 
   NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
@@ -998,7 +999,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
   NCCLCHECK(ncclCalloc(&allInfo, nranks));
   for (int rank=0; rank<nranks; rank++) {
     CUDACHECK(cudaSetDevice(devs[rank]));
-    NCCLCHECK(fillInfo(allInfo+rank, rank));
+    NCCLCHECK(fillInfo(allInfo+rank, rank, 0));
   }
 
   int* connectTransport;
diff --git a/src/misc/utils.cc b/src/misc/utils.cc
index 5093755..b511af1 100644
--- a/src/misc/utils.cc
+++ b/src/misc/utils.cc
@@ -96,6 +96,15 @@ uint64_t getHash(const char* string) {
   return result;
 }
 
+uint64_t getnHash(const char* string, int n) {
+  // Based on DJB2, result = result * 33 + char
+  uint64_t result = 9527;
+  for (int c = 0; c < n; c++) {
+    result = ((result << 5) + result) + string[c];
+  }
+  return result;
+}
+
 /* Generate a hash of the unique identifying string for this host
  * that will be unique for both bare-metal and container instances
  * Equivalent of a hash of;
author	David Addison <daddison@nvidia.com>	2019-08-14 19:45:45 +0300
committer	David Addison <daddison@nvidia.com>	2019-08-14 19:45:45 +0300
commit	01d1836668ad59bfc26d14aede5fec593da6ae42 (patch)
tree	a1a63ee00fc11f48a3cfa143881e7a0db5da9ecb
parent	7f2b337e703d73ed369937c9996e1f3d5f664ad0 (diff)
parent	161763aab2befd038da7cdecbbe8e52a56677200 (diff)