diff options
author | David Addison <daddison@nvidia.com> | 2019-08-14 20:09:53 +0300 |
---|---|---|
committer | David Addison <daddison@nvidia.com> | 2019-08-14 20:09:53 +0300 |
commit | ccb1298148327bacb9b83452ed6ae0b29417e7e2 (patch) | |
tree | 45e399f0568e26cd57e28deeb892d4aca8093d60 | |
parent | 7f2b337e703d73ed369937c9996e1f3d5f664ad0 (diff) | |
parent | fad079a8aeb72f4fb30f3564a48ad4ec37ea58f6 (diff) |
Merge branch 'lowintelligence-shm'
PR#196
-rw-r--r-- | src/include/utils.h | 1 | ||||
-rw-r--r-- | src/init.cc | 13 | ||||
-rw-r--r-- | src/misc/utils.cc | 8 |
3 files changed, 12 insertions, 10 deletions
diff --git a/src/include/utils.h b/src/include/utils.h index 93e72c8..5acccc2 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -11,6 +11,7 @@ #include <stdint.h> ncclResult_t getHostName(char* hostname, int maxlen, const char delim); +uint64_t getHash(const char* string, int n); uint64_t getHostHash(); uint64_t getPidHash(); diff --git a/src/init.cc b/src/init.cc index 0158f8d..706d3a6 100644 --- a/src/init.cc +++ b/src/init.cc @@ -308,12 +308,12 @@ static void showVersion() { } } -static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank) { +static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank, uint64_t commHash) { info->rank = rank; CUDACHECK(cudaGetDevice(&info->cudaDev)); NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev)) - info->hostHash=getHostHash(); - info->pidHash=getPidHash(); + info->hostHash=getHostHash()+commHash; + info->pidHash=getPidHash()+commHash; // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the // cudaDev is a CUDA runtime dev number which could be different from the @@ -691,7 +691,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm int rank = comm->rank; int nranks = comm->nRanks; - TRACE(NCCL_INIT, "rank %d nranks %d - BEGIN", rank, nranks); + uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES); + TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks); NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap)); // AllGather1 - begin @@ -702,7 +703,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECK(ncclCalloc(&allGather1Data, nranks)); allGather1Data[rank].comm = comm; - NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank)); + NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank, commHash)); NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data))); NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks)); @@ -998,7 +999,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, NCCLCHECK(ncclCalloc(&allInfo, nranks)); for (int rank=0; rank<nranks; rank++) { CUDACHECK(cudaSetDevice(devs[rank])); - NCCLCHECK(fillInfo(allInfo+rank, rank)); + NCCLCHECK(fillInfo(allInfo+rank, rank, 0)); } int* connectTransport; diff --git a/src/misc/utils.cc b/src/misc/utils.cc index 5093755..da99774 100644 --- a/src/misc/utils.cc +++ b/src/misc/utils.cc @@ -87,10 +87,10 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file } } -uint64_t getHash(const char* string) { +uint64_t getHash(const char* string, int n) { // Based on DJB2, result = result * 33 + char uint64_t result = 5381; - for (int c = 0; string[c] != '\0'; c++) { + for (int c = 0; c < n; c++) { result = ((result << 5) + result) + string[c]; } return result; @@ -120,7 +120,7 @@ uint64_t getHostHash(void) { uname[offset]='\0'; TRACE(NCCL_INIT,"unique hostname '%s'", uname); - return getHash(uname); + return getHash(uname, strlen(uname)); } /* Generate a hash of the unique identifying string for this process @@ -140,7 +140,7 @@ uint64_t getPidHash(void) { pname[plen+len]='\0'; TRACE(NCCL_INIT,"unique PID '%s'", pname); - return getHash(pname); + return getHash(pname, strlen(pname)); } int parseStringList(const char* string, struct netIf* ifList, int maxList) { |