diff options
author | David Addison <daddison@nvidia.com> | 2019-08-14 19:45:45 +0300 |
---|---|---|
committer | David Addison <daddison@nvidia.com> | 2019-08-14 19:45:45 +0300 |
commit | 01d1836668ad59bfc26d14aede5fec593da6ae42 (patch) | |
tree | a1a63ee00fc11f48a3cfa143881e7a0db5da9ecb | |
parent | 7f2b337e703d73ed369937c9996e1f3d5f664ad0 (diff) | |
parent | 161763aab2befd038da7cdecbbe8e52a56677200 (diff) |
Merge branch 'shm' of git://github.com/lowintelligence/nccl into lowintelligence-shm
-rw-r--r-- | src/include/utils.h | 1 | ||||
-rw-r--r-- | src/init.cc | 13 | ||||
-rw-r--r-- | src/misc/utils.cc | 9 |
3 files changed, 17 insertions, 6 deletions
diff --git a/src/include/utils.h b/src/include/utils.h index 93e72c8..3038e68 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -11,6 +11,7 @@ #include <stdint.h> ncclResult_t getHostName(char* hostname, int maxlen, const char delim); +uint64_t getnHash(const char* string, int n); uint64_t getHostHash(); uint64_t getPidHash(); diff --git a/src/init.cc b/src/init.cc index 0158f8d..229742c 100644 --- a/src/init.cc +++ b/src/init.cc @@ -308,12 +308,12 @@ static void showVersion() { } } -static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank) { +static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank, uint64_t commHash) { info->rank = rank; CUDACHECK(cudaGetDevice(&info->cudaDev)); NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev)) - info->hostHash=getHostHash(); - info->pidHash=getPidHash(); + info->hostHash=getHostHash()+commHash; + info->pidHash=getPidHash()+commHash; // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the // cudaDev is a CUDA runtime dev number which could be different from the @@ -691,7 +691,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm int rank = comm->rank; int nranks = comm->nRanks; - TRACE(NCCL_INIT, "rank %d nranks %d - BEGIN", rank, nranks); + uint64_t commHash = getnHash(commId->internal, NCCL_UNIQUE_ID_BYTES); + TRACE(NCCL_INIT, "comm %p, commHash %lu, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks); NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap)); // AllGather1 - begin @@ -702,7 +703,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECK(ncclCalloc(&allGather1Data, nranks)); allGather1Data[rank].comm = comm; - NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank)); + NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank, commHash)); NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data))); NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks)); @@ -998,7 +999,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, NCCLCHECK(ncclCalloc(&allInfo, nranks)); for (int rank=0; rank<nranks; rank++) { CUDACHECK(cudaSetDevice(devs[rank])); - NCCLCHECK(fillInfo(allInfo+rank, rank)); + NCCLCHECK(fillInfo(allInfo+rank, rank, 0)); } int* connectTransport; diff --git a/src/misc/utils.cc b/src/misc/utils.cc index 5093755..b511af1 100644 --- a/src/misc/utils.cc +++ b/src/misc/utils.cc @@ -96,6 +96,15 @@ uint64_t getHash(const char* string) { return result; } +uint64_t getnHash(const char* string, int n) { + // Based on DJB2, result = result * 33 + char + uint64_t result = 9527; + for (int c = 0; c < n; c++) { + result = ((result << 5) + result) + string[c]; + } + return result; +} + /* Generate a hash of the unique identifying string for this host * that will be unique for both bare-metal and container instances * Equivalent of a hash of; |