Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Addison <daddison@nvidia.com>2019-08-14 20:09:53 +0300
committerDavid Addison <daddison@nvidia.com>2019-08-14 20:09:53 +0300
commitccb1298148327bacb9b83452ed6ae0b29417e7e2 (patch)
tree45e399f0568e26cd57e28deeb892d4aca8093d60
parent7f2b337e703d73ed369937c9996e1f3d5f664ad0 (diff)
parentfad079a8aeb72f4fb30f3564a48ad4ec37ea58f6 (diff)
Merge branch 'lowintelligence-shm'
PR#196
-rw-r--r--src/include/utils.h1
-rw-r--r--src/init.cc13
-rw-r--r--src/misc/utils.cc8
3 files changed, 12 insertions, 10 deletions
diff --git a/src/include/utils.h b/src/include/utils.h
index 93e72c8..5acccc2 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -11,6 +11,7 @@
#include <stdint.h>
ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
+uint64_t getHash(const char* string, int n);
uint64_t getHostHash();
uint64_t getPidHash();
diff --git a/src/init.cc b/src/init.cc
index 0158f8d..706d3a6 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -308,12 +308,12 @@ static void showVersion() {
}
}
-static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank) {
+static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank, uint64_t commHash) {
info->rank = rank;
CUDACHECK(cudaGetDevice(&info->cudaDev));
NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev))
- info->hostHash=getHostHash();
- info->pidHash=getPidHash();
+ info->hostHash=getHostHash()+commHash;
+ info->pidHash=getPidHash()+commHash;
// Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
// cudaDev is a CUDA runtime dev number which could be different from the
@@ -691,7 +691,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
int rank = comm->rank;
int nranks = comm->nRanks;
- TRACE(NCCL_INIT, "rank %d nranks %d - BEGIN", rank, nranks);
+ uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
+ TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
// AllGather1 - begin
@@ -702,7 +703,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
allGather1Data[rank].comm = comm;
- NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank));
+ NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank, commHash));
NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data)));
NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks));
@@ -998,7 +999,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
NCCLCHECK(ncclCalloc(&allInfo, nranks));
for (int rank=0; rank<nranks; rank++) {
CUDACHECK(cudaSetDevice(devs[rank]));
- NCCLCHECK(fillInfo(allInfo+rank, rank));
+ NCCLCHECK(fillInfo(allInfo+rank, rank, 0));
}
int* connectTransport;
diff --git a/src/misc/utils.cc b/src/misc/utils.cc
index 5093755..da99774 100644
--- a/src/misc/utils.cc
+++ b/src/misc/utils.cc
@@ -87,10 +87,10 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
}
}
-uint64_t getHash(const char* string) {
+uint64_t getHash(const char* string, int n) {
// Based on DJB2, result = result * 33 + char
uint64_t result = 5381;
- for (int c = 0; string[c] != '\0'; c++) {
+ for (int c = 0; c < n; c++) {
result = ((result << 5) + result) + string[c];
}
return result;
@@ -120,7 +120,7 @@ uint64_t getHostHash(void) {
uname[offset]='\0';
TRACE(NCCL_INIT,"unique hostname '%s'", uname);
- return getHash(uname);
+ return getHash(uname, strlen(uname));
}
/* Generate a hash of the unique identifying string for this process
@@ -140,7 +140,7 @@ uint64_t getPidHash(void) {
pname[plen+len]='\0';
TRACE(NCCL_INIT,"unique PID '%s'", pname);
- return getHash(pname);
+ return getHash(pname, strlen(pname));
}
int parseStringList(const char* string, struct netIf* ifList, int maxList) {