Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/NVIDIA/nccl.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSylvain Jeaugey <sjeaugey@nvidia.com>2021-02-18 04:43:39 +0300
committerSylvain Jeaugey <sjeaugey@nvidia.com>2021-02-18 04:45:11 +0300
commitef5f37461fdbf11104cf0ee13da80d80b84b4cbc (patch)
treec02a9e64ec996e8a220fb2424e19f6320960bc5e
parent99b8a0393ffa379f3b0b81f3d5c0baa6aad7abef (diff)
Fix segfault in send/recv due to bootstrap tag.bootstrap_tag
-rw-r--r--src/transport.cc13
1 files changed, 7 insertions, 6 deletions
diff --git a/src/transport.cc b/src/transport.cc
index 55d3291..7972398 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -63,6 +63,7 @@ void dumpData(struct ncclConnect* data, int ndata) {
ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph) {
struct ncclConnect data[2*MAXCHANNELS];
for (int i=1; i<comm->nRanks; i++) {
+ int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks;
int sendPeer = (comm->rank + i) % comm->nRanks;
uint32_t recvMask = comm->connectRecv[recvPeer];
@@ -86,16 +87,16 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
if (sendPeer == recvPeer) {
if (recvChannels+sendChannels) {
- NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, (i<<8)+graph->id, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
- NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, (i<<8)+graph->id, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
+ NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
+ NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels)));
sendData = data;
recvData = data+sendChannels;
}
} else {
- if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, (i<<8)+graph->id, recvData, sizeof(struct ncclConnect)*recvChannels));
- if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, (i<<8)+graph->id, sendData, sizeof(struct ncclConnect)*sendChannels));
- if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, (i<<8)+graph->id, sendData, sizeof(struct ncclConnect)*sendChannels));
- if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, (i<<8)+graph->id, recvData, sizeof(struct ncclConnect)*recvChannels));
+ if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
+ if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
+ if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels));
+ if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels));
}
for (int c=0; c<MAXCHANNELS; c++) {