diff options
author | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2021-02-18 04:43:39 +0300 |
---|---|---|
committer | Sylvain Jeaugey <sjeaugey@nvidia.com> | 2021-02-18 04:45:11 +0300 |
commit | ef5f37461fdbf11104cf0ee13da80d80b84b4cbc (patch) | |
tree | c02a9e64ec996e8a220fb2424e19f6320960bc5e | |
parent | 99b8a0393ffa379f3b0b81f3d5c0baa6aad7abef (diff) |
Fix segfault in send/recv due to bootstrap tag.bootstrap_tag
-rw-r--r-- | src/transport.cc | 13 |
1 files changed, 7 insertions, 6 deletions
diff --git a/src/transport.cc b/src/transport.cc index 55d3291..7972398 100644 --- a/src/transport.cc +++ b/src/transport.cc @@ -63,6 +63,7 @@ void dumpData(struct ncclConnect* data, int ndata) { ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph) { struct ncclConnect data[2*MAXCHANNELS]; for (int i=1; i<comm->nRanks; i++) { + int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0); int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks; int sendPeer = (comm->rank + i) % comm->nRanks; uint32_t recvMask = comm->connectRecv[recvPeer]; @@ -86,16 +87,16 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* if (sendPeer == recvPeer) { if (recvChannels+sendChannels) { - NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, (i<<8)+graph->id, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels))); - NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, (i<<8)+graph->id, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels))); + NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels))); + NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels))); sendData = data; recvData = data+sendChannels; } } else { - if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, (i<<8)+graph->id, recvData, sizeof(struct ncclConnect)*recvChannels)); - if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, (i<<8)+graph->id, sendData, sizeof(struct ncclConnect)*sendChannels)); - if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, (i<<8)+graph->id, sendData, sizeof(struct ncclConnect)*sendChannels)); - if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, (i<<8)+graph->id, recvData, sizeof(struct ncclConnect)*recvChannels)); + if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels)); + if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels)); + if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels)); + if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels)); } for (int c=0; c<MAXCHANNELS; c++) { |