diff options
Diffstat (limited to 'src/graph/connect.cc')
-rw-r--r-- | src/graph/connect.cc | 190 |
1 files changed, 107 insertions, 83 deletions
diff --git a/src/graph/connect.cc b/src/graph/connect.cc index af481d2..a0f1265 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -14,7 +14,7 @@ /******************************************************************/ ncclResult_t ncclTopoPreset(struct ncclComm* comm, - struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, + struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph, struct ncclTopoRanks* topoRanks) { int rank = comm->rank; int localRanks = comm->localRanks; @@ -23,13 +23,14 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, for (int c=0; c<nChannels; c++) { struct ncclChannel* channel = comm->channels+c; channel->ring.prev = channel->ring.next = -1; - channel->treeUp.up = -1; - for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeUp.down[i] = -1; - channel->treeDn.up = -1; - for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->treeDn.down[i] = -1; + channel->tree.up = -1; + for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1; + channel->collTree.up = -1; + for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTree.down[i] = -1; int* ringIntra = ringGraph->intra+c*localRanks; int* treeIntra = treeGraph->intra+c*localRanks; + int* collNetIntra = collNetGraph->intra+c*localRanks; for (int i=0; i<localRanks; i++) { if (ringIntra[i] == rank) { @@ -39,23 +40,21 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, channel->ring.next = (i == localRanks-1) ? -1 : ringIntra[i+1]; } if (treeIntra[i] == rank) { - int recvIndex = 0, sendIndex = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; - int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks; + int parentIndex = 0; + int child0Index = treeGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; + int child1Index = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0; - // Tree loop always flows in the same direction. Other trees are symmetric, i.e. - // up/down go in reverse directions - int sym = treeGraph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE_LOOP ? 0 : 1; + topoRanks->treeToParent[c] = treeIntra[parentIndex]; + topoRanks->treeToChild0[c] = treeIntra[child0Index]; + topoRanks->treeToChild1[c] = treeIntra[child1Index]; + channel->tree.up = i == 0 ? -1 : treeIntra[i-1]; + channel->tree.down[0] = i == localRanks-1 ? -1 : treeIntra[i+1]; + } + if (collNetIntra[i] == rank) { + int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks; - // Down tree is common - topoRanks->treeDnRecv[c] = treeIntra[recvIndex]; - topoRanks->treeDnSend[c] = treeIntra[sendIndex]; - channel->treeDn.up = treeIntra[prev]; - channel->treeDn.down[0] = treeIntra[next]; - // Up tree depends on the pattern - topoRanks->treeUpRecv[c] = sym ? topoRanks->treeDnSend[c] : topoRanks->treeDnRecv[c]; - topoRanks->treeUpSend[c] = sym ? topoRanks->treeDnRecv[c] : topoRanks->treeDnSend[c]; - channel->treeUp.down[0] = sym ? channel->treeDn.down[0] : channel->treeDn.up ; - channel->treeUp.up = sym ? channel->treeDn.up : channel->treeDn.down[0]; + channel->collTree.up = collNetIntra[prev]; + channel->collTree.down[0] = collNetIntra[next]; } } topoRanks->ringPrev[c] = channel->ring.prev; @@ -105,72 +104,100 @@ static ncclResult_t getIndexes(int* ranks, int* indexes, int nNodes, int* firstR return ncclSuccess; } -static ncclResult_t setTreeUp(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int u0, int u1) { - if (u0 != -1) tree0->up = indexes[u0]; - if (u1 != -1) tree1->up = indexes[u1]; +static ncclResult_t setTreeUp(struct ncclTree* tree, int* indexes, int u) { + if (u == -1) return ncclSuccess; + tree->up = indexes[u]; return ncclSuccess; } -static ncclResult_t addRanksDown(int* down, int* indexes, int r0, int r1) { +static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) { + if (d == -1) return ncclSuccess; int x = 0; - if (down[x] >= 0) x++; - if (down[x] >= 0) { - WARN("Internal error : tree already has more than one child (%d %d %d)\n", down[0], down[1], down[2]); + while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++; + if (x == NCCL_MAX_TREE_ARITY) { + WARN("Internal error : tree already has %d children (%d %d %d)\n", x, tree->down[0], tree->down[1], tree->down[2]); return ncclInternalError; } - if (r0 != -1) down[x++] = indexes[r0]; - if (r1 != -1) down[x++] = indexes[r1]; - return ncclSuccess; -} - -static ncclResult_t setTreeDown(struct ncclTree* tree0, struct ncclTree* tree1, int* indexes, int d0_0, int d0_1, int d1_0, int d1_1) { - NCCLCHECK(addRanksDown(tree0->down, indexes, d0_0, d0_1)); - NCCLCHECK(addRanksDown(tree1->down, indexes, d1_0, d1_1)); + tree->down[x] = indexes[d]; return ncclSuccess; } -static ncclResult_t openRing(struct ncclTree* tree, int rank, int upRank) { - if (tree->down[0] == upRank) tree->down[0] = -1; - if (rank == upRank) tree->up = -1; - return ncclSuccess; -} - -static ncclResult_t connectTrees(struct ncclComm* comm, int* treeUpRecv, int* treeUpSend, int* treeDnRecv, int* treeDnSend, int* firstRanks) { +static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* firstRanks, int* treePatterns) { const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node; - int* indexesSend, *indexesRecv; - NCCLCHECK(ncclCalloc(&indexesSend, nNodes)); - NCCLCHECK(ncclCalloc(&indexesRecv, nNodes)); + int* ranksToParent, *ranksToChild0, *ranksToChild1; + NCCLCHECK(ncclCalloc(&ranksToParent, nNodes)); + NCCLCHECK(ncclCalloc(&ranksToChild0, nNodes)); + NCCLCHECK(ncclCalloc(&ranksToChild1, nNodes)); // Compute tree depth. Not an exact value but a good approximation in most // cases int depth = comm->nRanks/nNodes - 1 + log2i(nNodes); - int u0, d0_0, d0_1, u1, d1_0, d1_1; - NCCLCHECK(ncclGetDtree(nNodes, node, &u0, &d0_0, &d0_1, &u1, &d1_0, &d1_1)); + int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType; + NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType)); for (int c=0; c<nChannels; c++) { struct ncclChannel* channel0 = comm->channels+c; struct ncclChannel* channel1 = channel0+nChannels; - NCCLCHECK(getIndexes(treeUpSend+c*comm->nRanks, indexesSend, nNodes, firstRanks)); - NCCLCHECK(getIndexes(treeUpRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks)); - NCCLCHECK(openRing(&channel0->treeUp, comm->rank, indexesSend[node])); - NCCLCHECK(openRing(&channel1->treeUp, comm->rank, indexesSend[node])); - int root = indexesSend[node]; - if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeUp, &channel1->treeUp, indexesRecv, u0, u1)); - if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeUp, &channel1->treeUp, indexesSend, d0_0, d0_1, d1_0, d1_1)); - NCCLCHECK(getIndexes(treeDnSend+c*comm->nRanks, indexesSend, nNodes, firstRanks)); - NCCLCHECK(getIndexes(treeDnRecv+c*comm->nRanks, indexesRecv, nNodes, firstRanks)); - NCCLCHECK(openRing(&channel0->treeDn, comm->rank, u0 == -1 ? root : indexesRecv[node])); - NCCLCHECK(openRing(&channel1->treeDn, comm->rank, u1 == -1 ? root : indexesRecv[node])); - if (indexesSend[node] == comm->rank) NCCLCHECK(setTreeDown(&channel0->treeDn, &channel1->treeDn, indexesRecv, d0_0, d0_1, d1_0, d1_1)); - if (indexesRecv[node] == comm->rank) NCCLCHECK(setTreeUp(&channel0->treeDn, &channel1->treeDn, indexesSend, u0, u1)); - TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c, channel0->treeUp.up, channel0->treeUp.down[0], channel0->treeUp.down[1], channel0->treeUp.down[2]); - TRACE(NCCL_GRAPH, "TreeUp %d : %d -> %d/%d/%d", c+nChannels, channel1->treeUp.up, channel1->treeUp.down[0], channel1->treeUp.down[1], channel1->treeUp.down[2]); - TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c, channel0->treeDn.up, channel0->treeDn.down[0], channel0->treeDn.down[1], channel0->treeDn.down[2]); - TRACE(NCCL_GRAPH, "TreeDn %d : %d -> %d/%d/%d", c+nChannels, channel1->treeDn.up, channel1->treeDn.down[0], channel1->treeDn.down[1], channel1->treeDn.down[2]); - channel0->treeUp.depth = channel1->treeUp.depth = depth; + NCCLCHECK(getIndexes(treeToParent+c*comm->nRanks, ranksToParent, nNodes, firstRanks)); + NCCLCHECK(getIndexes(treeToChild0+c*comm->nRanks, ranksToChild0, nNodes, firstRanks)); + NCCLCHECK(getIndexes(treeToChild1+c*comm->nRanks, ranksToChild1, nNodes, firstRanks)); + if (comm->rank == ranksToParent[node]) { + NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ranksToChild0 : ranksToChild1, t0u)); + NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ranksToChild0 : ranksToChild1, t1u)); + } + if (comm->rank == ranksToChild0[node]) { + NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d0)); + NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d0)); + } + if (comm->rank == ranksToChild1[node]) { + NCCLCHECK(setTreeDown(&channel0->tree, ranksToParent, t0d1)); + NCCLCHECK(setTreeDown(&channel1->tree, ranksToParent, t1d1)); + } + if (comm->rank == ranksToParent[node] || + comm->rank == ranksToChild0[node] || + comm->rank == ranksToChild1[node]) { + INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]); + INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]); + } + channel0->tree.depth = channel1->tree.depth = depth; + } + free(ranksToParent); + free(ranksToChild0); + free(ranksToChild1); + return ncclSuccess; +} + +ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank) { + int nranks = comm->nRanks; + int depth = nranks/comm->nNodes; + int sendIndex = collNetGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; // send GPU index depends on topo pattern + int sendEndIndex = (sendIndex+comm->localRanks-1)%comm->localRanks; + for (int c=0; c<comm->nChannels/2; c++) { + struct ncclChannel* channel = comm->channels+c; + // Set root of collTree to id nranks + if (rank == collNetGraph->intra[sendIndex+c*comm->localRanks]) { // is master + channel->collTree.up = nranks; + } + if (rank == collNetGraph->intra[sendEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain + channel->collTree.down[0] = -1; + } + channel->collTree.depth = depth; + INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTree.up, channel->collTree.down[0]); + } + int recvIndex = 0; // recv GPU index is always 0 + int recvEndIndex = (recvIndex+comm->localRanks-1)%comm->localRanks; + for (int c=0; c<comm->nChannels/2; c++) { + struct ncclChannel* channel = comm->channels+comm->nChannels/2+c; + // Set root of collTree to id nranks + if (rank == collNetGraph->intra[recvIndex+c*comm->localRanks]) { // is master + channel->collTree.up = nranks; + } + if (rank == collNetGraph->intra[recvEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain + channel->collTree.down[0] = -1; + } + channel->collTree.depth = depth; + INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTree.up, channel->collTree.down[0]); } - free(indexesSend); - free(indexesRecv); return ncclSuccess; } @@ -204,35 +231,33 @@ int ncclMaxNchannels() { return maxNchannels; } -ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings) { +ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings) { // Gather data from all ranks - int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend; + int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1; int nranks = comm->nRanks; int nChannels = comm->nChannels; NCCLCHECK(ncclCalloc(&ringRecv, nranks*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&ringSend, nranks*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS)); NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&treeUpRecv, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&treeUpSend, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&treeDnRecv, nranks*MAXCHANNELS)); - NCCLCHECK(ncclCalloc(&treeDnSend, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeToParent, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeToChild0, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeToChild1, nranks*MAXCHANNELS)); for (int i=0; i<nranks; i++) { for (int c=0; c<nChannels;c++) { ringRecv[c*nranks+i] = allTopoRanks[i]->ringRecv[c]; ringSend[c*nranks+i] = allTopoRanks[i]->ringSend[c]; ringPrev[c*nranks+i] = allTopoRanks[i]->ringPrev[c]; ringNext[c*nranks+i] = allTopoRanks[i]->ringNext[c]; - treeUpRecv[c*nranks+i] = allTopoRanks[i]->treeUpRecv[c]; - treeUpSend[c*nranks+i] = allTopoRanks[i]->treeUpSend[c]; - treeDnRecv[c*nranks+i] = allTopoRanks[i]->treeDnRecv[c]; - treeDnSend[c*nranks+i] = allTopoRanks[i]->treeDnSend[c]; + treeToParent[c*nranks+i] = allTopoRanks[i]->treeToParent[c]; + treeToChild0[c*nranks+i] = allTopoRanks[i]->treeToChild0[c]; + treeToChild1[c*nranks+i] = allTopoRanks[i]->treeToChild1[c]; } } // Connect rings and trees. This should also duplicate the channels. NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext, firstRanks)); - NCCLCHECK(connectTrees(comm, treeUpRecv, treeUpSend, treeDnRecv, treeDnSend, firstRanks)); + NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, firstRanks, treePatterns)); // Duplicate ringPrev/ringNext for ncclBuildRing memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int)); @@ -259,10 +284,9 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl free(ringSend); free(ringPrev); free(ringNext); - free(treeUpRecv); - free(treeUpSend); - free(treeDnRecv); - free(treeDnSend); + free(treeToParent); + free(treeToChild0); + free(treeToChild1); return ncclSuccess; } |