src/misc/rings.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385

/*************************************************************************
 * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

#include "core.h"
#include "param.h"

#define NCCL_MAX_SCORE 7

/* Parse user defined rings. Format is like :
 * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
 * Rings with a non-matching number of ranks are ignored so we can provide
 * rings for multiple cases.
 */
#define MAX_ENV_RANKS 512
static ncclResult_t parseRings(const char* str, int* nringsRet, int nranks, int* prev, int* next) {
  int ranks[MAX_ENV_RANKS];
  int nrings = 0;
  int rank = 0;
  int offset = 0;
  int status = 0; // 0 : between numbers, 1 : inside number
  do {
    int digit = str[offset] - '0';
    if (digit >= 0 && digit <= 9) {
      if (status == 0) {
        ranks[rank] = digit;
        status = 1;
      } else {
        ranks[rank] = ranks[rank]*10+digit;
      }
    } else {
      if (status == 1) {
        rank++;
        if (rank == MAX_ENV_RANKS) goto end;
      }
      status = 0;
      if (str[offset] == '|' || str[offset] == '\0') {
        int prevRank = ranks[rank-1];
        // Ignore rings if nranks doesn't match
        if (rank != nranks) goto newring;

        for (int r=0; r<nranks; r++) {
          int rank = ranks[r];
          // Ignore rings with ranks out of bounds
          if (rank < 0 || rank >= nranks) goto newring;
          // Ignore rings with duplicate ranks
          for (int i=0; i<r; i++)
            if (ranks[i] == rank) goto newring;

          next[nrings*nranks+prevRank] = rank;
          prev[nrings*nranks+rank] = prevRank;
          prevRank = rank;
        }
        nrings++;
newring:
        rank = 0;
      }
    }
  } while (str[offset++] != 0);
end:
  *nringsRet = nrings;
  return ncclSuccess;
}

/*
 * Ring creation algorithm
 *
 * First, we establish hierarchical coordinates depending on the way ranks can
 * communicate. After fillCoords, we have for each rank a unique 3-int array
 * {   node, pci_domain,   rank } corresponding to the three transports :
 * { 2[NET],     1[SHM], 0[P2P] }.
 * Also, we renumber ranks (to indexes) based on their growing coordinates.
 *
 * Then, we ask transports to connect groups together. We start with net, then
 * shm, then p2p. We maintain two arrays, prev and next, where values are equal
 * to -1 when ranks are not yet connected, and a rank otherwise. We never
 * connect ranks outside our group, meaning that on 4 nodes of 2 sockets of 4
 * ranks, if we are rank 13, we should see something like (provided we have a
 * single net interface, hence a single ring) :
 *
 * Connecting all nodes                                <13>
 * 2[NET] : prev 31 -1 -1 -1 -1 -1 -1 -1  7 -1 -1 -1 -1 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
 *          next -1 -1 -1 -1 -1 -1 -1  8 -1 -1 -1 -1 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1  0
 *
 * Connecting P2P domains with shared memory           <13>
 * 1[SHM] : prev 31 -1 -1 -1 -1 -1 -1 -1  7 -1 -1 -1 11 -1 -1 -1 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
 *          next -1 -1 -1 -1 -1 -1 -1  8 -1 -1 -1 12 -1 -1 -1 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1  0
 *
 * Connecting ranks (only inside the P2P domain)       <13>
 * 0[P2P] : prev 31 -1 -1 -1 -1 -1 -1 -1  7 -1 -1 -1 11 12 13 14 15 -1 -1 -1 -1 -1 -1 -1 23 -1 -1 -1 -1 -1 -1 -1
 *          next -1 -1 -1 -1 -1 -1 -1  8 -1 -1 -1 12 13 14 15 16 -1 -1 -1 -1 -1 -1 -1 24 -1 -1 -1 -1 -1 -1 -1  0
 *
 * Hence, when we ask a transport to connect groups, we provide it with a subview of the ranks (except for net
 * which always sees the full world). That way, P2P can bruteforce all combinations inside the node without
 * risking to explode in terms of combinations, and we scale better.
 *
 * Finally, we loop over Network scores to try to create rings with high scores (=locality) and decrease until
 * we get at least one ring.
 */

static void recIsConnected(int rank, int* connected, int nranks, int* matrix, int transport) {
  connected[rank] = 1;
  for (int r=0; r<nranks; r++) {
    if (connected[r] == 0 && matrix[rank*nranks+r] == transport) {
      recIsConnected(r, connected, nranks, matrix, transport);
    }
  }
}

static void isConnected(int rank, int* connected, int nranks, int* matrix, int transport) {
  for (int r=0; r<nranks; r++) connected[r] = 0;
  recIsConnected(rank, connected, nranks, matrix, transport);
}

#define NEW_IDX(rank) do { \
  rankToIdx[rank] = idx; \
  idxToRank[idx] = rank; \
  for (int t=0; t<NTRANSPORTS; t++) coords[rank*NTRANSPORTS+t] = current[t]; \
  idx++; \
} while (0)

int findConnected(int rank, int* matrix, int nranks, int transport, int* coords) {
  for (int r=0; r<nranks; r++) {
    if (coords[r*NTRANSPORTS] == -1 && matrix[rank*nranks+r] == transport) return r;
  }
  return -1;
}

static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankToIdx, int* idxToRank) {
  int current[NTRANSPORTS];
  int* p2pConnected;
  NCCLCHECK(ncclCalloc(&p2pConnected, nranks));
  for (int i=0; i<NTRANSPORTS; i++) current[i] = 0;
  int curRank = 0, idx = 0;
  while (1) {
    // P2P is handled separately as there is no level below it and we need to
    // cover the case of being connected to another GPU indirectly.
    // So we detect all GPUs in the same P2P domain once and add them all at
    // once.
    isConnected(curRank, p2pConnected, nranks, matrix, 0);
    for (int r=0; r<nranks; r++) {
      if (p2pConnected[r]) {
        NEW_IDX(r);
        curRank = r;
        current[0]++;
      }
    }
    current[0] = 0;

    if (idx == nranks) {
      free(p2pConnected);
      return ncclSuccess;
    }

    // Find next group, either connected through SHM or NET.
    int rank;
    int transport = 1;
    while ((rank = findConnected(curRank, matrix, nranks, transport, coords)) == -1) {
      current[transport] = 0;
      transport++;
      if (transport == NTRANSPORTS) {
        WARN("Error : Could not find transport to connect next group\n");
        free(p2pConnected);
        return ncclInternalError; }
    }
    curRank = rank;
    current[transport]++;
  }
}

NCCL_PARAM(MinNrings, "MIN_NRINGS", 0);
NCCL_PARAM(MaxNrings, "MAX_NRINGS", 0);

/* Users can force the number of threads with an environment variable */
NCCL_PARAM(Nthreads, "NTHREADS", -2);
ncclResult_t getEnvThreads(int* nthreads) {
  int64_t nt = ncclParamNthreads();
  if (nt != -2)
    *nthreads = nt;
  return ncclSuccess;
}

static inline int copyRings(int nrings, int newNrings, int nranks, int* a, int* b, int* c, int* d) {
  if (newNrings > MAXCHANNELS) newNrings = MAXCHANNELS;
  for (int r=nrings; r<newNrings; r++) {
    for (int i=0; i<nranks; i++) {
      a[r*nranks+i] = a[(r-nrings)*nranks+i];
      b[r*nranks+i] = b[(r-nrings)*nranks+i];
      c[r*nranks+i] = c[(r-nrings)*nranks+i];
      d[r*nranks+i] = d[(r-nrings)*nranks+i];
    }
  }
  return newNrings;
}
/* Main ring creation function */
ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* transports, ncclTvalue_t* values, int* prev, int* next, int* treeIn, int* treeOut) {
  *nrings = 0;

  if (nranks == 1) return ncclSuccess;

  char* str = getenv("NCCL_RINGS");
  if (str && strlen(str)>0) {
    int ret = parseRings(str, nrings, nranks, prev, next);
    if (ret == ncclSuccess && *nrings > 0) {
      if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings);
      NCCLCHECK(getEnvThreads(nthreads));
      for (int r = 0; r<*nrings; r++) {
        for (int i = 0; i<nranks; i++) {
          if (transports[i*nranks+prev[r*nranks+i]] == 2) treeIn[r*nranks+i] = 1;
          if (transports[i*nranks+next[r*nranks+i]] == 2) treeOut[r*nranks+i] = 1;
        }
      }
      return ncclSuccess;
    }
    if (rank == 0) INFO(NCCL_INIT,"No valid ring found in environment, ignoring");
    *nrings = 0;
  }

  // Compute hierarchical topology groups, indexes, and rank<->index tables
  int* coords, *globalIdxToRank, *globalRankToIdx;
  NCCLCHECK(ncclCalloc(&coords, nranks*NTRANSPORTS));
  for (int i=0; i<nranks*NTRANSPORTS; i++) coords[i] = -1;
  NCCLCHECK(ncclCalloc(&globalIdxToRank, nranks));
  NCCLCHECK(ncclCalloc(&globalRankToIdx, nranks));

  NCCLCHECK(fillCoords(nranks, transports, coords, globalRankToIdx, globalIdxToRank));

  // Start with a high score, then decrease until we find rings
  int minScore = NCCL_MAX_SCORE;
  int nringsTmp;
  int *prevTmp, *nextTmp, *idxToRank, *rankToIdx, *groups, *subgroups;
  NCCLCHECK(ncclCalloc(&prevTmp, nranks*MAXCHANNELS));
  NCCLCHECK(ncclCalloc(&nextTmp, nranks*MAXCHANNELS));
  NCCLCHECK(ncclCalloc(&idxToRank, nranks));
  NCCLCHECK(ncclCalloc(&rankToIdx, nranks));
  NCCLCHECK(ncclCalloc(&groups, nranks));
  NCCLCHECK(ncclCalloc(&subgroups, nranks));

  int nThreads;
  do {
    nThreads = *nthreads;
    for (int i=0; i<nranks*MAXCHANNELS; i++) prevTmp[i] = nextTmp[i] = -1;
    nringsTmp = MAXCHANNELS;
    // Loop over transports to connect groups
    for (int t=NTRANSPORTS-1; t>=0; t--) {
      for (int i=0; i<nranks; i++) idxToRank[i] = rankToIdx[i] = -1;

      int nidx = 0;
      for (int i=0; i<nranks; i++) {
        // Extract only ranks in the same local area as rank
        // We need to extract them in the topological order, hence we iterate over indexes, not ranks
        int r = globalIdxToRank[i];
        int sameLocal = 1;
        for (int tr = NTRANSPORTS-1; tr > t; tr--) if (coords[r*NTRANSPORTS+tr] != coords[rank*NTRANSPORTS+tr]) sameLocal = 0;
        if (!sameLocal) continue;

        groups[nidx] = coords[r*NTRANSPORTS+t];
        subgroups[nidx] = t ? coords[r*NTRANSPORTS+t-1] : nidx;
        rankToIdx[r] = nidx;
        idxToRank[nidx] = r;
        nidx++;
      }

      int ngroups = groups[nidx-1] + 1; // Coords should be ordered

      ncclTvalue_t* subvalues;
      int *subprev, *subnext;
      NCCLCHECK(ncclCalloc(&subvalues, nidx*nidx));
      NCCLCHECK(ncclCalloc(&subprev, nidx*nringsTmp));
      NCCLCHECK(ncclCalloc(&subnext, nidx*nringsTmp));
      if (ngroups > 1) {
        /* Extract subvalues */
        for (int i=0; i<nidx; i++) {
          for (int j=0; j<nidx; j++) {
            if (transports[idxToRank[i]*nranks+idxToRank[j]] == t)
              subvalues[i*nidx+j] = values[idxToRank[i]*nranks+idxToRank[j]];
            else
              subvalues[i*nidx+j] = 0;
          }
        }
        /* Extract subprev/subnext */
        for (int i=0; i<nidx*nringsTmp; i++) {
          subprev[i] = subnext[i] = -1;
        }
        for (int r=0; r<nringsTmp; r++) {
          int start = -1, end = -1;
          for (int i=0; i<nranks; i++) {
            if (rankToIdx[i] == -1) continue;
            if (prevTmp[r*nranks+i] != -1) start = i;
            if (nextTmp[r*nranks+i] != -1) end = i;
          }
          if (start != -1 && end != -1) {
            subprev[r*nidx+rankToIdx[start]] = rankToIdx[end];
            subnext[r*nidx+rankToIdx[end]] = rankToIdx[start];
          }
        }
        /* Get rings */
        NCCLCHECK(ncclTransports[t].getRings(nidx, groups, subgroups, subvalues, &nringsTmp, subprev, subnext, minScore, &nThreads));
        /* Merge subprev/subnext into prev/next */
        for (int r=0; r<nringsTmp; r++) {
          for (int i=0; i<nidx; i++) {
            if ((prevTmp[r*nranks+idxToRank[i]] == -1) && (subprev[r*nidx+i] != -1)) prevTmp[r*nranks+idxToRank[i]] = idxToRank[subprev[r*nidx+i]];
            if ((nextTmp[r*nranks+idxToRank[i]] == -1) && (subnext[r*nidx+i] != -1)) nextTmp[r*nranks+idxToRank[i]] = idxToRank[subnext[r*nidx+i]];
            if (t == NTRANSPORTS-1) {
              // Save node-level masters for trees
              treeIn[r*nranks+idxToRank[i]] = prevTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
              treeOut[r*nranks+idxToRank[i]] = nextTmp[r*nranks+idxToRank[i]] == -1 ? 0 : 1;
            }
          }
        }
        //for (int r=0; r<nringsTmp; r++) {
        //printf("[%d] [%d] [%d] [%d] Prev ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", prevTmp[r*nranks+i]); printf("\n");
        //printf("[%d] [%d] [%d] [%d] Next ", rank, minScore, t, r); for (int i=0; i<nranks; i++) printf("%d ", nextTmp[r*nranks+i]); printf("\n");
        //}
      }
      free(subvalues);
      free(subprev);
      free(subnext);
      if (nringsTmp == 0) break;
    }
    minScore--;
    if (nringsTmp > *nrings) {
      *nrings = nringsTmp;
      for (int i=0; i<nranks*(*nrings); i++) {
        prev[i] = prevTmp[i];
        next[i] = nextTmp[i];
      }
    }
  } while (nringsTmp == 0 && minScore);

  free(coords);
  free(globalRankToIdx);
  free(globalIdxToRank);
  free(prevTmp);
  free(nextTmp);
  free(idxToRank);
  free(rankToIdx);
  free(groups);
  free(subgroups);

  *nthreads = nThreads;

  /* Duplicate the rings in case of multinode+NVLink */
  int nnodes = 0;
  for (int r=0; r<nranks; r++) nnodes += treeIn[r];
  int nvlink;
  NCCLCHECK(ncclNvlinkGpu(&nvlink));
  if (nnodes > 1 && nvlink) {
    *nrings = copyRings(*nrings, *nrings*2, nranks, prev, next, treeIn, treeOut);
  }

  if (*nrings == 0) {
    WARN("Could not create rings, falling back on simple ring");
    *nrings = 1;
    prev[rank] = (rank-1+nranks) % nranks;
    next[rank] = (rank+1)%nranks;
  }

  int maxNrings = ncclParamMaxNrings();
  int minNrings = ncclParamMinNrings();
  if (maxNrings > 0 && minNrings > maxNrings) {
    if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than NCCL_MAX_NRINGS, ignoring NCCL_MIN_NRINGS");
    minNrings = 0;
  }
  if (minNrings > MAXCHANNELS) {
    if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXCHANNELS, MAXCHANNELS);
    minNrings = MAXCHANNELS;
  }
  if (maxNrings > 0 && maxNrings <= *nrings) {
    if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings);
    *nrings = maxNrings;
  } else {
    int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1;
    if (minNrings < defaultMinNrings) minNrings = defaultMinNrings;
    if (minNrings > 0 && minNrings > *nrings) {
      if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings);
      *nrings = copyRings(*nrings, minNrings, nranks, prev, next, treeIn, treeOut);
    }
  }

  NCCLCHECK(getEnvThreads(nthreads));
  return ncclSuccess;
}