src/collectives/device/sendrecv.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92

/*************************************************************************
 * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"

template<class FUNC, typename T, int UNROLL>
class ncclFunction<ncclFuncSendRecv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
  public:
    __device__ void run(struct ncclWorkElem* firstArgs) {
      struct ncclWorkElem* args = firstArgs;
      int tid = threadIdx.x;
      int group = 0;
      for (int s=0; s<NCCL_MAX_WORK_ELEMENTS; s++) {
        int nThreadsSegment = args->p2p.nThreads;
        if (nThreadsSegment == 0) return; // Nothing else to do
        int groupRecv = group;
        group += 1;
        int groupSend = group;
        group += nThreadsSegment > 128 ? 2 : 1;
        if (tid < nThreadsSegment) {
          const int nThreads = nThreadsSegment > 128 ? nThreadsSegment-WARP_SIZE : nThreadsSegment;

          // Compute pointers
          const T* sendbuff = (const T*)args->sendbuff;
          T* recvbuff = (T*)args->recvbuff;
          const ssize_t sendCount = args->p2p.sendCount;
          const ssize_t recvCount = args->p2p.recvCount;

          const int delta = args->p2p.delta;
          if (delta == 0) {
            if (tid < nThreads && sendbuff != recvbuff) {
              // local copy : ReduceOrCopyMulti takes an int as number of elements,
              // so we split it in blocks of 1G elements.
              int blockSize = 1<<30;
              for (size_t offset=0; offset<sendCount; offset += blockSize) {
                size_t remaining = sendCount - offset;
                if (remaining < blockSize) blockSize = remaining;
                ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nThreads, 1, &sendbuff, 1, &recvbuff, blockSize);
                sendbuff += blockSize; recvbuff += blockSize;
              }
            }
          } else {
            struct ncclDevComm* comm = args->comm;
            struct ncclChannel* channel = comm->channels+blockIdx.x;

            const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(sizeof(T)*NCCL_STEPS);
            const int chunkSize = stepSize/SENDRECV_SLICEFACTOR;

            int nThreadsSplit = nThreads/2;
            if ((tid < nThreadsSplit) && recvCount >= 0) {
              int peer = (comm->rank-delta+comm->nRanks)%comm->nRanks;
              int nt = nThreadsSplit;
              ncclPrimitives<UNROLL, 1, 1, T, 1, 0, 1, FUNC>
                prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupRecv);

              if (recvCount == 0) {
                prims.recv(recvbuff, 0);
              } else for (ssize_t offset = 0; offset < recvCount; offset += chunkSize) {
                int realChunkSize = min(chunkSize, recvCount-offset);
                ALIGN_SIZE(realChunkSize, nt*sizeof(uint64_t)/sizeof(T));
                int nelem = min(realChunkSize, recvCount-offset);
                prims.directRecv(recvbuff+offset, offset, nelem);
              }
            }
            if ((tid >= nThreadsSplit) && sendCount >= 0) {
              int peer = (comm->rank+delta)%comm->nRanks;
              int nt = nThreads-nThreadsSplit;
              ncclPrimitives<UNROLL, 1, 1, T, 0, 1, 1, FUNC>
                prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupSend);

              if (sendCount == 0) {
                prims.send(sendbuff, 0);
              } else for (ssize_t offset = 0; offset < sendCount; offset += chunkSize) {
                int realChunkSize = min(chunkSize, sendCount-offset);
                ALIGN_SIZE(realChunkSize, nt*sizeof(uint64_t)/sizeof(T));
                int nelem = min(realChunkSize, sendCount-offset);
                prims.directSend(sendbuff+offset, offset, nelem);
              }
            }
          }
        }
        tid -= nThreadsSegment;
        if (tid < 0) return;
        args++;
      }
    }
};