1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
|
/*************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_COLLECTIVES_H_
#define NCCL_COLLECTIVES_H_
#include "core.h"
#include "info.h"
#define FUNC_INDEX(coll, redop, dtype, al, pr) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
#define NCCL_COLL_NAME(coll, op, dtype) \
coll##_##op##_##dtype
#define NCCL_KERN_NAME(coll, op, dtype) \
coll##Kernel_##op##_##dtype
/* Declare all collective operations */
#define DECL_COLL5(coll, op, dtype) \
extern __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \
extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl c); \
#define DECL_COLL4(coll, op, dtype) \
DECL_COLL5(coll, op, dtype) \
DECL_COLL5(coll##LL, op, dtype) \
DECL_COLL5(coll##LL128, op, dtype)
#define DECL_COLL3(coll, op, dtype) \
DECL_COLL4(coll##Ring, op, dtype) \
DECL_COLL4(coll##Tree, op, dtype)
#define DECL_COLL2(coll, op) \
DECL_COLL3(coll, op, i8) \
DECL_COLL3(coll, op, u8) \
DECL_COLL3(coll, op, i32) \
DECL_COLL3(coll, op, u32) \
DECL_COLL3(coll, op, i64) \
DECL_COLL3(coll, op, u64) \
DECL_COLL3(coll, op, f16) \
DECL_COLL3(coll, op, f32) \
DECL_COLL3(coll, op, f64)
#define DECL_COLL(coll) \
DECL_COLL2(coll, sum) \
DECL_COLL2(coll, prod) \
DECL_COLL2(coll, min) \
DECL_COLL2(coll, max)
#define DECL_ALL_COLLS \
DECL_COLL2(ncclBroadcast, copy) \
DECL_COLL(ncclReduce) \
DECL_COLL2(ncclAllGather, copy) \
DECL_COLL(ncclReduceScatter) \
DECL_COLL(ncclAllReduce) \
DECL_ALL_COLLS
// CHUNKSIZE must be a multiple of SLICESIZE
#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
#define BROADCAST_SLICESTEPS 1
#define BROADCAST_CHUNKSTEPS 1
#define REDUCE_SLICESTEPS 1
#define REDUCE_CHUNKSTEPS 1
#endif
|