intern/cycles/kernel/device/cuda/config.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

/*
 * Copyright 2011-2013 Blender Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/* Device data taken from CUDA occupancy calculator.
 *
 * Terminology
 * - CUDA GPUs have multiple streaming multiprocessors
 * - Each multiprocessor executes multiple thread blocks
 * - Each thread block contains a number of threads, also known as the block size
 * - Multiprocessors have a fixed number of registers, and the amount of registers
 *   used by each threads limits the number of threads per block.
 */

/* 3.0 and 3.5 */
#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
#  define GPU_BLOCK_MAX_THREADS 1024
#  define GPU_THREAD_MAX_REGISTERS 63

/* tunable parameters */
#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
#  define GPU_KERNEL_MAX_REGISTERS 63

/* 3.2 */
#elif __CUDA_ARCH__ == 320
#  define GPU_MULTIPRESSOR_MAX_REGISTERS 32768
#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
#  define GPU_BLOCK_MAX_THREADS 1024
#  define GPU_THREAD_MAX_REGISTERS 63

/* tunable parameters */
#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
#  define GPU_KERNEL_MAX_REGISTERS 63

/* 3.7 */
#elif __CUDA_ARCH__ == 370
#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
#  define GPU_BLOCK_MAX_THREADS 1024
#  define GPU_THREAD_MAX_REGISTERS 255

/* tunable parameters */
#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
#  define GPU_KERNEL_MAX_REGISTERS 63

/* 5.x, 6.x */
#elif __CUDA_ARCH__ <= 699
#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
#  define GPU_BLOCK_MAX_THREADS 1024
#  define GPU_THREAD_MAX_REGISTERS 255

/* tunable parameters */
#  define GPU_KERNEL_BLOCK_NUM_THREADS 256
/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
 * registers */
#  if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
#    define GPU_KERNEL_MAX_REGISTERS 64
#  else
#    define GPU_KERNEL_MAX_REGISTERS 48
#  endif

/* 7.x, 8.x */
#elif __CUDA_ARCH__ <= 899
#  define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
#  define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
#  define GPU_BLOCK_MAX_THREADS 1024
#  define GPU_THREAD_MAX_REGISTERS 255

/* tunable parameters */
#  define GPU_KERNEL_BLOCK_NUM_THREADS 512
#  define GPU_KERNEL_MAX_REGISTERS 96

/* unknown architecture */
#else
#  error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
#endif

/* Compute number of threads per block and minimum blocks per multiprocessor
 * given the maximum number of registers per thread. */
#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
  extern "C" __global__ void __launch_bounds__(block_num_threads, \
                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
                                                   (block_num_threads * thread_num_registers))

#define ccl_gpu_kernel_threads(block_num_threads) \
  extern "C" __global__ void __launch_bounds__(block_num_threads)

#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)

#define ccl_gpu_kernel_call(x) x

/* Define a function object where "func" is the lambda body, and additional parameters are used to
 * specify captured state  */
#define ccl_gpu_kernel_lambda(func, ...) \
  struct KernelLambda { \
    __VA_ARGS__; \
    __device__ int operator()(const int state) \
    { \
      return (func); \
    } \
  } ccl_gpu_kernel_lambda_pass

/* sanity checks */

#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
#  error "Maximum number of threads per block exceeded"
#endif

#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
    GPU_MULTIPROCESSOR_MAX_BLOCKS
#  error "Maximum number of blocks per multiprocessor exceeded"
#endif

#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
#  error "Maximum number of registers per thread exceeded"
#endif