intern/cycles/kernel/device/metal/compat.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318

/* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

#pragma once

#define __KERNEL_GPU__
#define __KERNEL_METAL__
#define CCL_NAMESPACE_BEGIN
#define CCL_NAMESPACE_END

#ifndef ATTR_FALLTHROUGH
#  define ATTR_FALLTHROUGH
#endif

#include <metal_atomic>
#include <metal_pack>
#include <metal_stdlib>
#include <simd/simd.h>

using namespace metal;

#ifdef __METALRT__
using namespace metal::raytracing;
#endif

#pragma clang diagnostic ignored "-Wunused-variable"
#pragma clang diagnostic ignored "-Wsign-compare"
#pragma clang diagnostic ignored "-Wuninitialized"

/* Qualifiers */

#define ccl_device
#define ccl_device_inline ccl_device __attribute__((always_inline))
#define ccl_device_forceinline ccl_device __attribute__((always_inline))
#if defined(__KERNEL_METAL_APPLE__)
#  define ccl_device_noinline ccl_device
#else
#  define ccl_device_noinline ccl_device __attribute__((noinline))
#endif

#define ccl_device_noinline_cpu ccl_device
#define ccl_device_inline_method ccl_device
#define ccl_global device
#define ccl_inline_constant static constant constexpr
#define ccl_device_constant constant
#define ccl_constant constant
#define ccl_gpu_shared threadgroup
#define ccl_private thread
#define ccl_may_alias
#define ccl_restrict __restrict
#define ccl_loop_no_unroll
#define ccl_align(n) alignas(n)
#define ccl_optional_struct_init

/* No assert supported for Metal */

#define kernel_assert(cond)

#define ccl_gpu_global_id_x() metal_global_id
#define ccl_gpu_warp_size simdgroup_size
#define ccl_gpu_thread_idx_x simd_group_index
#define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1)

#define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate)))
#define ccl_gpu_syncthreads() threadgroup_barrier(mem_flags::mem_threadgroup);

// clang-format off

/* kernel.h adapters */

#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
#define ccl_gpu_kernel_threads(block_num_threads)

/* Convert a comma-separated list into a semicolon-separated list
 * (so that we can generate a struct based on kernel entry-point parameters). */
#define FN0()
#define FN1(p1) p1;
#define FN2(p1, p2) p1; p2;
#define FN3(p1, p2, p3) p1; p2; p3;
#define FN4(p1, p2, p3, p4) p1; p2; p3; p4;
#define FN5(p1, p2, p3, p4, p5) p1; p2; p3; p4; p5;
#define FN6(p1, p2, p3, p4, p5, p6) p1; p2; p3; p4; p5; p6;
#define FN7(p1, p2, p3, p4, p5, p6, p7) p1; p2; p3; p4; p5; p6; p7;
#define FN8(p1, p2, p3, p4, p5, p6, p7, p8) p1; p2; p3; p4; p5; p6; p7; p8;
#define FN9(p1, p2, p3, p4, p5, p6, p7, p8, p9) p1; p2; p3; p4; p5; p6; p7; p8; p9;
#define FN10(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10;
#define FN11(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11;
#define FN12(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12;
#define FN13(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13;
#define FN14(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14;
#define FN15(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15;
#define FN16(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16;
#define FN17(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17;
#define FN18(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18;
#define FN19(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; p19;
#define FN20(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; p19; p20;
#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, ...) p20
#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN20, FN19, FN18, FN17, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0)

/* Generate a struct containing the entry-point parameters and a "run"
 * method which can access them implicitly via this-> */
#define ccl_gpu_kernel_signature(name, ...) \
struct kernel_gpu_##name \
{ \
  PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
  void run(thread MetalKernelContext& context, \
           threadgroup int *simdgroup_offset, \
           const uint metal_global_id, \
           const ushort metal_local_id, \
           const ushort metal_local_size, \
           uint simdgroup_size, \
           uint simd_lane_index, \
           uint simd_group_index, \
           uint num_simd_groups) ccl_global const; \
}; \
kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \
                                constant KernelParamsMetal &ccl_restrict   _launch_params_metal, \
                                constant MetalAncillaries *_metal_ancillaries, \
                                threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
                                const uint metal_global_id [[thread_position_in_grid]], \
                                const ushort metal_local_id   [[thread_position_in_threadgroup]], \
                                const ushort metal_local_size [[threads_per_threadgroup]], \
                                uint simdgroup_size [[threads_per_simdgroup]], \
                                uint simd_lane_index [[thread_index_in_simdgroup]], \
                                uint simd_group_index [[simdgroup_index_in_threadgroup]], \
                                uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
  MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
  params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
} \
void kernel_gpu_##name::run(thread MetalKernelContext& context, \
                  threadgroup int *simdgroup_offset, \
                  const uint metal_global_id, \
                  const ushort metal_local_id, \
                  const ushort metal_local_size, \
                  uint simdgroup_size, \
                  uint simd_lane_index, \
                  uint simd_group_index, \
                  uint num_simd_groups) ccl_global const

#define ccl_gpu_kernel_postfix
#define ccl_gpu_kernel_call(x) context.x

/* define a function object where "func" is the lambda body, and additional parameters are used to specify captured state  */
#define ccl_gpu_kernel_lambda(func, ...) \
  struct KernelLambda \
  { \
    KernelLambda(ccl_private MetalKernelContext &_context) : context(_context) {} \
    ccl_private MetalKernelContext &context; \
    __VA_ARGS__; \
    int operator()(const int state) const { return (func); } \
  } ccl_gpu_kernel_lambda_pass(context)

// clang-format on

/* volumetric lambda functions - use function objects for lambda-like functionality */
#define VOLUME_READ_LAMBDA(function_call) \
  struct FnObjectRead { \
    KernelGlobals kg; \
    ccl_private MetalKernelContext *context; \
    int state; \
\
    VolumeStack operator()(const int i) const \
    { \
      return context->function_call; \
    } \
  } volume_read_lambda_pass{kg, this, state};

#define VOLUME_WRITE_LAMBDA(function_call) \
  struct FnObjectWrite { \
    KernelGlobals kg; \
    ccl_private MetalKernelContext *context; \
    int state; \
\
    void operator()(const int i, VolumeStack entry) const \
    { \
      context->function_call; \
    } \
  } volume_write_lambda_pass{kg, this, state};

/* make_type definitions with Metal style element initializers */
ccl_device_forceinline float2 make_float2(const float x, const float y)
{
  return float2(x, y);
}

ccl_device_forceinline float3 make_float3(const float x, const float y, const float z)
{
  return float3(x, y, z);
}

ccl_device_forceinline float4 make_float4(const float x,
                                          const float y,
                                          const float z,
                                          const float w)
{
  return float4(x, y, z, w);
}

ccl_device_forceinline int2 make_int2(const int x, const int y)
{
  return int2(x, y);
}

ccl_device_forceinline int3 make_int3(const int x, const int y, const int z)
{
  return int3(x, y, z);
}

ccl_device_forceinline int4 make_int4(const int x, const int y, const int z, const int w)
{
  return int4(x, y, z, w);
}

ccl_device_forceinline uchar4 make_uchar4(const uchar x,
                                          const uchar y,
                                          const uchar z,
                                          const uchar w)
{
  return uchar4(x, y, z, w);
}

/* Math functions */

#define __uint_as_float(x) as_type<float>(x)
#define __float_as_uint(x) as_type<uint>(x)
#define __int_as_float(x) as_type<float>(x)
#define __float_as_int(x) as_type<int>(x)
#define __float2half(x) half(x)
#define powf(x, y) pow(float(x), float(y))
#define fabsf(x) fabs(float(x))
#define copysignf(x, y) copysign(float(x), float(y))
#define asinf(x) asin(float(x))
#define acosf(x) acos(float(x))
#define atanf(x) atan(float(x))
#define floorf(x) floor(float(x))
#define ceilf(x) ceil(float(x))
#define hypotf(x, y) hypot(float(x), float(y))
#define atan2f(x, y) atan2(float(x), float(y))
#define fmaxf(x, y) fmax(float(x), float(y))
#define fminf(x, y) fmin(float(x), float(y))
#define fmodf(x, y) fmod(float(x), float(y))
#define sinhf(x) sinh(float(x))
#define coshf(x) cosh(float(x))
#define tanhf(x) tanh(float(x))
#define saturatef(x) saturate(float(x))

/* Use native functions with possibly lower precision for performance,
 * no issues found so far. */
#define trigmode fast
#define sinf(x) trigmode::sin(float(x))
#define cosf(x) trigmode::cos(float(x))
#define tanf(x) trigmode::tan(float(x))
#define expf(x) trigmode::exp(float(x))
#define sqrtf(x) trigmode::sqrt(float(x))
#define logf(x) trigmode::log(float(x))

#define NULL 0

#define __device__

#ifdef __METALRT__

#  if defined(__METALRT_MOTION__)
#    define METALRT_TAGS instancing, instance_motion, primitive_motion
#  else
#    define METALRT_TAGS instancing
#  endif /* __METALRT_MOTION__ */

typedef acceleration_structure<METALRT_TAGS> metalrt_as_type;
typedef intersection_function_table<triangle_data, METALRT_TAGS> metalrt_ift_type;
typedef metal::raytracing::intersector<triangle_data, METALRT_TAGS> metalrt_intersector_type;

#endif /* __METALRT__ */

/* texture bindings and sampler setup */

struct Texture2DParamsMetal {
  texture2d<float, access::sample> tex;
};
struct Texture3DParamsMetal {
  texture3d<float, access::sample> tex;
};

struct MetalAncillaries {
  device Texture2DParamsMetal *textures_2d;
  device Texture3DParamsMetal *textures_3d;

#ifdef __METALRT__
  metalrt_as_type accel_struct;
  metalrt_ift_type ift_default;
  metalrt_ift_type ift_shadow;
  metalrt_ift_type ift_local;
#endif
};

#include "util/half.h"
#include "util/types.h"

enum SamplerType {
  SamplerFilterNearest_AddressRepeat,
  SamplerFilterNearest_AddressClampEdge,
  SamplerFilterNearest_AddressClampZero,

  SamplerFilterLinear_AddressRepeat,
  SamplerFilterLinear_AddressClampEdge,
  SamplerFilterLinear_AddressClampZero,

  SamplerCount
};

constant constexpr array<sampler, SamplerCount> metal_samplers = {
    sampler(address::repeat, filter::nearest),
    sampler(address::clamp_to_edge, filter::nearest),
    sampler(address::clamp_to_zero, filter::nearest),
    sampler(address::repeat, filter::linear),
    sampler(address::clamp_to_edge, filter::linear),
    sampler(address::clamp_to_zero, filter::linear),
};