diff options
Diffstat (limited to 'intern/cycles/kernel/device/metal/compat.h')
-rw-r--r-- | intern/cycles/kernel/device/metal/compat.h | 161 |
1 files changed, 160 insertions, 1 deletions
diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h index 77cea30914c..61597a4acfc 100644 --- a/intern/cycles/kernel/device/metal/compat.h +++ b/intern/cycles/kernel/device/metal/compat.h @@ -34,6 +34,7 @@ using namespace metal; #pragma clang diagnostic ignored "-Wunused-variable" #pragma clang diagnostic ignored "-Wsign-compare" +#pragma clang diagnostic ignored "-Wuninitialized" /* Qualifiers */ @@ -42,8 +43,9 @@ using namespace metal; #define ccl_device_forceinline ccl_device #define ccl_device_noinline ccl_device __attribute__((noinline)) #define ccl_device_noinline_cpu ccl_device +#define ccl_device_inline_method ccl_device #define ccl_global device -#define ccl_static_constant static constant constexpr +#define ccl_inline_constant static constant constexpr #define ccl_device_constant constant #define ccl_constant const device #define ccl_gpu_shared threadgroup @@ -58,6 +60,122 @@ using namespace metal; #define kernel_assert(cond) +#define ccl_gpu_global_id_x() metal_global_id +#define ccl_gpu_warp_size simdgroup_size +#define ccl_gpu_thread_idx_x simd_group_index +#define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1) + +#define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate))) +#define ccl_gpu_syncthreads() threadgroup_barrier(mem_flags::mem_threadgroup); + +// clang-format off + +/* kernel.h adapters */ + +#define ccl_gpu_kernel(block_num_threads, thread_num_registers) +#define ccl_gpu_kernel_threads(block_num_threads) + +/* Convert a comma-separated list into a semicolon-separated list + * (so that we can generate a struct based on kernel entry-point parameters). */ +#define FN0() +#define FN1(p1) p1; +#define FN2(p1, p2) p1; p2; +#define FN3(p1, p2, p3) p1; p2; p3; +#define FN4(p1, p2, p3, p4) p1; p2; p3; p4; +#define FN5(p1, p2, p3, p4, p5) p1; p2; p3; p4; p5; +#define FN6(p1, p2, p3, p4, p5, p6) p1; p2; p3; p4; p5; p6; +#define FN7(p1, p2, p3, p4, p5, p6, p7) p1; p2; p3; p4; p5; p6; p7; +#define FN8(p1, p2, p3, p4, p5, p6, p7, p8) p1; p2; p3; p4; p5; p6; p7; p8; +#define FN9(p1, p2, p3, p4, p5, p6, p7, p8, p9) p1; p2; p3; p4; p5; p6; p7; p8; p9; +#define FN10(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; +#define FN11(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; +#define FN12(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; +#define FN13(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; +#define FN14(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; +#define FN15(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; +#define FN16(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; +#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, ...) p16 +#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0) + +/* Generate a struct containing the entry-point parameters and a "run" + * method which can access them implicitly via this-> */ +#define ccl_gpu_kernel_signature(name, ...) \ +struct kernel_gpu_##name \ +{ \ + PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \ + void run(thread MetalKernelContext& context, \ + threadgroup int *simdgroup_offset, \ + const uint metal_global_id, \ + const ushort metal_local_id, \ + const ushort metal_local_size, \ + uint simdgroup_size, \ + uint simd_lane_index, \ + uint simd_group_index, \ + uint num_simd_groups) ccl_global const; \ +}; \ +kernel void kernel_metal_##name(device const kernel_gpu_##name *params_struct, \ + constant KernelParamsMetal &ccl_restrict _launch_params_metal, \ + constant MetalAncillaries *_metal_ancillaries, \ + threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \ + const uint metal_global_id [[thread_position_in_grid]], \ + const ushort metal_local_id [[thread_position_in_threadgroup]], \ + const ushort metal_local_size [[threads_per_threadgroup]], \ + uint simdgroup_size [[threads_per_simdgroup]], \ + uint simd_lane_index [[thread_index_in_simdgroup]], \ + uint simd_group_index [[simdgroup_index_in_threadgroup]], \ + uint num_simd_groups [[simdgroups_per_threadgroup]]) { \ + MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \ + params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \ +} \ +void kernel_gpu_##name::run(thread MetalKernelContext& context, \ + threadgroup int *simdgroup_offset, \ + const uint metal_global_id, \ + const ushort metal_local_id, \ + const ushort metal_local_size, \ + uint simdgroup_size, \ + uint simd_lane_index, \ + uint simd_group_index, \ + uint num_simd_groups) ccl_global const + +#define ccl_gpu_kernel_call(x) context.x + +/* define a function object where "func" is the lambda body, and additional parameters are used to specify captured state */ +#define ccl_gpu_kernel_lambda(func, ...) \ + struct KernelLambda \ + { \ + KernelLambda(ccl_private MetalKernelContext &_context) : context(_context) {} \ + ccl_private MetalKernelContext &context; \ + __VA_ARGS__; \ + int operator()(const int state) const { return (func); } \ + } ccl_gpu_kernel_lambda_pass(context) + +// clang-format on + +/* volumetric lambda functions - use function objects for lambda-like functionality */ +#define VOLUME_READ_LAMBDA(function_call) \ + struct FnObjectRead { \ + KernelGlobals kg; \ + ccl_private MetalKernelContext *context; \ + int state; \ +\ + VolumeStack operator()(const int i) const \ + { \ + return context->function_call; \ + } \ + } volume_read_lambda_pass{kg, this, state}; + +#define VOLUME_WRITE_LAMBDA(function_call) \ + struct FnObjectWrite { \ + KernelGlobals kg; \ + ccl_private MetalKernelContext *context; \ + int state; \ +\ + void operator()(const int i, VolumeStack entry) const \ + { \ + context->function_call; \ + } \ + } volume_write_lambda_pass{kg, this, state}; + /* make_type definitions with Metal style element initializers */ #ifdef make_float2 # undef make_float2 @@ -112,6 +230,7 @@ using namespace metal; #define sinhf(x) sinh(float(x)) #define coshf(x) cosh(float(x)) #define tanhf(x) tanh(float(x)) +#define saturatef(x) saturate(float(x)) /* Use native functions with possibly lower precision for performance, * no issues found so far. */ @@ -124,3 +243,43 @@ using namespace metal; #define logf(x) trigmode::log(float(x)) #define NULL 0 + +#define __device__ + +/* texture bindings and sampler setup */ + +struct Texture2DParamsMetal { + texture2d<float, access::sample> tex; +}; +struct Texture3DParamsMetal { + texture3d<float, access::sample> tex; +}; + +struct MetalAncillaries { + device Texture2DParamsMetal *textures_2d; + device Texture3DParamsMetal *textures_3d; +}; + +#include "util/half.h" +#include "util/types.h" + +enum SamplerType { + SamplerFilterNearest_AddressRepeat, + SamplerFilterNearest_AddressClampEdge, + SamplerFilterNearest_AddressClampZero, + + SamplerFilterLinear_AddressRepeat, + SamplerFilterLinear_AddressClampEdge, + SamplerFilterLinear_AddressClampZero, + + SamplerCount +}; + +constant constexpr array<sampler, SamplerCount> metal_samplers = { + sampler(address::repeat, filter::nearest), + sampler(address::clamp_to_edge, filter::nearest), + sampler(address::clamp_to_zero, filter::nearest), + sampler(address::repeat, filter::linear), + sampler(address::clamp_to_edge, filter::linear), + sampler(address::clamp_to_zero, filter::linear), +}; |