diff options
Diffstat (limited to 'intern/cycles/kernel/device/oneapi/compat.h')
-rw-r--r-- | intern/cycles/kernel/device/oneapi/compat.h | 194 |
1 files changed, 194 insertions, 0 deletions
diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h new file mode 100644 index 00000000000..5c49674f247 --- /dev/null +++ b/intern/cycles/kernel/device/oneapi/compat.h @@ -0,0 +1,194 @@ +/* SPDX-License-Identifier: Apache-2.0 + * Copyright 2021-2022 Intel Corporation */ + +#pragma once + +#define __KERNEL_GPU__ +#define __KERNEL_ONEAPI__ + +#define CCL_NAMESPACE_BEGIN +#define CCL_NAMESPACE_END + +#include <cstdint> + +#ifndef __NODES_MAX_GROUP__ +# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX +#endif +#ifndef __NODES_FEATURES__ +# define __NODES_FEATURES__ NODE_FEATURE_ALL +#endif + +/* This one does not have an abstraction. + * It's used by other devices directly. + */ + +#define __device__ + +/* Qualifier wrappers for different names on different devices */ + +#define ccl_device +#define ccl_global +#define ccl_always_inline __attribute__((always_inline)) +#define ccl_device_inline inline +#define ccl_noinline __attribute__((noinline)) +#define ccl_inline_constant const constexpr +#define ccl_static_constant const +#define ccl_device_forceinline __attribute__((always_inline)) +#define ccl_device_noinline ccl_device ccl_noinline +#define ccl_device_noinline_cpu ccl_device +#define ccl_device_inline_method ccl_device +#define ccl_restrict __restrict__ +#define ccl_loop_no_unroll +#define ccl_optional_struct_init +#define ccl_private +#define ATTR_FALLTHROUGH __attribute__((fallthrough)) +#define ccl_constant const +#define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__))) +#define ccl_align(n) __attribute__((aligned(n))) +#define kernel_assert(cond) +#define ccl_may_alias + +/* clang-format off */ + +/* kernel.h adapters */ +#define ccl_gpu_kernel(block_num_threads, thread_num_registers) +#define ccl_gpu_kernel_threads(block_num_threads) + +#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED +# define KG_ND_ITEMS \ + kg->nd_item_local_id_0 = item.get_local_id(0); \ + kg->nd_item_local_range_0 = item.get_local_range(0); \ + kg->nd_item_group_0 = item.get_group(0); \ + kg->nd_item_group_range_0 = item.get_group_range(0); \ + kg->nd_item_global_id_0 = item.get_global_id(0); \ + kg->nd_item_global_range_0 = item.get_global_range(0); +#else +# define KG_ND_ITEMS +#endif + +#define ccl_gpu_kernel_signature(name, ...) \ +void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ + size_t kernel_global_size, \ + size_t kernel_local_size, \ + sycl::handler &cgh, \ + __VA_ARGS__) { \ + (kg); \ + cgh.parallel_for<class kernel_##name>( \ + sycl::nd_range<1>(kernel_global_size, kernel_local_size), \ + [=](sycl::nd_item<1> item) { \ + KG_ND_ITEMS + +#define ccl_gpu_kernel_postfix \ + }); \ + } + +#define ccl_gpu_kernel_call(x) ((ONEAPIKernelContext*)kg)->x + +#define ccl_gpu_kernel_lambda(func, ...) \ + struct KernelLambda \ + { \ + KernelLambda(const ONEAPIKernelContext *_kg) : kg(_kg) {} \ + ccl_private const ONEAPIKernelContext *kg; \ + __VA_ARGS__; \ + int operator()(const int state) const { return (func); } \ + } ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg) + +/* GPU thread, block, grid size and index */ +#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED +# define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0)) +# define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0)) +# define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0)) +# define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0)) +# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) +# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) + +# define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0)) +# define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0)) +#else +# define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0) +# define ccl_gpu_block_dim_x (kg->nd_item_local_range_0) +# define ccl_gpu_block_idx_x (kg->nd_item_group_0) +# define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0) +# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) +# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) + +# define ccl_gpu_global_id_x() (kg->nd_item_global_id_0) +# define ccl_gpu_global_size_x() (kg->nd_item_global_range_0) +#endif + + +/* GPU warp synchronization */ + +#define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier() +#define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space) +#ifdef __SYCL_DEVICE_ONLY__ + #define ccl_gpu_ballot(predicate) (sycl::ext::oneapi::group_ballot(sycl::ext::oneapi::experimental::this_sub_group(), predicate).count()) +#else + #define ccl_gpu_ballot(predicate) (predicate ? 1 : 0) +#endif + +/* Debug defines */ +#if defined(__SYCL_DEVICE_ONLY__) +# define CONSTANT __attribute__((opencl_constant)) +#else +# define CONSTANT +#endif + +#define sycl_printf(format, ...) { \ + static const CONSTANT char fmt[] = format; \ + sycl::ext::oneapi::experimental::printf(fmt, __VA_ARGS__ ); \ + } + +#define sycl_printf_(format) { \ + static const CONSTANT char fmt[] = format; \ + sycl::ext::oneapi::experimental::printf(fmt); \ + } + +/* GPU texture objects */ + +/* clang-format on */ + +/* Types */ + +/* It's not possible to use sycl types like sycl::float3, sycl::int3, etc + * because these types have different interfaces from blender version. */ + +using uchar = unsigned char; +using sycl::half; + +/* math functions */ +#define fabsf(x) sycl::fabs((x)) +#define copysignf(x, y) sycl::copysign((x), (y)) +#define asinf(x) sycl::asin((x)) +#define acosf(x) sycl::acos((x)) +#define atanf(x) sycl::atan((x)) +#define floorf(x) sycl::floor((x)) +#define ceilf(x) sycl::ceil((x)) +#define sinhf(x) sycl::sinh((x)) +#define coshf(x) sycl::cosh((x)) +#define tanhf(x) sycl::tanh((x)) +#define hypotf(x, y) sycl::hypot((x), (y)) +#define atan2f(x, y) sycl::atan2((x), (y)) +#define fmaxf(x, y) sycl::fmax((x), (y)) +#define fminf(x, y) sycl::fmin((x), (y)) +#define fmodf(x, y) sycl::fmod((x), (y)) +#define lgammaf(x) sycl::lgamma((x)) + +#define __forceinline __attribute__((always_inline)) + +/* Types */ +#include "util/half.h" +#include "util/types.h" + +/* NOTE(@nsirgien): Declaring these functions after types headers is very important because they + * include oneAPI headers, which transitively include math.h headers which will cause redefinitions + * of the math defines because math.h also uses them and having them defined before math.h include + * is actually UB. */ +/* Use fast math functions - get them from sycl::native namespace for native math function + * implementations */ +#define cosf(x) sycl::native::cos(((float)(x))) +#define sinf(x) sycl::native::sin(((float)(x))) +#define powf(x, y) sycl::native::powr(((float)(x)), ((float)(y))) +#define tanf(x) sycl::native::tan(((float)(x))) +#define logf(x) sycl::native::log(((float)(x))) +#define expf(x) sycl::native::exp(((float)(x))) |