diff options
Diffstat (limited to 'intern/cycles/kernel/device/oneapi')
-rw-r--r-- | intern/cycles/kernel/device/oneapi/compat.h | 82 | ||||
-rw-r--r-- | intern/cycles/kernel/device/oneapi/dll_interface_template.h | 53 | ||||
-rw-r--r-- | intern/cycles/kernel/device/oneapi/globals.h | 9 | ||||
-rw-r--r-- | intern/cycles/kernel/device/oneapi/image.h | 26 | ||||
-rw-r--r-- | intern/cycles/kernel/device/oneapi/kernel.cpp | 566 | ||||
-rw-r--r-- | intern/cycles/kernel/device/oneapi/kernel.h | 21 | ||||
-rw-r--r-- | intern/cycles/kernel/device/oneapi/kernel_templates.h | 2 |
7 files changed, 136 insertions, 623 deletions
diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h index 1b25259bcf5..dfaec65130c 100644 --- a/intern/cycles/kernel/device/oneapi/compat.h +++ b/intern/cycles/kernel/device/oneapi/compat.h @@ -10,6 +10,7 @@ #define CCL_NAMESPACE_END #include <cstdint> +#include <math.h> #ifndef __NODES_MAX_GROUP__ # define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX @@ -30,7 +31,7 @@ #define ccl_global #define ccl_always_inline __attribute__((always_inline)) #define ccl_device_inline inline -#define ccl_noinline +#define ccl_noinline __attribute__((noinline)) #define ccl_inline_constant const constexpr #define ccl_static_constant const #define ccl_device_forceinline __attribute__((always_inline)) @@ -54,18 +55,6 @@ #define ccl_gpu_kernel(block_num_threads, thread_num_registers) #define ccl_gpu_kernel_threads(block_num_threads) -#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED -# define KG_ND_ITEMS \ - kg->nd_item_local_id_0 = item.get_local_id(0); \ - kg->nd_item_local_range_0 = item.get_local_range(0); \ - kg->nd_item_group_0 = item.get_group(0); \ - kg->nd_item_group_range_0 = item.get_group_range(0); \ - kg->nd_item_global_id_0 = item.get_global_id(0); \ - kg->nd_item_global_range_0 = item.get_global_range(0); -#else -# define KG_ND_ITEMS -#endif - #define ccl_gpu_kernel_signature(name, ...) \ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ size_t kernel_global_size, \ @@ -75,8 +64,7 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ (kg); \ cgh.parallel_for<class kernel_##name>( \ sycl::nd_range<1>(kernel_global_size, kernel_local_size), \ - [=](sycl::nd_item<1> item) { \ - KG_ND_ITEMS + [=](sycl::nd_item<1> item) { #define ccl_gpu_kernel_postfix \ }); \ @@ -94,31 +82,17 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ } ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg) /* GPU thread, block, grid size and index */ -#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED -# define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0)) -# define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0)) -# define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0)) -# define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0)) -# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) -# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) - -# define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0)) -# define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0)) -#else -# define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0) -# define ccl_gpu_block_dim_x (kg->nd_item_local_range_0) -# define ccl_gpu_block_idx_x (kg->nd_item_group_0) -# define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0) -# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) -# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) - -# define ccl_gpu_global_id_x() (kg->nd_item_global_id_0) -# define ccl_gpu_global_size_x() (kg->nd_item_global_range_0) -#endif +#define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0)) +#define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0)) +#define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0)) +#define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0)) +#define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0]) +#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp)) +#define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0)) +#define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0)) /* GPU warp synchronization */ - #define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier() #define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space) #ifdef __SYCL_DEVICE_ONLY__ @@ -149,25 +123,13 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \ /* clang-format on */ /* Types */ + /* It's not possible to use sycl types like sycl::float3, sycl::int3, etc - * because these types have different interfaces from blender version */ + * because these types have different interfaces from blender version. */ using uchar = unsigned char; using sycl::half; -struct float3 { - float x, y, z; -}; - -ccl_always_inline float3 make_float3(float x, float y, float z) -{ - return {x, y, z}; -} -ccl_always_inline float3 make_float3(float x) -{ - return {x, x, x}; -} - /* math functions */ #define fabsf(x) sycl::fabs((x)) #define copysignf(x, y) sycl::copysign((x), (y)) @@ -186,21 +148,15 @@ ccl_always_inline float3 make_float3(float x) #define fmodf(x, y) sycl::fmod((x), (y)) #define lgammaf(x) sycl::lgamma((x)) -#define __forceinline __attribute__((always_inline)) - -/* Types */ -#include "util/half.h" -#include "util/types.h" - -/* NOTE(@nsirgien): Declaring these functions after types headers is very important because they - * include oneAPI headers, which transitively include math.h headers which will cause redefinitions - * of the math defines because math.h also uses them and having them defined before math.h include - * is actually UB. */ -/* Use fast math functions - get them from sycl::native namespace for native math function - * implementations */ #define cosf(x) sycl::native::cos(((float)(x))) #define sinf(x) sycl::native::sin(((float)(x))) #define powf(x, y) sycl::native::powr(((float)(x)), ((float)(y))) #define tanf(x) sycl::native::tan(((float)(x))) #define logf(x) sycl::native::log(((float)(x))) #define expf(x) sycl::native::exp(((float)(x))) + +#define __forceinline __attribute__((always_inline)) + +/* Types */ +#include "util/half.h" +#include "util/types.h" diff --git a/intern/cycles/kernel/device/oneapi/dll_interface_template.h b/intern/cycles/kernel/device/oneapi/dll_interface_template.h deleted file mode 100644 index 662068c0fed..00000000000 --- a/intern/cycles/kernel/device/oneapi/dll_interface_template.h +++ /dev/null @@ -1,53 +0,0 @@ -/* SPDX-License-Identifier: Apache-2.0 - * Copyright 2022 Intel Corporation */ - -/* device_capabilities() returns a C string that must be free'd with oneapi_free(). */ -DLL_INTERFACE_CALL(oneapi_device_capabilities, char *) -DLL_INTERFACE_CALL(oneapi_free, void, void *) -DLL_INTERFACE_CALL(oneapi_get_memcapacity, size_t, SyclQueue *queue) - -DLL_INTERFACE_CALL(oneapi_get_compute_units_amount, size_t, SyclQueue *queue) -DLL_INTERFACE_CALL(oneapi_iterate_devices, void, OneAPIDeviceIteratorCallback cb, void *user_ptr) -DLL_INTERFACE_CALL(oneapi_set_error_cb, void, OneAPIErrorCallback, void *user_ptr) - -DLL_INTERFACE_CALL(oneapi_create_queue, bool, SyclQueue *&external_queue, int device_index) -DLL_INTERFACE_CALL(oneapi_free_queue, void, SyclQueue *queue) -DLL_INTERFACE_CALL( - oneapi_usm_aligned_alloc_host, void *, SyclQueue *queue, size_t memory_size, size_t alignment) -DLL_INTERFACE_CALL(oneapi_usm_alloc_device, void *, SyclQueue *queue, size_t memory_size) -DLL_INTERFACE_CALL(oneapi_usm_free, void, SyclQueue *queue, void *usm_ptr) - -DLL_INTERFACE_CALL( - oneapi_usm_memcpy, bool, SyclQueue *queue, void *dest, void *src, size_t num_bytes) -DLL_INTERFACE_CALL(oneapi_queue_synchronize, bool, SyclQueue *queue) -DLL_INTERFACE_CALL(oneapi_usm_memset, - bool, - SyclQueue *queue, - void *usm_ptr, - unsigned char value, - size_t num_bytes) - -DLL_INTERFACE_CALL(oneapi_run_test_kernel, bool, SyclQueue *queue) - -/* Operation with Kernel globals structure - map of global/constant allocation - filled before - * render/kernel execution As we don't know in cycles `sizeof` this - Cycles will manage just as - * pointer. */ -DLL_INTERFACE_CALL(oneapi_kernel_globals_size, bool, SyclQueue *queue, size_t &kernel_global_size) -DLL_INTERFACE_CALL(oneapi_set_global_memory, - void, - SyclQueue *queue, - void *kernel_globals, - const char *memory_name, - void *memory_device_pointer) - -DLL_INTERFACE_CALL(oneapi_kernel_preferred_local_size, - size_t, - SyclQueue *queue, - const DeviceKernel kernel, - const size_t kernel_global_size) -DLL_INTERFACE_CALL(oneapi_enqueue_kernel, - bool, - KernelContext *context, - int kernel, - size_t global_size, - void **args) diff --git a/intern/cycles/kernel/device/oneapi/globals.h b/intern/cycles/kernel/device/oneapi/globals.h index d60f4f135ba..116620eb725 100644 --- a/intern/cycles/kernel/device/oneapi/globals.h +++ b/intern/cycles/kernel/device/oneapi/globals.h @@ -23,15 +23,6 @@ typedef struct KernelGlobalsGPU { #undef KERNEL_DATA_ARRAY IntegratorStateGPU *integrator_state; const KernelData *__data; -#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED - size_t nd_item_local_id_0; - size_t nd_item_local_range_0; - size_t nd_item_group_0; - size_t nd_item_group_range_0; - - size_t nd_item_global_id_0; - size_t nd_item_global_range_0; -#endif } KernelGlobalsGPU; typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals; diff --git a/intern/cycles/kernel/device/oneapi/image.h b/intern/cycles/kernel/device/oneapi/image.h index 6681977a675..2417b8eac3b 100644 --- a/intern/cycles/kernel/device/oneapi/image.h +++ b/intern/cycles/kernel/device/oneapi/image.h @@ -81,10 +81,15 @@ ccl_device_inline float4 svm_image_texture_read_2d(int id, int x, int y) x = svm_image_texture_wrap_periodic(x, info.width); y = svm_image_texture_wrap_periodic(y, info.height); } - else { + else if (info.extension == EXTENSION_EXTEND) { x = svm_image_texture_wrap_clamp(x, info.width); y = svm_image_texture_wrap_clamp(y, info.height); } + else { + if (x < 0 || x >= info.width || y < 0 || y >= info.height) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } return svm_image_texture_read(info, x, y, 0); } @@ -99,11 +104,16 @@ ccl_device_inline float4 svm_image_texture_read_3d(int id, int x, int y, int z) y = svm_image_texture_wrap_periodic(y, info.height); z = svm_image_texture_wrap_periodic(z, info.depth); } - else { + else if (info.extension == EXTENSION_EXTEND) { x = svm_image_texture_wrap_clamp(x, info.width); y = svm_image_texture_wrap_clamp(y, info.height); z = svm_image_texture_wrap_clamp(z, info.depth); } + else { + if (x < 0 || x >= info.width || y < 0 || y >= info.height || z < 0 || z >= info.depth) { + return make_float4(0.0f, 0.0f, 0.0f, 0.0f); + } + } return svm_image_texture_read(info, x, y, z); } @@ -128,12 +138,6 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals, int id, float x, float { const TextureInfo &info = kernel_data_fetch(texture_info, id); - if (info.extension == EXTENSION_CLIP) { - if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - if (info.interpolation == INTERPOLATION_CLOSEST) { /* Closest interpolation. */ int ix, iy; @@ -315,12 +319,6 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals, int id, float3 P, in } #endif else { - if (info.extension == EXTENSION_CLIP) { - if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) { - return make_float4(0.0f, 0.0f, 0.0f, 0.0f); - } - } - x *= info.width; y *= info.height; z *= info.depth; diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp index 300e201600c..525ae288f0c 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.cpp +++ b/intern/cycles/kernel/device/oneapi/kernel.cpp @@ -3,208 +3,79 @@ #ifdef WITH_ONEAPI -/* clang-format off */ # include "kernel.h" # include <iostream> # include <map> # include <set> -# include <CL/sycl.hpp> +# include <sycl/sycl.hpp> # include "kernel/device/oneapi/compat.h" # include "kernel/device/oneapi/globals.h" # include "kernel/device/oneapi/kernel_templates.h" # include "kernel/device/gpu/kernel.h" -/* clang-format on */ static OneAPIErrorCallback s_error_cb = nullptr; static void *s_error_user_ptr = nullptr; -static std::vector<sycl::device> oneapi_available_devices(); - void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr) { s_error_cb = cb; s_error_user_ptr = user_ptr; } -void oneapi_check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false) -{ -# ifdef _DEBUG - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - sycl::info::device_type device_type = - queue->get_device().get_info<sycl::info::device::device_type>(); - sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context()); - (void)usm_type; - assert(usm_type == sycl::usm::alloc::device || - ((device_type == sycl::info::device_type::host || - device_type == sycl::info::device_type::is_cpu || allow_host) && - usm_type == sycl::usm::alloc::host)); -# endif -} - -bool oneapi_create_queue(SyclQueue *&external_queue, int device_index) -{ - bool finished_correct = true; - try { - std::vector<sycl::device> devices = oneapi_available_devices(); - if (device_index < 0 || device_index >= devices.size()) { - return false; - } - sycl::queue *created_queue = new sycl::queue(devices[device_index], - sycl::property::queue::in_order()); - external_queue = reinterpret_cast<SyclQueue *>(created_queue); - } - catch (sycl::exception const &e) { - finished_correct = false; - if (s_error_cb) { - s_error_cb(e.what(), s_error_user_ptr); - } - } - return finished_correct; -} - -void oneapi_free_queue(SyclQueue *queue_) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - delete queue; -} - -void *oneapi_usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - return sycl::aligned_alloc_host(alignment, memory_size, *queue); -} - -void *oneapi_usm_alloc_device(SyclQueue *queue_, size_t memory_size) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - return sycl::malloc_device(memory_size, *queue); -} - -void oneapi_usm_free(SyclQueue *queue_, void *usm_ptr) +/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality like + * memory allocations, memory transfers and execution of kernel with USM memory. */ +bool oneapi_run_test_kernel(SyclQueue *queue_) { assert(queue_); sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - oneapi_check_usm(queue_, usm_ptr, true); - sycl::free(usm_ptr, *queue); -} + const size_t N = 8; + const size_t memory_byte_size = sizeof(int) * N; -bool oneapi_usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - oneapi_check_usm(queue_, dest, true); - oneapi_check_usm(queue_, src, true); - sycl::event mem_event = queue->memcpy(dest, src, num_bytes); -# ifdef WITH_CYCLES_DEBUG + bool is_computation_correct = true; try { - /* NOTE(@nsirgien) Waiting on memory operation may give more precise error - * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug. - */ - mem_event.wait_and_throw(); - return true; - } - catch (sycl::exception const &e) { - if (s_error_cb) { - s_error_cb(e.what(), s_error_user_ptr); - } - return false; - } -# else - sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context()); - sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context()); - bool from_device_to_host = dest_type == sycl::usm::alloc::host && - src_type == sycl::usm::alloc::device; - bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown || - src_type == sycl::usm::alloc::unknown; - /* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host - * may not wait until the end of the transfer before using the memory. - */ - if (from_device_to_host || host_or_device_memop_with_offset) - mem_event.wait(); - return true; -# endif -} + int *A_host = (int *)sycl::aligned_alloc_host(16, memory_byte_size, *queue); -bool oneapi_usm_memset(SyclQueue *queue_, void *usm_ptr, unsigned char value, size_t num_bytes) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - oneapi_check_usm(queue_, usm_ptr, true); - sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes); -# ifdef WITH_CYCLES_DEBUG - try { - /* NOTE(@nsirgien) Waiting on memory operation may give more precise error - * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug. - */ - mem_event.wait_and_throw(); - return true; - } - catch (sycl::exception const &e) { - if (s_error_cb) { - s_error_cb(e.what(), s_error_user_ptr); + for (size_t i = (size_t)0; i < N; i++) { + A_host[i] = rand() % 32; } - return false; - } -# else - (void)mem_event; - return true; -# endif -} -bool oneapi_queue_synchronize(SyclQueue *queue_) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - try { - queue->wait_and_throw(); - return true; - } - catch (sycl::exception const &e) { - if (s_error_cb) { - s_error_cb(e.what(), s_error_user_ptr); - } - return false; - } -} + int *A_device = (int *)sycl::malloc_device(memory_byte_size, *queue); + int *B_device = (int *)sycl::malloc_device(memory_byte_size, *queue); -/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and - * also trigger runtime compilation of all existing oneAPI kernels */ -bool oneapi_run_test_kernel(SyclQueue *queue_) -{ - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); - size_t N = 8; - sycl::buffer<float, 1> A(N); - sycl::buffer<float, 1> B(N); - - { - sycl::host_accessor A_host_acc(A, sycl::write_only); - for (size_t i = (size_t)0; i < N; i++) - A_host_acc[i] = rand() % 32; - } + queue->memcpy(A_device, A_host, memory_byte_size); + queue->wait_and_throw(); - try { queue->submit([&](sycl::handler &cgh) { - sycl::accessor A_acc(A, cgh, sycl::read_only); - sycl::accessor B_acc(B, cgh, sycl::write_only, sycl::no_init); - - cgh.parallel_for(N, [=](sycl::id<1> idx) { B_acc[idx] = A_acc[idx] + idx.get(0); }); + cgh.parallel_for(N, [=](sycl::id<1> idx) { B_device[idx] = A_device[idx] + idx.get(0); }); }); queue->wait_and_throw(); - sycl::host_accessor A_host_acc(A, sycl::read_only); - sycl::host_accessor B_host_acc(B, sycl::read_only); + int *B_host = (int *)sycl::aligned_alloc_host(16, memory_byte_size, *queue); + + queue->memcpy(B_host, B_device, memory_byte_size); + queue->wait_and_throw(); for (size_t i = (size_t)0; i < N; i++) { - float result = A_host_acc[i] + B_host_acc[i]; - (void)result; + const int expected_result = i + A_host[i]; + if (B_host[i] != expected_result) { + is_computation_correct = false; + if (s_error_cb) { + s_error_cb(("Incorrect result in test kernel execution - expected " + + std::to_string(expected_result) + ", got " + std::to_string(B_host[i])) + .c_str(), + s_error_user_ptr); + } + } } + + sycl::free(A_host, *queue); + sycl::free(B_host, *queue); + sycl::free(A_device, *queue); + sycl::free(B_device, *queue); + queue->wait_and_throw(); } catch (sycl::exception const &e) { if (s_error_cb) { @@ -213,63 +84,16 @@ bool oneapi_run_test_kernel(SyclQueue *queue_) return false; } - return true; -} - -bool oneapi_kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size) -{ - kernel_global_size = sizeof(KernelGlobalsGPU); - - return true; -} - -void oneapi_set_global_memory(SyclQueue *queue_, - void *kernel_globals, - const char *memory_name, - void *memory_device_pointer) -{ - assert(queue_); - assert(kernel_globals); - assert(memory_name); - assert(memory_device_pointer); - KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals; - oneapi_check_usm(queue_, memory_device_pointer); - oneapi_check_usm(queue_, kernel_globals, true); - - std::string matched_name(memory_name); - -/* This macro will change global ptr of KernelGlobals via name matching. */ -# define KERNEL_DATA_ARRAY(type, name) \ - else if (#name == matched_name) \ - { \ - globals->__##name = (type *)memory_device_pointer; \ - return; \ - } - if (false) { - } - else if ("integrator_state" == matched_name) { - globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer; - return; - } - KERNEL_DATA_ARRAY(KernelData, data) -# include "kernel/data_arrays.h" - else - { - std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!" - << std::endl; - assert(false); - } -# undef KERNEL_DATA_ARRAY + return is_computation_correct; } /* TODO: Move device information to OneapiDevice initialized on creation and use it. */ /* TODO: Move below function to oneapi/queue.cpp. */ -size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_, +size_t oneapi_kernel_preferred_local_size(SyclQueue *queue, const DeviceKernel kernel, const size_t kernel_global_size) { - assert(queue_); - sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + assert(queue); (void)kernel_global_size; const static size_t preferred_work_group_size_intersect_shading = 32; const static size_t preferred_work_group_size_technical = 1024; @@ -311,11 +135,63 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_, preferred_work_group_size = 512; } - const size_t limit_work_group_size = - queue->get_device().get_info<sycl::info::device::max_work_group_size>(); + const size_t limit_work_group_size = reinterpret_cast<sycl::queue *>(queue) + ->get_device() + .get_info<sycl::info::device::max_work_group_size>(); + return std::min(limit_work_group_size, preferred_work_group_size); } +bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features) +{ +# ifdef SYCL_SKIP_KERNELS_PRELOAD + (void)queue_; + (void)requested_features; +# else + assert(queue_); + sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_); + + try { + sycl::kernel_bundle<sycl::bundle_state::input> all_kernels_bundle = + sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(), + {queue->get_device()}); + + for (const sycl::kernel_id &kernel_id : all_kernels_bundle.get_kernel_ids()) { + const std::string &kernel_name = kernel_id.get_name(); + + /* NOTE(@nsirgien): Names in this conditions below should match names from + * oneapi_call macro in oneapi_enqueue_kernel below */ + if (((requested_features & KERNEL_FEATURE_VOLUME) == 0) && + kernel_name.find("oneapi_kernel_integrator_shade_volume") != std::string::npos) { + continue; + } + + if (((requested_features & KERNEL_FEATURE_MNEE) == 0) && + kernel_name.find("oneapi_kernel_integrator_shade_surface_mnee") != std::string::npos) { + continue; + } + + if (((requested_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) && + kernel_name.find("oneapi_kernel_integrator_shade_surface_raytrace") != + std::string::npos) { + continue; + } + + sycl::kernel_bundle<sycl::bundle_state::input> one_kernel_bundle = + sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(), {kernel_id}); + sycl::build(one_kernel_bundle); + } + } + catch (sycl::exception const &e) { + if (s_error_cb) { + s_error_cb(e.what(), s_error_user_ptr); + } + return false; + } +# endif + return true; +} + bool oneapi_enqueue_kernel(KernelContext *kernel_context, int kernel, size_t global_size, @@ -354,13 +230,6 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, /* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices, * we extend work size to fit uniformity requirements. */ global_size = groups_count * local_size; - -# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED - if (queue->get_device().is_host()) { - global_size = 1; - local_size = 1; - } -# endif } /* Let the compiler throw an error if there are any kernels missing in this implementation. */ @@ -645,13 +514,9 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, /* Unsupported kernels */ case DEVICE_KERNEL_NUM: case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL: - assert(0); - return false; + kernel_assert(0); + break; } - - /* Unknown kernel. */ - assert(0); - return false; }); } catch (sycl::exception const &e) { @@ -668,247 +533,4 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context, # endif return success; } - -static const int lowest_supported_driver_version_win = 1011660; -static const int lowest_supported_driver_version_neo = 23570; - -static int parse_driver_build_version(const sycl::device &device) -{ - const std::string &driver_version = device.get_info<sycl::info::device::driver_version>(); - int driver_build_version = 0; - - size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1); - if (second_dot_position == std::string::npos) { - std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version - << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0)," - << " xx.xx.xxx.xxxx (Windows) for device \"" - << device.get_info<sycl::info::device::name>() << "\"." << std::endl; - } - else { - try { - size_t third_dot_position = driver_version.find('.', second_dot_position + 1); - if (third_dot_position != std::string::npos) { - const std::string &third_number_substr = driver_version.substr( - second_dot_position + 1, third_dot_position - second_dot_position - 1); - const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1); - if (third_number_substr.length() == 3 && forth_number_substr.length() == 4) - driver_build_version = std::stoi(third_number_substr) * 10000 + - std::stoi(forth_number_substr); - } - else { - const std::string &third_number_substr = driver_version.substr(second_dot_position + 1); - driver_build_version = std::stoi(third_number_substr); - } - } - catch (std::invalid_argument &e) { - std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version - << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0)," - << " xx.xx.xxx.xxxx (Windows) for device \"" - << device.get_info<sycl::info::device::name>() << "\"." << std::endl; - } - } - - return driver_build_version; -} - -static std::vector<sycl::device> oneapi_available_devices() -{ - bool allow_all_devices = false; - if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr) - allow_all_devices = true; - - /* Host device is useful only for debugging at the moment - * so we hide this device with default build settings. */ -# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED - bool allow_host = true; -# else - bool allow_host = false; -# endif - - const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms(); - - std::vector<sycl::device> available_devices; - for (const sycl::platform &platform : oneapi_platforms) { - /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL. - */ - if (platform.get_backend() == sycl::backend::opencl) { - continue; - } - - const std::vector<sycl::device> &oneapi_devices = - (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) : - platform.get_devices(sycl::info::device_type::gpu); - - for (const sycl::device &device : oneapi_devices) { - if (allow_all_devices) { - /* still filter out host device if build doesn't support it. */ - if (allow_host || !device.is_host()) { - available_devices.push_back(device); - } - } - else { - bool filter_out = false; - - /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU, - * assuming they have either more than 96 Execution Units or not 7 threads per EU. - * Official support can be broaden to older and smaller GPUs once ready. */ - if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) { - /* Filtered-out defaults in-case these values aren't available through too old L0 - * runtime. */ - int number_of_eus = 96; - int threads_per_eu = 7; - if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) { - number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>(); - } - if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) { - threads_per_eu = - device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>(); - } - /* This filters out all Level-Zero supported GPUs from older generation than Arc. */ - if (number_of_eus <= 96 && threads_per_eu == 7) { - filter_out = true; - } - /* if not already filtered out, check driver version. */ - if (!filter_out) { - int driver_build_version = parse_driver_build_version(device); - if ((driver_build_version > 100000 && - driver_build_version < lowest_supported_driver_version_win) || - (driver_build_version > 0 && - driver_build_version < lowest_supported_driver_version_neo)) { - filter_out = true; - } - } - } - else if (!allow_host && device.is_host()) { - filter_out = true; - } - else if (!allow_all_devices) { - filter_out = true; - } - - if (!filter_out) { - available_devices.push_back(device); - } - } - } - } - - return available_devices; -} - -char *oneapi_device_capabilities() -{ - std::stringstream capabilities; - - const std::vector<sycl::device> &oneapi_devices = oneapi_available_devices(); - for (const sycl::device &device : oneapi_devices) { - const std::string &name = device.get_info<sycl::info::device::name>(); - - capabilities << std::string("\t") << name << "\n"; -# define WRITE_ATTR(attribute_name, attribute_variable) \ - capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \ - << "\n"; -# define GET_NUM_ATTR(attribute) \ - { \ - size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \ - capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \ - } - - GET_NUM_ATTR(vendor_id) - GET_NUM_ATTR(max_compute_units) - GET_NUM_ATTR(max_work_item_dimensions) - - sycl::id<3> max_work_item_sizes = device.get_info<sycl::info::device::max_work_item_sizes>(); - WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0))) - WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1))) - WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2))) - - GET_NUM_ATTR(max_work_group_size) - GET_NUM_ATTR(max_num_sub_groups) - GET_NUM_ATTR(sub_group_independent_forward_progress) - - GET_NUM_ATTR(preferred_vector_width_char) - GET_NUM_ATTR(preferred_vector_width_short) - GET_NUM_ATTR(preferred_vector_width_int) - GET_NUM_ATTR(preferred_vector_width_long) - GET_NUM_ATTR(preferred_vector_width_float) - GET_NUM_ATTR(preferred_vector_width_double) - GET_NUM_ATTR(preferred_vector_width_half) - - GET_NUM_ATTR(native_vector_width_char) - GET_NUM_ATTR(native_vector_width_short) - GET_NUM_ATTR(native_vector_width_int) - GET_NUM_ATTR(native_vector_width_long) - GET_NUM_ATTR(native_vector_width_float) - GET_NUM_ATTR(native_vector_width_double) - GET_NUM_ATTR(native_vector_width_half) - - size_t max_clock_frequency = - (size_t)(device.is_host() ? (size_t)0 : - device.get_info<sycl::info::device::max_clock_frequency>()); - WRITE_ATTR("max_clock_frequency", max_clock_frequency) - - GET_NUM_ATTR(address_bits) - GET_NUM_ATTR(max_mem_alloc_size) - - /* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't - * supported so we always return false, even if device supports HW texture usage acceleration. - */ - bool image_support = false; - WRITE_ATTR("image_support", (size_t)image_support) - - GET_NUM_ATTR(max_parameter_size) - GET_NUM_ATTR(mem_base_addr_align) - GET_NUM_ATTR(global_mem_size) - GET_NUM_ATTR(local_mem_size) - GET_NUM_ATTR(error_correction_support) - GET_NUM_ATTR(profiling_timer_resolution) - GET_NUM_ATTR(is_available) - -# undef GET_NUM_ATTR -# undef WRITE_ATTR - capabilities << "\n"; - } - - return ::strdup(capabilities.str().c_str()); -} - -void oneapi_free(void *p) -{ - if (p) { - ::free(p); - } -} - -void oneapi_iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr) -{ - int num = 0; - std::vector<sycl::device> devices = oneapi_available_devices(); - for (sycl::device &device : devices) { - const std::string &platform_name = - device.get_platform().get_info<sycl::info::platform::name>(); - std::string name = device.get_info<sycl::info::device::name>(); - std::string id = "ONEAPI_" + platform_name + "_" + name; - if (device.has(sycl::aspect::ext_intel_pci_address)) { - id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>()); - } - (cb)(id.c_str(), name.c_str(), num, user_ptr); - num++; - } -} - -size_t oneapi_get_memcapacity(SyclQueue *queue) -{ - return reinterpret_cast<sycl::queue *>(queue) - ->get_device() - .get_info<sycl::info::device::global_mem_size>(); -} - -size_t oneapi_get_compute_units_amount(SyclQueue *queue) -{ - return reinterpret_cast<sycl::queue *>(queue) - ->get_device() - .get_info<sycl::info::device::max_compute_units>(); -} - #endif /* WITH_ONEAPI */ diff --git a/intern/cycles/kernel/device/oneapi/kernel.h b/intern/cycles/kernel/device/oneapi/kernel.h index c5f853742ed..2bfc0b89c87 100644 --- a/intern/cycles/kernel/device/oneapi/kernel.h +++ b/intern/cycles/kernel/device/oneapi/kernel.h @@ -25,11 +25,6 @@ enum DeviceKernel : int; class SyclQueue; -typedef void (*OneAPIDeviceIteratorCallback)(const char *id, - const char *name, - int num, - void *user_ptr); - typedef void (*OneAPIErrorCallback)(const char *error, void *user_ptr); struct KernelContext { @@ -45,13 +40,17 @@ struct KernelContext { extern "C" { # endif -# define DLL_INTERFACE_CALL(function, return_type, ...) \ - CYCLES_KERNEL_ONEAPI_EXPORT return_type function(__VA_ARGS__); -# include "kernel/device/oneapi/dll_interface_template.h" -# undef DLL_INTERFACE_CALL - +CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_run_test_kernel(SyclQueue *queue_); +CYCLES_KERNEL_ONEAPI_EXPORT void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr); +CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_kernel_preferred_local_size( + SyclQueue *queue, const DeviceKernel kernel, const size_t kernel_global_size); +CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context, + int kernel, + size_t global_size, + void **args); +CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_load_kernels(SyclQueue *queue, + const unsigned int requested_features); # ifdef __cplusplus } # endif - #endif /* WITH_ONEAPI */ diff --git a/intern/cycles/kernel/device/oneapi/kernel_templates.h b/intern/cycles/kernel/device/oneapi/kernel_templates.h index d8964d9b672..0ae925cf748 100644 --- a/intern/cycles/kernel/device/oneapi/kernel_templates.h +++ b/intern/cycles/kernel/device/oneapi/kernel_templates.h @@ -80,7 +80,7 @@ void oneapi_call( (x, ##__VA_ARGS__) /* This template automatically casts entries in the void **args array to the types requested by the kernel func. - Since kernel parameters are passed as void ** to the device, this is the closest that we have to type safety. */ + * Since kernel parameters are passed as void ** to the device, this is the closest that we have to type safety. */ #define oneapi_template(...) \ template<ONEAPI_CALL_FOR(ONEAPI_TYP, __VA_ARGS__)> \ void oneapi_call( \ |