Welcome to mirror list, hosted at ThFree Co, Russian Federation.

git.blender.org/blender.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'intern/cycles/kernel/device/oneapi')
-rw-r--r--intern/cycles/kernel/device/oneapi/compat.h82
-rw-r--r--intern/cycles/kernel/device/oneapi/dll_interface_template.h53
-rw-r--r--intern/cycles/kernel/device/oneapi/globals.h9
-rw-r--r--intern/cycles/kernel/device/oneapi/image.h26
-rw-r--r--intern/cycles/kernel/device/oneapi/kernel.cpp566
-rw-r--r--intern/cycles/kernel/device/oneapi/kernel.h21
-rw-r--r--intern/cycles/kernel/device/oneapi/kernel_templates.h2
7 files changed, 136 insertions, 623 deletions
diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h
index 1b25259bcf5..dfaec65130c 100644
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -10,6 +10,7 @@
#define CCL_NAMESPACE_END
#include <cstdint>
+#include <math.h>
#ifndef __NODES_MAX_GROUP__
# define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
@@ -30,7 +31,7 @@
#define ccl_global
#define ccl_always_inline __attribute__((always_inline))
#define ccl_device_inline inline
-#define ccl_noinline
+#define ccl_noinline __attribute__((noinline))
#define ccl_inline_constant const constexpr
#define ccl_static_constant const
#define ccl_device_forceinline __attribute__((always_inline))
@@ -54,18 +55,6 @@
#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
#define ccl_gpu_kernel_threads(block_num_threads)
-#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-# define KG_ND_ITEMS \
- kg->nd_item_local_id_0 = item.get_local_id(0); \
- kg->nd_item_local_range_0 = item.get_local_range(0); \
- kg->nd_item_group_0 = item.get_group(0); \
- kg->nd_item_group_range_0 = item.get_group_range(0); \
- kg->nd_item_global_id_0 = item.get_global_id(0); \
- kg->nd_item_global_range_0 = item.get_global_range(0);
-#else
-# define KG_ND_ITEMS
-#endif
-
#define ccl_gpu_kernel_signature(name, ...) \
void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
size_t kernel_global_size, \
@@ -75,8 +64,7 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
(kg); \
cgh.parallel_for<class kernel_##name>( \
sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
- [=](sycl::nd_item<1> item) { \
- KG_ND_ITEMS
+ [=](sycl::nd_item<1> item) {
#define ccl_gpu_kernel_postfix \
}); \
@@ -94,31 +82,17 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
} ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg)
/* GPU thread, block, grid size and index */
-#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED
-# define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
-# define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
-# define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
-# define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
-# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
-# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
-
-# define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
-# define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
-#else
-# define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0)
-# define ccl_gpu_block_dim_x (kg->nd_item_local_range_0)
-# define ccl_gpu_block_idx_x (kg->nd_item_group_0)
-# define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0)
-# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
-# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
-
-# define ccl_gpu_global_id_x() (kg->nd_item_global_id_0)
-# define ccl_gpu_global_size_x() (kg->nd_item_global_range_0)
-#endif
+#define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
+#define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
+#define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
+#define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
+#define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
+#define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
+#define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
/* GPU warp synchronization */
-
#define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier()
#define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space)
#ifdef __SYCL_DEVICE_ONLY__
@@ -149,25 +123,13 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
/* clang-format on */
/* Types */
+
/* It's not possible to use sycl types like sycl::float3, sycl::int3, etc
- * because these types have different interfaces from blender version */
+ * because these types have different interfaces from blender version. */
using uchar = unsigned char;
using sycl::half;
-struct float3 {
- float x, y, z;
-};
-
-ccl_always_inline float3 make_float3(float x, float y, float z)
-{
- return {x, y, z};
-}
-ccl_always_inline float3 make_float3(float x)
-{
- return {x, x, x};
-}
-
/* math functions */
#define fabsf(x) sycl::fabs((x))
#define copysignf(x, y) sycl::copysign((x), (y))
@@ -186,21 +148,15 @@ ccl_always_inline float3 make_float3(float x)
#define fmodf(x, y) sycl::fmod((x), (y))
#define lgammaf(x) sycl::lgamma((x))
-#define __forceinline __attribute__((always_inline))
-
-/* Types */
-#include "util/half.h"
-#include "util/types.h"
-
-/* NOTE(@nsirgien): Declaring these functions after types headers is very important because they
- * include oneAPI headers, which transitively include math.h headers which will cause redefinitions
- * of the math defines because math.h also uses them and having them defined before math.h include
- * is actually UB. */
-/* Use fast math functions - get them from sycl::native namespace for native math function
- * implementations */
#define cosf(x) sycl::native::cos(((float)(x)))
#define sinf(x) sycl::native::sin(((float)(x)))
#define powf(x, y) sycl::native::powr(((float)(x)), ((float)(y)))
#define tanf(x) sycl::native::tan(((float)(x)))
#define logf(x) sycl::native::log(((float)(x)))
#define expf(x) sycl::native::exp(((float)(x)))
+
+#define __forceinline __attribute__((always_inline))
+
+/* Types */
+#include "util/half.h"
+#include "util/types.h"
diff --git a/intern/cycles/kernel/device/oneapi/dll_interface_template.h b/intern/cycles/kernel/device/oneapi/dll_interface_template.h
deleted file mode 100644
index 662068c0fed..00000000000
--- a/intern/cycles/kernel/device/oneapi/dll_interface_template.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2022 Intel Corporation */
-
-/* device_capabilities() returns a C string that must be free'd with oneapi_free(). */
-DLL_INTERFACE_CALL(oneapi_device_capabilities, char *)
-DLL_INTERFACE_CALL(oneapi_free, void, void *)
-DLL_INTERFACE_CALL(oneapi_get_memcapacity, size_t, SyclQueue *queue)
-
-DLL_INTERFACE_CALL(oneapi_get_compute_units_amount, size_t, SyclQueue *queue)
-DLL_INTERFACE_CALL(oneapi_iterate_devices, void, OneAPIDeviceIteratorCallback cb, void *user_ptr)
-DLL_INTERFACE_CALL(oneapi_set_error_cb, void, OneAPIErrorCallback, void *user_ptr)
-
-DLL_INTERFACE_CALL(oneapi_create_queue, bool, SyclQueue *&external_queue, int device_index)
-DLL_INTERFACE_CALL(oneapi_free_queue, void, SyclQueue *queue)
-DLL_INTERFACE_CALL(
- oneapi_usm_aligned_alloc_host, void *, SyclQueue *queue, size_t memory_size, size_t alignment)
-DLL_INTERFACE_CALL(oneapi_usm_alloc_device, void *, SyclQueue *queue, size_t memory_size)
-DLL_INTERFACE_CALL(oneapi_usm_free, void, SyclQueue *queue, void *usm_ptr)
-
-DLL_INTERFACE_CALL(
- oneapi_usm_memcpy, bool, SyclQueue *queue, void *dest, void *src, size_t num_bytes)
-DLL_INTERFACE_CALL(oneapi_queue_synchronize, bool, SyclQueue *queue)
-DLL_INTERFACE_CALL(oneapi_usm_memset,
- bool,
- SyclQueue *queue,
- void *usm_ptr,
- unsigned char value,
- size_t num_bytes)
-
-DLL_INTERFACE_CALL(oneapi_run_test_kernel, bool, SyclQueue *queue)
-
-/* Operation with Kernel globals structure - map of global/constant allocation - filled before
- * render/kernel execution As we don't know in cycles `sizeof` this - Cycles will manage just as
- * pointer. */
-DLL_INTERFACE_CALL(oneapi_kernel_globals_size, bool, SyclQueue *queue, size_t &kernel_global_size)
-DLL_INTERFACE_CALL(oneapi_set_global_memory,
- void,
- SyclQueue *queue,
- void *kernel_globals,
- const char *memory_name,
- void *memory_device_pointer)
-
-DLL_INTERFACE_CALL(oneapi_kernel_preferred_local_size,
- size_t,
- SyclQueue *queue,
- const DeviceKernel kernel,
- const size_t kernel_global_size)
-DLL_INTERFACE_CALL(oneapi_enqueue_kernel,
- bool,
- KernelContext *context,
- int kernel,
- size_t global_size,
- void **args)
diff --git a/intern/cycles/kernel/device/oneapi/globals.h b/intern/cycles/kernel/device/oneapi/globals.h
index d60f4f135ba..116620eb725 100644
--- a/intern/cycles/kernel/device/oneapi/globals.h
+++ b/intern/cycles/kernel/device/oneapi/globals.h
@@ -23,15 +23,6 @@ typedef struct KernelGlobalsGPU {
#undef KERNEL_DATA_ARRAY
IntegratorStateGPU *integrator_state;
const KernelData *__data;
-#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
- size_t nd_item_local_id_0;
- size_t nd_item_local_range_0;
- size_t nd_item_group_0;
- size_t nd_item_group_range_0;
-
- size_t nd_item_global_id_0;
- size_t nd_item_global_range_0;
-#endif
} KernelGlobalsGPU;
typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals;
diff --git a/intern/cycles/kernel/device/oneapi/image.h b/intern/cycles/kernel/device/oneapi/image.h
index 6681977a675..2417b8eac3b 100644
--- a/intern/cycles/kernel/device/oneapi/image.h
+++ b/intern/cycles/kernel/device/oneapi/image.h
@@ -81,10 +81,15 @@ ccl_device_inline float4 svm_image_texture_read_2d(int id, int x, int y)
x = svm_image_texture_wrap_periodic(x, info.width);
y = svm_image_texture_wrap_periodic(y, info.height);
}
- else {
+ else if (info.extension == EXTENSION_EXTEND) {
x = svm_image_texture_wrap_clamp(x, info.width);
y = svm_image_texture_wrap_clamp(y, info.height);
}
+ else {
+ if (x < 0 || x >= info.width || y < 0 || y >= info.height) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ }
return svm_image_texture_read(info, x, y, 0);
}
@@ -99,11 +104,16 @@ ccl_device_inline float4 svm_image_texture_read_3d(int id, int x, int y, int z)
y = svm_image_texture_wrap_periodic(y, info.height);
z = svm_image_texture_wrap_periodic(z, info.depth);
}
- else {
+ else if (info.extension == EXTENSION_EXTEND) {
x = svm_image_texture_wrap_clamp(x, info.width);
y = svm_image_texture_wrap_clamp(y, info.height);
z = svm_image_texture_wrap_clamp(z, info.depth);
}
+ else {
+ if (x < 0 || x >= info.width || y < 0 || y >= info.height || z < 0 || z >= info.depth) {
+ return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+ }
return svm_image_texture_read(info, x, y, z);
}
@@ -128,12 +138,6 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals, int id, float x, float
{
const TextureInfo &info = kernel_data_fetch(texture_info, id);
- if (info.extension == EXTENSION_CLIP) {
- if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
- return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- }
- }
-
if (info.interpolation == INTERPOLATION_CLOSEST) {
/* Closest interpolation. */
int ix, iy;
@@ -315,12 +319,6 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals, int id, float3 P, in
}
#endif
else {
- if (info.extension == EXTENSION_CLIP) {
- if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
- return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- }
- }
-
x *= info.width;
y *= info.height;
z *= info.depth;
diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp
index 300e201600c..525ae288f0c 100644
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -3,208 +3,79 @@
#ifdef WITH_ONEAPI
-/* clang-format off */
# include "kernel.h"
# include <iostream>
# include <map>
# include <set>
-# include <CL/sycl.hpp>
+# include <sycl/sycl.hpp>
# include "kernel/device/oneapi/compat.h"
# include "kernel/device/oneapi/globals.h"
# include "kernel/device/oneapi/kernel_templates.h"
# include "kernel/device/gpu/kernel.h"
-/* clang-format on */
static OneAPIErrorCallback s_error_cb = nullptr;
static void *s_error_user_ptr = nullptr;
-static std::vector<sycl::device> oneapi_available_devices();
-
void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr)
{
s_error_cb = cb;
s_error_user_ptr = user_ptr;
}
-void oneapi_check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
-{
-# ifdef _DEBUG
- sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
- sycl::info::device_type device_type =
- queue->get_device().get_info<sycl::info::device::device_type>();
- sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
- (void)usm_type;
- assert(usm_type == sycl::usm::alloc::device ||
- ((device_type == sycl::info::device_type::host ||
- device_type == sycl::info::device_type::is_cpu || allow_host) &&
- usm_type == sycl::usm::alloc::host));
-# endif
-}
-
-bool oneapi_create_queue(SyclQueue *&external_queue, int device_index)
-{
- bool finished_correct = true;
- try {
- std::vector<sycl::device> devices = oneapi_available_devices();
- if (device_index < 0 || device_index >= devices.size()) {
- return false;
- }
- sycl::queue *created_queue = new sycl::queue(devices[device_index],
- sycl::property::queue::in_order());
- external_queue = reinterpret_cast<SyclQueue *>(created_queue);
- }
- catch (sycl::exception const &e) {
- finished_correct = false;
- if (s_error_cb) {
- s_error_cb(e.what(), s_error_user_ptr);
- }
- }
- return finished_correct;
-}
-
-void oneapi_free_queue(SyclQueue *queue_)
-{
- assert(queue_);
- sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
- delete queue;
-}
-
-void *oneapi_usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment)
-{
- assert(queue_);
- sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
- return sycl::aligned_alloc_host(alignment, memory_size, *queue);
-}
-
-void *oneapi_usm_alloc_device(SyclQueue *queue_, size_t memory_size)
-{
- assert(queue_);
- sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
- return sycl::malloc_device(memory_size, *queue);
-}
-
-void oneapi_usm_free(SyclQueue *queue_, void *usm_ptr)
+/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality like
+ * memory allocations, memory transfers and execution of kernel with USM memory. */
+bool oneapi_run_test_kernel(SyclQueue *queue_)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
- oneapi_check_usm(queue_, usm_ptr, true);
- sycl::free(usm_ptr, *queue);
-}
+ const size_t N = 8;
+ const size_t memory_byte_size = sizeof(int) * N;
-bool oneapi_usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
-{
- assert(queue_);
- sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
- oneapi_check_usm(queue_, dest, true);
- oneapi_check_usm(queue_, src, true);
- sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
-# ifdef WITH_CYCLES_DEBUG
+ bool is_computation_correct = true;
try {
- /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
- * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
- */
- mem_event.wait_and_throw();
- return true;
- }
- catch (sycl::exception const &e) {
- if (s_error_cb) {
- s_error_cb(e.what(), s_error_user_ptr);
- }
- return false;
- }
-# else
- sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
- sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
- bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
- src_type == sycl::usm::alloc::device;
- bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
- src_type == sycl::usm::alloc::unknown;
- /* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host
- * may not wait until the end of the transfer before using the memory.
- */
- if (from_device_to_host || host_or_device_memop_with_offset)
- mem_event.wait();
- return true;
-# endif
-}
+ int *A_host = (int *)sycl::aligned_alloc_host(16, memory_byte_size, *queue);
-bool oneapi_usm_memset(SyclQueue *queue_, void *usm_ptr, unsigned char value, size_t num_bytes)
-{
- assert(queue_);
- sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
- oneapi_check_usm(queue_, usm_ptr, true);
- sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
-# ifdef WITH_CYCLES_DEBUG
- try {
- /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
- * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
- */
- mem_event.wait_and_throw();
- return true;
- }
- catch (sycl::exception const &e) {
- if (s_error_cb) {
- s_error_cb(e.what(), s_error_user_ptr);
+ for (size_t i = (size_t)0; i < N; i++) {
+ A_host[i] = rand() % 32;
}
- return false;
- }
-# else
- (void)mem_event;
- return true;
-# endif
-}
-bool oneapi_queue_synchronize(SyclQueue *queue_)
-{
- assert(queue_);
- sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
- try {
- queue->wait_and_throw();
- return true;
- }
- catch (sycl::exception const &e) {
- if (s_error_cb) {
- s_error_cb(e.what(), s_error_user_ptr);
- }
- return false;
- }
-}
+ int *A_device = (int *)sycl::malloc_device(memory_byte_size, *queue);
+ int *B_device = (int *)sycl::malloc_device(memory_byte_size, *queue);
-/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and
- * also trigger runtime compilation of all existing oneAPI kernels */
-bool oneapi_run_test_kernel(SyclQueue *queue_)
-{
- assert(queue_);
- sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
- size_t N = 8;
- sycl::buffer<float, 1> A(N);
- sycl::buffer<float, 1> B(N);
-
- {
- sycl::host_accessor A_host_acc(A, sycl::write_only);
- for (size_t i = (size_t)0; i < N; i++)
- A_host_acc[i] = rand() % 32;
- }
+ queue->memcpy(A_device, A_host, memory_byte_size);
+ queue->wait_and_throw();
- try {
queue->submit([&](sycl::handler &cgh) {
- sycl::accessor A_acc(A, cgh, sycl::read_only);
- sycl::accessor B_acc(B, cgh, sycl::write_only, sycl::no_init);
-
- cgh.parallel_for(N, [=](sycl::id<1> idx) { B_acc[idx] = A_acc[idx] + idx.get(0); });
+ cgh.parallel_for(N, [=](sycl::id<1> idx) { B_device[idx] = A_device[idx] + idx.get(0); });
});
queue->wait_and_throw();
- sycl::host_accessor A_host_acc(A, sycl::read_only);
- sycl::host_accessor B_host_acc(B, sycl::read_only);
+ int *B_host = (int *)sycl::aligned_alloc_host(16, memory_byte_size, *queue);
+
+ queue->memcpy(B_host, B_device, memory_byte_size);
+ queue->wait_and_throw();
for (size_t i = (size_t)0; i < N; i++) {
- float result = A_host_acc[i] + B_host_acc[i];
- (void)result;
+ const int expected_result = i + A_host[i];
+ if (B_host[i] != expected_result) {
+ is_computation_correct = false;
+ if (s_error_cb) {
+ s_error_cb(("Incorrect result in test kernel execution - expected " +
+ std::to_string(expected_result) + ", got " + std::to_string(B_host[i]))
+ .c_str(),
+ s_error_user_ptr);
+ }
+ }
}
+
+ sycl::free(A_host, *queue);
+ sycl::free(B_host, *queue);
+ sycl::free(A_device, *queue);
+ sycl::free(B_device, *queue);
+ queue->wait_and_throw();
}
catch (sycl::exception const &e) {
if (s_error_cb) {
@@ -213,63 +84,16 @@ bool oneapi_run_test_kernel(SyclQueue *queue_)
return false;
}
- return true;
-}
-
-bool oneapi_kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size)
-{
- kernel_global_size = sizeof(KernelGlobalsGPU);
-
- return true;
-}
-
-void oneapi_set_global_memory(SyclQueue *queue_,
- void *kernel_globals,
- const char *memory_name,
- void *memory_device_pointer)
-{
- assert(queue_);
- assert(kernel_globals);
- assert(memory_name);
- assert(memory_device_pointer);
- KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
- oneapi_check_usm(queue_, memory_device_pointer);
- oneapi_check_usm(queue_, kernel_globals, true);
-
- std::string matched_name(memory_name);
-
-/* This macro will change global ptr of KernelGlobals via name matching. */
-# define KERNEL_DATA_ARRAY(type, name) \
- else if (#name == matched_name) \
- { \
- globals->__##name = (type *)memory_device_pointer; \
- return; \
- }
- if (false) {
- }
- else if ("integrator_state" == matched_name) {
- globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
- return;
- }
- KERNEL_DATA_ARRAY(KernelData, data)
-# include "kernel/data_arrays.h"
- else
- {
- std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
- << std::endl;
- assert(false);
- }
-# undef KERNEL_DATA_ARRAY
+ return is_computation_correct;
}
/* TODO: Move device information to OneapiDevice initialized on creation and use it. */
/* TODO: Move below function to oneapi/queue.cpp. */
-size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_,
+size_t oneapi_kernel_preferred_local_size(SyclQueue *queue,
const DeviceKernel kernel,
const size_t kernel_global_size)
{
- assert(queue_);
- sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+ assert(queue);
(void)kernel_global_size;
const static size_t preferred_work_group_size_intersect_shading = 32;
const static size_t preferred_work_group_size_technical = 1024;
@@ -311,11 +135,63 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_,
preferred_work_group_size = 512;
}
- const size_t limit_work_group_size =
- queue->get_device().get_info<sycl::info::device::max_work_group_size>();
+ const size_t limit_work_group_size = reinterpret_cast<sycl::queue *>(queue)
+ ->get_device()
+ .get_info<sycl::info::device::max_work_group_size>();
+
return std::min(limit_work_group_size, preferred_work_group_size);
}
+bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features)
+{
+# ifdef SYCL_SKIP_KERNELS_PRELOAD
+ (void)queue_;
+ (void)requested_features;
+# else
+ assert(queue_);
+ sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+
+ try {
+ sycl::kernel_bundle<sycl::bundle_state::input> all_kernels_bundle =
+ sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(),
+ {queue->get_device()});
+
+ for (const sycl::kernel_id &kernel_id : all_kernels_bundle.get_kernel_ids()) {
+ const std::string &kernel_name = kernel_id.get_name();
+
+ /* NOTE(@nsirgien): Names in this conditions below should match names from
+ * oneapi_call macro in oneapi_enqueue_kernel below */
+ if (((requested_features & KERNEL_FEATURE_VOLUME) == 0) &&
+ kernel_name.find("oneapi_kernel_integrator_shade_volume") != std::string::npos) {
+ continue;
+ }
+
+ if (((requested_features & KERNEL_FEATURE_MNEE) == 0) &&
+ kernel_name.find("oneapi_kernel_integrator_shade_surface_mnee") != std::string::npos) {
+ continue;
+ }
+
+ if (((requested_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) &&
+ kernel_name.find("oneapi_kernel_integrator_shade_surface_raytrace") !=
+ std::string::npos) {
+ continue;
+ }
+
+ sycl::kernel_bundle<sycl::bundle_state::input> one_kernel_bundle =
+ sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(), {kernel_id});
+ sycl::build(one_kernel_bundle);
+ }
+ }
+ catch (sycl::exception const &e) {
+ if (s_error_cb) {
+ s_error_cb(e.what(), s_error_user_ptr);
+ }
+ return false;
+ }
+# endif
+ return true;
+}
+
bool oneapi_enqueue_kernel(KernelContext *kernel_context,
int kernel,
size_t global_size,
@@ -354,13 +230,6 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
/* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
* we extend work size to fit uniformity requirements. */
global_size = groups_count * local_size;
-
-# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
- if (queue->get_device().is_host()) {
- global_size = 1;
- local_size = 1;
- }
-# endif
}
/* Let the compiler throw an error if there are any kernels missing in this implementation. */
@@ -645,13 +514,9 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
/* Unsupported kernels */
case DEVICE_KERNEL_NUM:
case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
- assert(0);
- return false;
+ kernel_assert(0);
+ break;
}
-
- /* Unknown kernel. */
- assert(0);
- return false;
});
}
catch (sycl::exception const &e) {
@@ -668,247 +533,4 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
# endif
return success;
}
-
-static const int lowest_supported_driver_version_win = 1011660;
-static const int lowest_supported_driver_version_neo = 23570;
-
-static int parse_driver_build_version(const sycl::device &device)
-{
- const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
- int driver_build_version = 0;
-
- size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
- if (second_dot_position == std::string::npos) {
- std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
- << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
- << " xx.xx.xxx.xxxx (Windows) for device \""
- << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
- }
- else {
- try {
- size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
- if (third_dot_position != std::string::npos) {
- const std::string &third_number_substr = driver_version.substr(
- second_dot_position + 1, third_dot_position - second_dot_position - 1);
- const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
- if (third_number_substr.length() == 3 && forth_number_substr.length() == 4)
- driver_build_version = std::stoi(third_number_substr) * 10000 +
- std::stoi(forth_number_substr);
- }
- else {
- const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
- driver_build_version = std::stoi(third_number_substr);
- }
- }
- catch (std::invalid_argument &e) {
- std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
- << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
- << " xx.xx.xxx.xxxx (Windows) for device \""
- << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
- }
- }
-
- return driver_build_version;
-}
-
-static std::vector<sycl::device> oneapi_available_devices()
-{
- bool allow_all_devices = false;
- if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
- allow_all_devices = true;
-
- /* Host device is useful only for debugging at the moment
- * so we hide this device with default build settings. */
-# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
- bool allow_host = true;
-# else
- bool allow_host = false;
-# endif
-
- const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
-
- std::vector<sycl::device> available_devices;
- for (const sycl::platform &platform : oneapi_platforms) {
- /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL.
- */
- if (platform.get_backend() == sycl::backend::opencl) {
- continue;
- }
-
- const std::vector<sycl::device> &oneapi_devices =
- (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
- platform.get_devices(sycl::info::device_type::gpu);
-
- for (const sycl::device &device : oneapi_devices) {
- if (allow_all_devices) {
- /* still filter out host device if build doesn't support it. */
- if (allow_host || !device.is_host()) {
- available_devices.push_back(device);
- }
- }
- else {
- bool filter_out = false;
-
- /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
- * assuming they have either more than 96 Execution Units or not 7 threads per EU.
- * Official support can be broaden to older and smaller GPUs once ready. */
- if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) {
- /* Filtered-out defaults in-case these values aren't available through too old L0
- * runtime. */
- int number_of_eus = 96;
- int threads_per_eu = 7;
- if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
- number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
- }
- if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
- threads_per_eu =
- device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
- }
- /* This filters out all Level-Zero supported GPUs from older generation than Arc. */
- if (number_of_eus <= 96 && threads_per_eu == 7) {
- filter_out = true;
- }
- /* if not already filtered out, check driver version. */
- if (!filter_out) {
- int driver_build_version = parse_driver_build_version(device);
- if ((driver_build_version > 100000 &&
- driver_build_version < lowest_supported_driver_version_win) ||
- (driver_build_version > 0 &&
- driver_build_version < lowest_supported_driver_version_neo)) {
- filter_out = true;
- }
- }
- }
- else if (!allow_host && device.is_host()) {
- filter_out = true;
- }
- else if (!allow_all_devices) {
- filter_out = true;
- }
-
- if (!filter_out) {
- available_devices.push_back(device);
- }
- }
- }
- }
-
- return available_devices;
-}
-
-char *oneapi_device_capabilities()
-{
- std::stringstream capabilities;
-
- const std::vector<sycl::device> &oneapi_devices = oneapi_available_devices();
- for (const sycl::device &device : oneapi_devices) {
- const std::string &name = device.get_info<sycl::info::device::name>();
-
- capabilities << std::string("\t") << name << "\n";
-# define WRITE_ATTR(attribute_name, attribute_variable) \
- capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
- << "\n";
-# define GET_NUM_ATTR(attribute) \
- { \
- size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \
- capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \
- }
-
- GET_NUM_ATTR(vendor_id)
- GET_NUM_ATTR(max_compute_units)
- GET_NUM_ATTR(max_work_item_dimensions)
-
- sycl::id<3> max_work_item_sizes = device.get_info<sycl::info::device::max_work_item_sizes>();
- WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0)))
- WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1)))
- WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2)))
-
- GET_NUM_ATTR(max_work_group_size)
- GET_NUM_ATTR(max_num_sub_groups)
- GET_NUM_ATTR(sub_group_independent_forward_progress)
-
- GET_NUM_ATTR(preferred_vector_width_char)
- GET_NUM_ATTR(preferred_vector_width_short)
- GET_NUM_ATTR(preferred_vector_width_int)
- GET_NUM_ATTR(preferred_vector_width_long)
- GET_NUM_ATTR(preferred_vector_width_float)
- GET_NUM_ATTR(preferred_vector_width_double)
- GET_NUM_ATTR(preferred_vector_width_half)
-
- GET_NUM_ATTR(native_vector_width_char)
- GET_NUM_ATTR(native_vector_width_short)
- GET_NUM_ATTR(native_vector_width_int)
- GET_NUM_ATTR(native_vector_width_long)
- GET_NUM_ATTR(native_vector_width_float)
- GET_NUM_ATTR(native_vector_width_double)
- GET_NUM_ATTR(native_vector_width_half)
-
- size_t max_clock_frequency =
- (size_t)(device.is_host() ? (size_t)0 :
- device.get_info<sycl::info::device::max_clock_frequency>());
- WRITE_ATTR("max_clock_frequency", max_clock_frequency)
-
- GET_NUM_ATTR(address_bits)
- GET_NUM_ATTR(max_mem_alloc_size)
-
- /* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't
- * supported so we always return false, even if device supports HW texture usage acceleration.
- */
- bool image_support = false;
- WRITE_ATTR("image_support", (size_t)image_support)
-
- GET_NUM_ATTR(max_parameter_size)
- GET_NUM_ATTR(mem_base_addr_align)
- GET_NUM_ATTR(global_mem_size)
- GET_NUM_ATTR(local_mem_size)
- GET_NUM_ATTR(error_correction_support)
- GET_NUM_ATTR(profiling_timer_resolution)
- GET_NUM_ATTR(is_available)
-
-# undef GET_NUM_ATTR
-# undef WRITE_ATTR
- capabilities << "\n";
- }
-
- return ::strdup(capabilities.str().c_str());
-}
-
-void oneapi_free(void *p)
-{
- if (p) {
- ::free(p);
- }
-}
-
-void oneapi_iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
-{
- int num = 0;
- std::vector<sycl::device> devices = oneapi_available_devices();
- for (sycl::device &device : devices) {
- const std::string &platform_name =
- device.get_platform().get_info<sycl::info::platform::name>();
- std::string name = device.get_info<sycl::info::device::name>();
- std::string id = "ONEAPI_" + platform_name + "_" + name;
- if (device.has(sycl::aspect::ext_intel_pci_address)) {
- id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>());
- }
- (cb)(id.c_str(), name.c_str(), num, user_ptr);
- num++;
- }
-}
-
-size_t oneapi_get_memcapacity(SyclQueue *queue)
-{
- return reinterpret_cast<sycl::queue *>(queue)
- ->get_device()
- .get_info<sycl::info::device::global_mem_size>();
-}
-
-size_t oneapi_get_compute_units_amount(SyclQueue *queue)
-{
- return reinterpret_cast<sycl::queue *>(queue)
- ->get_device()
- .get_info<sycl::info::device::max_compute_units>();
-}
-
#endif /* WITH_ONEAPI */
diff --git a/intern/cycles/kernel/device/oneapi/kernel.h b/intern/cycles/kernel/device/oneapi/kernel.h
index c5f853742ed..2bfc0b89c87 100644
--- a/intern/cycles/kernel/device/oneapi/kernel.h
+++ b/intern/cycles/kernel/device/oneapi/kernel.h
@@ -25,11 +25,6 @@ enum DeviceKernel : int;
class SyclQueue;
-typedef void (*OneAPIDeviceIteratorCallback)(const char *id,
- const char *name,
- int num,
- void *user_ptr);
-
typedef void (*OneAPIErrorCallback)(const char *error, void *user_ptr);
struct KernelContext {
@@ -45,13 +40,17 @@ struct KernelContext {
extern "C" {
# endif
-# define DLL_INTERFACE_CALL(function, return_type, ...) \
- CYCLES_KERNEL_ONEAPI_EXPORT return_type function(__VA_ARGS__);
-# include "kernel/device/oneapi/dll_interface_template.h"
-# undef DLL_INTERFACE_CALL
-
+CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_run_test_kernel(SyclQueue *queue_);
+CYCLES_KERNEL_ONEAPI_EXPORT void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr);
+CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_kernel_preferred_local_size(
+ SyclQueue *queue, const DeviceKernel kernel, const size_t kernel_global_size);
+CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context,
+ int kernel,
+ size_t global_size,
+ void **args);
+CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_load_kernels(SyclQueue *queue,
+ const unsigned int requested_features);
# ifdef __cplusplus
}
# endif
-
#endif /* WITH_ONEAPI */
diff --git a/intern/cycles/kernel/device/oneapi/kernel_templates.h b/intern/cycles/kernel/device/oneapi/kernel_templates.h
index d8964d9b672..0ae925cf748 100644
--- a/intern/cycles/kernel/device/oneapi/kernel_templates.h
+++ b/intern/cycles/kernel/device/oneapi/kernel_templates.h
@@ -80,7 +80,7 @@ void oneapi_call(
(x, ##__VA_ARGS__)
/* This template automatically casts entries in the void **args array to the types requested by the kernel func.
- Since kernel parameters are passed as void ** to the device, this is the closest that we have to type safety. */
+ * Since kernel parameters are passed as void ** to the device, this is the closest that we have to type safety. */
#define oneapi_template(...) \
template<ONEAPI_CALL_FOR(ONEAPI_TYP, __VA_ARGS__)> \
void oneapi_call( \