7 files changed, 136 insertions, 623 deletions
diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h
index 1b25259bcf5..dfaec65130c 100644
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -10,6 +10,7 @@
 #define CCL_NAMESPACE_END
 
 #include <cstdint>
+#include <math.h>
 
 #ifndef __NODES_MAX_GROUP__
 #  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
@@ -30,7 +31,7 @@
 #define ccl_global
 #define ccl_always_inline __attribute__((always_inline))
 #define ccl_device_inline inline
-#define ccl_noinline
+#define ccl_noinline __attribute__((noinline))
 #define ccl_inline_constant const constexpr
 #define ccl_static_constant const
 #define ccl_device_forceinline __attribute__((always_inline))
@@ -54,18 +55,6 @@
 #define ccl_gpu_kernel(block_num_threads, thread_num_registers)
 #define ccl_gpu_kernel_threads(block_num_threads)
 
-#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-#  define KG_ND_ITEMS \
-  kg->nd_item_local_id_0 = item.get_local_id(0); \
-  kg->nd_item_local_range_0 = item.get_local_range(0); \
-  kg->nd_item_group_0 = item.get_group(0); \
-  kg->nd_item_group_range_0 = item.get_group_range(0); \
-  kg->nd_item_global_id_0 = item.get_global_id(0); \
-  kg->nd_item_global_range_0 = item.get_global_range(0);
-#else
-# define KG_ND_ITEMS
-#endif
-
 #define ccl_gpu_kernel_signature(name, ...) \
 void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
                           size_t kernel_global_size, \
@@ -75,8 +64,7 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
       (kg); \
       cgh.parallel_for<class kernel_##name>( \
           sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
-          [=](sycl::nd_item<1> item) { \
-            KG_ND_ITEMS
+          [=](sycl::nd_item<1> item) {
 
 #define ccl_gpu_kernel_postfix \
           }); \
@@ -94,31 +82,17 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
   } ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg)
 
 /* GPU thread, block, grid size and index */
-#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED
-#  define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
-#  define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
-#  define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
-#  define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
-#  define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
-#  define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
-
-#  define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
-#  define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
-#else
-#  define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0)
-#  define ccl_gpu_block_dim_x (kg->nd_item_local_range_0)
-#  define ccl_gpu_block_idx_x (kg->nd_item_group_0)
-#  define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0)
-#  define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
-#  define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
-
-#  define ccl_gpu_global_id_x() (kg->nd_item_global_id_0)
-#  define ccl_gpu_global_size_x() (kg->nd_item_global_range_0)
-#endif
+#define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
+#define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
+#define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
+#define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
+#define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
 
+#define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
+#define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
 
 /* GPU warp synchronization */
-
 #define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier()
 #define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space)
 #ifdef __SYCL_DEVICE_ONLY__
@@ -149,25 +123,13 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
 /* clang-format on */
 
 /* Types */
+
 /* It's not possible to use sycl types like sycl::float3, sycl::int3, etc
- * because these types have different interfaces from blender version */
+ * because these types have different interfaces from blender version. */
 
 using uchar = unsigned char;
 using sycl::half;
 
-struct float3 {
-  float x, y, z;
-};
-
-ccl_always_inline float3 make_float3(float x, float y, float z)
-{
-  return {x, y, z};
-}
-ccl_always_inline float3 make_float3(float x)
-{
-  return {x, x, x};
-}
-
 /* math functions */
 #define fabsf(x) sycl::fabs((x))
 #define copysignf(x, y) sycl::copysign((x), (y))
@@ -186,21 +148,15 @@ ccl_always_inline float3 make_float3(float x)
 #define fmodf(x, y) sycl::fmod((x), (y))
 #define lgammaf(x) sycl::lgamma((x))
 
-#define __forceinline __attribute__((always_inline))
-
-/* Types */
-#include "util/half.h"
-#include "util/types.h"
-
-/* NOTE(@nsirgien): Declaring these functions after types headers is very important because they
- * include oneAPI headers, which transitively include math.h headers which will cause redefinitions
- * of the math defines because math.h also uses them and having them defined before math.h include
- * is actually UB. */
-/* Use fast math functions - get them from sycl::native namespace for native math function
- * implementations */
 #define cosf(x) sycl::native::cos(((float)(x)))
 #define sinf(x) sycl::native::sin(((float)(x)))
 #define powf(x, y) sycl::native::powr(((float)(x)), ((float)(y)))
 #define tanf(x) sycl::native::tan(((float)(x)))
 #define logf(x) sycl::native::log(((float)(x)))
 #define expf(x) sycl::native::exp(((float)(x)))
+
+#define __forceinline __attribute__((always_inline))
+
+/* Types */
+#include "util/half.h"
+#include "util/types.h"
diff --git a/intern/cycles/kernel/device/oneapi/dll_interface_template.h b/intern/cycles/kernel/device/oneapi/dll_interface_template.h
deleted file mode 100644
index 662068c0fed..00000000000
--- a/intern/cycles/kernel/device/oneapi/dll_interface_template.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2022 Intel Corporation */
-
-/* device_capabilities() returns a C string that must be free'd with oneapi_free(). */
-DLL_INTERFACE_CALL(oneapi_device_capabilities, char *)
-DLL_INTERFACE_CALL(oneapi_free, void, void *)
-DLL_INTERFACE_CALL(oneapi_get_memcapacity, size_t, SyclQueue *queue)
-
-DLL_INTERFACE_CALL(oneapi_get_compute_units_amount, size_t, SyclQueue *queue)
-DLL_INTERFACE_CALL(oneapi_iterate_devices, void, OneAPIDeviceIteratorCallback cb, void *user_ptr)
-DLL_INTERFACE_CALL(oneapi_set_error_cb, void, OneAPIErrorCallback, void *user_ptr)
-
-DLL_INTERFACE_CALL(oneapi_create_queue, bool, SyclQueue *&external_queue, int device_index)
-DLL_INTERFACE_CALL(oneapi_free_queue, void, SyclQueue *queue)
-DLL_INTERFACE_CALL(
-    oneapi_usm_aligned_alloc_host, void *, SyclQueue *queue, size_t memory_size, size_t alignment)
-DLL_INTERFACE_CALL(oneapi_usm_alloc_device, void *, SyclQueue *queue, size_t memory_size)
-DLL_INTERFACE_CALL(oneapi_usm_free, void, SyclQueue *queue, void *usm_ptr)
-
-DLL_INTERFACE_CALL(
-    oneapi_usm_memcpy, bool, SyclQueue *queue, void *dest, void *src, size_t num_bytes)
-DLL_INTERFACE_CALL(oneapi_queue_synchronize, bool, SyclQueue *queue)
-DLL_INTERFACE_CALL(oneapi_usm_memset,
-                   bool,
-                   SyclQueue *queue,
-                   void *usm_ptr,
-                   unsigned char value,
-                   size_t num_bytes)
-
-DLL_INTERFACE_CALL(oneapi_run_test_kernel, bool, SyclQueue *queue)
-
-/* Operation with Kernel globals structure - map of global/constant allocation - filled before
- * render/kernel execution As we don't know in cycles `sizeof` this - Cycles will manage just as
- * pointer. */
-DLL_INTERFACE_CALL(oneapi_kernel_globals_size, bool, SyclQueue *queue, size_t &kernel_global_size)
-DLL_INTERFACE_CALL(oneapi_set_global_memory,
-                   void,
-                   SyclQueue *queue,
-                   void *kernel_globals,
-                   const char *memory_name,
-                   void *memory_device_pointer)
-
-DLL_INTERFACE_CALL(oneapi_kernel_preferred_local_size,
-                   size_t,
-                   SyclQueue *queue,
-                   const DeviceKernel kernel,
-                   const size_t kernel_global_size)
-DLL_INTERFACE_CALL(oneapi_enqueue_kernel,
-                   bool,
-                   KernelContext *context,
-                   int kernel,
-                   size_t global_size,
-                   void **args)
diff --git a/intern/cycles/kernel/device/oneapi/globals.h b/intern/cycles/kernel/device/oneapi/globals.h
index d60f4f135ba..116620eb725 100644
--- a/intern/cycles/kernel/device/oneapi/globals.h
+++ b/intern/cycles/kernel/device/oneapi/globals.h
@@ -23,15 +23,6 @@ typedef struct KernelGlobalsGPU {
 #undef KERNEL_DATA_ARRAY
   IntegratorStateGPU *integrator_state;
   const KernelData *__data;
-#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-  size_t nd_item_local_id_0;
-  size_t nd_item_local_range_0;
-  size_t nd_item_group_0;
-  size_t nd_item_group_range_0;
-
-  size_t nd_item_global_id_0;
-  size_t nd_item_global_range_0;
-#endif
 } KernelGlobalsGPU;
 
 typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals;
diff --git a/intern/cycles/kernel/device/oneapi/image.h b/intern/cycles/kernel/device/oneapi/image.h
index 6681977a675..2417b8eac3b 100644
--- a/intern/cycles/kernel/device/oneapi/image.h
+++ b/intern/cycles/kernel/device/oneapi/image.h
@@ -81,10 +81,15 @@ ccl_device_inline float4 svm_image_texture_read_2d(int id, int x, int y)
     x = svm_image_texture_wrap_periodic(x, info.width);
     y = svm_image_texture_wrap_periodic(y, info.height);
   }
-  else {
+  else if (info.extension == EXTENSION_EXTEND) {
     x = svm_image_texture_wrap_clamp(x, info.width);
     y = svm_image_texture_wrap_clamp(y, info.height);
   }
+  else {
+    if (x < 0 || x >= info.width || y < 0 || y >= info.height) {
+      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+  }
 
   return svm_image_texture_read(info, x, y, 0);
 }
@@ -99,11 +104,16 @@ ccl_device_inline float4 svm_image_texture_read_3d(int id, int x, int y, int z)
     y = svm_image_texture_wrap_periodic(y, info.height);
     z = svm_image_texture_wrap_periodic(z, info.depth);
   }
-  else {
+  else if (info.extension == EXTENSION_EXTEND) {
     x = svm_image_texture_wrap_clamp(x, info.width);
     y = svm_image_texture_wrap_clamp(y, info.height);
     z = svm_image_texture_wrap_clamp(z, info.depth);
   }
+  else {
+    if (x < 0 || x >= info.width || y < 0 || y >= info.height || z < 0 || z >= info.depth) {
+      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+  }
 
   return svm_image_texture_read(info, x, y, z);
 }
@@ -128,12 +138,6 @@ ccl_device float4 kernel_tex_image_interp(KernelGlobals, int id, float x, float
 {
   const TextureInfo &info = kernel_data_fetch(texture_info, id);
 
-  if (info.extension == EXTENSION_CLIP) {
-    if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
-      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-    }
-  }
-
   if (info.interpolation == INTERPOLATION_CLOSEST) {
     /* Closest interpolation. */
     int ix, iy;
@@ -315,12 +319,6 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals, int id, float3 P, in
   }
 #endif
   else {
-    if (info.extension == EXTENSION_CLIP) {
-      if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
-        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-      }
-    }
-
     x *= info.width;
     y *= info.height;
     z *= info.depth;
diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp
index 300e201600c..525ae288f0c 100644
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -3,208 +3,79 @@
 
 #ifdef WITH_ONEAPI
 
-/* clang-format off */
 #  include "kernel.h"
 #  include <iostream>
 #  include <map>
 #  include <set>
 
-#  include <CL/sycl.hpp>
+#  include <sycl/sycl.hpp>
 
 #  include "kernel/device/oneapi/compat.h"
 #  include "kernel/device/oneapi/globals.h"
 #  include "kernel/device/oneapi/kernel_templates.h"
 
 #  include "kernel/device/gpu/kernel.h"
-/* clang-format on */
 
 static OneAPIErrorCallback s_error_cb = nullptr;
 static void *s_error_user_ptr = nullptr;
 
-static std::vector<sycl::device> oneapi_available_devices();
-
 void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr)
 {
   s_error_cb = cb;
   s_error_user_ptr = user_ptr;
 }
 
-void oneapi_check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
-{
-#  ifdef _DEBUG
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  sycl::info::device_type device_type =
-      queue->get_device().get_info<sycl::info::device::device_type>();
-  sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
-  (void)usm_type;
-  assert(usm_type == sycl::usm::alloc::device ||
-         ((device_type == sycl::info::device_type::host ||
-           device_type == sycl::info::device_type::is_cpu || allow_host) &&
-          usm_type == sycl::usm::alloc::host));
-#  endif
-}
-
-bool oneapi_create_queue(SyclQueue *&external_queue, int device_index)
-{
-  bool finished_correct = true;
-  try {
-    std::vector<sycl::device> devices = oneapi_available_devices();
-    if (device_index < 0 || device_index >= devices.size()) {
-      return false;
-    }
-    sycl::queue *created_queue = new sycl::queue(devices[device_index],
-                                                 sycl::property::queue::in_order());
-    external_queue = reinterpret_cast<SyclQueue *>(created_queue);
-  }
-  catch (sycl::exception const &e) {
-    finished_correct = false;
-    if (s_error_cb) {
-      s_error_cb(e.what(), s_error_user_ptr);
-    }
-  }
-  return finished_correct;
-}
-
-void oneapi_free_queue(SyclQueue *queue_)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  delete queue;
-}
-
-void *oneapi_usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  return sycl::aligned_alloc_host(alignment, memory_size, *queue);
-}
-
-void *oneapi_usm_alloc_device(SyclQueue *queue_, size_t memory_size)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  return sycl::malloc_device(memory_size, *queue);
-}
-
-void oneapi_usm_free(SyclQueue *queue_, void *usm_ptr)
+/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality like
+ * memory allocations, memory transfers and execution of kernel with USM memory. */
+bool oneapi_run_test_kernel(SyclQueue *queue_)
 {
   assert(queue_);
   sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  oneapi_check_usm(queue_, usm_ptr, true);
-  sycl::free(usm_ptr, *queue);
-}
+  const size_t N = 8;
+  const size_t memory_byte_size = sizeof(int) * N;
 
-bool oneapi_usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  oneapi_check_usm(queue_, dest, true);
-  oneapi_check_usm(queue_, src, true);
-  sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
-#  ifdef WITH_CYCLES_DEBUG
+  bool is_computation_correct = true;
   try {
-    /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
-     * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
-     */
-    mem_event.wait_and_throw();
-    return true;
-  }
-  catch (sycl::exception const &e) {
-    if (s_error_cb) {
-      s_error_cb(e.what(), s_error_user_ptr);
-    }
-    return false;
-  }
-#  else
-  sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
-  sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
-  bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
-                             src_type == sycl::usm::alloc::device;
-  bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
-                                          src_type == sycl::usm::alloc::unknown;
-  /* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host
-   * may not wait until the end of the transfer before using the memory.
-   */
-  if (from_device_to_host || host_or_device_memop_with_offset)
-    mem_event.wait();
-  return true;
-#  endif
-}
+    int *A_host = (int *)sycl::aligned_alloc_host(16, memory_byte_size, *queue);
 
-bool oneapi_usm_memset(SyclQueue *queue_, void *usm_ptr, unsigned char value, size_t num_bytes)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  oneapi_check_usm(queue_, usm_ptr, true);
-  sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
-#  ifdef WITH_CYCLES_DEBUG
-  try {
-    /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
-     * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
-     */
-    mem_event.wait_and_throw();
-    return true;
-  }
-  catch (sycl::exception const &e) {
-    if (s_error_cb) {
-      s_error_cb(e.what(), s_error_user_ptr);
+    for (size_t i = (size_t)0; i < N; i++) {
+      A_host[i] = rand() % 32;
     }
-    return false;
-  }
-#  else
-  (void)mem_event;
-  return true;
-#  endif
-}
 
-bool oneapi_queue_synchronize(SyclQueue *queue_)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  try {
-    queue->wait_and_throw();
-    return true;
-  }
-  catch (sycl::exception const &e) {
-    if (s_error_cb) {
-      s_error_cb(e.what(), s_error_user_ptr);
-    }
-    return false;
-  }
-}
+    int *A_device = (int *)sycl::malloc_device(memory_byte_size, *queue);
+    int *B_device = (int *)sycl::malloc_device(memory_byte_size, *queue);
 
-/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and
- * also trigger runtime compilation of all existing oneAPI kernels */
-bool oneapi_run_test_kernel(SyclQueue *queue_)
-{
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
-  size_t N = 8;
-  sycl::buffer<float, 1> A(N);
-  sycl::buffer<float, 1> B(N);
-
-  {
-    sycl::host_accessor A_host_acc(A, sycl::write_only);
-    for (size_t i = (size_t)0; i < N; i++)
-      A_host_acc[i] = rand() % 32;
-  }
+    queue->memcpy(A_device, A_host, memory_byte_size);
+    queue->wait_and_throw();
 
-  try {
     queue->submit([&](sycl::handler &cgh) {
-      sycl::accessor A_acc(A, cgh, sycl::read_only);
-      sycl::accessor B_acc(B, cgh, sycl::write_only, sycl::no_init);
-
-      cgh.parallel_for(N, [=](sycl::id<1> idx) { B_acc[idx] = A_acc[idx] + idx.get(0); });
+      cgh.parallel_for(N, [=](sycl::id<1> idx) { B_device[idx] = A_device[idx] + idx.get(0); });
     });
     queue->wait_and_throw();
 
-    sycl::host_accessor A_host_acc(A, sycl::read_only);
-    sycl::host_accessor B_host_acc(B, sycl::read_only);
+    int *B_host = (int *)sycl::aligned_alloc_host(16, memory_byte_size, *queue);
+
+    queue->memcpy(B_host, B_device, memory_byte_size);
+    queue->wait_and_throw();
 
     for (size_t i = (size_t)0; i < N; i++) {
-      float result = A_host_acc[i] + B_host_acc[i];
-      (void)result;
+      const int expected_result = i + A_host[i];
+      if (B_host[i] != expected_result) {
+        is_computation_correct = false;
+        if (s_error_cb) {
+          s_error_cb(("Incorrect result in test kernel execution -  expected " +
+                      std::to_string(expected_result) + ", got " + std::to_string(B_host[i]))
+                         .c_str(),
+                     s_error_user_ptr);
+        }
+      }
     }
+
+    sycl::free(A_host, *queue);
+    sycl::free(B_host, *queue);
+    sycl::free(A_device, *queue);
+    sycl::free(B_device, *queue);
+    queue->wait_and_throw();
   }
   catch (sycl::exception const &e) {
     if (s_error_cb) {
@@ -213,63 +84,16 @@ bool oneapi_run_test_kernel(SyclQueue *queue_)
     return false;
   }
 
-  return true;
-}
-
-bool oneapi_kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size)
-{
-  kernel_global_size = sizeof(KernelGlobalsGPU);
-
-  return true;
-}
-
-void oneapi_set_global_memory(SyclQueue *queue_,
-                              void *kernel_globals,
-                              const char *memory_name,
-                              void *memory_device_pointer)
-{
-  assert(queue_);
-  assert(kernel_globals);
-  assert(memory_name);
-  assert(memory_device_pointer);
-  KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
-  oneapi_check_usm(queue_, memory_device_pointer);
-  oneapi_check_usm(queue_, kernel_globals, true);
-
-  std::string matched_name(memory_name);
-
-/* This macro will change global ptr of KernelGlobals via name matching. */
-#  define KERNEL_DATA_ARRAY(type, name) \
-    else if (#name == matched_name) \
-    { \
-      globals->__##name = (type *)memory_device_pointer; \
-      return; \
-    }
-  if (false) {
-  }
-  else if ("integrator_state" == matched_name) {
-    globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
-    return;
-  }
-  KERNEL_DATA_ARRAY(KernelData, data)
-#  include "kernel/data_arrays.h"
-  else
-  {
-    std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
-              << std::endl;
-    assert(false);
-  }
-#  undef KERNEL_DATA_ARRAY
+  return is_computation_correct;
 }
 
 /* TODO: Move device information to OneapiDevice initialized on creation and use it. */
 /* TODO: Move below function to oneapi/queue.cpp. */
-size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_,
+size_t oneapi_kernel_preferred_local_size(SyclQueue *queue,
                                           const DeviceKernel kernel,
                                           const size_t kernel_global_size)
 {
-  assert(queue_);
-  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  assert(queue);
   (void)kernel_global_size;
   const static size_t preferred_work_group_size_intersect_shading = 32;
   const static size_t preferred_work_group_size_technical = 1024;
@@ -311,11 +135,63 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_,
       preferred_work_group_size = 512;
   }
 
-  const size_t limit_work_group_size =
-      queue->get_device().get_info<sycl::info::device::max_work_group_size>();
+  const size_t limit_work_group_size = reinterpret_cast<sycl::queue *>(queue)
+                                           ->get_device()
+                                           .get_info<sycl::info::device::max_work_group_size>();
+
   return std::min(limit_work_group_size, preferred_work_group_size);
 }
 
+bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features)
+{
+#  ifdef SYCL_SKIP_KERNELS_PRELOAD
+  (void)queue_;
+  (void)requested_features;
+#  else
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+
+  try {
+    sycl::kernel_bundle<sycl::bundle_state::input> all_kernels_bundle =
+        sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(),
+                                                           {queue->get_device()});
+
+    for (const sycl::kernel_id &kernel_id : all_kernels_bundle.get_kernel_ids()) {
+      const std::string &kernel_name = kernel_id.get_name();
+
+      /* NOTE(@nsirgien): Names in this conditions below should match names from
+       * oneapi_call macro in oneapi_enqueue_kernel below */
+      if (((requested_features & KERNEL_FEATURE_VOLUME) == 0) &&
+          kernel_name.find("oneapi_kernel_integrator_shade_volume") != std::string::npos) {
+        continue;
+      }
+
+      if (((requested_features & KERNEL_FEATURE_MNEE) == 0) &&
+          kernel_name.find("oneapi_kernel_integrator_shade_surface_mnee") != std::string::npos) {
+        continue;
+      }
+
+      if (((requested_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) &&
+          kernel_name.find("oneapi_kernel_integrator_shade_surface_raytrace") !=
+              std::string::npos) {
+        continue;
+      }
+
+      sycl::kernel_bundle<sycl::bundle_state::input> one_kernel_bundle =
+          sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(), {kernel_id});
+      sycl::build(one_kernel_bundle);
+    }
+  }
+  catch (sycl::exception const &e) {
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+    }
+    return false;
+  }
+#  endif
+  return true;
+}
+
 bool oneapi_enqueue_kernel(KernelContext *kernel_context,
                            int kernel,
                            size_t global_size,
@@ -354,13 +230,6 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
     /* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
      * we extend work size to fit uniformity requirements. */
     global_size = groups_count * local_size;
-
-#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-    if (queue->get_device().is_host()) {
-      global_size = 1;
-      local_size = 1;
-    }
-#  endif
   }
 
   /* Let the compiler throw an error if there are any kernels missing in this implementation. */
@@ -645,13 +514,9 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
         /* Unsupported kernels */
         case DEVICE_KERNEL_NUM:
         case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
-          assert(0);
-          return false;
+          kernel_assert(0);
+          break;
       }
-
-      /* Unknown kernel. */
-      assert(0);
-      return false;
     });
   }
   catch (sycl::exception const &e) {
@@ -668,247 +533,4 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
 #  endif
   return success;
 }
-
-static const int lowest_supported_driver_version_win = 1011660;
-static const int lowest_supported_driver_version_neo = 23570;
-
-static int parse_driver_build_version(const sycl::device &device)
-{
-  const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
-  int driver_build_version = 0;
-
-  size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
-  if (second_dot_position == std::string::npos) {
-    std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
-              << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
-              << " xx.xx.xxx.xxxx (Windows) for device \""
-              << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
-  }
-  else {
-    try {
-      size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
-      if (third_dot_position != std::string::npos) {
-        const std::string &third_number_substr = driver_version.substr(
-            second_dot_position + 1, third_dot_position - second_dot_position - 1);
-        const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
-        if (third_number_substr.length() == 3 && forth_number_substr.length() == 4)
-          driver_build_version = std::stoi(third_number_substr) * 10000 +
-                                 std::stoi(forth_number_substr);
-      }
-      else {
-        const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
-        driver_build_version = std::stoi(third_number_substr);
-      }
-    }
-    catch (std::invalid_argument &e) {
-      std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
-                << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
-                << " xx.xx.xxx.xxxx (Windows) for device \""
-                << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
-    }
-  }
-
-  return driver_build_version;
-}
-
-static std::vector<sycl::device> oneapi_available_devices()
-{
-  bool allow_all_devices = false;
-  if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
-    allow_all_devices = true;
-
-    /* Host device is useful only for debugging at the moment
-     * so we hide this device with default build settings. */
-#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
-  bool allow_host = true;
-#  else
-  bool allow_host = false;
-#  endif
-
-  const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
-
-  std::vector<sycl::device> available_devices;
-  for (const sycl::platform &platform : oneapi_platforms) {
-    /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL.
-     */
-    if (platform.get_backend() == sycl::backend::opencl) {
-      continue;
-    }
-
-    const std::vector<sycl::device> &oneapi_devices =
-        (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
-                                            platform.get_devices(sycl::info::device_type::gpu);
-
-    for (const sycl::device &device : oneapi_devices) {
-      if (allow_all_devices) {
-        /* still filter out host device if build doesn't support it. */
-        if (allow_host || !device.is_host()) {
-          available_devices.push_back(device);
-        }
-      }
-      else {
-        bool filter_out = false;
-
-        /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
-         * assuming they have either more than 96 Execution Units or not 7 threads per EU.
-         * Official support can be broaden to older and smaller GPUs once ready. */
-        if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) {
-          /* Filtered-out defaults in-case these values aren't available through too old L0
-           * runtime. */
-          int number_of_eus = 96;
-          int threads_per_eu = 7;
-          if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
-            number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
-          }
-          if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
-            threads_per_eu =
-                device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
-          }
-          /* This filters out all Level-Zero supported GPUs from older generation than Arc. */
-          if (number_of_eus <= 96 && threads_per_eu == 7) {
-            filter_out = true;
-          }
-          /* if not already filtered out, check driver version. */
-          if (!filter_out) {
-            int driver_build_version = parse_driver_build_version(device);
-            if ((driver_build_version > 100000 &&
-                 driver_build_version < lowest_supported_driver_version_win) ||
-                (driver_build_version > 0 &&
-                 driver_build_version < lowest_supported_driver_version_neo)) {
-              filter_out = true;
-            }
-          }
-        }
-        else if (!allow_host && device.is_host()) {
-          filter_out = true;
-        }
-        else if (!allow_all_devices) {
-          filter_out = true;
-        }
-
-        if (!filter_out) {
-          available_devices.push_back(device);
-        }
-      }
-    }
-  }
-
-  return available_devices;
-}
-
-char *oneapi_device_capabilities()
-{
-  std::stringstream capabilities;
-
-  const std::vector<sycl::device> &oneapi_devices = oneapi_available_devices();
-  for (const sycl::device &device : oneapi_devices) {
-    const std::string &name = device.get_info<sycl::info::device::name>();
-
-    capabilities << std::string("\t") << name << "\n";
-#  define WRITE_ATTR(attribute_name, attribute_variable) \
-    capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
-                 << "\n";
-#  define GET_NUM_ATTR(attribute) \
-    { \
-      size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \
-      capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \
-    }
-
-    GET_NUM_ATTR(vendor_id)
-    GET_NUM_ATTR(max_compute_units)
-    GET_NUM_ATTR(max_work_item_dimensions)
-
-    sycl::id<3> max_work_item_sizes = device.get_info<sycl::info::device::max_work_item_sizes>();
-    WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0)))
-    WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1)))
-    WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2)))
-
-    GET_NUM_ATTR(max_work_group_size)
-    GET_NUM_ATTR(max_num_sub_groups)
-    GET_NUM_ATTR(sub_group_independent_forward_progress)
-
-    GET_NUM_ATTR(preferred_vector_width_char)
-    GET_NUM_ATTR(preferred_vector_width_short)
-    GET_NUM_ATTR(preferred_vector_width_int)
-    GET_NUM_ATTR(preferred_vector_width_long)
-    GET_NUM_ATTR(preferred_vector_width_float)
-    GET_NUM_ATTR(preferred_vector_width_double)
-    GET_NUM_ATTR(preferred_vector_width_half)
-
-    GET_NUM_ATTR(native_vector_width_char)
-    GET_NUM_ATTR(native_vector_width_short)
-    GET_NUM_ATTR(native_vector_width_int)
-    GET_NUM_ATTR(native_vector_width_long)
-    GET_NUM_ATTR(native_vector_width_float)
-    GET_NUM_ATTR(native_vector_width_double)
-    GET_NUM_ATTR(native_vector_width_half)
-
-    size_t max_clock_frequency =
-        (size_t)(device.is_host() ? (size_t)0 :
-                                    device.get_info<sycl::info::device::max_clock_frequency>());
-    WRITE_ATTR("max_clock_frequency", max_clock_frequency)
-
-    GET_NUM_ATTR(address_bits)
-    GET_NUM_ATTR(max_mem_alloc_size)
-
-    /* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't
-     * supported so we always return false, even if device supports HW texture usage acceleration.
-     */
-    bool image_support = false;
-    WRITE_ATTR("image_support", (size_t)image_support)
-
-    GET_NUM_ATTR(max_parameter_size)
-    GET_NUM_ATTR(mem_base_addr_align)
-    GET_NUM_ATTR(global_mem_size)
-    GET_NUM_ATTR(local_mem_size)
-    GET_NUM_ATTR(error_correction_support)
-    GET_NUM_ATTR(profiling_timer_resolution)
-    GET_NUM_ATTR(is_available)
-
-#  undef GET_NUM_ATTR
-#  undef WRITE_ATTR
-    capabilities << "\n";
-  }
-
-  return ::strdup(capabilities.str().c_str());
-}
-
-void oneapi_free(void *p)
-{
-  if (p) {
-    ::free(p);
-  }
-}
-
-void oneapi_iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
-{
-  int num = 0;
-  std::vector<sycl::device> devices = oneapi_available_devices();
-  for (sycl::device &device : devices) {
-    const std::string &platform_name =
-        device.get_platform().get_info<sycl::info::platform::name>();
-    std::string name = device.get_info<sycl::info::device::name>();
-    std::string id = "ONEAPI_" + platform_name + "_" + name;
-    if (device.has(sycl::aspect::ext_intel_pci_address)) {
-      id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>());
-    }
-    (cb)(id.c_str(), name.c_str(), num, user_ptr);
-    num++;
-  }
-}
-
-size_t oneapi_get_memcapacity(SyclQueue *queue)
-{
-  return reinterpret_cast<sycl::queue *>(queue)
-      ->get_device()
-      .get_info<sycl::info::device::global_mem_size>();
-}
-
-size_t oneapi_get_compute_units_amount(SyclQueue *queue)
-{
-  return reinterpret_cast<sycl::queue *>(queue)
-      ->get_device()
-      .get_info<sycl::info::device::max_compute_units>();
-}
-
 #endif /* WITH_ONEAPI */
diff --git a/intern/cycles/kernel/device/oneapi/kernel.h b/intern/cycles/kernel/device/oneapi/kernel.h
index c5f853742ed..2bfc0b89c87 100644
--- a/intern/cycles/kernel/device/oneapi/kernel.h
+++ b/intern/cycles/kernel/device/oneapi/kernel.h
@@ -25,11 +25,6 @@ enum DeviceKernel : int;
 
 class SyclQueue;
 
-typedef void (*OneAPIDeviceIteratorCallback)(const char *id,
-                                             const char *name,
-                                             int num,
-                                             void *user_ptr);
-
 typedef void (*OneAPIErrorCallback)(const char *error, void *user_ptr);
 
 struct KernelContext {
@@ -45,13 +40,17 @@ struct KernelContext {
 extern "C" {
 #  endif
 
-#  define DLL_INTERFACE_CALL(function, return_type, ...) \
-    CYCLES_KERNEL_ONEAPI_EXPORT return_type function(__VA_ARGS__);
-#  include "kernel/device/oneapi/dll_interface_template.h"
-#  undef DLL_INTERFACE_CALL
-
+CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_run_test_kernel(SyclQueue *queue_);
+CYCLES_KERNEL_ONEAPI_EXPORT void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr);
+CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_kernel_preferred_local_size(
+    SyclQueue *queue, const DeviceKernel kernel, const size_t kernel_global_size);
+CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context,
+                                                       int kernel,
+                                                       size_t global_size,
+                                                       void **args);
+CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_load_kernels(SyclQueue *queue,
+                                                     const unsigned int requested_features);
 #  ifdef __cplusplus
 }
 #  endif
-
 #endif /* WITH_ONEAPI */
diff --git a/intern/cycles/kernel/device/oneapi/kernel_templates.h b/intern/cycles/kernel/device/oneapi/kernel_templates.h
index d8964d9b672..0ae925cf748 100644
--- a/intern/cycles/kernel/device/oneapi/kernel_templates.h
+++ b/intern/cycles/kernel/device/oneapi/kernel_templates.h
@@ -80,7 +80,7 @@ void oneapi_call(
   (x, ##__VA_ARGS__)
 
 /* This template automatically casts entries in the void **args array to the types requested by the kernel func.
-   Since kernel parameters are passed as void ** to the device, this is the closest that we have to type safety. */
+ * Since kernel parameters are passed as void ** to the device, this is the closest that we have to type safety. */
 #define oneapi_template(...) \
   template<ONEAPI_CALL_FOR(ONEAPI_TYP, __VA_ARGS__)> \
   void oneapi_call( \