9 files changed, 1807 insertions, 0 deletions
diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h
new file mode 100644
index 00000000000..5c49674f247
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -0,0 +1,194 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#pragma once
+
+#define __KERNEL_GPU__
+#define __KERNEL_ONEAPI__
+
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
+
+#include <cstdint>
+
+#ifndef __NODES_MAX_GROUP__
+#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
+#endif
+#ifndef __NODES_FEATURES__
+#  define __NODES_FEATURES__ NODE_FEATURE_ALL
+#endif
+
+/* This one does not have an abstraction.
+ * It's used by other devices directly.
+ */
+
+#define __device__
+
+/* Qualifier wrappers for different names on different devices */
+
+#define ccl_device
+#define ccl_global
+#define ccl_always_inline __attribute__((always_inline))
+#define ccl_device_inline inline
+#define ccl_noinline __attribute__((noinline))
+#define ccl_inline_constant const constexpr
+#define ccl_static_constant const
+#define ccl_device_forceinline __attribute__((always_inline))
+#define ccl_device_noinline ccl_device ccl_noinline
+#define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
+#define ccl_restrict __restrict__
+#define ccl_loop_no_unroll
+#define ccl_optional_struct_init
+#define ccl_private
+#define ATTR_FALLTHROUGH __attribute__((fallthrough))
+#define ccl_constant const
+#define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
+#define ccl_align(n) __attribute__((aligned(n)))
+#define kernel_assert(cond)
+#define ccl_may_alias
+
+/* clang-format off */
+
+/* kernel.h adapters */
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
+#define ccl_gpu_kernel_threads(block_num_threads)
+
+#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+#  define KG_ND_ITEMS \
+  kg->nd_item_local_id_0 = item.get_local_id(0); \
+  kg->nd_item_local_range_0 = item.get_local_range(0); \
+  kg->nd_item_group_0 = item.get_group(0); \
+  kg->nd_item_group_range_0 = item.get_group_range(0); \
+  kg->nd_item_global_id_0 = item.get_global_id(0); \
+  kg->nd_item_global_range_0 = item.get_global_range(0);
+#else
+# define KG_ND_ITEMS
+#endif
+
+#define ccl_gpu_kernel_signature(name, ...) \
+void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
+                          size_t kernel_global_size, \
+                          size_t kernel_local_size, \
+                          sycl::handler &cgh, \
+                          __VA_ARGS__) { \
+      (kg); \
+      cgh.parallel_for<class kernel_##name>( \
+          sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
+          [=](sycl::nd_item<1> item) { \
+            KG_ND_ITEMS
+
+#define ccl_gpu_kernel_postfix \
+          }); \
+    }
+
+#define ccl_gpu_kernel_call(x) ((ONEAPIKernelContext*)kg)->x
+
+#define ccl_gpu_kernel_lambda(func, ...) \
+  struct KernelLambda \
+  { \
+    KernelLambda(const ONEAPIKernelContext *_kg) : kg(_kg) {} \
+    ccl_private const ONEAPIKernelContext *kg; \
+    __VA_ARGS__; \
+    int operator()(const int state) const { return (func); } \
+  } ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg)
+
+/* GPU thread, block, grid size and index */
+#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED
+#  define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
+#  define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
+#  define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
+#  define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
+#  define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
+#  define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
+
+#  define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
+#  define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
+#else
+#  define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0)
+#  define ccl_gpu_block_dim_x (kg->nd_item_local_range_0)
+#  define ccl_gpu_block_idx_x (kg->nd_item_group_0)
+#  define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0)
+#  define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
+#  define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
+
+#  define ccl_gpu_global_id_x() (kg->nd_item_global_id_0)
+#  define ccl_gpu_global_size_x() (kg->nd_item_global_range_0)
+#endif
+
+
+/* GPU warp synchronization */
+
+#define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier()
+#define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space)
+#ifdef __SYCL_DEVICE_ONLY__
+  #define ccl_gpu_ballot(predicate) (sycl::ext::oneapi::group_ballot(sycl::ext::oneapi::experimental::this_sub_group(), predicate).count())
+#else
+  #define ccl_gpu_ballot(predicate) (predicate ? 1 : 0)
+#endif
+
+/* Debug defines */
+#if defined(__SYCL_DEVICE_ONLY__)
+#  define CONSTANT __attribute__((opencl_constant))
+#else
+#  define CONSTANT
+#endif
+
+#define sycl_printf(format, ...) {               \
+    static const CONSTANT char fmt[] = format;               \
+    sycl::ext::oneapi::experimental::printf(fmt, __VA_ARGS__ );    \
+  }
+
+#define sycl_printf_(format) {               \
+    static const CONSTANT char fmt[] = format;               \
+    sycl::ext::oneapi::experimental::printf(fmt);                  \
+  }
+
+/* GPU texture objects */
+
+/* clang-format on */
+
+/* Types */
+
+/* It's not possible to use sycl types like sycl::float3, sycl::int3, etc
+ * because these types have different interfaces from blender version. */
+
+using uchar = unsigned char;
+using sycl::half;
+
+/* math functions */
+#define fabsf(x) sycl::fabs((x))
+#define copysignf(x, y) sycl::copysign((x), (y))
+#define asinf(x) sycl::asin((x))
+#define acosf(x) sycl::acos((x))
+#define atanf(x) sycl::atan((x))
+#define floorf(x) sycl::floor((x))
+#define ceilf(x) sycl::ceil((x))
+#define sinhf(x) sycl::sinh((x))
+#define coshf(x) sycl::cosh((x))
+#define tanhf(x) sycl::tanh((x))
+#define hypotf(x, y) sycl::hypot((x), (y))
+#define atan2f(x, y) sycl::atan2((x), (y))
+#define fmaxf(x, y) sycl::fmax((x), (y))
+#define fminf(x, y) sycl::fmin((x), (y))
+#define fmodf(x, y) sycl::fmod((x), (y))
+#define lgammaf(x) sycl::lgamma((x))
+
+#define __forceinline __attribute__((always_inline))
+
+/* Types */
+#include "util/half.h"
+#include "util/types.h"
+
+/* NOTE(@nsirgien): Declaring these functions after types headers is very important because they
+ * include oneAPI headers, which transitively include math.h headers which will cause redefinitions
+ * of the math defines because math.h also uses them and having them defined before math.h include
+ * is actually UB. */
+/* Use fast math functions - get them from sycl::native namespace for native math function
+ * implementations */
+#define cosf(x) sycl::native::cos(((float)(x)))
+#define sinf(x) sycl::native::sin(((float)(x)))
+#define powf(x, y) sycl::native::powr(((float)(x)), ((float)(y)))
+#define tanf(x) sycl::native::tan(((float)(x)))
+#define logf(x) sycl::native::log(((float)(x)))
+#define expf(x) sycl::native::exp(((float)(x)))
diff --git a/intern/cycles/kernel/device/oneapi/context_begin.h b/intern/cycles/kernel/device/oneapi/context_begin.h
new file mode 100644
index 00000000000..6d6f8cec4ca
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/context_begin.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#ifdef WITH_NANOVDB
+#  include <nanovdb/NanoVDB.h>
+#  include <nanovdb/util/SampleFromVoxels.h>
+#endif
+
+/* clang-format off */
+struct ONEAPIKernelContext : public KernelGlobalsGPU {
+  public:
+#    include "kernel/device/oneapi/image.h"
+  /* clang-format on */
diff --git a/intern/cycles/kernel/device/oneapi/context_end.h b/intern/cycles/kernel/device/oneapi/context_end.h
new file mode 100644
index 00000000000..ddf0d1f1712
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/context_end.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+}
+; /* end of ONEAPIKernelContext class definition */
+
+#undef kernel_integrator_state
+#define kernel_integrator_state (*(kg->integrator_state))
diff --git a/intern/cycles/kernel/device/oneapi/dll_interface_template.h b/intern/cycles/kernel/device/oneapi/dll_interface_template.h
new file mode 100644
index 00000000000..5dd0d4203a4
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/dll_interface_template.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2022 Intel Corporation */
+
+/* device_capabilities() returns a C string that must be free'd with oneapi_free(). */
+DLL_INTERFACE_CALL(oneapi_device_capabilities, char *)
+DLL_INTERFACE_CALL(oneapi_free, void, void *)
+DLL_INTERFACE_CALL(oneapi_get_memcapacity, size_t, SyclQueue *queue)
+
+DLL_INTERFACE_CALL(oneapi_get_num_multiprocessors, int, SyclQueue *queue)
+DLL_INTERFACE_CALL(oneapi_get_max_num_threads_per_multiprocessor, int, SyclQueue *queue)
+DLL_INTERFACE_CALL(oneapi_iterate_devices, void, OneAPIDeviceIteratorCallback cb, void *user_ptr)
+DLL_INTERFACE_CALL(oneapi_set_error_cb, void, OneAPIErrorCallback, void *user_ptr)
+
+DLL_INTERFACE_CALL(oneapi_create_queue, bool, SyclQueue *&external_queue, int device_index)
+DLL_INTERFACE_CALL(oneapi_free_queue, void, SyclQueue *queue)
+DLL_INTERFACE_CALL(
+    oneapi_usm_aligned_alloc_host, void *, SyclQueue *queue, size_t memory_size, size_t alignment)
+DLL_INTERFACE_CALL(oneapi_usm_alloc_device, void *, SyclQueue *queue, size_t memory_size)
+DLL_INTERFACE_CALL(oneapi_usm_free, void, SyclQueue *queue, void *usm_ptr)
+
+DLL_INTERFACE_CALL(
+    oneapi_usm_memcpy, bool, SyclQueue *queue, void *dest, void *src, size_t num_bytes)
+DLL_INTERFACE_CALL(oneapi_queue_synchronize, bool, SyclQueue *queue)
+DLL_INTERFACE_CALL(oneapi_usm_memset,
+                   bool,
+                   SyclQueue *queue,
+                   void *usm_ptr,
+                   unsigned char value,
+                   size_t num_bytes)
+
+DLL_INTERFACE_CALL(oneapi_run_test_kernel, bool, SyclQueue *queue)
+
+/* Operation with Kernel globals structure - map of global/constant allocation - filled before
+ * render/kernel execution As we don't know in cycles `sizeof` this - Cycles will manage just as
+ * pointer. */
+DLL_INTERFACE_CALL(oneapi_kernel_globals_size, bool, SyclQueue *queue, size_t &kernel_global_size)
+DLL_INTERFACE_CALL(oneapi_set_global_memory,
+                   void,
+                   SyclQueue *queue,
+                   void *kernel_globals,
+                   const char *memory_name,
+                   void *memory_device_pointer)
+
+DLL_INTERFACE_CALL(oneapi_kernel_preferred_local_size,
+                   size_t,
+                   SyclQueue *queue,
+                   const DeviceKernel kernel,
+                   const size_t kernel_global_size)
+DLL_INTERFACE_CALL(oneapi_enqueue_kernel,
+                   bool,
+                   KernelContext *context,
+                   int kernel,
+                   size_t global_size,
+                   void **args)
diff --git a/intern/cycles/kernel/device/oneapi/globals.h b/intern/cycles/kernel/device/oneapi/globals.h
new file mode 100644
index 00000000000..d60f4f135ba
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/globals.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#pragma once
+
+#include "kernel/integrator/state.h"
+#include "kernel/types.h"
+#include "kernel/util/profiling.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* NOTE(@nsirgien): With SYCL we can't declare __constant__ global variable, which will be
+ * accessible from device code, like it has been done for Cycles CUDA backend. So, the backend will
+ * allocate this "constant" memory regions and store pointers to them in oneAPI context class */
+
+struct IntegratorStateGPU;
+struct IntegratorQueueCounter;
+
+typedef struct KernelGlobalsGPU {
+
+#define KERNEL_DATA_ARRAY(type, name) const type *__##name = nullptr;
+#include "kernel/data_arrays.h"
+#undef KERNEL_DATA_ARRAY
+  IntegratorStateGPU *integrator_state;
+  const KernelData *__data;
+#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+  size_t nd_item_local_id_0;
+  size_t nd_item_local_range_0;
+  size_t nd_item_group_0;
+  size_t nd_item_group_range_0;
+
+  size_t nd_item_global_id_0;
+  size_t nd_item_global_range_0;
+#endif
+} KernelGlobalsGPU;
+
+typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals;
+
+#define kernel_data (*(__data))
+#define kernel_integrator_state (*(integrator_state))
+
+/* data lookup defines */
+
+#define kernel_data_fetch(name, index) __##name[index]
+#define kernel_data_array(name) __##name
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/oneapi/image.h b/intern/cycles/kernel/device/oneapi/image.h
new file mode 100644
index 00000000000..2417b8eac3b
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/image.h
@@ -0,0 +1,383 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+CCL_NAMESPACE_BEGIN
+
+/* For oneAPI implementation we do manual lookup and interpolation. */
+/* TODO: share implementation with ../cpu/image.h. */
+
+template<typename T> ccl_device_forceinline T tex_fetch(const TextureInfo &info, int index)
+{
+  return reinterpret_cast<ccl_global T *>(info.data)[index];
+}
+
+ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
+{
+  x %= width;
+  if (x < 0)
+    x += width;
+  return x;
+}
+
+ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
+{
+  return clamp(x, 0, width - 1);
+}
+
+ccl_device_inline float4 svm_image_texture_read(const TextureInfo &info, int x, int y, int z)
+{
+  const int data_offset = x + info.width * y + info.width * info.height * z;
+  const int texture_type = info.data_type;
+
+  /* Float4 */
+  if (texture_type == IMAGE_DATA_TYPE_FLOAT4) {
+    return tex_fetch<float4>(info, data_offset);
+  }
+  /* Byte4 */
+  else if (texture_type == IMAGE_DATA_TYPE_BYTE4) {
+    uchar4 r = tex_fetch<uchar4>(info, data_offset);
+    float f = 1.0f / 255.0f;
+    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
+  }
+  /* Ushort4 */
+  else if (texture_type == IMAGE_DATA_TYPE_USHORT4) {
+    ushort4 r = tex_fetch<ushort4>(info, data_offset);
+    float f = 1.0f / 65535.f;
+    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
+  }
+  /* Float */
+  else if (texture_type == IMAGE_DATA_TYPE_FLOAT) {
+    float f = tex_fetch<float>(info, data_offset);
+    return make_float4(f, f, f, 1.0f);
+  }
+  /* UShort */
+  else if (texture_type == IMAGE_DATA_TYPE_USHORT) {
+    ushort r = tex_fetch<ushort>(info, data_offset);
+    float f = r * (1.0f / 65535.0f);
+    return make_float4(f, f, f, 1.0f);
+  }
+  else if (texture_type == IMAGE_DATA_TYPE_HALF) {
+    float f = tex_fetch<half>(info, data_offset);
+    return make_float4(f, f, f, 1.0f);
+  }
+  else if (texture_type == IMAGE_DATA_TYPE_HALF4) {
+    half4 r = tex_fetch<half4>(info, data_offset);
+    return make_float4(r.x, r.y, r.z, r.w);
+  }
+  /* Byte */
+  else {
+    uchar r = tex_fetch<uchar>(info, data_offset);
+    float f = r * (1.0f / 255.0f);
+    return make_float4(f, f, f, 1.0f);
+  }
+}
+
+ccl_device_inline float4 svm_image_texture_read_2d(int id, int x, int y)
+{
+  const TextureInfo &info = kernel_data_fetch(texture_info, id);
+
+  /* Wrap */
+  if (info.extension == EXTENSION_REPEAT) {
+    x = svm_image_texture_wrap_periodic(x, info.width);
+    y = svm_image_texture_wrap_periodic(y, info.height);
+  }
+  else if (info.extension == EXTENSION_EXTEND) {
+    x = svm_image_texture_wrap_clamp(x, info.width);
+    y = svm_image_texture_wrap_clamp(y, info.height);
+  }
+  else {
+    if (x < 0 || x >= info.width || y < 0 || y >= info.height) {
+      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+  }
+
+  return svm_image_texture_read(info, x, y, 0);
+}
+
+ccl_device_inline float4 svm_image_texture_read_3d(int id, int x, int y, int z)
+{
+  const TextureInfo &info = kernel_data_fetch(texture_info, id);
+
+  /* Wrap */
+  if (info.extension == EXTENSION_REPEAT) {
+    x = svm_image_texture_wrap_periodic(x, info.width);
+    y = svm_image_texture_wrap_periodic(y, info.height);
+    z = svm_image_texture_wrap_periodic(z, info.depth);
+  }
+  else if (info.extension == EXTENSION_EXTEND) {
+    x = svm_image_texture_wrap_clamp(x, info.width);
+    y = svm_image_texture_wrap_clamp(y, info.height);
+    z = svm_image_texture_wrap_clamp(z, info.depth);
+  }
+  else {
+    if (x < 0 || x >= info.width || y < 0 || y >= info.height || z < 0 || z >= info.depth) {
+      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+  }
+
+  return svm_image_texture_read(info, x, y, z);
+}
+
+static float svm_image_texture_frac(float x, int *ix)
+{
+  int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0);
+  *ix = i;
+  return x - (float)i;
+}
+
+#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
+  { \
+    u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \
+    u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \
+    u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \
+    u[3] = (1.0f / 6.0f) * t * t * t; \
+  } \
+  (void)0
+
+ccl_device float4 kernel_tex_image_interp(KernelGlobals, int id, float x, float y)
+{
+  const TextureInfo &info = kernel_data_fetch(texture_info, id);
+
+  if (info.interpolation == INTERPOLATION_CLOSEST) {
+    /* Closest interpolation. */
+    int ix, iy;
+    svm_image_texture_frac(x * info.width, &ix);
+    svm_image_texture_frac(y * info.height, &iy);
+
+    return svm_image_texture_read_2d(id, ix, iy);
+  }
+  else if (info.interpolation == INTERPOLATION_LINEAR) {
+    /* Bilinear interpolation. */
+    int ix, iy;
+    float tx = svm_image_texture_frac(x * info.width - 0.5f, &ix);
+    float ty = svm_image_texture_frac(y * info.height - 0.5f, &iy);
+
+    float4 r;
+    r = (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_2d(id, ix, iy);
+    r += (1.0f - ty) * tx * svm_image_texture_read_2d(id, ix + 1, iy);
+    r += ty * (1.0f - tx) * svm_image_texture_read_2d(id, ix, iy + 1);
+    r += ty * tx * svm_image_texture_read_2d(id, ix + 1, iy + 1);
+    return r;
+  }
+  else {
+    /* Bicubic interpolation. */
+    int ix, iy;
+    float tx = svm_image_texture_frac(x * info.width - 0.5f, &ix);
+    float ty = svm_image_texture_frac(y * info.height - 0.5f, &iy);
+
+    float u[4], v[4];
+    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+
+    float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+    for (int y = 0; y < 4; y++) {
+      for (int x = 0; x < 4; x++) {
+        float weight = u[x] * v[y];
+        r += weight * svm_image_texture_read_2d(id, ix + x - 1, iy + y - 1);
+      }
+    }
+    return r;
+  }
+}
+
+#ifdef WITH_NANOVDB
+template<typename T> struct NanoVDBInterpolator {
+
+  typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType;
+
+  static ccl_always_inline float4 read(float r)
+  {
+    return make_float4(r, r, r, 1.0f);
+  }
+
+  static ccl_always_inline float4 read(nanovdb::Vec3f r)
+  {
+    return make_float4(r[0], r[1], r[2], 1.0f);
+  }
+
+  static ccl_always_inline float4 interp_3d_closest(const AccessorType &acc,
+                                                    float x,
+                                                    float y,
+                                                    float z)
+  {
+    const nanovdb::Vec3f xyz(x, y, z);
+    return read(nanovdb::SampleFromVoxels<AccessorType, 0, false>(acc)(xyz));
+  }
+
+  static ccl_always_inline float4 interp_3d_linear(const AccessorType &acc,
+                                                   float x,
+                                                   float y,
+                                                   float z)
+  {
+    const nanovdb::Vec3f xyz(x - 0.5f, y - 0.5f, z - 0.5f);
+    return read(nanovdb::SampleFromVoxels<AccessorType, 1, false>(acc)(xyz));
+  }
+
+  static float4 interp_3d_cubic(const AccessorType &acc, float x, float y, float z)
+  {
+    int ix, iy, iz;
+    int nix, niy, niz;
+    int pix, piy, piz;
+    int nnix, nniy, nniz;
+    /* Tri-cubic b-spline interpolation. */
+    const float tx = svm_image_texture_frac(x - 0.5f, &ix);
+    const float ty = svm_image_texture_frac(y - 0.5f, &iy);
+    const float tz = svm_image_texture_frac(z - 0.5f, &iz);
+    pix = ix - 1;
+    piy = iy - 1;
+    piz = iz - 1;
+    nix = ix + 1;
+    niy = iy + 1;
+    niz = iz + 1;
+    nnix = ix + 2;
+    nniy = iy + 2;
+    nniz = iz + 2;
+
+    const int xc[4] = {pix, ix, nix, nnix};
+    const int yc[4] = {piy, iy, niy, nniy};
+    const int zc[4] = {piz, iz, niz, nniz};
+    float u[4], v[4], w[4];
+
+    /* Some helper macro to keep code reasonable size,
+     * let compiler to inline all the matrix multiplications.
+     */
+#  define DATA(x, y, z) (read(acc.getValue(nanovdb::Coord(xc[x], yc[y], zc[z]))))
+#  define COL_TERM(col, row) \
+    (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \
+               u[3] * DATA(3, col, row)))
+#  define ROW_TERM(row) \
+    (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row)))
+
+    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+    SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+    /* Actual interpolation. */
+    return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#  undef COL_TERM
+#  undef ROW_TERM
+#  undef DATA
+  }
+
+  static ccl_always_inline float4
+  interp_3d(const TextureInfo &info, float x, float y, float z, int interp)
+  {
+    using namespace nanovdb;
+
+    NanoGrid<T> *const grid = (NanoGrid<T> *)info.data;
+    AccessorType acc = grid->getAccessor();
+
+    switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) {
+      case INTERPOLATION_CLOSEST:
+        return interp_3d_closest(acc, x, y, z);
+      case INTERPOLATION_LINEAR:
+        return interp_3d_linear(acc, x, y, z);
+      default:
+        return interp_3d_cubic(acc, x, y, z);
+    }
+  }
+};
+#endif /* WITH_NANOVDB */
+
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals, int id, float3 P, int interp)
+{
+  const TextureInfo &info = kernel_data_fetch(texture_info, id);
+
+  if (info.use_transform_3d) {
+    Transform tfm = info.transform_3d;
+    P = transform_point(&tfm, P);
+  }
+
+  float x = P.x;
+  float y = P.y;
+  float z = P.z;
+
+  uint interpolation = (interp == INTERPOLATION_NONE) ? info.interpolation : interp;
+
+#ifdef WITH_NANOVDB
+  if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT) {
+    return NanoVDBInterpolator<float>::interp_3d(info, x, y, z, interpolation);
+  }
+  else if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+    return NanoVDBInterpolator<nanovdb::Vec3f>::interp_3d(info, x, y, z, interpolation);
+  }
+  else if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FPN) {
+    return NanoVDBInterpolator<nanovdb::FpN>::interp_3d(info, x, y, z, interpolation);
+  }
+  else if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FP16) {
+    return NanoVDBInterpolator<nanovdb::Fp16>::interp_3d(info, x, y, z, interpolation);
+  }
+#else
+  if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT ||
+      info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3 ||
+      info.data_type == IMAGE_DATA_TYPE_NANOVDB_FPN ||
+      info.data_type == IMAGE_DATA_TYPE_NANOVDB_FP16) {
+    return make_float4(
+        TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A);
+  }
+#endif
+  else {
+    x *= info.width;
+    y *= info.height;
+    z *= info.depth;
+  }
+
+  if (interpolation == INTERPOLATION_CLOSEST) {
+    /* Closest interpolation. */
+    int ix, iy, iz;
+    svm_image_texture_frac(x, &ix);
+    svm_image_texture_frac(y, &iy);
+    svm_image_texture_frac(z, &iz);
+
+    return svm_image_texture_read_3d(id, ix, iy, iz);
+  }
+  else if (interpolation == INTERPOLATION_LINEAR) {
+    /* Trilinear interpolation. */
+    int ix, iy, iz;
+    float tx = svm_image_texture_frac(x - 0.5f, &ix);
+    float ty = svm_image_texture_frac(y - 0.5f, &iy);
+    float tz = svm_image_texture_frac(z - 0.5f, &iz);
+
+    float4 r;
+    r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy, iz);
+    r += (1.0f - tz) * (1.0f - ty) * tx * svm_image_texture_read_3d(id, ix + 1, iy, iz);
+    r += (1.0f - tz) * ty * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy + 1, iz);
+    r += (1.0f - tz) * ty * tx * svm_image_texture_read_3d(id, ix + 1, iy + 1, iz);
+
+    r += tz * (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy, iz + 1);
+    r += tz * (1.0f - ty) * tx * svm_image_texture_read_3d(id, ix + 1, iy, iz + 1);
+    r += tz * ty * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy + 1, iz + 1);
+    r += tz * ty * tx * svm_image_texture_read_3d(id, ix + 1, iy + 1, iz + 1);
+    return r;
+  }
+  else {
+    /* Tri-cubic interpolation. */
+    int ix, iy, iz;
+    float tx = svm_image_texture_frac(x - 0.5f, &ix);
+    float ty = svm_image_texture_frac(y - 0.5f, &iy);
+    float tz = svm_image_texture_frac(z - 0.5f, &iz);
+
+    float u[4], v[4], w[4];
+    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+    SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+    float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+    for (int z = 0; z < 4; z++) {
+      for (int y = 0; y < 4; y++) {
+        for (int x = 0; x < 4; x++) {
+          float weight = u[x] * v[y] * w[z];
+          r += weight * svm_image_texture_read_3d(id, ix + x - 1, iy + y - 1, iz + z - 1);
+        }
+      }
+    }
+    return r;
+  }
+}
+
+#undef SET_CUBIC_SPLINE_WEIGHTS
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp
new file mode 100644
index 00000000000..097d21b963f
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -0,0 +1,929 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#ifdef WITH_ONEAPI
+
+/* clang-format off */
+#  include "kernel.h"
+#  include <iostream>
+#  include <map>
+#  include <set>
+
+#  include <CL/sycl.hpp>
+
+#  include "kernel/device/oneapi/compat.h"
+#  include "kernel/device/oneapi/globals.h"
+#  include "kernel/device/oneapi/kernel_templates.h"
+
+#  include "kernel/device/gpu/kernel.h"
+/* clang-format on */
+
+static OneAPIErrorCallback s_error_cb = nullptr;
+static void *s_error_user_ptr = nullptr;
+
+static std::vector<sycl::device> oneapi_available_devices();
+
+void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr)
+{
+  s_error_cb = cb;
+  s_error_user_ptr = user_ptr;
+}
+
+void oneapi_check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
+{
+#  ifdef _DEBUG
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  sycl::info::device_type device_type =
+      queue->get_device().get_info<sycl::info::device::device_type>();
+  sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
+  (void)usm_type;
+  assert(usm_type == sycl::usm::alloc::device ||
+         ((device_type == sycl::info::device_type::host ||
+           device_type == sycl::info::device_type::is_cpu || allow_host) &&
+          usm_type == sycl::usm::alloc::host));
+#  endif
+}
+
+bool oneapi_create_queue(SyclQueue *&external_queue, int device_index)
+{
+  bool finished_correct = true;
+  try {
+    std::vector<sycl::device> devices = oneapi_available_devices();
+    if (device_index < 0 || device_index >= devices.size()) {
+      return false;
+    }
+    sycl::queue *created_queue = new sycl::queue(devices[device_index],
+                                                 sycl::property::queue::in_order());
+    external_queue = reinterpret_cast<SyclQueue *>(created_queue);
+  }
+  catch (sycl::exception const &e) {
+    finished_correct = false;
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+    }
+  }
+  return finished_correct;
+}
+
+void oneapi_free_queue(SyclQueue *queue_)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  delete queue;
+}
+
+void *oneapi_usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  return sycl::aligned_alloc_host(alignment, memory_size, *queue);
+}
+
+void *oneapi_usm_alloc_device(SyclQueue *queue_, size_t memory_size)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  return sycl::malloc_device(memory_size, *queue);
+}
+
+void oneapi_usm_free(SyclQueue *queue_, void *usm_ptr)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  oneapi_check_usm(queue_, usm_ptr, true);
+  sycl::free(usm_ptr, *queue);
+}
+
+bool oneapi_usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  oneapi_check_usm(queue_, dest, true);
+  oneapi_check_usm(queue_, src, true);
+  sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
+#  ifdef WITH_CYCLES_DEBUG
+  try {
+    /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
+     * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
+     */
+    mem_event.wait_and_throw();
+    return true;
+  }
+  catch (sycl::exception const &e) {
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+    }
+    return false;
+  }
+#  else
+  sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
+  sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
+  bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
+                             src_type == sycl::usm::alloc::device;
+  bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
+                                          src_type == sycl::usm::alloc::unknown;
+  /* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host
+   * may not wait until the end of the transfer before using the memory.
+   */
+  if (from_device_to_host || host_or_device_memop_with_offset)
+    mem_event.wait();
+  return true;
+#  endif
+}
+
+bool oneapi_usm_memset(SyclQueue *queue_, void *usm_ptr, unsigned char value, size_t num_bytes)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  oneapi_check_usm(queue_, usm_ptr, true);
+  sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
+#  ifdef WITH_CYCLES_DEBUG
+  try {
+    /* NOTE(@nsirgien) Waiting on memory operation may give more precise error
+     * messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
+     */
+    mem_event.wait_and_throw();
+    return true;
+  }
+  catch (sycl::exception const &e) {
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+    }
+    return false;
+  }
+#  else
+  (void)mem_event;
+  return true;
+#  endif
+}
+
+bool oneapi_queue_synchronize(SyclQueue *queue_)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  try {
+    queue->wait_and_throw();
+    return true;
+  }
+  catch (sycl::exception const &e) {
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+    }
+    return false;
+  }
+}
+
+/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and
+ * also trigger runtime compilation of all existing oneAPI kernels */
+bool oneapi_run_test_kernel(SyclQueue *queue_)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  size_t N = 8;
+  sycl::buffer<float, 1> A(N);
+  sycl::buffer<float, 1> B(N);
+
+  {
+    sycl::host_accessor A_host_acc(A, sycl::write_only);
+    for (size_t i = (size_t)0; i < N; i++)
+      A_host_acc[i] = rand() % 32;
+  }
+
+  try {
+    queue->submit([&](sycl::handler &cgh) {
+      sycl::accessor A_acc(A, cgh, sycl::read_only);
+      sycl::accessor B_acc(B, cgh, sycl::write_only, sycl::no_init);
+
+      cgh.parallel_for(N, [=](sycl::id<1> idx) { B_acc[idx] = A_acc[idx] + idx.get(0); });
+    });
+    queue->wait_and_throw();
+
+    sycl::host_accessor A_host_acc(A, sycl::read_only);
+    sycl::host_accessor B_host_acc(B, sycl::read_only);
+
+    for (size_t i = (size_t)0; i < N; i++) {
+      float result = A_host_acc[i] + B_host_acc[i];
+      (void)result;
+    }
+  }
+  catch (sycl::exception const &e) {
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+    }
+    return false;
+  }
+
+  return true;
+}
+
+bool oneapi_kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size)
+{
+  kernel_global_size = sizeof(KernelGlobalsGPU);
+
+  return true;
+}
+
+void oneapi_set_global_memory(SyclQueue *queue_,
+                              void *kernel_globals,
+                              const char *memory_name,
+                              void *memory_device_pointer)
+{
+  assert(queue_);
+  assert(kernel_globals);
+  assert(memory_name);
+  assert(memory_device_pointer);
+  KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
+  oneapi_check_usm(queue_, memory_device_pointer);
+  oneapi_check_usm(queue_, kernel_globals, true);
+
+  std::string matched_name(memory_name);
+
+/* This macro will change global ptr of KernelGlobals via name matching. */
+#  define KERNEL_DATA_ARRAY(type, name) \
+    else if (#name == matched_name) \
+    { \
+      globals->__##name = (type *)memory_device_pointer; \
+      return; \
+    }
+  if (false) {
+  }
+  else if ("integrator_state" == matched_name) {
+    globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
+    return;
+  }
+  KERNEL_DATA_ARRAY(KernelData, data)
+#  include "kernel/data_arrays.h"
+  else
+  {
+    std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
+              << std::endl;
+    assert(false);
+  }
+#  undef KERNEL_DATA_ARRAY
+}
+
+/* TODO: Move device information to OneapiDevice initialized on creation and use it. */
+/* TODO: Move below function to oneapi/queue.cpp. */
+size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_,
+                                          const DeviceKernel kernel,
+                                          const size_t kernel_global_size)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  (void)kernel_global_size;
+  const static size_t preferred_work_group_size_intersect_shading = 32;
+  const static size_t preferred_work_group_size_technical = 1024;
+
+  size_t preferred_work_group_size = 0;
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA:
+    case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+      preferred_work_group_size = preferred_work_group_size_intersect_shading;
+      break;
+
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES:
+    case DEVICE_KERNEL_INTEGRATOR_RESET:
+    case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS:
+      preferred_work_group_size = preferred_work_group_size_technical;
+      break;
+
+    default:
+      preferred_work_group_size = 512;
+  }
+
+  const size_t limit_work_group_size =
+      queue->get_device().get_info<sycl::info::device::max_work_group_size>();
+  return std::min(limit_work_group_size, preferred_work_group_size);
+}
+
+bool oneapi_enqueue_kernel(KernelContext *kernel_context,
+                           int kernel,
+                           size_t global_size,
+                           void **args)
+{
+  bool success = true;
+  ::DeviceKernel device_kernel = (::DeviceKernel)kernel;
+  KernelGlobalsGPU *kg = (KernelGlobalsGPU *)kernel_context->kernel_globals;
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(kernel_context->queue);
+  assert(queue);
+  if (!queue) {
+    return false;
+  }
+
+  size_t local_size = oneapi_kernel_preferred_local_size(
+      kernel_context->queue, device_kernel, global_size);
+  assert(global_size % local_size == 0);
+
+  /* Local size for DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY needs to be enforced so we
+   * overwrite it outside of oneapi_kernel_preferred_local_size. */
+  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY) {
+    local_size = GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE;
+  }
+
+  /* Kernels listed below need a specific number of work groups. */
+  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY ||
+      device_kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY ||
+      device_kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY ||
+      device_kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY ||
+      device_kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY ||
+      device_kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY ||
+      device_kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY) {
+    int num_states = *((int *)(args[0]));
+    /* Round up to the next work-group. */
+    size_t groups_count = (num_states + local_size - 1) / local_size;
+    /* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
+     * we extend work size to fit uniformity requirements. */
+    global_size = groups_count * local_size;
+
+#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+    if (queue->get_device().is_host()) {
+      global_size = 1;
+      local_size = 1;
+    }
+#  endif
+  }
+
+  /* Let the compiler throw an error if there are any kernels missing in this implementation. */
+#  if defined(_WIN32)
+#    pragma warning(error : 4062)
+#  elif defined(__GNUC__)
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic error "-Wswitch"
+#  endif
+
+  try {
+    queue->submit([&](sycl::handler &cgh) {
+      switch (device_kernel) {
+        case DEVICE_KERNEL_INTEGRATOR_RESET: {
+          oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_reset);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_init_from_camera);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_init_from_bake);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_intersect_closest);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_intersect_shadow);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_intersect_subsurface);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_intersect_volume_stack);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_background);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_light);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_shadow);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_surface);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_shade_surface_raytrace);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_surface_mnee);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_volume);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_queued_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_queued_shadow_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_active_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_terminated_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_terminated_shadow_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sorted_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_compact_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_compact_shadow_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_adaptive_sampling_convergence_check);
+          break;
+        }
+        case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_adaptive_sampling_filter_x);
+          break;
+        }
+        case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_adaptive_sampling_filter_y);
+          break;
+        }
+        case DEVICE_KERNEL_SHADER_EVAL_DISPLACE: {
+          oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_shader_eval_displace);
+          break;
+        }
+        case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_shader_eval_background);
+          break;
+        }
+        case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_shader_eval_curve_shadow_transparency);
+          break;
+        }
+        case DEVICE_KERNEL_PREFIX_SUM: {
+          oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_prefix_sum);
+          break;
+        }
+
+        /* clang-format off */
+    #  define DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT, variant) \
+    case DEVICE_KERNEL_FILM_CONVERT_##VARIANT: { \
+      oneapi_call(kg, cgh, \
+                            global_size, \
+                            local_size, \
+                            args, \
+                            oneapi_kernel_film_convert_##variant); \
+      break; \
+     }
+
+#  define DEVICE_KERNEL_FILM_CONVERT(variant, VARIANT) \
+      DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT, variant) \
+      DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT##_HALF_RGBA, variant##_half_rgba)
+
+      DEVICE_KERNEL_FILM_CONVERT(depth, DEPTH);
+      DEVICE_KERNEL_FILM_CONVERT(mist, MIST);
+      DEVICE_KERNEL_FILM_CONVERT(sample_count, SAMPLE_COUNT);
+      DEVICE_KERNEL_FILM_CONVERT(float, FLOAT);
+      DEVICE_KERNEL_FILM_CONVERT(light_path, LIGHT_PATH);
+      DEVICE_KERNEL_FILM_CONVERT(float3, FLOAT3);
+      DEVICE_KERNEL_FILM_CONVERT(motion, MOTION);
+      DEVICE_KERNEL_FILM_CONVERT(cryptomatte, CRYPTOMATTE);
+      DEVICE_KERNEL_FILM_CONVERT(shadow_catcher, SHADOW_CATCHER);
+      DEVICE_KERNEL_FILM_CONVERT(shadow_catcher_matte_with_shadow,
+                                 SHADOW_CATCHER_MATTE_WITH_SHADOW);
+      DEVICE_KERNEL_FILM_CONVERT(combined, COMBINED);
+      DEVICE_KERNEL_FILM_CONVERT(float4, FLOAT4);
+
+#  undef DEVICE_KERNEL_FILM_CONVERT
+#  undef DEVICE_KERNEL_FILM_CONVERT_PARTIAL
+          /* clang-format on */
+
+        case DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_filter_guiding_preprocess);
+          break;
+        }
+        case DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_filter_guiding_set_fake_albedo);
+          break;
+        }
+        case DEVICE_KERNEL_FILTER_COLOR_PREPROCESS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_filter_color_preprocess);
+          break;
+        }
+        case DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_filter_color_postprocess);
+          break;
+        }
+        case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_cryptomatte_postprocess);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_compact_states);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_compact_shadow_states);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_shadow_catcher_count_possible_splits);
+          break;
+        }
+        /* Unsupported kernels */
+        case DEVICE_KERNEL_NUM:
+        case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
+          kernel_assert(0);
+          break;
+      }
+    });
+  }
+  catch (sycl::exception const &e) {
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+      success = false;
+    }
+  }
+
+#  if defined(_WIN32)
+#    pragma warning(default : 4062)
+#  elif defined(__GNUC__)
+#    pragma GCC diagnostic pop
+#  endif
+  return success;
+}
+
+/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
+ * since Windows driver 101.3268. */
+/* The same min compute-runtime version is currently required across Windows and Linux.
+ * For Windows driver 101.3268, compute-runtime version is 23570. */
+static const int lowest_supported_driver_version_win = 1013268;
+static const int lowest_supported_driver_version_neo = 23570;
+
+static int parse_driver_build_version(const sycl::device &device)
+{
+  const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
+  int driver_build_version = 0;
+
+  size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
+  if (second_dot_position == std::string::npos) {
+    std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
+              << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
+              << " xx.xx.xxx.xxxx (Windows) for device \""
+              << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
+  }
+  else {
+    try {
+      size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
+      if (third_dot_position != std::string::npos) {
+        const std::string &third_number_substr = driver_version.substr(
+            second_dot_position + 1, third_dot_position - second_dot_position - 1);
+        const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
+        if (third_number_substr.length() == 3 && forth_number_substr.length() == 4)
+          driver_build_version = std::stoi(third_number_substr) * 10000 +
+                                 std::stoi(forth_number_substr);
+      }
+      else {
+        const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
+        driver_build_version = std::stoi(third_number_substr);
+      }
+    }
+    catch (std::invalid_argument &e) {
+      std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
+                << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
+                << " xx.xx.xxx.xxxx (Windows) for device \""
+                << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
+    }
+  }
+
+  return driver_build_version;
+}
+
+static std::vector<sycl::device> oneapi_available_devices()
+{
+  bool allow_all_devices = false;
+  if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
+    allow_all_devices = true;
+
+    /* Host device is useful only for debugging at the moment
+     * so we hide this device with default build settings. */
+#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+  bool allow_host = true;
+#  else
+  bool allow_host = false;
+#  endif
+
+  const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
+
+  std::vector<sycl::device> available_devices;
+  for (const sycl::platform &platform : oneapi_platforms) {
+    /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL.
+     */
+    if (platform.get_backend() == sycl::backend::opencl) {
+      continue;
+    }
+
+    const std::vector<sycl::device> &oneapi_devices =
+        (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
+                                            platform.get_devices(sycl::info::device_type::gpu);
+
+    for (const sycl::device &device : oneapi_devices) {
+      if (allow_all_devices) {
+        /* still filter out host device if build doesn't support it. */
+        if (allow_host || !device.is_host()) {
+          available_devices.push_back(device);
+        }
+      }
+      else {
+        bool filter_out = false;
+
+        /* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
+         * assuming they have either more than 96 Execution Units or not 7 threads per EU.
+         * Official support can be broaden to older and smaller GPUs once ready. */
+        if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) {
+          /* Filtered-out defaults in-case these values aren't available through too old L0
+           * runtime. */
+          int number_of_eus = 96;
+          int threads_per_eu = 7;
+          if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
+            number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
+          }
+          if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
+            threads_per_eu =
+                device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
+          }
+          /* This filters out all Level-Zero supported GPUs from older generation than Arc. */
+          if (number_of_eus <= 96 && threads_per_eu == 7) {
+            filter_out = true;
+          }
+          /* if not already filtered out, check driver version. */
+          if (!filter_out) {
+            int driver_build_version = parse_driver_build_version(device);
+            if ((driver_build_version > 100000 &&
+                 driver_build_version < lowest_supported_driver_version_win) ||
+                driver_build_version < lowest_supported_driver_version_neo) {
+              filter_out = true;
+            }
+          }
+        }
+        else if (!allow_host && device.is_host()) {
+          filter_out = true;
+        }
+        else if (!allow_all_devices) {
+          filter_out = true;
+        }
+
+        if (!filter_out) {
+          available_devices.push_back(device);
+        }
+      }
+    }
+  }
+
+  return available_devices;
+}
+
+char *oneapi_device_capabilities()
+{
+  std::stringstream capabilities;
+
+  const std::vector<sycl::device> &oneapi_devices = oneapi_available_devices();
+  for (const sycl::device &device : oneapi_devices) {
+    const std::string &name = device.get_info<sycl::info::device::name>();
+
+    capabilities << std::string("\t") << name << "\n";
+#  define WRITE_ATTR(attribute_name, attribute_variable) \
+    capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
+                 << "\n";
+#  define GET_NUM_ATTR(attribute) \
+    { \
+      size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \
+      capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \
+    }
+
+    GET_NUM_ATTR(vendor_id)
+    GET_NUM_ATTR(max_compute_units)
+    GET_NUM_ATTR(max_work_item_dimensions)
+
+    sycl::id<3> max_work_item_sizes =
+        device.get_info<sycl::info::device::max_work_item_sizes<3>>();
+    WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0)))
+    WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1)))
+    WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2)))
+
+    GET_NUM_ATTR(max_work_group_size)
+    GET_NUM_ATTR(max_num_sub_groups)
+    GET_NUM_ATTR(sub_group_independent_forward_progress)
+
+    GET_NUM_ATTR(preferred_vector_width_char)
+    GET_NUM_ATTR(preferred_vector_width_short)
+    GET_NUM_ATTR(preferred_vector_width_int)
+    GET_NUM_ATTR(preferred_vector_width_long)
+    GET_NUM_ATTR(preferred_vector_width_float)
+    GET_NUM_ATTR(preferred_vector_width_double)
+    GET_NUM_ATTR(preferred_vector_width_half)
+
+    GET_NUM_ATTR(native_vector_width_char)
+    GET_NUM_ATTR(native_vector_width_short)
+    GET_NUM_ATTR(native_vector_width_int)
+    GET_NUM_ATTR(native_vector_width_long)
+    GET_NUM_ATTR(native_vector_width_float)
+    GET_NUM_ATTR(native_vector_width_double)
+    GET_NUM_ATTR(native_vector_width_half)
+
+    size_t max_clock_frequency =
+        (size_t)(device.is_host() ? (size_t)0 :
+                                    device.get_info<sycl::info::device::max_clock_frequency>());
+    WRITE_ATTR("max_clock_frequency", max_clock_frequency)
+
+    GET_NUM_ATTR(address_bits)
+    GET_NUM_ATTR(max_mem_alloc_size)
+
+    /* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't
+     * supported so we always return false, even if device supports HW texture usage acceleration.
+     */
+    bool image_support = false;
+    WRITE_ATTR("image_support", (size_t)image_support)
+
+    GET_NUM_ATTR(max_parameter_size)
+    GET_NUM_ATTR(mem_base_addr_align)
+    GET_NUM_ATTR(global_mem_size)
+    GET_NUM_ATTR(local_mem_size)
+    GET_NUM_ATTR(error_correction_support)
+    GET_NUM_ATTR(profiling_timer_resolution)
+    GET_NUM_ATTR(is_available)
+
+#  undef GET_NUM_ATTR
+#  undef WRITE_ATTR
+    capabilities << "\n";
+  }
+
+  return ::strdup(capabilities.str().c_str());
+}
+
+void oneapi_free(void *p)
+{
+  if (p) {
+    ::free(p);
+  }
+}
+
+void oneapi_iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
+{
+  int num = 0;
+  std::vector<sycl::device> devices = oneapi_available_devices();
+  for (sycl::device &device : devices) {
+    const std::string &platform_name =
+        device.get_platform().get_info<sycl::info::platform::name>();
+    std::string name = device.get_info<sycl::info::device::name>();
+    std::string id = "ONEAPI_" + platform_name + "_" + name;
+    if (device.has(sycl::aspect::ext_intel_pci_address)) {
+      id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>());
+    }
+    (cb)(id.c_str(), name.c_str(), num, user_ptr);
+    num++;
+  }
+}
+
+size_t oneapi_get_memcapacity(SyclQueue *queue)
+{
+  return reinterpret_cast<sycl::queue *>(queue)
+      ->get_device()
+      .get_info<sycl::info::device::global_mem_size>();
+}
+
+int oneapi_get_num_multiprocessors(SyclQueue *queue)
+{
+  const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
+  if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
+    return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
+  }
+  else
+    return 0;
+}
+
+int oneapi_get_max_num_threads_per_multiprocessor(SyclQueue *queue)
+{
+  const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
+  if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
+      device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
+    return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() *
+           device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
+  }
+  else
+    return 0;
+}
+
+#endif /* WITH_ONEAPI */
diff --git a/intern/cycles/kernel/device/oneapi/kernel.h b/intern/cycles/kernel/device/oneapi/kernel.h
new file mode 100644
index 00000000000..c5f853742ed
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/kernel.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#pragma once
+
+#ifdef WITH_ONEAPI
+
+#  include <stddef.h>
+
+/* NOTE(@nsirgien): Should match underlying type in the declaration inside "kernel/types.h"
+ * TODO: use kernel/types.h directly. */
+enum DeviceKernel : int;
+
+#  ifndef CYCLES_KERNEL_ONEAPI_EXPORT
+#    ifdef _WIN32
+#      if defined(ONEAPI_EXPORT)
+#        define CYCLES_KERNEL_ONEAPI_EXPORT extern __declspec(dllexport)
+#      else
+#        define CYCLES_KERNEL_ONEAPI_EXPORT extern __declspec(dllimport)
+#      endif
+#    else
+#      define CYCLES_KERNEL_ONEAPI_EXPORT
+#    endif
+#  endif
+
+class SyclQueue;
+
+typedef void (*OneAPIDeviceIteratorCallback)(const char *id,
+                                             const char *name,
+                                             int num,
+                                             void *user_ptr);
+
+typedef void (*OneAPIErrorCallback)(const char *error, void *user_ptr);
+
+struct KernelContext {
+  /* Queue, associated with selected device */
+  SyclQueue *queue;
+  /* Pointer to USM device memory with all global/constant allocation on this device */
+  void *kernel_globals;
+};
+
+/* Use extern C linking so that the symbols can be easily load from the dynamic library at runtime.
+ */
+#  ifdef __cplusplus
+extern "C" {
+#  endif
+
+#  define DLL_INTERFACE_CALL(function, return_type, ...) \
+    CYCLES_KERNEL_ONEAPI_EXPORT return_type function(__VA_ARGS__);
+#  include "kernel/device/oneapi/dll_interface_template.h"
+#  undef DLL_INTERFACE_CALL
+
+#  ifdef __cplusplus
+}
+#  endif
+
+#endif /* WITH_ONEAPI */
diff --git a/intern/cycles/kernel/device/oneapi/kernel_templates.h b/intern/cycles/kernel/device/oneapi/kernel_templates.h
new file mode 100644
index 00000000000..0ae925cf748
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/kernel_templates.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#pragma once
+
+/* Some macro magic to generate templates for kernel arguments.
+ * The resulting oneapi_call() template allows to call a SYCL/C++ kernel
+ * with typed arguments by only giving it a void `**args` as given by Cycles.
+ * The template will automatically cast from void* to the expected type. */
+
+/* When expanded by the preprocessor, the generated templates will look like this example: */
+#if 0
+template<typename T0, typename T1, typename T2>
+void oneapi_call(
+    KernelGlobalsGPU *kg,
+    sycl::handler &cgh,
+    size_t global_size,
+    size_t local_size,
+    void **args,
+    void (*func)(const KernelGlobalsGPU *, size_t, size_t, sycl::handler &, T0, T1, T2))
+{
+  func(kg, global_size, local_size, cgh, *(T0 *)(args[0]), *(T1 *)(args[1]), *(T2 *)(args[2]));
+}
+#endif
+
+/* clang-format off */
+#define ONEAPI_TYP(x) typename T##x
+#define ONEAPI_CAST(x) *(T##x *)(args[x])
+#define ONEAPI_T(x) T##x
+
+#define ONEAPI_GET_NTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, N,  ...) N
+#define ONEAPI_0(_call, ...)
+#define ONEAPI_1(_call, x) _call(x)
+#define ONEAPI_2(_call, x, ...) _call(x), ONEAPI_1(_call, __VA_ARGS__)
+#define ONEAPI_3(_call, x, ...) _call(x), ONEAPI_2(_call, __VA_ARGS__)
+#define ONEAPI_4(_call, x, ...) _call(x), ONEAPI_3(_call, __VA_ARGS__)
+#define ONEAPI_5(_call, x, ...) _call(x), ONEAPI_4(_call, __VA_ARGS__)
+#define ONEAPI_6(_call, x, ...) _call(x), ONEAPI_5(_call, __VA_ARGS__)
+#define ONEAPI_7(_call, x, ...) _call(x), ONEAPI_6(_call, __VA_ARGS__)
+#define ONEAPI_8(_call, x, ...) _call(x), ONEAPI_7(_call, __VA_ARGS__)
+#define ONEAPI_9(_call, x, ...) _call(x), ONEAPI_8(_call, __VA_ARGS__)
+#define ONEAPI_10(_call, x, ...) _call(x), ONEAPI_9(_call, __VA_ARGS__)
+#define ONEAPI_11(_call, x, ...) _call(x), ONEAPI_10(_call, __VA_ARGS__)
+#define ONEAPI_12(_call, x, ...) _call(x), ONEAPI_11(_call, __VA_ARGS__)
+#define ONEAPI_13(_call, x, ...) _call(x), ONEAPI_12(_call, __VA_ARGS__)
+#define ONEAPI_14(_call, x, ...) _call(x), ONEAPI_13(_call, __VA_ARGS__)
+#define ONEAPI_15(_call, x, ...) _call(x), ONEAPI_14(_call, __VA_ARGS__)
+#define ONEAPI_16(_call, x, ...) _call(x), ONEAPI_15(_call, __VA_ARGS__)
+#define ONEAPI_17(_call, x, ...) _call(x), ONEAPI_16(_call, __VA_ARGS__)
+#define ONEAPI_18(_call, x, ...) _call(x), ONEAPI_17(_call, __VA_ARGS__)
+#define ONEAPI_19(_call, x, ...) _call(x), ONEAPI_18(_call, __VA_ARGS__)
+#define ONEAPI_20(_call, x, ...) _call(x), ONEAPI_19(_call, __VA_ARGS__)
+#define ONEAPI_21(_call, x, ...) _call(x), ONEAPI_20(_call, __VA_ARGS__)
+
+#define ONEAPI_CALL_FOR(x, ...) \
+  ONEAPI_GET_NTH_ARG("ignored", \
+                     ##__VA_ARGS__, \
+                     ONEAPI_21, \
+                     ONEAPI_20, \
+                     ONEAPI_19, \
+                     ONEAPI_18, \
+                     ONEAPI_17, \
+                     ONEAPI_16, \
+                     ONEAPI_15, \
+                     ONEAPI_14, \
+                     ONEAPI_13, \
+                     ONEAPI_12, \
+                     ONEAPI_11, \
+                     ONEAPI_10, \
+                     ONEAPI_9, \
+                     ONEAPI_8, \
+                     ONEAPI_7, \
+                     ONEAPI_6, \
+                     ONEAPI_5, \
+                     ONEAPI_4, \
+                     ONEAPI_3, \
+                     ONEAPI_2, \
+                     ONEAPI_1, \
+                     ONEAPI_0) \
+  (x, ##__VA_ARGS__)
+
+/* This template automatically casts entries in the void **args array to the types requested by the kernel func.
+ * Since kernel parameters are passed as void ** to the device, this is the closest that we have to type safety. */
+#define oneapi_template(...) \
+  template<ONEAPI_CALL_FOR(ONEAPI_TYP, __VA_ARGS__)> \
+  void oneapi_call( \
+      KernelGlobalsGPU *kg, \
+      sycl::handler &cgh, \
+      size_t global_size, \
+      size_t local_size, \
+      void **args, \
+      void (*func)(KernelGlobalsGPU*, size_t, size_t, sycl::handler &, ONEAPI_CALL_FOR(ONEAPI_T, __VA_ARGS__))) \
+  { \
+        func(kg, \
+             global_size, \
+             local_size, \
+             cgh, \
+             ONEAPI_CALL_FOR(ONEAPI_CAST, __VA_ARGS__)); \
+  }
+
+oneapi_template(0)
+oneapi_template(0, 1)
+oneapi_template(0, 1, 2)
+oneapi_template(0, 1, 2, 3)
+oneapi_template(0, 1, 2, 3, 4)
+oneapi_template(0, 1, 2, 3, 4, 5)
+oneapi_template(0, 1, 2, 3, 4, 5, 6)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)
+
+    /* clang-format on */