Cycles: Add support for rendering on Intel GPUs using oneAPI

This patch adds a new Cycles device with similar functionality to the existing GPU devices. Kernel compilation and runtime interaction happen via oneAPI DPC++ compiler and SYCL API. This implementation is primarly focusing on Intel® Arc™ GPUs and other future Intel GPUs. The first supported drivers are 101.1660 on Windows and 22.10.22597 on Linux. The necessary tools for compilation are: - A SYCL compiler such as oneAPI DPC++ compiler or https://github.com/intel/llvm - Intel® oneAPI Level Zero which is used for low level device queries: https://github.com/oneapi-src/level-zero - To optionally generate prebuilt graphics binaries: Intel® Graphics Compiler All are included in Linux precompiled libraries on svn: https://svn.blender.org/svnroot/bf-blender/trunk/lib The same goes for Windows precompiled binaries but for the graphics compiler, available as "Intel® Graphics Offline Compiler for OpenCL™ Code" from https://www.intel.com/content/www/us/en/developer/articles/tool/oneapi-standalone-components.html, for which path can be set as OCLOC_INSTALL_DIR. Being based on the open SYCL standard, this implementation could also be extended to run on other compatible non-Intel hardware in the future. Reviewed By: sergey, brecht Differential Revision: https://developer.blender.org/D15254 Co-authored-by: Nikita Sirgienko <nikita.sirgienko@intel.com> Co-authored-by: Stefan Werner <stefan.werner@intel.com>
author: Xavier Hallade <xavier.hallade@intel.com> 2022-06-29 13:58:04 +0300
committer: Xavier Hallade <xavier.hallade@intel.com> 2022-06-29 13:58:04 +0300
commit: a02992f1313811c9905e44dc95a0aee31d707f67 (patch)
tree: 2d1f59524e2a298bb530ad578a2d2b9e2c4a1432 /intern
parent: 302b04a5a3fc0e767ac784424f78ce2edf5d2844 (diff)
61 files changed, 3379 insertions, 74 deletions
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index f5d717e70fc..82fd81be262 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -263,6 +263,10 @@ if(WITH_CYCLES_DEVICE_OPTIX)
   endif()
 endif()
 
+if (WITH_CYCLES_DEVICE_ONEAPI)
+  add_definitions(-DWITH_ONEAPI)
+endif()
+
 if(WITH_CYCLES_EMBREE)
   add_definitions(-DWITH_EMBREE)
   include_directories(
diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index 17f05f6da34..7d7ca78c15a 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -118,7 +118,8 @@ enum_device_type = (
     ('CUDA', "CUDA", "CUDA", 1),
     ('OPTIX', "OptiX", "OptiX", 3),
     ('HIP', "HIP", "HIP", 4),
-    ('METAL', "Metal", "Metal", 5)
+    ('METAL', "Metal", "Metal", 5),
+    ('ONEAPI', "oneAPI", "oneAPI", 6)
 )
 
 enum_texture_limit = (
@@ -1397,7 +1398,8 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     def get_device_types(self, context):
         import _cycles
-        has_cuda, has_optix, has_hip, has_metal = _cycles.get_device_types()
+        has_cuda, has_optix, has_hip, has_metal, has_oneapi = _cycles.get_device_types()
+
         list = [('NONE', "None", "Don't use compute device", 0)]
         if has_cuda:
             list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1))
@@ -1407,6 +1409,8 @@ class CyclesPreferences(bpy.types.AddonPreferences):
             list.append(('HIP', "HIP", "Use HIP for GPU acceleration", 4))
         if has_metal:
             list.append(('METAL', "Metal", "Use Metal for GPU acceleration", 5))
+        if has_oneapi:
+            list.append(('ONEAPI', "oneAPI", "Use oneAPI for GPU acceleration", 6))
 
         return list
 
@@ -1438,7 +1442,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
 
     def update_device_entries(self, device_list):
         for device in device_list:
-            if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP', 'METAL'}:
+            if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP', 'METAL', 'ONEAPI'}:
                 continue
             # Try to find existing Device entry
             entry = self.find_existing_device_entry(device)
@@ -1482,7 +1486,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
         import _cycles
         # Ensure `self.devices` is not re-allocated when the second call to
         # get_devices_for_type is made, freeing items from the first list.
-        for device_type in ('CUDA', 'OPTIX', 'HIP', 'METAL'):
+        for device_type in ('CUDA', 'OPTIX', 'HIP', 'METAL', 'ONEAPI'):
             self.update_device_entries(_cycles.available_devices(device_type))
 
     # Deprecated: use refresh_devices instead.
@@ -1550,13 +1554,25 @@ class CyclesPreferences(bpy.types.AddonPreferences):
                 elif sys.platform.startswith("linux"):
                     col.label(text="Requires AMD GPU with Vega or RDNA architecture", icon='BLANK1')
                     col.label(text="and AMD driver version 22.10 or newer", icon='BLANK1')
+            elif device_type == 'ONEAPI':
+                import sys
+                col.label(text="Requires Intel GPU with Xe-HPG architecture", icon='BLANK1')
+                if sys.platform.startswith("win"):
+                    col.label(text="and Windows driver version 101.1660 or newer", icon='BLANK1')
+                elif sys.platform.startswith("linux"):
+                    col.label(text="and Linux driver version xx.xx.20066 or newer", icon='BLANK1')
             elif device_type == 'METAL':
                 col.label(text="Requires Apple Silicon with macOS 12.2 or newer", icon='BLANK1')
                 col.label(text="or AMD with macOS 12.3 or newer", icon='BLANK1')
             return
 
         for device in devices:
-            box.prop(device, "use", text=device.name)
+            import unicodedata
+            box.prop(device, "use", text=device.name
+                    .replace('(TM)', unicodedata.lookup('TRADE MARK SIGN'))
+                    .replace('(R)', unicodedata.lookup('REGISTERED SIGN'))
+                    .replace('(C)', unicodedata.lookup('COPYRIGHT SIGN'))
+                    )
 
     def draw_impl(self, layout, context):
         row = layout.row()
diff --git a/intern/cycles/blender/addon/ui.py b/intern/cycles/blender/addon/ui.py
index 88be546746d..5b8c3960c82 100644
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -110,6 +110,10 @@ def use_optix(context):
 
     return (get_device_type(context) == 'OPTIX' and cscene.device == 'GPU')
 
+def use_oneapi(context):
+    cscene = context.scene.cycles
+
+    return (get_device_type(context) == 'ONEAPI' and cscene.device == 'GPU')
 
 def use_multi_device(context):
     cscene = context.scene.cycles
diff --git a/intern/cycles/blender/device.cpp b/intern/cycles/blender/device.cpp
index 38effa329a5..22beca898f1 100644
--- a/intern/cycles/blender/device.cpp
+++ b/intern/cycles/blender/device.cpp
@@ -15,6 +15,7 @@ enum ComputeDevice {
   COMPUTE_DEVICE_OPTIX = 3,
   COMPUTE_DEVICE_HIP = 4,
   COMPUTE_DEVICE_METAL = 5,
+  COMPUTE_DEVICE_ONEAPI = 6,
 
   COMPUTE_DEVICE_NUM
 };
@@ -76,6 +77,9 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
       else if (compute_device == COMPUTE_DEVICE_METAL) {
         mask |= DEVICE_MASK_METAL;
       }
+      else if (compute_device == COMPUTE_DEVICE_ONEAPI) {
+        mask |= DEVICE_MASK_ONEAPI;
+      }
       vector<DeviceInfo> devices = Device::available_devices(mask);
 
       /* Match device preferences and available devices. */
diff --git a/intern/cycles/blender/python.cpp b/intern/cycles/blender/python.cpp
index 7bd1ad2cafe..8b2b331f73e 100644
--- a/intern/cycles/blender/python.cpp
+++ b/intern/cycles/blender/python.cpp
@@ -871,18 +871,20 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*
 static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
 {
   vector<DeviceType> device_types = Device::available_types();
-  bool has_cuda = false, has_optix = false, has_hip = false, has_metal = false;
+  bool has_cuda = false, has_optix = false, has_hip = false, has_metal = false, has_oneapi = false;
   foreach (DeviceType device_type, device_types) {
     has_cuda |= (device_type == DEVICE_CUDA);
     has_optix |= (device_type == DEVICE_OPTIX);
     has_hip |= (device_type == DEVICE_HIP);
     has_metal |= (device_type == DEVICE_METAL);
+    has_oneapi |= (device_type == DEVICE_ONEAPI);
   }
-  PyObject *list = PyTuple_New(4);
+  PyObject *list = PyTuple_New(5);
   PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda));
   PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix));
   PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_hip));
   PyTuple_SET_ITEM(list, 3, PyBool_FromLong(has_metal));
+  PyTuple_SET_ITEM(list, 4, PyBool_FromLong(has_oneapi));
   return list;
 }
 
@@ -914,6 +916,9 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg)
   else if (override == "METAL") {
     BlenderSession::device_override = DEVICE_MASK_METAL;
   }
+  else if (override == "ONEAPI") {
+    BlenderSession::device_override = DEVICE_MASK_ONEAPI;
+  }
   else {
     printf("\nError: %s is not a valid Cycles device.\n", override.c_str());
     Py_RETURN_FALSE;
diff --git a/intern/cycles/cmake/external_libs.cmake b/intern/cycles/cmake/external_libs.cmake
index d2f30fe764b..51830250f2e 100644
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -91,6 +91,8 @@ if(CYCLES_STANDALONE_REPOSITORY)
     _set_default(USD_ROOT_DIR "${_cycles_lib_dir}/usd")
     _set_default(WEBP_ROOT_DIR "${_cycles_lib_dir}/webp")
     _set_default(ZLIB_ROOT "${_cycles_lib_dir}/zlib")
+    _set_default(LEVEL_ZERO_ROOT_DIR "${_cycles_lib_dir}/level-zero")
+    _set_default(SYCL_ROOT_DIR "${_cycles_lib_dir}/dpcpp")
 
     # Ignore system libraries
     set(CMAKE_IGNORE_PATH "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES};${CMAKE_SYSTEM_INCLUDE_PATH};${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES};${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES}")
@@ -647,3 +649,22 @@ if(WITH_CYCLES_DEVICE_METAL)
     message(STATUS "Found Metal: ${METAL_LIBRARY}")
   endif()
 endif()
+
+###########################################################################
+# oneAPI
+###########################################################################
+
+if (WITH_CYCLES_DEVICE_ONEAPI)
+  find_package(SYCL)
+  find_package(LevelZero)
+
+  if (SYCL_FOUND AND LEVEL_ZERO_FOUND)
+    message(STATUS "Found oneAPI: ${SYCL_LIBRARY}")
+    message(STATUS "Found Level Zero: ${LEVEL_ZERO_LIBRARY}")
+  else()
+    message(STATUS "oneAPI or Level Zero not found, disabling oneAPI device from Cycles")
+    set(WITH_CYCLES_DEVICE_ONEAPI OFF)
+  endif()
+endif()
+
+unset(_cycles_lib_dir)
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index 6205775260a..6418801c572 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -82,6 +82,15 @@ set(SRC_HIP
   hip/util.h
 )
 
+set(SRC_ONEAPI
+  oneapi/device_impl.cpp
+  oneapi/device_impl.h
+  oneapi/device.cpp
+  oneapi/device.h
+  oneapi/queue.cpp
+  oneapi/queue.h
+)
+
 set(SRC_DUMMY
   dummy/device.cpp
   dummy/device.h
@@ -134,6 +143,7 @@ set(SRC
   ${SRC_DUMMY}
   ${SRC_MULTI}
   ${SRC_OPTIX}
+  ${SRC_ONEAPI}
   ${SRC_HEADERS}
 )
 
@@ -181,6 +191,9 @@ if(WITH_CYCLES_DEVICE_METAL)
     ${SRC_METAL}
   )
 endif()
+if (WITH_CYCLES_DEVICE_ONEAPI)
+  add_definitions(-DWITH_ONEAPI)
+endif()
 
 if(WITH_OPENIMAGEDENOISE)
   list(APPEND LIB
@@ -193,6 +206,11 @@ include_directories(SYSTEM ${INC_SYS})
 
 cycles_add_library(cycles_device "${LIB}" ${SRC})
 
+if(WITH_CYCLES_DEVICE_ONEAPI)
+  # Need to have proper rebuilding in case of changes in cycles_kernel_oneapi due external project behaviour
+  add_dependencies(cycles_device cycles_kernel_oneapi)
+endif()
+
 source_group("cpu" FILES ${SRC_CPU})
 source_group("cuda" FILES ${SRC_CUDA})
 source_group("dummy" FILES ${SRC_DUMMY})
@@ -200,4 +218,5 @@ source_group("hip" FILES ${SRC_HIP})
 source_group("multi" FILES ${SRC_MULTI})
 source_group("metal" FILES ${SRC_METAL})
 source_group("optix" FILES ${SRC_OPTIX})
+source_group("oneapi" FILES ${SRC_ONEAPI})
 source_group("common" FILES ${SRC_BASE} ${SRC_HEADERS})
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 82c7881da5f..ace6ed517f5 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -16,6 +16,7 @@
 #include "device/hip/device.h"
 #include "device/metal/device.h"
 #include "device/multi/device.h"
+#include "device/oneapi/device.h"
 #include "device/optix/device.h"
 
 #include "util/foreach.h"
@@ -39,6 +40,7 @@ vector<DeviceInfo> Device::optix_devices;
 vector<DeviceInfo> Device::cpu_devices;
 vector<DeviceInfo> Device::hip_devices;
 vector<DeviceInfo> Device::metal_devices;
+vector<DeviceInfo> Device::oneapi_devices;
 uint Device::devices_initialized_mask = 0;
 
 /* Device */
@@ -101,6 +103,13 @@ Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
         device = device_metal_create(info, stats, profiler);
       break;
 #endif
+
+#ifdef WITH_ONEAPI
+    case DEVICE_ONEAPI:
+      device = device_oneapi_create(info, stats, profiler);
+      break;
+#endif
+
     default:
       break;
   }
@@ -126,6 +135,8 @@ DeviceType Device::type_from_string(const char *name)
     return DEVICE_HIP;
   else if (strcmp(name, "METAL") == 0)
     return DEVICE_METAL;
+  else if (strcmp(name, "ONEAPI") == 0)
+    return DEVICE_ONEAPI;
 
   return DEVICE_NONE;
 }
@@ -144,6 +155,8 @@ string Device::string_from_type(DeviceType type)
     return "HIP";
   else if (type == DEVICE_METAL)
     return "METAL";
+  else if (type == DEVICE_ONEAPI)
+    return "ONEAPI";
 
   return "";
 }
@@ -164,6 +177,9 @@ vector<DeviceType> Device::available_types()
 #ifdef WITH_METAL
   types.push_back(DEVICE_METAL);
 #endif
+#ifdef WITH_ONEAPI
+  types.push_back(DEVICE_ONEAPI);
+#endif
   return types;
 }
 
@@ -219,6 +235,20 @@ vector<DeviceInfo> Device::available_devices(uint mask)
   }
 #endif
 
+#ifdef WITH_ONEAPI
+  if (mask & DEVICE_MASK_ONEAPI) {
+    if (!(devices_initialized_mask & DEVICE_MASK_ONEAPI)) {
+      if (device_oneapi_init()) {
+        device_oneapi_info(oneapi_devices);
+      }
+      devices_initialized_mask |= DEVICE_MASK_ONEAPI;
+    }
+    foreach (DeviceInfo &info, oneapi_devices) {
+      devices.push_back(info);
+    }
+  }
+#endif
+
   if (mask & DEVICE_MASK_CPU) {
     if (!(devices_initialized_mask & DEVICE_MASK_CPU)) {
       device_cpu_info(cpu_devices);
@@ -282,6 +312,15 @@ string Device::device_capabilities(uint mask)
   }
 #endif
 
+#ifdef WITH_ONEAPI
+  if (mask & DEVICE_MASK_ONEAPI) {
+    if (device_oneapi_init()) {
+      capabilities += "\noneAPI device capabilities:\n";
+      capabilities += device_oneapi_capabilities();
+    }
+  }
+#endif
+
 #ifdef WITH_METAL
   if (mask & DEVICE_MASK_METAL) {
     if (device_metal_init()) {
@@ -380,6 +419,7 @@ void Device::free_memory()
   cuda_devices.free_memory();
   optix_devices.free_memory();
   hip_devices.free_memory();
+  oneapi_devices.free_memory();
   cpu_devices.free_memory();
   metal_devices.free_memory();
 }
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 927caae600c..340be85e853 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -40,6 +40,7 @@ enum DeviceType {
   DEVICE_OPTIX,
   DEVICE_HIP,
   DEVICE_METAL,
+  DEVICE_ONEAPI,
   DEVICE_DUMMY,
 };
 
@@ -49,6 +50,7 @@ enum DeviceTypeMask {
   DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
   DEVICE_MASK_HIP = (1 << DEVICE_HIP),
   DEVICE_MASK_METAL = (1 << DEVICE_METAL),
+  DEVICE_MASK_ONEAPI = (1 << DEVICE_ONEAPI),
   DEVICE_MASK_ALL = ~0
 };
 
@@ -273,6 +275,7 @@ class Device {
   static vector<DeviceInfo> cpu_devices;
   static vector<DeviceInfo> hip_devices;
   static vector<DeviceInfo> metal_devices;
+  static vector<DeviceInfo> oneapi_devices;
   static uint devices_initialized_mask;
 };
 
diff --git a/intern/cycles/device/oneapi/device.cpp b/intern/cycles/device/oneapi/device.cpp
new file mode 100644
index 00000000000..b6f0f0c2b42
--- /dev/null
+++ b/intern/cycles/device/oneapi/device.cpp
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#include "device/oneapi/device.h"
+
+#include "util/log.h"
+
+#ifdef WITH_ONEAPI
+#  include "device/device.h"
+#  include "device/oneapi/device_impl.h"
+
+#  include "util/path.h"
+#  include "util/string.h"
+
+#  ifdef __linux__
+#    include <dlfcn.h>
+#  endif
+#endif /* WITH_ONEAPI */
+
+CCL_NAMESPACE_BEGIN
+
+#ifdef WITH_ONEAPI
+static OneAPIDLLInterface oneapi_dll;
+#endif
+
+#ifdef _WIN32
+#  define LOAD_ONEAPI_SHARED_LIBRARY(path) (void *)(LoadLibrary(path))
+#  define FREE_SHARED_LIBRARY(handle) FreeLibrary((HMODULE)handle)
+#  define GET_SHARED_LIBRARY_SYMBOL(handle, name) GetProcAddress((HMODULE)handle, name)
+#elif __linux__
+#  define LOAD_ONEAPI_SHARED_LIBRARY(path) dlopen(path, RTLD_NOW)
+#  define FREE_SHARED_LIBRARY(handle) dlclose(handle)
+#  define GET_SHARED_LIBRARY_SYMBOL(handle, name) dlsym(handle, name)
+#endif
+
+bool device_oneapi_init()
+{
+#if !defined(WITH_ONEAPI)
+  return false;
+#else
+
+  string lib_path = path_get("lib");
+#  ifdef _WIN32
+  lib_path = path_join(lib_path, "cycles_kernel_oneapi.dll");
+#  else
+  lib_path = path_join(lib_path, "cycles_kernel_oneapi.so");
+#  endif
+  void *lib_handle = LOAD_ONEAPI_SHARED_LIBRARY(lib_path.c_str());
+
+  /* This shouldn't happen, but it still makes sense to have a branch for this. */
+  if (lib_handle == NULL) {
+    LOG(ERROR) << "oneAPI kernel shared library cannot be loaded for some reason. This should not "
+                  "happen, however, it occurs hence oneAPI rendering will be disabled";
+    return false;
+  }
+
+#  define DLL_INTERFACE_CALL(function, return_type, ...) \
+    (oneapi_dll.function) = reinterpret_cast<decltype(oneapi_dll.function)>( \
+        GET_SHARED_LIBRARY_SYMBOL(lib_handle, #function)); \
+    if (oneapi_dll.function == NULL) { \
+      LOG(ERROR) << "oneAPI shared library function \"" << #function \
+                 << "\" has not been loaded from kernel shared  - disable oneAPI " \
+                    "library disable oneAPI implementation due to this"; \
+      FREE_SHARED_LIBRARY(lib_handle); \
+      return false; \
+    }
+#  include "kernel/device/oneapi/dll_interface_template.h"
+#  undef DLL_INTERFACE_CALL
+
+  VLOG_INFO << "oneAPI kernel shared library has been loaded successfully";
+
+  /* We need to have this oneapi kernel shared library during all life-span of the Blender.
+   * So it is not unloaded because of this.
+   * FREE_SHARED_LIBRARY(lib_handle); */
+
+  /* NOTE(@nsirgien): we need to enable JIT cache from here and
+   * right now this cache policy is controlled by env. variables. */
+  /* NOTE(hallade) we also disable use of copy engine as it
+   * improves stability as of intel/llvm sycl-nightly/20220529.
+   * All these env variable can be set beforehand by end-users and
+   * will in that case -not- be overwritten. */
+#  ifdef _WIN32
+  if (getenv("SYCL_CACHE_PERSISTENT") == nullptr) {
+    _putenv_s("SYCL_CACHE_PERSISTENT", "1");
+  }
+  if (getenv("SYCL_CACHE_TRESHOLD") == nullptr) {
+    _putenv_s("SYCL_CACHE_THRESHOLD", "0");
+  }
+  if (getenv("SYCL_DEVICE_FILTER") == nullptr) {
+    _putenv_s("SYCL_DEVICE_FILTER", "host,level_zero");
+  }
+  if (getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE") == nullptr) {
+    _putenv_s("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0");
+  }
+#  elif __linux__
+  setenv("SYCL_CACHE_PERSISTENT", "1", false);
+  setenv("SYCL_CACHE_THRESHOLD", "0", false);
+  setenv("SYCL_DEVICE_FILTER", "host,level_zero", false);
+  setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false);
+#  endif
+
+  return true;
+#endif
+}
+
+#if defined(_WIN32) || defined(__linux__)
+#  undef LOAD_SYCL_SHARED_LIBRARY
+#  undef LOAD_ONEAPI_SHARED_LIBRARY
+#  undef FREE_SHARED_LIBRARY
+#  undef GET_SHARED_LIBRARY_SYMBOL
+#endif
+
+Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+#ifdef WITH_ONEAPI
+  return new OneapiDevice(info, oneapi_dll, stats, profiler);
+#else
+  (void)info;
+  (void)stats;
+  (void)profiler;
+
+  LOG(FATAL) << "Requested to create oneAPI device while not enabled for this build.";
+
+  return nullptr;
+#endif
+}
+
+#ifdef WITH_ONEAPI
+static void device_iterator_cb(const char *id, const char *name, int num, void *user_ptr)
+{
+  vector<DeviceInfo> *devices = (vector<DeviceInfo> *)user_ptr;
+
+  DeviceInfo info;
+
+  info.type = DEVICE_ONEAPI;
+  info.description = name;
+  info.num = num;
+
+  /* NOTE(@nsirgien): Should be unique at least on proper oneapi installation. */
+  info.id = id;
+
+  info.has_nanovdb = true;
+  info.denoisers = 0;
+
+  info.has_gpu_queue = true;
+
+  /* NOTE(@nsirgien): oneAPI right now is focused on one device usage. In future it maybe will
+   * change, but right now peer access from one device to another device is not supported. */
+  info.has_peer_memory = false;
+
+  /* NOTE(@nsirgien): Seems not possible to know from SYCL/oneAPI or Level0. */
+  info.display_device = false;
+
+  devices->push_back(info);
+  VLOG_INFO << "Added device \"" << name << "\" with id \"" << info.id << "\".";
+}
+#endif
+
+void device_oneapi_info(vector<DeviceInfo> &devices)
+{
+#ifdef WITH_ONEAPI
+  (oneapi_dll.oneapi_iterate_devices)(device_iterator_cb, &devices);
+#else  /* WITH_ONEAPI */
+  (void)devices;
+#endif /* WITH_ONEAPI */
+}
+
+string device_oneapi_capabilities()
+{
+  string capabilities;
+#ifdef WITH_ONEAPI
+  char *c_capabilities = (oneapi_dll.oneapi_device_capabilities)();
+  if (c_capabilities) {
+    capabilities = c_capabilities;
+    (oneapi_dll.oneapi_free)(c_capabilities);
+  }
+#endif
+  return capabilities;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/oneapi/device.h b/intern/cycles/device/oneapi/device.h
new file mode 100644
index 00000000000..db8c985d4d5
--- /dev/null
+++ b/intern/cycles/device/oneapi/device.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#include "util/string.h"
+#include "util/vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_oneapi_init();
+
+Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_oneapi_info(vector<DeviceInfo> &devices);
+
+string device_oneapi_capabilities();
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp
new file mode 100644
index 00000000000..8c8ab522b47
--- /dev/null
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -0,0 +1,426 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#ifdef WITH_ONEAPI
+
+#  include "device/oneapi/device_impl.h"
+
+#  include "util/debug.h"
+#  include "util/log.h"
+
+#  include "kernel/device/oneapi/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+static void queue_error_cb(const char *message, void *user_ptr)
+{
+  if (user_ptr) {
+    *reinterpret_cast<std::string *>(user_ptr) = message;
+  }
+}
+
+OneapiDevice::OneapiDevice(const DeviceInfo &info,
+                           OneAPIDLLInterface &oneapi_dll_object,
+                           Stats &stats,
+                           Profiler &profiler)
+    : Device(info, stats, profiler),
+      device_queue_(nullptr),
+      texture_info_(this, "texture_info", MEM_GLOBAL),
+      kg_memory_(nullptr),
+      kg_memory_device_(nullptr),
+      kg_memory_size_(0),
+      oneapi_dll_(oneapi_dll_object)
+{
+  need_texture_info_ = false;
+
+  oneapi_dll_.oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
+
+  /* Oneapi calls should be initialised on this moment. */
+  assert(oneapi_dll_.oneapi_create_queue != nullptr);
+
+  bool is_finished_ok = oneapi_dll_.oneapi_create_queue(device_queue_, info.num);
+  if (is_finished_ok == false) {
+    set_error("oneAPI queue initialization error: got runtime exception \"" +
+              oneapi_error_string_ + "\"");
+  }
+  else {
+    VLOG_DEBUG << "oneAPI queue has been successfully created for the device \""
+               << info.description << "\"";
+    assert(device_queue_);
+  }
+
+  size_t globals_segment_size;
+  is_finished_ok = oneapi_dll_.oneapi_kernel_globals_size(device_queue_, globals_segment_size);
+  if (is_finished_ok == false) {
+    set_error("oneAPI constant memory initialization got runtime exception \"" +
+              oneapi_error_string_ + "\"");
+  }
+  else {
+    VLOG_DEBUG << "Successfully created global/constant memory segment (kernel globals object)";
+  }
+
+  kg_memory_ = oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, globals_segment_size, 16);
+  oneapi_dll_.oneapi_usm_memset(device_queue_, kg_memory_, 0, globals_segment_size);
+
+  kg_memory_device_ = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, globals_segment_size);
+
+  kg_memory_size_ = globals_segment_size;
+}
+
+OneapiDevice::~OneapiDevice()
+{
+  texture_info_.free();
+  oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_);
+  oneapi_dll_.oneapi_usm_free(device_queue_, kg_memory_device_);
+
+  for (ConstMemMap::iterator mt = const_mem_map_.begin(); mt != const_mem_map_.end(); mt++)
+    delete mt->second;
+
+  if (device_queue_)
+    oneapi_dll_.oneapi_free_queue(device_queue_);
+}
+
+bool OneapiDevice::check_peer_access(Device * /*peer_device*/)
+{
+  return false;
+}
+
+BVHLayoutMask OneapiDevice::get_bvh_layout_mask() const
+{
+  return BVH_LAYOUT_BVH2;
+}
+
+bool OneapiDevice::load_kernels(const uint requested_features)
+{
+  assert(device_queue_);
+  /* NOTE(@nsirgien): oneAPI can support compilation of kernel code with sertain feature set
+   * with specialization constants, but it hasn't been implemented yet. */
+  (void)requested_features;
+
+  bool is_finished_ok = oneapi_dll_.oneapi_run_test_kernel(device_queue_);
+  if (is_finished_ok == false) {
+    set_error("oneAPI kernel load: got runtime exception \"" + oneapi_error_string_ + "\"");
+  }
+  else {
+    VLOG_INFO << "Runtime compilation done for \"" << info.description << "\"";
+    assert(device_queue_);
+  }
+  return is_finished_ok;
+}
+
+void OneapiDevice::load_texture_info()
+{
+  if (need_texture_info_) {
+    need_texture_info_ = false;
+    texture_info_.copy_to_device();
+  }
+}
+
+void OneapiDevice::generic_alloc(device_memory &mem)
+{
+  size_t memory_size = mem.memory_size();
+
+  /* TODO(@nsirgien): In future, if scene doesn't fit into device memory, then
+   * we can use USM host memory.
+   * Because of the expected performance impact, implementation of this has had a low priority
+   * and is not implemented yet. */
+
+  assert(device_queue_);
+  /* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device
+   * and shared. For new project it maybe more beneficial to use USM shared memory, because it
+   * provides automatic migration mechanism in order to allow to use the same pointer on host and
+   * on device, without need to worry about explicit memory transfer operations. But for
+   * Blender/Cycles this type of memory is not very suitable in current application architecture,
+   * because Cycles already uses two different pointer for host activity and device activity, and
+   * also has to perform all needed memory transfer operations. So, USM device memory
+   * type has been used for oneAPI device in order to better fit in Cycles architecture. */
+  void *device_pointer = oneapi_dll_.oneapi_usm_alloc_device(device_queue_, memory_size);
+  if (device_pointer == nullptr) {
+    size_t max_memory_on_device = oneapi_dll_.oneapi_get_memcapacity(device_queue_);
+    set_error("oneAPI kernel - device memory allocation error for " +
+              string_human_readable_size(mem.memory_size()) +
+              ", possibly caused by lack of available memory space on the device: " +
+              string_human_readable_size(stats.mem_used) + " of " +
+              string_human_readable_size(max_memory_on_device) + " is already allocated");
+    return;
+  }
+  assert(device_pointer);
+
+  mem.device_pointer = reinterpret_cast<ccl::device_ptr>(device_pointer);
+  mem.device_size = memory_size;
+
+  stats.mem_alloc(memory_size);
+}
+
+void OneapiDevice::generic_copy_to(device_memory &mem)
+{
+  size_t memory_size = mem.memory_size();
+
+  /* Copy operation from host shouldn't be requested if there is no memory allocated on host. */
+  assert(mem.host_pointer);
+  assert(device_queue_);
+  oneapi_dll_.oneapi_usm_memcpy(
+      device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size);
+}
+
+/* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */
+SyclQueue *OneapiDevice::sycl_queue()
+{
+  return device_queue_;
+}
+
+string OneapiDevice::oneapi_error_message()
+{
+  return string(oneapi_error_string_);
+}
+
+OneAPIDLLInterface OneapiDevice::oneapi_dll_object()
+{
+  return oneapi_dll_;
+}
+
+void *OneapiDevice::kernel_globals_device_pointer()
+{
+  return kg_memory_device_;
+}
+
+void OneapiDevice::generic_free(device_memory &mem)
+{
+  assert(mem.device_pointer);
+  stats.mem_free(mem.device_size);
+  mem.device_size = 0;
+
+  assert(device_queue_);
+  oneapi_dll_.oneapi_usm_free(device_queue_, (void *)mem.device_pointer);
+  mem.device_pointer = 0;
+}
+
+void OneapiDevice::mem_alloc(device_memory &mem)
+{
+  if (mem.type == MEM_TEXTURE) {
+    assert(!"mem_alloc not supported for textures.");
+  }
+  else if (mem.type == MEM_GLOBAL) {
+    assert(!"mem_alloc not supported for global memory.");
+  }
+  else {
+    if (mem.name) {
+      VLOG_DEBUG << "OneapiDevice::mem_alloc: \"" << mem.name << "\", "
+                 << string_human_readable_number(mem.memory_size()) << " bytes. ("
+                 << string_human_readable_size(mem.memory_size()) << ")";
+    }
+    generic_alloc(mem);
+  }
+}
+
+void OneapiDevice::mem_copy_to(device_memory &mem)
+{
+  if (mem.name) {
+    VLOG_DEBUG << "OneapiDevice::mem_copy_to: \"" << mem.name << "\", "
+               << string_human_readable_number(mem.memory_size()) << " bytes. ("
+               << string_human_readable_size(mem.memory_size()) << ")";
+  }
+
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+    global_alloc(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+    tex_alloc((device_texture &)mem);
+  }
+  else {
+    if (!mem.device_pointer)
+      mem_alloc(mem);
+
+    generic_copy_to(mem);
+  }
+}
+
+void OneapiDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem)
+{
+  if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
+    assert(!"mem_copy_from not supported for textures.");
+  }
+  else if (mem.host_pointer) {
+    const size_t size = (w > 0 || h > 0 || elem > 0) ? (elem * w * h) : mem.memory_size();
+    const size_t offset = elem * y * w;
+
+    if (mem.name) {
+      VLOG_DEBUG << "OneapiDevice::mem_copy_from: \"" << mem.name << "\" object of "
+                 << string_human_readable_number(mem.memory_size()) << " bytes. ("
+                 << string_human_readable_size(mem.memory_size()) << ") from offset " << offset
+                 << " data " << size << " bytes";
+    }
+
+    assert(device_queue_);
+
+    assert(size != 0);
+    assert(mem.device_pointer);
+    char *shifted_host = reinterpret_cast<char *>(mem.host_pointer) + offset;
+    char *shifted_device = reinterpret_cast<char *>(mem.device_pointer) + offset;
+    bool is_finished_ok = oneapi_dll_.oneapi_usm_memcpy(
+        device_queue_, shifted_host, shifted_device, size);
+    if (is_finished_ok == false) {
+      set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
+                "\"");
+    }
+  }
+}
+
+void OneapiDevice::mem_zero(device_memory &mem)
+{
+  if (mem.name) {
+    VLOG_DEBUG << "OneapiDevice::mem_zero: \"" << mem.name << "\", "
+               << string_human_readable_number(mem.memory_size()) << " bytes. ("
+               << string_human_readable_size(mem.memory_size()) << ")\n";
+  }
+
+  if (!mem.device_pointer) {
+    mem_alloc(mem);
+  }
+  if (!mem.device_pointer) {
+    return;
+  }
+
+  assert(device_queue_);
+  bool is_finished_ok = oneapi_dll_.oneapi_usm_memset(
+      device_queue_, (void *)mem.device_pointer, 0, mem.memory_size());
+  if (is_finished_ok == false) {
+    set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
+              "\"");
+  }
+}
+
+void OneapiDevice::mem_free(device_memory &mem)
+{
+  if (mem.name) {
+    VLOG_DEBUG << "OneapiDevice::mem_free: \"" << mem.name << "\", "
+               << string_human_readable_number(mem.device_size) << " bytes. ("
+               << string_human_readable_size(mem.device_size) << ")\n";
+  }
+
+  if (mem.type == MEM_GLOBAL) {
+    global_free(mem);
+  }
+  else if (mem.type == MEM_TEXTURE) {
+    tex_free((device_texture &)mem);
+  }
+  else {
+    generic_free(mem);
+  }
+}
+
+device_ptr OneapiDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/)
+{
+  return reinterpret_cast<device_ptr>(reinterpret_cast<char *>(mem.device_pointer) +
+                                      mem.memory_elements_size(offset));
+}
+
+void OneapiDevice::const_copy_to(const char *name, void *host, size_t size)
+{
+  assert(name);
+
+  VLOG_DEBUG << "OneapiDevice::const_copy_to \"" << name << "\" object "
+             << string_human_readable_number(size) << " bytes. ("
+             << string_human_readable_size(size) << ")";
+
+  ConstMemMap::iterator i = const_mem_map_.find(name);
+  device_vector<uchar> *data;
+
+  if (i == const_mem_map_.end()) {
+    data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
+    data->alloc(size);
+    const_mem_map_.insert(ConstMemMap::value_type(name, data));
+  }
+  else {
+    data = i->second;
+  }
+
+  assert(data->memory_size() <= size);
+  memcpy(data->data(), host, size);
+  data->copy_to_device();
+
+  oneapi_dll_.oneapi_set_global_memory(
+      device_queue_, kg_memory_, name, (void *)data->device_pointer);
+
+  oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
+}
+
+void OneapiDevice::global_alloc(device_memory &mem)
+{
+  assert(mem.name);
+
+  size_t size = mem.memory_size();
+  VLOG_DEBUG << "OneapiDevice::global_alloc \"" << mem.name << "\" object "
+             << string_human_readable_number(size) << " bytes. ("
+             << string_human_readable_size(size) << ")";
+
+  generic_alloc(mem);
+  generic_copy_to(mem);
+
+  oneapi_dll_.oneapi_set_global_memory(
+      device_queue_, kg_memory_, mem.name, (void *)mem.device_pointer);
+
+  oneapi_dll_.oneapi_usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
+}
+
+void OneapiDevice::global_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    generic_free(mem);
+  }
+}
+
+void OneapiDevice::tex_alloc(device_texture &mem)
+{
+  generic_alloc(mem);
+  generic_copy_to(mem);
+
+  /* Resize if needed. Also, in case of resize - allocate in advance for future allocs. */
+  const uint slot = mem.slot;
+  if (slot >= texture_info_.size()) {
+    texture_info_.resize(slot + 128);
+  }
+
+  texture_info_[slot] = mem.info;
+  need_texture_info_ = true;
+
+  texture_info_[slot].data = (uint64_t)mem.device_pointer;
+}
+
+void OneapiDevice::tex_free(device_texture &mem)
+{
+  /* There is no texture memory in SYCL. */
+  if (mem.device_pointer) {
+    generic_free(mem);
+  }
+}
+
+unique_ptr<DeviceQueue> OneapiDevice::gpu_queue_create()
+{
+  return make_unique<OneapiDeviceQueue>(this);
+}
+
+bool OneapiDevice::should_use_graphics_interop()
+{
+  /* NOTE(@nsirgien): oneAPI doesn't yet support direct writing into graphics API objects, so
+   * return false. */
+  return false;
+}
+
+void *OneapiDevice::usm_aligned_alloc_host(size_t memory_size, size_t alignment)
+{
+  assert(device_queue_);
+  return oneapi_dll_.oneapi_usm_aligned_alloc_host(device_queue_, memory_size, alignment);
+}
+
+void OneapiDevice::usm_free(void *usm_ptr)
+{
+  assert(device_queue_);
+  return oneapi_dll_.oneapi_usm_free(device_queue_, usm_ptr);
+}
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/oneapi/device_impl.h b/intern/cycles/device/oneapi/device_impl.h
new file mode 100644
index 00000000000..f925687ebe9
--- /dev/null
+++ b/intern/cycles/device/oneapi/device_impl.h
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#ifdef WITH_ONEAPI
+
+#  include "device/device.h"
+#  include "device/oneapi/device.h"
+#  include "device/oneapi/queue.h"
+
+#  include "util/map.h"
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+class OneapiDevice : public Device {
+ private:
+  SyclQueue *device_queue_;
+
+  using ConstMemMap = map<string, device_vector<uchar> *>;
+  ConstMemMap const_mem_map_;
+  device_vector<TextureInfo> texture_info_;
+  bool need_texture_info_;
+  void *kg_memory_;
+  void *kg_memory_device_;
+  size_t kg_memory_size_ = (size_t)0;
+  OneAPIDLLInterface oneapi_dll_;
+  std::string oneapi_error_string_;
+
+ public:
+  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+  OneapiDevice(const DeviceInfo &info,
+               OneAPIDLLInterface &oneapi_dll_object,
+               Stats &stats,
+               Profiler &profiler);
+
+  virtual ~OneapiDevice();
+
+  bool check_peer_access(Device *peer_device) override;
+
+  bool load_kernels(const uint requested_features) override;
+
+  void load_texture_info();
+
+  void generic_alloc(device_memory &mem);
+
+  void generic_copy_to(device_memory &mem);
+
+  void generic_free(device_memory &mem);
+
+  SyclQueue *sycl_queue();
+
+  string oneapi_error_message();
+
+  OneAPIDLLInterface oneapi_dll_object();
+
+  void *kernel_globals_device_pointer();
+
+  void mem_alloc(device_memory &mem) override;
+
+  void mem_copy_to(device_memory &mem) override;
+
+  void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override;
+
+  void mem_copy_from(device_memory &mem)
+  {
+    mem_copy_from(mem, 0, 0, 0, 0);
+  }
+
+  void mem_zero(device_memory &mem) override;
+
+  void mem_free(device_memory &mem) override;
+
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override;
+
+  virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void global_alloc(device_memory &mem);
+
+  void global_free(device_memory &mem);
+
+  void tex_alloc(device_texture &mem);
+
+  void tex_free(device_texture &mem);
+
+  /* Graphics resources interoperability. */
+  virtual bool should_use_graphics_interop() override;
+
+  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+  /* NOTE(@nsirgien): Create this methods to avoid some compilation problems on Windows with host
+   * side compilation (MSVC). */
+  void *usm_aligned_alloc_host(size_t memory_size, size_t alignment);
+  void usm_free(void *usm_ptr);
+};
+
+CCL_NAMESPACE_END
+
+#endif
diff --git a/intern/cycles/device/oneapi/dll_interface.h b/intern/cycles/device/oneapi/dll_interface.h
new file mode 100644
index 00000000000..bc681ff8f64
--- /dev/null
+++ b/intern/cycles/device/oneapi/dll_interface.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+/* Include kernel header to get access to sycl-specific types, like SyclQueue and
+ * OneAPIDeviceIteratorCallback. */
+#include "kernel/device/oneapi/kernel.h"
+
+#ifdef WITH_ONEAPI
+struct OneAPIDLLInterface {
+#  define DLL_INTERFACE_CALL(function, return_type, ...) \
+    return_type (*function)(__VA_ARGS__) = nullptr;
+#  include "kernel/device/oneapi/dll_interface_template.h"
+#  undef DLL_INTERFACE_CALL
+};
+#endif
diff --git a/intern/cycles/device/oneapi/queue.cpp b/intern/cycles/device/oneapi/queue.cpp
new file mode 100644
index 00000000000..42e2408ee7a
--- /dev/null
+++ b/intern/cycles/device/oneapi/queue.cpp
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#ifdef WITH_ONEAPI
+
+#  include "device/oneapi/queue.h"
+#  include "device/oneapi/device_impl.h"
+#  include "util/log.h"
+#  include "util/time.h"
+#  include <iomanip>
+#  include <vector>
+
+#  include "kernel/device/oneapi/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+struct KernelExecutionInfo {
+  double elapsed_summary = 0.0;
+  int enqueue_count = 0;
+};
+
+/* OneapiDeviceQueue */
+
+OneapiDeviceQueue::OneapiDeviceQueue(OneapiDevice *device)
+    : DeviceQueue(device),
+      oneapi_device_(device),
+      oneapi_dll_(device->oneapi_dll_object()),
+      kernel_context_(nullptr)
+{
+}
+
+OneapiDeviceQueue::~OneapiDeviceQueue()
+{
+  delete kernel_context_;
+}
+
+int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const
+{
+  int num_states;
+
+  /* TODO: implement and use get_num_multiprocessors and get_max_num_threads_per_multiprocessor. */
+  const size_t compute_units = oneapi_dll_.oneapi_get_compute_units_amount(
+      oneapi_device_->sycl_queue());
+  if (compute_units >= 128) {
+    /* dGPU path, make sense to allocate more states, because it will be dedicated GPU memory. */
+    int base = 1024 * 1024;
+    /* linear dependency (with coefficient less that 1) from amount of compute units. */
+    num_states = (base * (compute_units / 128)) * 3 / 4;
+
+    /* Limit amount of integrator states by one quarter of device memory, because
+     * other allocations will need some space as well
+     * TODO: base this calculation on the how many states what the GPU is actually capable of
+     * running, with some headroom to improve occupancy. If the texture don't fit, offload into
+     * unified memory. */
+    size_t states_memory_size = num_states * state_size;
+    size_t device_memory_amount =
+        (oneapi_dll_.oneapi_get_memcapacity)(oneapi_device_->sycl_queue());
+    if (states_memory_size >= device_memory_amount / 4) {
+      num_states = device_memory_amount / 4 / state_size;
+    }
+  }
+  else {
+    /* iGPU path - no real need to allocate a lot of integrator states because it is shared GPU
+     * memory. */
+    num_states = 1024 * 512;
+  }
+
+  VLOG_DEVICE_STATS << "GPU queue concurrent states: " << num_states << ", using up to "
+                    << string_human_readable_size(num_states * state_size);
+
+  return num_states;
+}
+
+int OneapiDeviceQueue::num_concurrent_busy_states() const
+{
+  const size_t compute_units = oneapi_dll_.oneapi_get_compute_units_amount(
+      oneapi_device_->sycl_queue());
+  if (compute_units >= 128) {
+    return 1024 * 1024;
+  }
+  else {
+    return 1024 * 512;
+  }
+}
+
+void OneapiDeviceQueue::init_execution()
+{
+  oneapi_device_->load_texture_info();
+
+  SyclQueue *device_queue = oneapi_device_->sycl_queue();
+  void *kg_dptr = (void *)oneapi_device_->kernel_globals_device_pointer();
+  assert(device_queue);
+  assert(kg_dptr);
+  kernel_context_ = new KernelContext{device_queue, kg_dptr};
+
+  debug_init_execution();
+}
+
+bool OneapiDeviceQueue::enqueue(DeviceKernel kernel,
+                                const int signed_kernel_work_size,
+                                DeviceKernelArguments const &_args)
+{
+  if (oneapi_device_->have_error()) {
+    return false;
+  }
+
+  void **args = const_cast<void **>(_args.values);
+
+  debug_enqueue(kernel, signed_kernel_work_size);
+  assert(signed_kernel_work_size >= 0);
+  size_t kernel_work_size = (size_t)signed_kernel_work_size;
+
+  size_t kernel_local_size = oneapi_dll_.oneapi_kernel_preferred_local_size(
+      kernel_context_->queue, (::DeviceKernel)kernel, kernel_work_size);
+  size_t uniformed_kernel_work_size = round_up(kernel_work_size, kernel_local_size);
+
+  assert(kernel_context_);
+
+  /* Call the oneAPI kernel DLL to launch the requested kernel. */
+  bool is_finished_ok = oneapi_dll_.oneapi_enqueue_kernel(
+      kernel_context_, kernel, uniformed_kernel_work_size, args);
+
+  if (is_finished_ok == false) {
+    oneapi_device_->set_error("oneAPI kernel \"" + std::string(device_kernel_as_string(kernel)) +
+                              "\" execution error: got runtime exception \"" +
+                              oneapi_device_->oneapi_error_message() + "\"");
+  }
+
+  return is_finished_ok;
+}
+
+bool OneapiDeviceQueue::synchronize()
+{
+  if (oneapi_device_->have_error()) {
+    return false;
+  }
+
+  bool is_finished_ok = oneapi_dll_.oneapi_queue_synchronize(oneapi_device_->sycl_queue());
+  if (is_finished_ok == false)
+    oneapi_device_->set_error("oneAPI unknown kernel execution error: got runtime exception \"" +
+                              oneapi_device_->oneapi_error_message() + "\"");
+
+  debug_synchronize();
+
+  return !(oneapi_device_->have_error());
+}
+
+void OneapiDeviceQueue::zero_to_device(device_memory &mem)
+{
+  oneapi_device_->mem_zero(mem);
+}
+
+void OneapiDeviceQueue::copy_to_device(device_memory &mem)
+{
+  oneapi_device_->mem_copy_to(mem);
+}
+
+void OneapiDeviceQueue::copy_from_device(device_memory &mem)
+{
+  oneapi_device_->mem_copy_from(mem);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_ONEAPI */
diff --git a/intern/cycles/device/oneapi/queue.h b/intern/cycles/device/oneapi/queue.h
new file mode 100644
index 00000000000..09a015303b6
--- /dev/null
+++ b/intern/cycles/device/oneapi/queue.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#pragma once
+
+#ifdef WITH_ONEAPI
+
+#  include "device/kernel.h"
+#  include "device/memory.h"
+#  include "device/queue.h"
+
+#  include "device/oneapi/device.h"
+#  include "device/oneapi/dll_interface.h"
+
+CCL_NAMESPACE_BEGIN
+
+class OneapiDevice;
+class device_memory;
+
+/* Base class for Oneapi queues. */
+class OneapiDeviceQueue : public DeviceQueue {
+ public:
+  explicit OneapiDeviceQueue(OneapiDevice *device);
+  ~OneapiDeviceQueue();
+
+  virtual int num_concurrent_states(const size_t state_size) const override;
+
+  virtual int num_concurrent_busy_states() const override;
+
+  virtual void init_execution() override;
+
+  virtual bool enqueue(DeviceKernel kernel,
+                       const int kernel_work_size,
+                       DeviceKernelArguments const &args) override;
+
+  virtual bool synchronize() override;
+
+  virtual void zero_to_device(device_memory &mem) override;
+  virtual void copy_to_device(device_memory &mem) override;
+  virtual void copy_from_device(device_memory &mem) override;
+
+ protected:
+  OneapiDevice *oneapi_device_;
+  OneAPIDLLInterface oneapi_dll_;
+  KernelContext *kernel_context_;
+  bool with_kernel_statistics_;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_ONEAPI */
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index 9ad1c465725..6912bf928cd 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -1103,6 +1103,8 @@ static const char *device_type_for_description(const DeviceType type)
       return "OptiX";
     case DEVICE_HIP:
       return "HIP";
+    case DEVICE_ONEAPI:
+      return "oneAPI";
     case DEVICE_DUMMY:
       return "Dummy";
     case DEVICE_MULTI:
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index a07d7852211..ccd694dfdfd 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -37,6 +37,10 @@ set(SRC_KERNEL_DEVICE_OPTIX
   device/optix/kernel_shader_raytrace.cu
 )
 
+set(SRC_KERNEL_DEVICE_ONEAPI
+  device/oneapi/kernel.cpp
+)
+
 set(SRC_KERNEL_DEVICE_CPU_HEADERS
   device/cpu/compat.h
   device/cpu/image.h
@@ -78,6 +82,17 @@ set(SRC_KERNEL_DEVICE_METAL_HEADERS
   device/metal/globals.h
 )
 
+set(SRC_KERNEL_DEVICE_ONEAPI_HEADERS
+  device/oneapi/compat.h
+  device/oneapi/context_begin.h
+  device/oneapi/context_end.h
+  device/oneapi/device_id.h
+  device/oneapi/globals.h
+  device/oneapi/image.h
+  device/oneapi/kernel.h
+  device/oneapi/kernel_templates.h
+)
+
 set(SRC_KERNEL_CLOSURE_HEADERS
   closure/alloc.h
   closure/bsdf.h
@@ -687,6 +702,212 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
   cycles_set_solution_folder(cycles_kernel_optix)
 endif()
 
+if(WITH_CYCLES_DEVICE_ONEAPI)
+  if(WIN32)
+    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.dll)
+  else()
+    set(cycles_kernel_oneapi_lib ${CMAKE_CURRENT_BINARY_DIR}/cycles_kernel_oneapi.so)
+  endif()
+
+  set(cycles_oneapi_kernel_sources
+    ${SRC_KERNEL_DEVICE_ONEAPI}
+    ${SRC_KERNEL_HEADERS}
+    ${SRC_KERNEL_DEVICE_GPU_HEADERS}
+    ${SRC_KERNEL_DEVICE_ONEAPI_HEADERS}
+    ${SRC_UTIL_HEADERS}
+  )
+
+  # SYCL_CPP_FLAGS is a variable that the user can set to pass extra compiler options
+  set(sycl_compiler_flags
+      ${CMAKE_CURRENT_SOURCE_DIR}/${SRC_KERNEL_DEVICE_ONEAPI}
+      -fsycl
+      -fsycl-unnamed-lambda
+      -fdelayed-template-parsing
+      -mllvm -inlinedefault-threshold=300
+      -mllvm -inlinehint-threshold=400
+      -shared
+      -DWITH_ONEAPI
+      -ffast-math
+      -DNDEBUG
+      -O2
+      -o ${cycles_kernel_oneapi_lib}
+      -I${CMAKE_CURRENT_SOURCE_DIR}/..
+      -I${LEVEL_ZERO_INCLUDE_DIR}
+      ${LEVEL_ZERO_LIBRARY}
+      ${SYCL_CPP_FLAGS}
+      )
+
+
+  if (WITH_CYCLES_ONEAPI_SYCL_HOST_ENABLED)
+    list(APPEND sycl_compiler_flags -DWITH_ONEAPI_SYCL_HOST_ENABLED)
+  endif()
+
+  # Set defaults for spir64 and spir64_gen options
+  if (NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64)
+    set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-large-register-file -ze-opt-regular-grf-kernel integrator_intersect'")
+  endif()
+  if (NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen)
+    SET (CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "${CYCLES_ONEAPI_SYCL_OPTIONS_spir64}" CACHE STRING "Extra build options for spir64_gen target")
+  endif()
+  # enabling zebin (graphics binary format with improved compatibility) on Windows only while support on Linux isn't available yet
+  if(WIN32)
+    string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "--format zebin ")
+  endif()
+  string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "-device ${CYCLES_ONEAPI_SPIR64_GEN_DEVICES} ")
+
+  if (WITH_CYCLES_ONEAPI_BINARIES)
+    # Iterate over all targest and their options
+    list (JOIN CYCLES_ONEAPI_SYCL_TARGETS "," targets_string)
+    list (APPEND sycl_compiler_flags -fsycl-targets=${targets_string})
+    foreach(target ${CYCLES_ONEAPI_SYCL_TARGETS})
+      if(DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_${target})
+        list (APPEND sycl_compiler_flags -Xsycl-target-backend=${target} "${CYCLES_ONEAPI_SYCL_OPTIONS_${target}}")
+      endif()
+    endforeach()
+  else()
+    # If AOT is disabled, build for spir64
+    list(APPEND sycl_compiler_flags
+      -fsycl-targets=spir64
+      -Xsycl-target-backend=spir64 "${CYCLES_ONEAPI_SYCL_OPTIONS_spir64}")
+  endif()
+
+  if(WITH_NANOVDB)
+    list(APPEND sycl_compiler_flags
+      -DWITH_NANOVDB
+      -I"${NANOVDB_INCLUDE_DIR}")
+  endif()
+
+  if(WITH_CYCLES_DEBUG)
+    list(APPEND sycl_compiler_flags -DWITH_CYCLES_DEBUG)
+  endif()
+
+  get_filename_component(sycl_compiler_root ${SYCL_COMPILER} DIRECTORY)
+  get_filename_component(sycl_compiler_compiler_name ${SYCL_COMPILER} NAME_WE)
+
+  if(NOT OCLOC_INSTALL_DIR)
+    get_filename_component(OCLOC_INSTALL_DIR "${sycl_compiler_root}/../lib/ocloc" ABSOLUTE)
+  endif()
+  if(WITH_CYCLES_ONEAPI_BINARIES AND NOT EXISTS ${OCLOC_INSTALL_DIR})
+    message(FATAL_ERROR "WITH_CYCLES_ONEAPI_BINARIES requires ocloc but ${OCLOC_INSTALL_DIR} directory doesn't exist."
+                        " A different ocloc directory can be set using OCLOC_INSTALL_DIR cmake variable.")
+  endif()
+
+  if(UNIX AND NOT APPLE)
+    if(NOT WITH_CXX11_ABI)
+      check_library_exists(sycl
+        _ZN2cl4sycl7handler22verifyUsedKernelBundleERKSs ${sycl_compiler_root}/../lib SYCL_NO_CXX11_ABI)
+      if(SYCL_NO_CXX11_ABI)
+        list(APPEND sycl_compiler_flags -D_GLIBCXX_USE_CXX11_ABI=0)
+      endif()
+    endif()
+  endif()
+
+  if(WIN32)
+    list(APPEND sycl_compiler_flags
+    -fms-extensions
+    -fms-compatibility
+    -D_WINDLL
+    -D_MBCS
+    -DWIN32
+    -D_WINDOWS
+    -D_CRT_NONSTDC_NO_DEPRECATE
+    -D_CRT_SECURE_NO_DEPRECATE
+    -DONEAPI_EXPORT)
+
+    if(sycl_compiler_compiler_name MATCHES "dpcpp")
+      # The oneAPI distribution calls the compiler "dpcpp" and comes with a script that sets environment variables.
+      add_custom_command(
+        OUTPUT ${cycles_kernel_oneapi_lib}
+        COMMAND "${sycl_compiler_root}/../../env/vars.bat"
+        COMMAND ${SYCL_COMPILER} $<$<CONFIG:Debug>:-g>$<$<CONFIG:RelWithDebInfo>:-g> ${sycl_compiler_flags}
+        DEPENDS ${cycles_oneapi_kernel_sources})
+    else()
+      # The open source SYCL compiler just goes by clang++ and does not have such a script.
+      # Set the variables manually.
+      string(REPLACE /Redist/ /Tools/ MSVC_TOOLS_DIR ${MSVC_REDIST_DIR})
+      if(NOT CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION) # case for Ninja on Windows
+        get_filename_component(cmake_mt_dir ${CMAKE_MT} DIRECTORY)
+        string(REPLACE /bin/ /Lib/ WINDOWS_KIT_DIR ${cmake_mt_dir})
+        get_filename_component(WINDOWS_KIT_DIR "${WINDOWS_KIT_DIR}/../" ABSOLUTE)
+      else()
+        set(WINDOWS_KIT_DIR ${WINDOWS_KITS_DIR}/Lib/${CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION})
+      endif()
+      list(APPEND sycl_compiler_flags
+                  -L "${MSVC_TOOLS_DIR}/lib/x64"
+                  -L "${WINDOWS_KIT_DIR}/um/x64"
+                  -L "${WINDOWS_KIT_DIR}/ucrt/x64")
+      add_custom_command(
+        OUTPUT ${cycles_kernel_oneapi_lib}
+        COMMAND ${CMAKE_COMMAND} -E env
+                "LIB=${sycl_compiler_root}/../lib" # for compiler to find sycl.lib
+                "PATH=${OCLOC_INSTALL_DIR};${sycl_compiler_root}"
+                ${SYCL_COMPILER} $<$<CONFIG:Debug>:-g>$<$<CONFIG:RelWithDebInfo>:-g> ${sycl_compiler_flags}
+        DEPENDS ${cycles_oneapi_kernel_sources})
+    endif()
+  else()
+    list(APPEND sycl_compiler_flags -fPIC)
+
+    # avoid getting __FAST_MATH__ to be defined for the graphics compiler on CentOS 7 until the compile-time issue it triggers gets fixed.
+    if(WITH_CYCLES_ONEAPI_BINARIES)
+      list(APPEND sycl_compiler_flags -fhonor-nans)
+    endif()
+
+    # add $ORIGIN to cycles_kernel_oneapi.so rpath so libsycl.so and
+    # libpi_level_zero.so can be placed next to it and get found.
+    list(APPEND sycl_compiler_flags -Wl,-rpath,'$$ORIGIN')
+
+    # The oneAPI distribution calls the compiler "dpcpp" and comes with a script that sets environment variables.
+    if(sycl_compiler_compiler_name MATCHES "dpcpp")
+      add_custom_command(
+        OUTPUT ${cycles_kernel_oneapi_lib}
+        COMMAND bash -c \"source ${sycl_compiler_root}/../../env/vars.sh&&${SYCL_COMPILER} $<$<CONFIG:Debug>:-g>$<$<CONFIG:RelWithDebInfo>:-g> ${sycl_compiler_flags}\"
+        DEPENDS ${cycles_oneapi_kernel_sources})
+    else()
+      # The open source SYCL compiler just goes by clang++ and does not have such a script.
+      # Set the variables manually.
+      if(NOT IGC_INSTALL_DIR)
+        get_filename_component(IGC_INSTALL_DIR "${sycl_compiler_root}/../lib/igc" ABSOLUTE)
+      endif()
+      add_custom_command(
+        OUTPUT ${cycles_kernel_oneapi_lib}
+        COMMAND ${CMAKE_COMMAND} -E env
+                "LD_LIBRARY_PATH=${sycl_compiler_root}/../lib:${OCLOC_INSTALL_DIR}/lib:${IGC_INSTALL_DIR}/lib"
+                "PATH=${OCLOC_INSTALL_DIR}/bin:${sycl_compiler_root}:$ENV{PATH}" # env PATH is for compiler to find ld
+                ${SYCL_COMPILER} $<$<CONFIG:Debug>:-g>$<$<CONFIG:RelWithDebInfo>:-g> ${sycl_compiler_flags}
+        DEPENDS ${cycles_oneapi_kernel_sources})
+    endif()
+  endif()
+
+  # install dynamic libraries required at runtime
+  if(WIN32)
+    set(SYCL_RUNTIME_DEPENDENCIES
+        sycl.dll
+        pi_level_zero.dll
+    )
+    if(NOT WITH_BLENDER)
+      # For the Cycles standalone put libraries next to the Cycles application.
+      delayed_install("${sycl_compiler_root}" "${SYCL_RUNTIME_DEPENDENCIES}" ${CYCLES_INSTALL_PATH})
+    else()
+      # For Blender put the libraries next to the Blender executable.
+      #
+      # Note that the installation path in the delayed_install is relative to the versioned folder,
+      # which means we need to go one level up.
+      delayed_install("${sycl_compiler_root}" "${SYCL_RUNTIME_DEPENDENCIES}" "../")
+    endif()
+  elseif(UNIX AND NOT APPLE)
+    file(GLOB SYCL_RUNTIME_DEPENDENCIES
+              ${sycl_compiler_root}/../lib/libsycl.so
+              ${sycl_compiler_root}/../lib/libsycl.so.[0-9]
+              ${sycl_compiler_root}/../lib/libsycl.so.[0-9].[0-9].[0-9]-[0-9]
+    )
+    list(APPEND SYCL_RUNTIME_DEPENDENCIES ${sycl_compiler_root}/../lib/libpi_level_zero.so)
+    delayed_install("" "${SYCL_RUNTIME_DEPENDENCIES}" ${CYCLES_INSTALL_PATH}/lib)
+  endif()
+
+  delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cycles_kernel_oneapi_lib}" ${CYCLES_INSTALL_PATH}/lib)
+  add_custom_target(cycles_kernel_oneapi ALL DEPENDS ${cycles_kernel_oneapi_lib})
+endif()
+
 # OSL module
 
 if(WITH_CYCLES_OSL)
@@ -752,6 +973,7 @@ cycles_add_library(cycles_kernel "${LIB}"
   ${SRC_KERNEL_DEVICE_HIP_HEADERS}
   ${SRC_KERNEL_DEVICE_OPTIX_HEADERS}
   ${SRC_KERNEL_DEVICE_METAL_HEADERS}
+  ${SRC_KERNEL_DEVICE_ONEAPI_HEADERS}
 )
 
 source_group("bake" FILES ${SRC_KERNEL_BAKE_HEADERS})
@@ -764,6 +986,7 @@ source_group("device\\gpu" FILES ${SRC_KERNEL_DEVICE_GPU_HEADERS})
 source_group("device\\hip" FILES ${SRC_KERNEL_DEVICE_HIP} ${SRC_KERNEL_DEVICE_HIP_HEADERS})
 source_group("device\\optix" FILES ${SRC_KERNEL_DEVICE_OPTIX} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS})
 source_group("device\\metal" FILES ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_DEVICE_METAL_HEADERS})
+source_group("device\\oneapi" FILES ${SRC_KERNEL_DEVICE_ONEAPI} ${SRC_KERNEL_DEVICE_ONEAPI_HEADERS})
 source_group("film" FILES ${SRC_KERNEL_FILM_HEADERS})
 source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS})
 source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS})
@@ -782,6 +1005,9 @@ endif()
 if(WITH_CYCLES_HIP)
   add_dependencies(cycles_kernel cycles_kernel_hip)
 endif()
+if(WITH_CYCLES_DEVICE_ONEAPI)
+  add_dependencies(cycles_kernel cycles_kernel_oneapi)
+endif()
 
 # Install kernel source for runtime compilation
 
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index d657571a5fa..b9a44ccad02 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -14,6 +14,8 @@
 
 #ifdef __KERNEL_METAL__
 #  include "kernel/device/metal/context_begin.h"
+#elif defined(__KERNEL_ONEAPI__)
+#  include "kernel/device/oneapi/context_begin.h"
 #endif
 
 #include "kernel/device/gpu/work_stealing.h"
@@ -40,6 +42,8 @@
 
 #ifdef __KERNEL_METAL__
 #  include "kernel/device/metal/context_end.h"
+#elif defined(__KERNEL_ONEAPI__)
+#  include "kernel/device/oneapi/context_end.h"
 #endif
 
 #include "kernel/film/read.h"
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index 7d7266d5edf..c1df49c4f49 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -18,15 +18,68 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
 #endif
 
-#ifndef __KERNEL_METAL__
+/* TODO: abstract more device differences, define ccl_gpu_local_syncthreads,
+ * ccl_gpu_thread_warp, ccl_gpu_warp_index, ccl_gpu_num_warps for all devices
+ * and keep device specific code in compat.h */
+
+#ifdef __KERNEL_ONEAPI__
+#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+template<typename IsActiveOp>
+void cpu_serial_active_index_array_impl(const uint num_states,
+                                        ccl_global int *ccl_restrict indices,
+                                        ccl_global int *ccl_restrict num_indices,
+                                        IsActiveOp is_active_op)
+{
+  int write_index = 0;
+  for (int state_index = 0; state_index < num_states; state_index++) {
+    if (is_active_op(state_index))
+      indices[write_index++] = state_index;
+  }
+  *num_indices = write_index;
+  return;
+}
+#  endif /* WITH_ONEAPI_SYCL_HOST_ENABLED */
+
+template<typename IsActiveOp>
+void gpu_parallel_active_index_array_impl(const uint num_states,
+                                          ccl_global int *ccl_restrict indices,
+                                          ccl_global int *ccl_restrict num_indices,
+                                          IsActiveOp is_active_op)
+{
+  const sycl::nd_item<1> &item_id = sycl::ext::oneapi::experimental::this_nd_item<1>();
+  const uint blocksize = item_id.get_local_range(0);
+
+  sycl::multi_ptr<int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1],
+                  sycl::access::address_space::local_space>
+      ptr = sycl::ext::oneapi::group_local_memory<
+          int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1]>(item_id.get_group());
+  int *warp_offset = *ptr;
+
+  /* NOTE(@nsirgien): Here we calculate the same value as below but
+   * faster for DPC++ : seems CUDA converting "%", "/", "*" based calculations below into
+   * something faster already but DPC++ doesn't, so it's better to use
+   * direct request of needed parameters - switching from this computation to computation below
+   * will cause 2.5x performance slowdown. */
+  const uint thread_index = item_id.get_local_id(0);
+  const uint thread_warp = item_id.get_sub_group().get_local_id();
+
+  const uint warp_index = item_id.get_sub_group().get_group_id();
+  const uint num_warps = item_id.get_sub_group().get_group_range()[0];
+
+  const uint state_index = item_id.get_global_id(0);
+
+  /* Test if state corresponding to this thread is active. */
+  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+#else /* !__KERNEL__ONEAPI__ */
+#  ifndef __KERNEL_METAL__
 template<uint blocksize, typename IsActiveOp>
 __device__
-#endif
+#  endif
     void
     gpu_parallel_active_index_array_impl(const uint num_states,
                                          ccl_global int *indices,
                                          ccl_global int *num_indices,
-#ifdef __KERNEL_METAL__
+#  ifdef __KERNEL_METAL__
                                          const uint is_active,
                                          const uint blocksize,
                                          const int thread_index,
@@ -37,7 +90,7 @@ __device__
                                          const int num_warps,
                                          threadgroup int *warp_offset)
 {
-#else
+#  else
                                           IsActiveOp is_active_op)
 {
   extern ccl_gpu_shared int warp_offset[];
@@ -52,18 +105,33 @@ __device__
 
   /* Test if state corresponding to this thread is active. */
   const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
-#endif
-
+#  endif
+#endif /* !__KERNEL_ONEAPI__ */
   /* For each thread within a warp compute how many other active states precede it. */
+#ifdef __KERNEL_ONEAPI__
+  const uint thread_offset = sycl::exclusive_scan_over_group(
+      item_id.get_sub_group(), is_active, std::plus<>());
+#else
   const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
                                       ccl_gpu_thread_mask(thread_warp));
+#endif
 
   /* Last thread in warp stores number of active states for each warp. */
+#ifdef __KERNEL_ONEAPI__
+  if (thread_warp == item_id.get_sub_group().get_local_range()[0] - 1) {
+#else
   if (thread_warp == ccl_gpu_warp_size - 1) {
+#endif
     warp_offset[warp_index] = thread_offset + is_active;
   }
 
+#ifdef __KERNEL_ONEAPI__
+  /* NOTE(@nsirgien): For us here only local memory writing (warp_offset) is important,
+   * so faster local barriers can be used. */
+  ccl_gpu_local_syncthreads();
+#else
   ccl_gpu_syncthreads();
+#endif
 
   /* Last thread in block converts per-warp sizes to offsets, increments global size of
    * index array and gets offset to write to. */
@@ -80,7 +148,13 @@ __device__
     warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
   }
 
+#ifdef __KERNEL_ONEAPI__
+  /* NOTE(@nsirgien): For us here only important local memory writing (warp_offset),
+   * so faster local barriers can be used. */
+  ccl_gpu_local_syncthreads();
+#else
   ccl_gpu_syncthreads();
+#endif
 
   /* Write to index array. */
   if (is_active) {
@@ -107,7 +181,19 @@ __device__
                                          simd_group_index, \
                                          num_simd_groups, \
                                          simdgroup_offset)
-
+#elif defined(__KERNEL_ONEAPI__)
+#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+#    define gpu_parallel_active_index_array( \
+        blocksize, num_states, indices, num_indices, is_active_op) \
+      if (ccl_gpu_global_size_x() == 1) \
+        cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \
+      else \
+        gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op);
+#  else
+#    define gpu_parallel_active_index_array( \
+        blocksize, num_states, indices, num_indices, is_active_op) \
+      gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
+#  endif
 #else
 
 #  define gpu_parallel_active_index_array( \
diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h
new file mode 100644
index 00000000000..30b0f088ede
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -0,0 +1,206 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#pragma once
+
+#define __KERNEL_GPU__
+#define __KERNEL_ONEAPI__
+
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
+
+#include <cstdint>
+
+#ifndef __NODES_MAX_GROUP__
+#  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
+#endif
+#ifndef __NODES_FEATURES__
+#  define __NODES_FEATURES__ NODE_FEATURE_ALL
+#endif
+
+/* This one does not have an abstraction.
+ * It's used by other devices directly.
+ */
+
+#define __device__
+
+/* Qualifier wrappers for different names on different devices */
+
+#define ccl_device
+#define ccl_global
+#define ccl_always_inline __attribute__((always_inline))
+#define ccl_device_inline inline
+#define ccl_noinline
+#define ccl_inline_constant const constexpr
+#define ccl_static_constant const
+#define ccl_device_forceinline __attribute__((always_inline))
+#define ccl_device_noinline ccl_device ccl_noinline
+#define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
+#define ccl_restrict __restrict__
+#define ccl_loop_no_unroll
+#define ccl_optional_struct_init
+#define ccl_private
+#define ATTR_FALLTHROUGH __attribute__((fallthrough))
+#define ccl_constant const
+#define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
+#define ccl_align(n) __attribute__((aligned(n)))
+#define kernel_assert(cond)
+#define ccl_may_alias
+
+/* clang-format off */
+
+/* kernel.h adapters */
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
+#define ccl_gpu_kernel_threads(block_num_threads)
+
+#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+#  define KG_ND_ITEMS \
+  kg->nd_item_local_id_0 = item.get_local_id(0); \
+  kg->nd_item_local_range_0 = item.get_local_range(0); \
+  kg->nd_item_group_0 = item.get_group(0); \
+  kg->nd_item_group_range_0 = item.get_group_range(0); \
+  kg->nd_item_global_id_0 = item.get_global_id(0); \
+  kg->nd_item_global_range_0 = item.get_global_range(0);
+#else
+# define KG_ND_ITEMS
+#endif
+
+#define ccl_gpu_kernel_signature(name, ...) \
+void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
+                          size_t kernel_global_size, \
+                          size_t kernel_local_size, \
+                          sycl::handler &cgh, \
+                          __VA_ARGS__) { \
+      (kg); \
+      cgh.parallel_for<class kernel_##name>( \
+          sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
+          [=](sycl::nd_item<1> item) { \
+            KG_ND_ITEMS
+
+#define ccl_gpu_kernel_postfix \
+          }); \
+    }
+
+#define ccl_gpu_kernel_call(x) ((ONEAPIKernelContext*)kg)->x
+
+#define ccl_gpu_kernel_lambda(func, ...) \
+  struct KernelLambda \
+  { \
+    KernelLambda(const ONEAPIKernelContext *_kg) : kg(_kg) {} \
+    ccl_private const ONEAPIKernelContext *kg; \
+    __VA_ARGS__; \
+    int operator()(const int state) const { return (func); } \
+  } ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg)
+
+/* GPU thread, block, grid size and index */
+#ifndef WITH_ONEAPI_SYCL_HOST_ENABLED
+#  define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
+#  define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
+#  define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
+#  define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
+#  define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
+#  define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
+
+#  define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
+#  define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
+#else
+#  define ccl_gpu_thread_idx_x (kg->nd_item_local_id_0)
+#  define ccl_gpu_block_dim_x (kg->nd_item_local_range_0)
+#  define ccl_gpu_block_idx_x (kg->nd_item_group_0)
+#  define ccl_gpu_grid_dim_x (kg->nd_item_group_range_0)
+#  define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
+#  define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
+
+#  define ccl_gpu_global_id_x() (kg->nd_item_global_id_0)
+#  define ccl_gpu_global_size_x() (kg->nd_item_global_range_0)
+#endif
+
+
+/* GPU warp synchronization */
+
+#define ccl_gpu_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier()
+#define ccl_gpu_local_syncthreads() sycl::ext::oneapi::experimental::this_nd_item<1>().barrier(sycl::access::fence_space::local_space)
+#ifdef __SYCL_DEVICE_ONLY__
+  #define ccl_gpu_ballot(predicate) (sycl::ext::oneapi::group_ballot(sycl::ext::oneapi::experimental::this_sub_group(), predicate).count())
+#else
+  #define ccl_gpu_ballot(predicate) (predicate ? 1 : 0)
+#endif
+
+/* Debug defines */
+#if defined(__SYCL_DEVICE_ONLY__)
+#  define CONSTANT __attribute__((opencl_constant))
+#else
+#  define CONSTANT
+#endif
+
+#define sycl_printf(format, ...) {               \
+    static const CONSTANT char fmt[] = format;               \
+    sycl::ext::oneapi::experimental::printf(fmt, __VA_ARGS__ );    \
+  }
+
+#define sycl_printf_(format) {               \
+    static const CONSTANT char fmt[] = format;               \
+    sycl::ext::oneapi::experimental::printf(fmt);                  \
+  }
+
+/* GPU texture objects */
+
+/* clang-format on */
+
+/* Types */
+/* It's not possible to use sycl types like sycl::float3, sycl::int3, etc
+ * because these types have different interfaces from blender version */
+
+using uchar = unsigned char;
+using sycl::half;
+
+struct float3 {
+  float x, y, z;
+};
+
+ccl_always_inline float3 make_float3(float x, float y, float z)
+{
+  return {x, y, z};
+}
+ccl_always_inline float3 make_float3(float x)
+{
+  return {x, x, x};
+}
+
+/* math functions */
+#define fabsf(x) sycl::fabs((x))
+#define copysignf(x, y) sycl::copysign((x), (y))
+#define asinf(x) sycl::asin((x))
+#define acosf(x) sycl::acos((x))
+#define atanf(x) sycl::atan((x))
+#define floorf(x) sycl::floor((x))
+#define ceilf(x) sycl::ceil((x))
+#define sinhf(x) sycl::sinh((x))
+#define coshf(x) sycl::cosh((x))
+#define tanhf(x) sycl::tanh((x))
+#define hypotf(x, y) sycl::hypot((x), (y))
+#define atan2f(x, y) sycl::atan2((x), (y))
+#define fmaxf(x, y) sycl::fmax((x), (y))
+#define fminf(x, y) sycl::fmin((x), (y))
+#define fmodf(x, y) sycl::fmod((x), (y))
+#define lgammaf(x) sycl::lgamma((x))
+
+#define __forceinline __attribute__((always_inline))
+
+/* Types */
+#include "util/half.h"
+#include "util/types.h"
+
+/* NOTE(@nsirgien): Declaring these functions after types headers is very important because they
+ * include oneAPI headers, which transitively include math.h headers which will cause redefintions
+ * of the math defines because math.h also uses them and having them defined before math.h include
+ * is actually UB. */
+/* Use fast math functions - get them from sycl::native namespace for native math function
+ * implementations */
+#define cosf(x) sycl::native::cos(((float)(x)))
+#define sinf(x) sycl::native::sin(((float)(x)))
+#define powf(x, y) sycl::native::powr(((float)(x)), ((float)(y)))
+#define tanf(x) sycl::native::tan(((float)(x)))
+#define logf(x) sycl::native::log(((float)(x)))
+#define expf(x) sycl::native::exp(((float)(x)))
diff --git a/intern/cycles/kernel/device/oneapi/context_begin.h b/intern/cycles/kernel/device/oneapi/context_begin.h
new file mode 100644
index 00000000000..6d6f8cec4ca
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/context_begin.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#ifdef WITH_NANOVDB
+#  include <nanovdb/NanoVDB.h>
+#  include <nanovdb/util/SampleFromVoxels.h>
+#endif
+
+/* clang-format off */
+struct ONEAPIKernelContext : public KernelGlobalsGPU {
+  public:
+#    include "kernel/device/oneapi/image.h"
+  /* clang-format on */
diff --git a/intern/cycles/kernel/device/oneapi/context_end.h b/intern/cycles/kernel/device/oneapi/context_end.h
new file mode 100644
index 00000000000..ddf0d1f1712
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/context_end.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+}
+; /* end of ONEAPIKernelContext class definition */
+
+#undef kernel_integrator_state
+#define kernel_integrator_state (*(kg->integrator_state))
diff --git a/intern/cycles/kernel/device/oneapi/device_id.h b/intern/cycles/kernel/device/oneapi/device_id.h
new file mode 100644
index 00000000000..b4c94ac27a2
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/device_id.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#pragma once
+
+/* from public source :
+ * https://gitlab.freedesktop.org/mesa/mesa/-/blob/main/include/pci_ids/iris_pci_ids.h */
+const static std::set<uint32_t> intel_arc_alchemist_device_ids = {
+    0x4f80, 0x4f81, 0x4f82, 0x4f83, 0x4f84, 0x4f87, 0x4f88, 0x5690, 0x5691,
+    0x5692, 0x5693, 0x5694, 0x5695, 0x5696, 0x5697, 0x56a0, 0x56a1, 0x56a2,
+    0x56a3, 0x56a4, 0x56a5, 0x56a6, 0x56b0, 0x56b1, 0x56b2, 0x56b3};
diff --git a/intern/cycles/kernel/device/oneapi/dll_interface_template.h b/intern/cycles/kernel/device/oneapi/dll_interface_template.h
new file mode 100644
index 00000000000..2d740b4c64a
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/dll_interface_template.h
@@ -0,0 +1,50 @@
+/* device_capabilities() returns a C string that must be free'd with oneapi_free(). */
+DLL_INTERFACE_CALL(oneapi_device_capabilities, char *)
+DLL_INTERFACE_CALL(oneapi_free, void, void *)
+DLL_INTERFACE_CALL(oneapi_get_memcapacity, size_t, SyclQueue *queue)
+
+DLL_INTERFACE_CALL(oneapi_get_compute_units_amount, size_t, SyclQueue *queue)
+DLL_INTERFACE_CALL(oneapi_iterate_devices, void, OneAPIDeviceIteratorCallback cb, void *user_ptr)
+DLL_INTERFACE_CALL(oneapi_set_error_cb, void, OneAPIErrorCallback, void *user_ptr)
+
+DLL_INTERFACE_CALL(oneapi_create_queue, bool, SyclQueue *&external_queue, int device_index)
+DLL_INTERFACE_CALL(oneapi_free_queue, void, SyclQueue *queue)
+DLL_INTERFACE_CALL(
+    oneapi_usm_aligned_alloc_host, void *, SyclQueue *queue, size_t memory_size, size_t alignment)
+DLL_INTERFACE_CALL(oneapi_usm_alloc_device, void *, SyclQueue *queue, size_t memory_size)
+DLL_INTERFACE_CALL(oneapi_usm_free, void, SyclQueue *queue, void *usm_ptr)
+
+DLL_INTERFACE_CALL(
+    oneapi_usm_memcpy, bool, SyclQueue *queue, void *dest, void *src, size_t num_bytes)
+DLL_INTERFACE_CALL(oneapi_queue_synchronize, bool, SyclQueue *queue)
+DLL_INTERFACE_CALL(oneapi_usm_memset,
+                   bool,
+                   SyclQueue *queue,
+                   void *usm_ptr,
+                   unsigned char value,
+                   size_t num_bytes)
+
+DLL_INTERFACE_CALL(oneapi_run_test_kernel, bool, SyclQueue *queue)
+
+/* Operation with Kernel globals structure - map of global/constant allocation - filled before
+ * render/kernel execution As we don't know in cycles sizeof this - Cycles will manage just as
+ * pointer. */
+DLL_INTERFACE_CALL(oneapi_kernel_globals_size, bool, SyclQueue *queue, size_t &kernel_global_size)
+DLL_INTERFACE_CALL(oneapi_set_global_memory,
+                   void,
+                   SyclQueue *queue,
+                   void *kernel_globals,
+                   const char *memory_name,
+                   void *memory_device_pointer)
+
+DLL_INTERFACE_CALL(oneapi_kernel_preferred_local_size,
+                   size_t,
+                   SyclQueue *queue,
+                   const DeviceKernel kernel,
+                   const size_t kernel_global_size)
+DLL_INTERFACE_CALL(oneapi_enqueue_kernel,
+                   bool,
+                   KernelContext *context,
+                   int kernel,
+                   size_t global_size,
+                   void **args)
diff --git a/intern/cycles/kernel/device/oneapi/globals.h b/intern/cycles/kernel/device/oneapi/globals.h
new file mode 100644
index 00000000000..d60f4f135ba
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/globals.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#pragma once
+
+#include "kernel/integrator/state.h"
+#include "kernel/types.h"
+#include "kernel/util/profiling.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* NOTE(@nsirgien): With SYCL we can't declare __constant__ global variable, which will be
+ * accessible from device code, like it has been done for Cycles CUDA backend. So, the backend will
+ * allocate this "constant" memory regions and store pointers to them in oneAPI context class */
+
+struct IntegratorStateGPU;
+struct IntegratorQueueCounter;
+
+typedef struct KernelGlobalsGPU {
+
+#define KERNEL_DATA_ARRAY(type, name) const type *__##name = nullptr;
+#include "kernel/data_arrays.h"
+#undef KERNEL_DATA_ARRAY
+  IntegratorStateGPU *integrator_state;
+  const KernelData *__data;
+#ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+  size_t nd_item_local_id_0;
+  size_t nd_item_local_range_0;
+  size_t nd_item_group_0;
+  size_t nd_item_group_range_0;
+
+  size_t nd_item_global_id_0;
+  size_t nd_item_global_range_0;
+#endif
+} KernelGlobalsGPU;
+
+typedef ccl_global KernelGlobalsGPU *ccl_restrict KernelGlobals;
+
+#define kernel_data (*(__data))
+#define kernel_integrator_state (*(integrator_state))
+
+/* data lookup defines */
+
+#define kernel_data_fetch(name, index) __##name[index]
+#define kernel_data_array(name) __##name
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/oneapi/image.h b/intern/cycles/kernel/device/oneapi/image.h
new file mode 100644
index 00000000000..892558d40bf
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/image.h
@@ -0,0 +1,385 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+CCL_NAMESPACE_BEGIN
+
+/* For oneAPI implementation we do manual lookup and interpolation. */
+/* TODO: share implementation with ../cpu/image.h. */
+
+template<typename T> ccl_device_forceinline T tex_fetch(const TextureInfo &info, int index)
+{
+  return reinterpret_cast<ccl_global T *>(info.data)[index];
+}
+
+ccl_device_inline int svm_image_texture_wrap_periodic(int x, int width)
+{
+  x %= width;
+  if (x < 0)
+    x += width;
+  return x;
+}
+
+ccl_device_inline int svm_image_texture_wrap_clamp(int x, int width)
+{
+  return clamp(x, 0, width - 1);
+}
+
+ccl_device_inline float4 svm_image_texture_read(const TextureInfo &info, int x, int y, int z)
+{
+  const int data_offset = x + info.width * y + info.width * info.height * z;
+  const int texture_type = info.data_type;
+
+  /* Float4 */
+  if (texture_type == IMAGE_DATA_TYPE_FLOAT4) {
+    return tex_fetch<float4>(info, data_offset);
+  }
+  /* Byte4 */
+  else if (texture_type == IMAGE_DATA_TYPE_BYTE4) {
+    uchar4 r = tex_fetch<uchar4>(info, data_offset);
+    float f = 1.0f / 255.0f;
+    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
+  }
+  /* Ushort4 */
+  else if (texture_type == IMAGE_DATA_TYPE_USHORT4) {
+    ushort4 r = tex_fetch<ushort4>(info, data_offset);
+    float f = 1.0f / 65535.f;
+    return make_float4(r.x * f, r.y * f, r.z * f, r.w * f);
+  }
+  /* Float */
+  else if (texture_type == IMAGE_DATA_TYPE_FLOAT) {
+    float f = tex_fetch<float>(info, data_offset);
+    return make_float4(f, f, f, 1.0f);
+  }
+  /* UShort */
+  else if (texture_type == IMAGE_DATA_TYPE_USHORT) {
+    ushort r = tex_fetch<ushort>(info, data_offset);
+    float f = r * (1.0f / 65535.0f);
+    return make_float4(f, f, f, 1.0f);
+  }
+  else if (texture_type == IMAGE_DATA_TYPE_HALF) {
+    float f = tex_fetch<half>(info, data_offset);
+    return make_float4(f, f, f, 1.0f);
+  }
+  else if (texture_type == IMAGE_DATA_TYPE_HALF4) {
+    half4 r = tex_fetch<half4>(info, data_offset);
+    return make_float4(r.x, r.y, r.z, r.w);
+  }
+  /* Byte */
+  else {
+    uchar r = tex_fetch<uchar>(info, data_offset);
+    float f = r * (1.0f / 255.0f);
+    return make_float4(f, f, f, 1.0f);
+  }
+}
+
+ccl_device_inline float4 svm_image_texture_read_2d(int id, int x, int y)
+{
+  const TextureInfo &info = kernel_data_fetch(texture_info, id);
+
+  /* Wrap */
+  if (info.extension == EXTENSION_REPEAT) {
+    x = svm_image_texture_wrap_periodic(x, info.width);
+    y = svm_image_texture_wrap_periodic(y, info.height);
+  }
+  else {
+    x = svm_image_texture_wrap_clamp(x, info.width);
+    y = svm_image_texture_wrap_clamp(y, info.height);
+  }
+
+  return svm_image_texture_read(info, x, y, 0);
+}
+
+ccl_device_inline float4 svm_image_texture_read_3d(int id, int x, int y, int z)
+{
+  const TextureInfo &info = kernel_data_fetch(texture_info, id);
+
+  /* Wrap */
+  if (info.extension == EXTENSION_REPEAT) {
+    x = svm_image_texture_wrap_periodic(x, info.width);
+    y = svm_image_texture_wrap_periodic(y, info.height);
+    z = svm_image_texture_wrap_periodic(z, info.depth);
+  }
+  else {
+    x = svm_image_texture_wrap_clamp(x, info.width);
+    y = svm_image_texture_wrap_clamp(y, info.height);
+    z = svm_image_texture_wrap_clamp(z, info.depth);
+  }
+
+  return svm_image_texture_read(info, x, y, z);
+}
+
+static float svm_image_texture_frac(float x, int *ix)
+{
+  int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0);
+  *ix = i;
+  return x - (float)i;
+}
+
+#define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
+  { \
+    u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \
+    u[1] = ((0.5f * t - 1.0f) * t) * t + (2.0f / 3.0f); \
+    u[2] = ((-0.5f * t + 0.5f) * t + 0.5f) * t + (1.0f / 6.0f); \
+    u[3] = (1.0f / 6.0f) * t * t * t; \
+  } \
+  (void)0
+
+ccl_device float4 kernel_tex_image_interp(KernelGlobals, int id, float x, float y)
+{
+  const TextureInfo &info = kernel_data_fetch(texture_info, id);
+
+  if (info.extension == EXTENSION_CLIP) {
+    if (x < 0.0f || y < 0.0f || x > 1.0f || y > 1.0f) {
+      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    }
+  }
+
+  if (info.interpolation == INTERPOLATION_CLOSEST) {
+    /* Closest interpolation. */
+    int ix, iy;
+    svm_image_texture_frac(x * info.width, &ix);
+    svm_image_texture_frac(y * info.height, &iy);
+
+    return svm_image_texture_read_2d(id, ix, iy);
+  }
+  else if (info.interpolation == INTERPOLATION_LINEAR) {
+    /* Bilinear interpolation. */
+    int ix, iy;
+    float tx = svm_image_texture_frac(x * info.width - 0.5f, &ix);
+    float ty = svm_image_texture_frac(y * info.height - 0.5f, &iy);
+
+    float4 r;
+    r = (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_2d(id, ix, iy);
+    r += (1.0f - ty) * tx * svm_image_texture_read_2d(id, ix + 1, iy);
+    r += ty * (1.0f - tx) * svm_image_texture_read_2d(id, ix, iy + 1);
+    r += ty * tx * svm_image_texture_read_2d(id, ix + 1, iy + 1);
+    return r;
+  }
+  else {
+    /* Bicubic interpolation. */
+    int ix, iy;
+    float tx = svm_image_texture_frac(x * info.width - 0.5f, &ix);
+    float ty = svm_image_texture_frac(y * info.height - 0.5f, &iy);
+
+    float u[4], v[4];
+    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+
+    float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+    for (int y = 0; y < 4; y++) {
+      for (int x = 0; x < 4; x++) {
+        float weight = u[x] * v[y];
+        r += weight * svm_image_texture_read_2d(id, ix + x - 1, iy + y - 1);
+      }
+    }
+    return r;
+  }
+}
+
+#ifdef WITH_NANOVDB
+template<typename T> struct NanoVDBInterpolator {
+
+  typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType;
+
+  static ccl_always_inline float4 read(float r)
+  {
+    return make_float4(r, r, r, 1.0f);
+  }
+
+  static ccl_always_inline float4 read(nanovdb::Vec3f r)
+  {
+    return make_float4(r[0], r[1], r[2], 1.0f);
+  }
+
+  static ccl_always_inline float4 interp_3d_closest(const AccessorType &acc,
+                                                    float x,
+                                                    float y,
+                                                    float z)
+  {
+    const nanovdb::Vec3f xyz(x, y, z);
+    return read(nanovdb::SampleFromVoxels<AccessorType, 0, false>(acc)(xyz));
+  }
+
+  static ccl_always_inline float4 interp_3d_linear(const AccessorType &acc,
+                                                   float x,
+                                                   float y,
+                                                   float z)
+  {
+    const nanovdb::Vec3f xyz(x - 0.5f, y - 0.5f, z - 0.5f);
+    return read(nanovdb::SampleFromVoxels<AccessorType, 1, false>(acc)(xyz));
+  }
+
+  static float4 interp_3d_cubic(const AccessorType &acc, float x, float y, float z)
+  {
+    int ix, iy, iz;
+    int nix, niy, niz;
+    int pix, piy, piz;
+    int nnix, nniy, nniz;
+    /* Tricubic b-spline interpolation. */
+    const float tx = svm_image_texture_frac(x - 0.5f, &ix);
+    const float ty = svm_image_texture_frac(y - 0.5f, &iy);
+    const float tz = svm_image_texture_frac(z - 0.5f, &iz);
+    pix = ix - 1;
+    piy = iy - 1;
+    piz = iz - 1;
+    nix = ix + 1;
+    niy = iy + 1;
+    niz = iz + 1;
+    nnix = ix + 2;
+    nniy = iy + 2;
+    nniz = iz + 2;
+
+    const int xc[4] = {pix, ix, nix, nnix};
+    const int yc[4] = {piy, iy, niy, nniy};
+    const int zc[4] = {piz, iz, niz, nniz};
+    float u[4], v[4], w[4];
+
+    /* Some helper macro to keep code reasonable size,
+     * let compiler to inline all the matrix multiplications.
+     */
+#  define DATA(x, y, z) (read(acc.getValue(nanovdb::Coord(xc[x], yc[y], zc[z]))))
+#  define COL_TERM(col, row) \
+    (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \
+               u[3] * DATA(3, col, row)))
+#  define ROW_TERM(row) \
+    (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row)))
+
+    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+    SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+    /* Actual interpolation. */
+    return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#  undef COL_TERM
+#  undef ROW_TERM
+#  undef DATA
+  }
+
+  static ccl_always_inline float4
+  interp_3d(const TextureInfo &info, float x, float y, float z, int interp)
+  {
+    using namespace nanovdb;
+
+    NanoGrid<T> *const grid = (NanoGrid<T> *)info.data;
+    AccessorType acc = grid->getAccessor();
+
+    switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) {
+      case INTERPOLATION_CLOSEST:
+        return interp_3d_closest(acc, x, y, z);
+      case INTERPOLATION_LINEAR:
+        return interp_3d_linear(acc, x, y, z);
+      default:
+        return interp_3d_cubic(acc, x, y, z);
+    }
+  }
+};
+#endif /* WITH_NANOVDB */
+
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals, int id, float3 P, int interp)
+{
+  const TextureInfo &info = kernel_data_fetch(texture_info, id);
+
+  if (info.use_transform_3d) {
+    Transform tfm = info.transform_3d;
+    P = transform_point(&tfm, P);
+  }
+
+  float x = P.x;
+  float y = P.y;
+  float z = P.z;
+
+  uint interpolation = (interp == INTERPOLATION_NONE) ? info.interpolation : interp;
+
+#ifdef WITH_NANOVDB
+  if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT) {
+    return NanoVDBInterpolator<float>::interp_3d(info, x, y, z, interpolation);
+  }
+  else if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
+    return NanoVDBInterpolator<nanovdb::Vec3f>::interp_3d(info, x, y, z, interpolation);
+  }
+  else if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FPN) {
+    return NanoVDBInterpolator<nanovdb::FpN>::interp_3d(info, x, y, z, interpolation);
+  }
+  else if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FP16) {
+    return NanoVDBInterpolator<nanovdb::Fp16>::interp_3d(info, x, y, z, interpolation);
+  }
+#else
+  if (info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT ||
+      info.data_type == IMAGE_DATA_TYPE_NANOVDB_FLOAT3 ||
+      info.data_type == IMAGE_DATA_TYPE_NANOVDB_FPN ||
+      info.data_type == IMAGE_DATA_TYPE_NANOVDB_FP16) {
+    return make_float4(
+        TEX_IMAGE_MISSING_R, TEX_IMAGE_MISSING_G, TEX_IMAGE_MISSING_B, TEX_IMAGE_MISSING_A);
+  }
+#endif
+  else {
+    if (info.extension == EXTENSION_CLIP) {
+      if (x < 0.0f || y < 0.0f || z < 0.0f || x > 1.0f || y > 1.0f || z > 1.0f) {
+        return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+      }
+    }
+
+    x *= info.width;
+    y *= info.height;
+    z *= info.depth;
+  }
+
+  if (interpolation == INTERPOLATION_CLOSEST) {
+    /* Closest interpolation. */
+    int ix, iy, iz;
+    svm_image_texture_frac(x, &ix);
+    svm_image_texture_frac(y, &iy);
+    svm_image_texture_frac(z, &iz);
+
+    return svm_image_texture_read_3d(id, ix, iy, iz);
+  }
+  else if (interpolation == INTERPOLATION_LINEAR) {
+    /* Trilinear interpolation. */
+    int ix, iy, iz;
+    float tx = svm_image_texture_frac(x - 0.5f, &ix);
+    float ty = svm_image_texture_frac(y - 0.5f, &iy);
+    float tz = svm_image_texture_frac(z - 0.5f, &iz);
+
+    float4 r;
+    r = (1.0f - tz) * (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy, iz);
+    r += (1.0f - tz) * (1.0f - ty) * tx * svm_image_texture_read_3d(id, ix + 1, iy, iz);
+    r += (1.0f - tz) * ty * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy + 1, iz);
+    r += (1.0f - tz) * ty * tx * svm_image_texture_read_3d(id, ix + 1, iy + 1, iz);
+
+    r += tz * (1.0f - ty) * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy, iz + 1);
+    r += tz * (1.0f - ty) * tx * svm_image_texture_read_3d(id, ix + 1, iy, iz + 1);
+    r += tz * ty * (1.0f - tx) * svm_image_texture_read_3d(id, ix, iy + 1, iz + 1);
+    r += tz * ty * tx * svm_image_texture_read_3d(id, ix + 1, iy + 1, iz + 1);
+    return r;
+  }
+  else {
+    /* Tricubic interpolation. */
+    int ix, iy, iz;
+    float tx = svm_image_texture_frac(x - 0.5f, &ix);
+    float ty = svm_image_texture_frac(y - 0.5f, &iy);
+    float tz = svm_image_texture_frac(z - 0.5f, &iz);
+
+    float u[4], v[4], w[4];
+    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+    SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+    float4 r = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+
+    for (int z = 0; z < 4; z++) {
+      for (int y = 0; y < 4; y++) {
+        for (int x = 0; x < 4; x++) {
+          float weight = u[x] * v[y] * w[z];
+          r += weight * svm_image_texture_read_3d(id, ix + x - 1, iy + y - 1, iz + z - 1);
+        }
+      }
+    }
+    return r;
+  }
+}
+
+#undef SET_CUBIC_SPLINE_WEIGHTS
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/oneapi/kernel.cpp b/intern/cycles/kernel/device/oneapi/kernel.cpp
new file mode 100644
index 00000000000..62affe6e58e
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -0,0 +1,884 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#ifdef WITH_ONEAPI
+
+/* clang-format off */
+#  include "kernel.h"
+#  include <iostream>
+#  include <map>
+#  include <set>
+
+#  include <level_zero/ze_api.h>
+#  include <CL/sycl.hpp>
+#  include <ext/oneapi/backend/level_zero.hpp>
+
+#  include "kernel/device/oneapi/compat.h"
+#  include "kernel/device/oneapi/device_id.h"
+#  include "kernel/device/oneapi/globals.h"
+#  include "kernel/device/oneapi/kernel_templates.h"
+
+#  include "kernel/device/gpu/kernel.h"
+/* clang-format on */
+
+static OneAPIErrorCallback s_error_cb = nullptr;
+static void *s_error_user_ptr = nullptr;
+
+static std::vector<sycl::device> oneapi_available_devices();
+
+void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr)
+{
+  s_error_cb = cb;
+  s_error_user_ptr = user_ptr;
+}
+
+void oneapi_check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
+{
+#  ifdef _DEBUG
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  sycl::info::device_type device_type =
+      queue->get_device().get_info<sycl::info::device::device_type>();
+  sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
+  (void)usm_type;
+  assert(usm_type == sycl::usm::alloc::device ||
+         ((device_type == sycl::info::device_type::host ||
+           device_type == sycl::info::device_type::is_cpu || allow_host) &&
+          usm_type == sycl::usm::alloc::host));
+#  endif
+}
+
+bool oneapi_create_queue(SyclQueue *&external_queue, int device_index)
+{
+  bool finished_correct = true;
+  try {
+    std::vector<sycl::device> devices = oneapi_available_devices();
+    if (device_index < 0 || device_index >= devices.size()) {
+      return false;
+    }
+    sycl::queue *created_queue = new sycl::queue(devices[device_index],
+                                                 sycl::property::queue::in_order());
+    external_queue = reinterpret_cast<SyclQueue *>(created_queue);
+  }
+  catch (sycl::exception const &e) {
+    finished_correct = false;
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+    }
+  }
+  return finished_correct;
+}
+
+void oneapi_free_queue(SyclQueue *queue_)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  delete queue;
+}
+
+void *oneapi_usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  return sycl::aligned_alloc_host(alignment, memory_size, *queue);
+}
+
+void *oneapi_usm_alloc_device(SyclQueue *queue_, size_t memory_size)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  return sycl::malloc_device(memory_size, *queue);
+}
+
+void oneapi_usm_free(SyclQueue *queue_, void *usm_ptr)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  oneapi_check_usm(queue_, usm_ptr, true);
+  sycl::free(usm_ptr, *queue);
+}
+
+bool oneapi_usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  oneapi_check_usm(queue_, dest, true);
+  oneapi_check_usm(queue_, src, true);
+  try {
+    sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
+    mem_event.wait_and_throw();
+    return true;
+  }
+  catch (sycl::exception const &e) {
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+    }
+    return false;
+  }
+}
+
+bool oneapi_usm_memset(SyclQueue *queue_, void *usm_ptr, unsigned char value, size_t num_bytes)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  oneapi_check_usm(queue_, usm_ptr, true);
+  try {
+    sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
+    mem_event.wait_and_throw();
+    return true;
+  }
+  catch (sycl::exception const &e) {
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+    }
+    return false;
+  }
+}
+
+bool oneapi_queue_synchronize(SyclQueue *queue_)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  try {
+    queue->wait_and_throw();
+    return true;
+  }
+  catch (sycl::exception const &e) {
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+    }
+    return false;
+  }
+}
+
+/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and
+ * also trigger runtime compilation of all existing oneAPI kernels */
+bool oneapi_run_test_kernel(SyclQueue *queue_)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  size_t N = 8;
+  sycl::buffer<float, 1> A(N);
+  sycl::buffer<float, 1> B(N);
+
+  {
+    sycl::host_accessor A_host_acc(A, sycl::write_only);
+    for (size_t i = (size_t)0; i < N; i++)
+      A_host_acc[i] = rand() % 32;
+  }
+
+  try {
+    queue->submit([&](sycl::handler &cgh) {
+      sycl::accessor A_acc(A, cgh, sycl::read_only);
+      sycl::accessor B_acc(B, cgh, sycl::write_only, sycl::no_init);
+
+      cgh.parallel_for(N, [=](sycl::id<1> idx) { B_acc[idx] = A_acc[idx] + idx.get(0); });
+    });
+    queue->wait_and_throw();
+
+    sycl::host_accessor A_host_acc(A, sycl::read_only);
+    sycl::host_accessor B_host_acc(B, sycl::read_only);
+
+    for (size_t i = (size_t)0; i < N; i++) {
+      float result = A_host_acc[i] + B_host_acc[i];
+      (void)result;
+    }
+  }
+  catch (sycl::exception const &e) {
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+    }
+    return false;
+  }
+
+  return true;
+}
+
+bool oneapi_kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size)
+{
+  kernel_global_size = sizeof(KernelGlobalsGPU);
+
+  return true;
+}
+
+void oneapi_set_global_memory(SyclQueue *queue_,
+                              void *kernel_globals,
+                              const char *memory_name,
+                              void *memory_device_pointer)
+{
+  assert(queue_);
+  assert(kernel_globals);
+  assert(memory_name);
+  assert(memory_device_pointer);
+  KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
+  oneapi_check_usm(queue_, memory_device_pointer);
+  oneapi_check_usm(queue_, kernel_globals, true);
+
+  std::string matched_name(memory_name);
+
+/* This macro will change global ptr of KernelGlobals via name matching. */
+#  define KERNEL_DATA_ARRAY(type, name) \
+    else if (#name == matched_name) \
+    { \
+      globals->__##name = (type *)memory_device_pointer; \
+      return; \
+    }
+  if (false) {
+  }
+  else if ("integrator_state" == matched_name) {
+    globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
+    return;
+  }
+  KERNEL_DATA_ARRAY(KernelData, data)
+#  include "kernel/data_arrays.h"
+  else
+  {
+    std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
+              << std::endl;
+    assert(false);
+  }
+#  undef KERNEL_DATA_ARRAY
+}
+
+/* TODO: Move device information to OneapiDevice initialized on creation and use it. */
+/* TODO: Move below function to oneapi/queue.cpp. */
+size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_,
+                                          const DeviceKernel kernel,
+                                          const size_t kernel_global_size)
+{
+  assert(queue_);
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+  (void)kernel_global_size;
+  const static size_t preferred_work_group_size_intersect_shading = 32;
+  const static size_t preferred_work_group_size_technical = 1024;
+
+  size_t preferred_work_group_size = 0;
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA:
+    case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
+    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+      preferred_work_group_size = preferred_work_group_size_intersect_shading;
+      break;
+
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES:
+    case DEVICE_KERNEL_INTEGRATOR_RESET:
+    case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS:
+      preferred_work_group_size = preferred_work_group_size_technical;
+      break;
+
+    default:
+      preferred_work_group_size = 512;
+  }
+
+  const size_t limit_work_group_size =
+      queue->get_device().get_info<sycl::info::device::max_work_group_size>();
+  return std::min(limit_work_group_size, preferred_work_group_size);
+}
+
+bool oneapi_enqueue_kernel(KernelContext *kernel_context,
+                           int kernel,
+                           size_t global_size,
+                           void **args)
+{
+  bool success = true;
+  ::DeviceKernel device_kernel = (::DeviceKernel)kernel;
+  KernelGlobalsGPU *kg = (KernelGlobalsGPU *)kernel_context->kernel_globals;
+  sycl::queue *queue = reinterpret_cast<sycl::queue *>(kernel_context->queue);
+  assert(queue);
+  if (!queue) {
+    return false;
+  }
+
+  size_t local_size = oneapi_kernel_preferred_local_size(
+      kernel_context->queue, device_kernel, global_size);
+  assert(global_size % local_size == 0);
+
+  /* Local size for DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY needs to be enforced so we
+   * overwrite it outside of oneapi_kernel_preferred_local_size. */
+  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY) {
+    local_size = GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE;
+  }
+
+  /* Kernels listed below need a specific number of work groups. */
+  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY ||
+      device_kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY ||
+      device_kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY ||
+      device_kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY ||
+      device_kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY ||
+      device_kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY ||
+      device_kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY) {
+    int num_states = *((int *)(args[0]));
+    /* Round up to the next work-group. */
+    size_t groups_count = (num_states + local_size - 1) / local_size;
+    /* NOTE(@nsirgien): As for now non-uniform workgroups don't work on most oneAPI devices, we
+     * extend work size to fit uniformity requirements. */
+    global_size = groups_count * local_size;
+
+#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+    if (queue->get_device().is_host()) {
+      global_size = 1;
+      local_size = 1;
+    }
+#  endif
+  }
+
+  /* Let the compiler throw an error if there are any kernels missing in this implementation. */
+#  if defined(_WIN32)
+#    pragma warning(error : 4062)
+#  elif defined(__GNUC__)
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic error "-Wswitch"
+#  endif
+
+  try {
+    queue->submit([&](sycl::handler &cgh) {
+      switch (device_kernel) {
+        case DEVICE_KERNEL_INTEGRATOR_RESET: {
+          oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_reset);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_init_from_camera);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_init_from_bake);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_intersect_closest);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_intersect_shadow);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_intersect_subsurface);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_intersect_volume_stack);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_background);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_light);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_shadow);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_surface);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_shade_surface_raytrace);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_surface_mnee);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_volume);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_queued_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_queued_shadow_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_active_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_terminated_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_terminated_shadow_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sorted_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_compact_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_compact_shadow_paths_array);
+          break;
+        }
+        case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_adaptive_sampling_convergence_check);
+          break;
+        }
+        case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_adaptive_sampling_filter_x);
+          break;
+        }
+        case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_adaptive_sampling_filter_y);
+          break;
+        }
+        case DEVICE_KERNEL_SHADER_EVAL_DISPLACE: {
+          oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_shader_eval_displace);
+          break;
+        }
+        case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_shader_eval_background);
+          break;
+        }
+        case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_shader_eval_curve_shadow_transparency);
+          break;
+        }
+        case DEVICE_KERNEL_PREFIX_SUM: {
+          oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_prefix_sum);
+          break;
+        }
+
+        /* clang-format off */
+    #  define DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT, variant) \
+    case DEVICE_KERNEL_FILM_CONVERT_##VARIANT: { \
+      oneapi_call(kg, cgh, \
+                            global_size, \
+                            local_size, \
+                            args, \
+                            oneapi_kernel_film_convert_##variant); \
+      break; \
+     }
+
+#  define DEVICE_KERNEL_FILM_CONVERT(variant, VARIANT) \
+      DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT, variant) \
+      DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT##_HALF_RGBA, variant##_half_rgba)
+
+      DEVICE_KERNEL_FILM_CONVERT(depth, DEPTH);
+      DEVICE_KERNEL_FILM_CONVERT(mist, MIST);
+      DEVICE_KERNEL_FILM_CONVERT(sample_count, SAMPLE_COUNT);
+      DEVICE_KERNEL_FILM_CONVERT(float, FLOAT);
+      DEVICE_KERNEL_FILM_CONVERT(light_path, LIGHT_PATH);
+      DEVICE_KERNEL_FILM_CONVERT(float3, FLOAT3);
+      DEVICE_KERNEL_FILM_CONVERT(motion, MOTION);
+      DEVICE_KERNEL_FILM_CONVERT(cryptomatte, CRYPTOMATTE);
+      DEVICE_KERNEL_FILM_CONVERT(shadow_catcher, SHADOW_CATCHER);
+      DEVICE_KERNEL_FILM_CONVERT(shadow_catcher_matte_with_shadow,
+                                 SHADOW_CATCHER_MATTE_WITH_SHADOW);
+      DEVICE_KERNEL_FILM_CONVERT(combined, COMBINED);
+      DEVICE_KERNEL_FILM_CONVERT(float4, FLOAT4);
+
+#  undef DEVICE_KERNEL_FILM_CONVERT
+#  undef DEVICE_KERNEL_FILM_CONVERT_PARTIAL
+          /* clang-format on */
+
+        case DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_filter_guiding_preprocess);
+          break;
+        }
+        case DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_filter_guiding_set_fake_albedo);
+          break;
+        }
+        case DEVICE_KERNEL_FILTER_COLOR_PREPROCESS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_filter_color_preprocess);
+          break;
+        }
+        case DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_filter_color_postprocess);
+          break;
+        }
+        case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_cryptomatte_postprocess);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_compact_states);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_compact_shadow_states);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS: {
+          oneapi_call(kg,
+                      cgh,
+                      global_size,
+                      local_size,
+                      args,
+                      oneapi_kernel_integrator_shadow_catcher_count_possible_splits);
+          break;
+        }
+        /* Unsupported kernels */
+        case DEVICE_KERNEL_NUM:
+        case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
+          assert(0);
+          return false;
+      }
+
+      /* Unknown kernel. */
+      assert(0);
+      return false;
+    });
+  }
+  catch (sycl::exception const &e) {
+    if (s_error_cb) {
+      s_error_cb(e.what(), s_error_user_ptr);
+      success = false;
+    }
+  }
+
+#  if defined(_WIN32)
+#    pragma warning(default : 4062)
+#  elif defined(__GNUC__)
+#    pragma GCC diagnostic pop
+#  endif
+  return success;
+}
+
+static const int lowest_supported_driver_version_win = 1011660;
+static const int lowest_supported_driver_version_neo = 20066;
+
+static int parse_driver_build_version(const sycl::device &device)
+{
+  const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
+  int driver_build_version = 0;
+
+  size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
+  if (second_dot_position == std::string::npos) {
+    std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
+              << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
+              << " xx.xx.xxx.xxxx (Windows) for device \""
+              << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
+  }
+  else {
+    try {
+      size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
+      if (third_dot_position != std::string::npos) {
+        const std::string &third_number_substr = driver_version.substr(
+            second_dot_position + 1, third_dot_position - second_dot_position - 1);
+        const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
+        if (third_number_substr.length() == 3 && forth_number_substr.length() == 4)
+          driver_build_version = std::stoi(third_number_substr) * 10000 +
+                                 std::stoi(forth_number_substr);
+      }
+      else {
+        const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
+        driver_build_version = std::stoi(third_number_substr);
+      }
+    }
+    catch (std::invalid_argument &e) {
+      std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
+                << "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
+                << " xx.xx.xxx.xxxx (Windows) for device \""
+                << device.get_info<sycl::info::device::name>() << "\"." << std::endl;
+    }
+  }
+
+  return driver_build_version;
+}
+
+static std::vector<sycl::device> oneapi_available_devices()
+{
+  bool allow_all_devices = false;
+  if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
+    allow_all_devices = true;
+
+    /* Host device is useful only for debugging at the moment
+     * so we hide this device with default build settings. */
+#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+  bool allow_host = true;
+#  else
+  bool allow_host = false;
+#  endif
+
+  const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
+
+  std::vector<sycl::device> available_devices;
+  for (const sycl::platform &platform : oneapi_platforms) {
+    /* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL.
+     */
+    if (platform.get_backend() == sycl::backend::opencl) {
+      continue;
+    }
+
+    const std::vector<sycl::device> &oneapi_devices =
+        (allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
+                                            platform.get_devices(sycl::info::device_type::gpu);
+
+    for (const sycl::device &device : oneapi_devices) {
+      if (allow_all_devices) {
+        /* still filter out host device if build doesn't support it. */
+        if (allow_host || !device.is_host()) {
+          available_devices.push_back(device);
+        }
+      }
+      else {
+        bool filter_out = false;
+
+        /* For now we support all Intel(R) Arc(TM) devices
+         * and any future GPU with more than 128 execution units
+         * official support can be broaden to older and smaller GPUs once ready. */
+        if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) {
+          ze_device_handle_t ze_device = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
+              device);
+          ze_device_properties_t props = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES};
+          zeDeviceGetProperties(ze_device, &props);
+          bool is_dg2 = (intel_arc_alchemist_device_ids.find(props.deviceId) !=
+                         intel_arc_alchemist_device_ids.end());
+          int number_of_eus = props.numEUsPerSubslice * props.numSubslicesPerSlice *
+                              props.numSlices;
+          if (!is_dg2 || number_of_eus < 128)
+            filter_out = true;
+
+          /* if not already filtered out, check driver version. */
+          if (!filter_out) {
+            int driver_build_version = parse_driver_build_version(device);
+            if ((driver_build_version > 100000 &&
+                 driver_build_version < lowest_supported_driver_version_win) ||
+                (driver_build_version > 0 &&
+                 driver_build_version < lowest_supported_driver_version_neo)) {
+              filter_out = true;
+            }
+          }
+        }
+        else if (!allow_host && device.is_host()) {
+          filter_out = true;
+        }
+        else if (!allow_all_devices) {
+          filter_out = true;
+        }
+
+        if (!filter_out) {
+          available_devices.push_back(device);
+        }
+      }
+    }
+  }
+
+  return available_devices;
+}
+
+char *oneapi_device_capabilities()
+{
+  std::stringstream capabilities;
+
+  const std::vector<sycl::device> &oneapi_devices = oneapi_available_devices();
+  for (const sycl::device &device : oneapi_devices) {
+    const std::string &name = device.get_info<sycl::info::device::name>();
+
+    capabilities << std::string("\t") << name << "\n";
+#  define WRITE_ATTR(attribute_name, attribute_variable) \
+    capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
+                 << "\n";
+#  define GET_NUM_ATTR(attribute) \
+    { \
+      size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \
+      capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \
+    }
+
+    GET_NUM_ATTR(vendor_id)
+    GET_NUM_ATTR(max_compute_units)
+    GET_NUM_ATTR(max_work_item_dimensions)
+
+    sycl::id<3> max_work_item_sizes = device.get_info<sycl::info::device::max_work_item_sizes>();
+    WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0)))
+    WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1)))
+    WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2)))
+
+    GET_NUM_ATTR(max_work_group_size)
+    GET_NUM_ATTR(max_num_sub_groups)
+    GET_NUM_ATTR(sub_group_independent_forward_progress)
+
+    GET_NUM_ATTR(preferred_vector_width_char)
+    GET_NUM_ATTR(preferred_vector_width_short)
+    GET_NUM_ATTR(preferred_vector_width_int)
+    GET_NUM_ATTR(preferred_vector_width_long)
+    GET_NUM_ATTR(preferred_vector_width_float)
+    GET_NUM_ATTR(preferred_vector_width_double)
+    GET_NUM_ATTR(preferred_vector_width_half)
+
+    GET_NUM_ATTR(native_vector_width_char)
+    GET_NUM_ATTR(native_vector_width_short)
+    GET_NUM_ATTR(native_vector_width_int)
+    GET_NUM_ATTR(native_vector_width_long)
+    GET_NUM_ATTR(native_vector_width_float)
+    GET_NUM_ATTR(native_vector_width_double)
+    GET_NUM_ATTR(native_vector_width_half)
+
+    size_t max_clock_frequency =
+        (size_t)(device.is_host() ? (size_t)0 :
+                                    device.get_info<sycl::info::device::max_clock_frequency>());
+    WRITE_ATTR("max_clock_frequency", max_clock_frequency)
+
+    GET_NUM_ATTR(address_bits)
+    GET_NUM_ATTR(max_mem_alloc_size)
+
+    /* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't
+     * supported so we always return false, even if device supports HW texture usage acceleration.
+     */
+    bool image_support = false;
+    WRITE_ATTR("image_support", (size_t)image_support)
+
+    GET_NUM_ATTR(max_parameter_size)
+    GET_NUM_ATTR(mem_base_addr_align)
+    GET_NUM_ATTR(global_mem_size)
+    GET_NUM_ATTR(local_mem_size)
+    GET_NUM_ATTR(error_correction_support)
+    GET_NUM_ATTR(profiling_timer_resolution)
+    GET_NUM_ATTR(is_available)
+
+#  undef GET_NUM_ATTR
+#  undef WRITE_ATTR
+    capabilities << "\n";
+  }
+
+  return ::strdup(capabilities.str().c_str());
+}
+
+void oneapi_free(void *p)
+{
+  if (p) {
+    ::free(p);
+  }
+}
+
+void oneapi_iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
+{
+  int num = 0;
+  std::vector<sycl::device> devices = oneapi_available_devices();
+  for (sycl::device &device : devices) {
+    const std::string &platform_name =
+        device.get_platform().get_info<sycl::info::platform::name>();
+    std::string name = device.get_info<sycl::info::device::name>();
+    std::string id = "ONEAPI_" + platform_name + "_" + name;
+    (cb)(id.c_str(), name.c_str(), num, user_ptr);
+    num++;
+  }
+}
+
+size_t oneapi_get_memcapacity(SyclQueue *queue)
+{
+  return reinterpret_cast<sycl::queue *>(queue)
+      ->get_device()
+      .get_info<sycl::info::device::global_mem_size>();
+}
+
+size_t oneapi_get_compute_units_amount(SyclQueue *queue)
+{
+  return reinterpret_cast<sycl::queue *>(queue)
+      ->get_device()
+      .get_info<sycl::info::device::max_compute_units>();
+}
+
+#endif /* WITH_ONEAPI */
diff --git a/intern/cycles/kernel/device/oneapi/kernel.h b/intern/cycles/kernel/device/oneapi/kernel.h
new file mode 100644
index 00000000000..c5f853742ed
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/kernel.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Intel Corporation */
+
+#pragma once
+
+#ifdef WITH_ONEAPI
+
+#  include <stddef.h>
+
+/* NOTE(@nsirgien): Should match underlying type in the declaration inside "kernel/types.h"
+ * TODO: use kernel/types.h directly. */
+enum DeviceKernel : int;
+
+#  ifndef CYCLES_KERNEL_ONEAPI_EXPORT
+#    ifdef _WIN32
+#      if defined(ONEAPI_EXPORT)
+#        define CYCLES_KERNEL_ONEAPI_EXPORT extern __declspec(dllexport)
+#      else
+#        define CYCLES_KERNEL_ONEAPI_EXPORT extern __declspec(dllimport)
+#      endif
+#    else
+#      define CYCLES_KERNEL_ONEAPI_EXPORT
+#    endif
+#  endif
+
+class SyclQueue;
+
+typedef void (*OneAPIDeviceIteratorCallback)(const char *id,
+                                             const char *name,
+                                             int num,
+                                             void *user_ptr);
+
+typedef void (*OneAPIErrorCallback)(const char *error, void *user_ptr);
+
+struct KernelContext {
+  /* Queue, associated with selected device */
+  SyclQueue *queue;
+  /* Pointer to USM device memory with all global/constant allocation on this device */
+  void *kernel_globals;
+};
+
+/* Use extern C linking so that the symbols can be easily load from the dynamic library at runtime.
+ */
+#  ifdef __cplusplus
+extern "C" {
+#  endif
+
+#  define DLL_INTERFACE_CALL(function, return_type, ...) \
+    CYCLES_KERNEL_ONEAPI_EXPORT return_type function(__VA_ARGS__);
+#  include "kernel/device/oneapi/dll_interface_template.h"
+#  undef DLL_INTERFACE_CALL
+
+#  ifdef __cplusplus
+}
+#  endif
+
+#endif /* WITH_ONEAPI */
diff --git a/intern/cycles/kernel/device/oneapi/kernel_templates.h b/intern/cycles/kernel/device/oneapi/kernel_templates.h
new file mode 100644
index 00000000000..2dfc96292ed
--- /dev/null
+++ b/intern/cycles/kernel/device/oneapi/kernel_templates.h
@@ -0,0 +1,121 @@
+#pragma once
+
+/* Some macro magic to generate templates for kernel arguments.
+   The resulting oneapi_call() template allows to call a SYCL/C++ kernel
+   with typed arguments by only giving it a void **args as given by Cycles.
+   The template will automatically cast from void* to the expectd type.
+   */
+
+/* When expanded by the preprocessor, the generated templates will look like this example: */
+#if 0
+template<typename T0, typename T1, typename T2>
+void oneapi_call(
+    KernelGlobalsGPU *kg,
+    sycl::handler &cgh,
+    size_t global_size,
+    size_t local_size,
+    void **args,
+    void (*func)(const KernelGlobalsGPU *, size_t, size_t, sycl::handler &, T0, T1, T2))
+{
+  func(kg, global_size, local_size, cgh, *(T0 *)(args[0]), *(T1 *)(args[1]), *(T2 *)(args[2]));
+}
+#endif
+
+/* clang-format off */
+#define ONEAPI_TYP(x) typename T##x
+#define ONEAPI_CAST(x) *(T##x *)(args[x])
+#define ONEAPI_T(x) T##x
+
+#define ONEAPI_GET_NTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, N,  ...) N
+#define ONEAPI_0(_call, ...)
+#define ONEAPI_1(_call, x) _call(x)
+#define ONEAPI_2(_call, x, ...) _call(x), ONEAPI_1(_call, __VA_ARGS__)
+#define ONEAPI_3(_call, x, ...) _call(x), ONEAPI_2(_call, __VA_ARGS__)
+#define ONEAPI_4(_call, x, ...) _call(x), ONEAPI_3(_call, __VA_ARGS__)
+#define ONEAPI_5(_call, x, ...) _call(x), ONEAPI_4(_call, __VA_ARGS__)
+#define ONEAPI_6(_call, x, ...) _call(x), ONEAPI_5(_call, __VA_ARGS__)
+#define ONEAPI_7(_call, x, ...) _call(x), ONEAPI_6(_call, __VA_ARGS__)
+#define ONEAPI_8(_call, x, ...) _call(x), ONEAPI_7(_call, __VA_ARGS__)
+#define ONEAPI_9(_call, x, ...) _call(x), ONEAPI_8(_call, __VA_ARGS__)
+#define ONEAPI_10(_call, x, ...) _call(x), ONEAPI_9(_call, __VA_ARGS__)
+#define ONEAPI_11(_call, x, ...) _call(x), ONEAPI_10(_call, __VA_ARGS__)
+#define ONEAPI_12(_call, x, ...) _call(x), ONEAPI_11(_call, __VA_ARGS__)
+#define ONEAPI_13(_call, x, ...) _call(x), ONEAPI_12(_call, __VA_ARGS__)
+#define ONEAPI_14(_call, x, ...) _call(x), ONEAPI_13(_call, __VA_ARGS__)
+#define ONEAPI_15(_call, x, ...) _call(x), ONEAPI_14(_call, __VA_ARGS__)
+#define ONEAPI_16(_call, x, ...) _call(x), ONEAPI_15(_call, __VA_ARGS__)
+#define ONEAPI_17(_call, x, ...) _call(x), ONEAPI_16(_call, __VA_ARGS__)
+#define ONEAPI_18(_call, x, ...) _call(x), ONEAPI_17(_call, __VA_ARGS__)
+#define ONEAPI_19(_call, x, ...) _call(x), ONEAPI_18(_call, __VA_ARGS__)
+#define ONEAPI_20(_call, x, ...) _call(x), ONEAPI_19(_call, __VA_ARGS__)
+#define ONEAPI_21(_call, x, ...) _call(x), ONEAPI_20(_call, __VA_ARGS__)
+
+#define ONEAPI_CALL_FOR(x, ...) \
+  ONEAPI_GET_NTH_ARG("ignored", \
+                     ##__VA_ARGS__, \
+                     ONEAPI_21, \
+                     ONEAPI_20, \
+                     ONEAPI_19, \
+                     ONEAPI_18, \
+                     ONEAPI_17, \
+                     ONEAPI_16, \
+                     ONEAPI_15, \
+                     ONEAPI_14, \
+                     ONEAPI_13, \
+                     ONEAPI_12, \
+                     ONEAPI_11, \
+                     ONEAPI_10, \
+                     ONEAPI_9, \
+                     ONEAPI_8, \
+                     ONEAPI_7, \
+                     ONEAPI_6, \
+                     ONEAPI_5, \
+                     ONEAPI_4, \
+                     ONEAPI_3, \
+                     ONEAPI_2, \
+                     ONEAPI_1, \
+                     ONEAPI_0) \
+  (x, ##__VA_ARGS__)
+
+/* This template automatically casts entries in the void **args array to the types requested by the kernel func.
+   Since kernel parameters are passed as void ** to the device, this is the closest that we have to type safety. */
+#define oneapi_template(...) \
+  template<ONEAPI_CALL_FOR(ONEAPI_TYP, __VA_ARGS__)> \
+  void oneapi_call( \
+      KernelGlobalsGPU *kg, \
+      sycl::handler &cgh, \
+      size_t global_size, \
+      size_t local_size, \
+      void **args, \
+      void (*func)(KernelGlobalsGPU*, size_t, size_t, sycl::handler &, ONEAPI_CALL_FOR(ONEAPI_T, __VA_ARGS__))) \
+  { \
+        func(kg, \
+             global_size, \
+             local_size, \
+             cgh, \
+             ONEAPI_CALL_FOR(ONEAPI_CAST, __VA_ARGS__)); \
+  }
+
+oneapi_template(0)
+oneapi_template(0, 1)
+oneapi_template(0, 1, 2)
+oneapi_template(0, 1, 2, 3)
+oneapi_template(0, 1, 2, 3, 4)
+oneapi_template(0, 1, 2, 3, 4, 5)
+oneapi_template(0, 1, 2, 3, 4, 5, 6)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19)
+oneapi_template(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)
+
+    /* clang-format on */
diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h
index ad022716207..f2e61d25002 100644
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -1571,7 +1571,7 @@ static_assert_align(KernelShaderEvalInput, 16);
  * If the kernel uses shared CUDA memory, `CUDADeviceQueue::enqueue` is to be modified.
  * The path iteration kernels are handled in `PathTraceWorkGPU::enqueue_path_iteration`. */
 
-typedef enum DeviceKernel {
+typedef enum DeviceKernel : int {
   DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA = 0,
   DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE,
   DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST,
diff --git a/intern/cycles/util/atomic.h b/intern/cycles/util/atomic.h
index f89eb28b0b7..1ebf085ae13 100644
--- a/intern/cycles/util/atomic.h
+++ b/intern/cycles/util/atomic.h
@@ -106,6 +106,116 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float
 
 #  endif /* __KERNEL_METAL__ */
 
+#  ifdef __KERNEL_ONEAPI__
+
+ccl_device_inline float atomic_add_and_fetch_float(ccl_global float *p, float x)
+{
+  sycl::atomic_ref<float,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::ext_intel_global_device_space>
+      atomic(*p);
+  return atomic.fetch_add(x);
+}
+
+ccl_device_inline float atomic_compare_and_swap_float(ccl_global float *source,
+                                                      float old_val,
+                                                      float new_val)
+{
+  sycl::atomic_ref<float,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::ext_intel_global_device_space>
+      atomic(*source);
+  atomic.compare_exchange_weak(old_val, new_val);
+  return old_val;
+}
+
+ccl_device_inline unsigned int atomic_fetch_and_add_uint32(ccl_global unsigned int *p,
+                                                           unsigned int x)
+{
+  sycl::atomic_ref<unsigned int,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::ext_intel_global_device_space>
+      atomic(*p);
+  return atomic.fetch_add(x);
+}
+
+ccl_device_inline int atomic_fetch_and_add_uint32(ccl_global int *p, int x)
+{
+  sycl::atomic_ref<int,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::ext_intel_global_device_space>
+      atomic(*p);
+  return atomic.fetch_add(x);
+}
+
+ccl_device_inline unsigned int atomic_fetch_and_sub_uint32(ccl_global unsigned int *p,
+                                                           unsigned int x)
+{
+  sycl::atomic_ref<unsigned int,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::ext_intel_global_device_space>
+      atomic(*p);
+  return atomic.fetch_sub(x);
+}
+
+ccl_device_inline int atomic_fetch_and_sub_uint32(ccl_global int *p, int x)
+{
+  sycl::atomic_ref<int,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::ext_intel_global_device_space>
+      atomic(*p);
+  return atomic.fetch_sub(x);
+}
+
+ccl_device_inline unsigned int atomic_fetch_and_inc_uint32(ccl_global unsigned int *p)
+{
+  return atomic_fetch_and_add_uint32(p, 1);
+}
+
+ccl_device_inline int atomic_fetch_and_inc_uint32(ccl_global int *p)
+{
+  return atomic_fetch_and_add_uint32(p, 1);
+}
+
+ccl_device_inline unsigned int atomic_fetch_and_dec_uint32(ccl_global unsigned int *p)
+{
+  return atomic_fetch_and_sub_uint32(p, 1);
+}
+
+ccl_device_inline int atomic_fetch_and_dec_uint32(ccl_global int *p)
+{
+  return atomic_fetch_and_sub_uint32(p, 1);
+}
+
+ccl_device_inline unsigned int atomic_fetch_and_or_uint32(ccl_global unsigned int *p,
+                                                          unsigned int x)
+{
+  sycl::atomic_ref<unsigned int,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::ext_intel_global_device_space>
+      atomic(*p);
+  return atomic.fetch_or(x);
+}
+
+ccl_device_inline int atomic_fetch_and_or_uint32(ccl_global int *p, int x)
+{
+  sycl::atomic_ref<int,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::ext_intel_global_device_space>
+      atomic(*p);
+  return atomic.fetch_or(x);
+}
+
+#  endif /* __KERNEL_ONEAPI__ */
+
 #endif /* __KERNEL_GPU__ */
 
 #endif /* __UTIL_ATOMIC_H__ */
diff --git a/intern/cycles/util/half.h b/intern/cycles/util/half.h
index 434bc12d670..c668638eb02 100644
--- a/intern/cycles/util/half.h
+++ b/intern/cycles/util/half.h
@@ -35,7 +35,7 @@ ccl_device_inline float half_to_float(half h_in)
 #else
 
 /* CUDA has its own half data type, no need to define then */
-#  if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__)
+#  if !defined(__KERNEL_CUDA__) && !defined(__KERNEL_HIP__) && !defined(__KERNEL_ONEAPI__)
 /* Implementing this as a class rather than a typedef so that the compiler can tell it apart from
  * unsigned shorts. */
 class half {
@@ -73,7 +73,7 @@ struct half4 {
 
 ccl_device_inline half float_to_half_image(float f)
 {
-#if defined(__KERNEL_METAL__)
+#if defined(__KERNEL_METAL__) || defined(__KERNEL_ONEAPI__)
   return half(min(f, 65504.0f));
 #elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
   return __float2half(min(f, 65504.0f));
@@ -103,6 +103,8 @@ ccl_device_inline float half_to_float_image(half h)
 {
 #if defined(__KERNEL_METAL__)
   return half_to_float(h);
+#elif defined(__KERNEL_ONEAPI__)
+  return float(h);
 #elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
   return __half2float(h);
 #else
@@ -136,7 +138,7 @@ ccl_device_inline float4 half4_to_float4_image(const half4 h)
 
 ccl_device_inline half float_to_half_display(const float f)
 {
-#if defined(__KERNEL_METAL__)
+#if defined(__KERNEL_METAL__) || defined(__KERNEL_ONEAPI__)
   return half(min(f, 65504.0f));
 #elif defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
   return __float2half(min(f, 65504.0f));
diff --git a/intern/cycles/util/math.h b/intern/cycles/util/math.h
index f1f627588c5..af2f1ea092d 100644
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -79,7 +79,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Scalar */
 
-#ifndef __HIP__
+#if !defined(__HIP__) && !defined(__KERNEL_ONEAPI__)
 #  ifdef _WIN32
 ccl_device_inline float fmaxf(float a, float b)
 {
@@ -92,12 +92,18 @@ ccl_device_inline float fminf(float a, float b)
 }
 
 #  endif /* _WIN32 */
-#endif   /* __HIP__ */
+#endif   /* __HIP__, __KERNEL_ONEAPI__ */
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
+#  ifndef __KERNEL_ONEAPI__
 using std::isfinite;
 using std::isnan;
 using std::sqrt;
+#  else
+using sycl::sqrt;
+#    define isfinite(x) sycl::isfinite((x))
+#    define isnan(x) sycl::isnan((x))
+#  endif
 
 ccl_device_inline int abs(int x)
 {
@@ -793,6 +799,8 @@ ccl_device_inline uint popcount(uint x)
   return i & 1;
 }
 #  endif
+#elif defined(__KERNEL_ONEAPI__)
+#  define popcount(x) sycl::popcount(x)
 #elif defined(__KERNEL_HIP__)
 /* Use popcll to support 64-bit wave for pre-RDNA AMD GPUs */
 #  define popcount(x) __popcll(x)
@@ -806,6 +814,8 @@ ccl_device_inline uint count_leading_zeros(uint x)
   return __clz(x);
 #elif defined(__KERNEL_METAL__)
   return clz(x);
+#elif defined(__KERNEL_ONEAPI__)
+  return sycl::clz(x);
 #else
   assert(x != 0);
 #  ifdef _MSC_VER
@@ -824,6 +834,8 @@ ccl_device_inline uint count_trailing_zeros(uint x)
   return (__ffs(x) - 1);
 #elif defined(__KERNEL_METAL__)
   return ctz(x);
+#elif defined(__KERNEL_ONEAPI__)
+  return sycl::ctz(x);
 #else
   assert(x != 0);
 #  ifdef _MSC_VER
diff --git a/intern/cycles/util/types_float2.h b/intern/cycles/util/types_float2.h
index d8b2efb7b4b..07b9ec0986b 100644
--- a/intern/cycles/util/types_float2.h
+++ b/intern/cycles/util/types_float2.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 struct float2 {
   float x, y;
 
@@ -20,7 +20,7 @@ struct float2 {
 
 ccl_device_inline float2 make_float2(float x, float y);
 ccl_device_inline void print_float2(const char *label, const float2 &a);
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_float2_impl.h b/intern/cycles/util/types_float2_impl.h
index d67ec946b79..45fc90c52bd 100644
--- a/intern/cycles/util/types_float2_impl.h
+++ b/intern/cycles/util/types_float2_impl.h
@@ -14,7 +14,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 __forceinline float float2::operator[](int i) const
 {
   util_assert(i >= 0);
@@ -39,7 +39,7 @@ ccl_device_inline void print_float2(const char *label, const float2 &a)
 {
   printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y);
 }
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_float3.h b/intern/cycles/util/types_float3.h
index 060c2ac4152..c7900acaa69 100644
--- a/intern/cycles/util/types_float3.h
+++ b/intern/cycles/util/types_float3.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__)
 struct ccl_try_align(16) float3
 {
 #  ifdef __KERNEL_SSE__
@@ -40,7 +40,7 @@ struct ccl_try_align(16) float3
 ccl_device_inline float3 make_float3(float f);
 ccl_device_inline float3 make_float3(float x, float y, float z);
 ccl_device_inline void print_float3(const char *label, const float3 &a);
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) */
 
 /* Smaller float3 for storage. For math operations this must be converted to float3, so that on the
  * CPU SIMD instructions can be used. */
diff --git a/intern/cycles/util/types_float3_impl.h b/intern/cycles/util/types_float3_impl.h
index f5ffc48c1be..2e6e864c8ea 100644
--- a/intern/cycles/util/types_float3_impl.h
+++ b/intern/cycles/util/types_float3_impl.h
@@ -14,7 +14,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__)
 #  ifdef __KERNEL_SSE__
 __forceinline float3::float3()
 {
@@ -83,7 +83,7 @@ ccl_device_inline void print_float3(const char *label, const float3 &a)
 {
   printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z);
 }
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_float4.h b/intern/cycles/util/types_float4.h
index 68ba787dac0..27453bf39e4 100644
--- a/intern/cycles/util/types_float4.h
+++ b/intern/cycles/util/types_float4.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 struct int4;
 
 struct ccl_try_align(16) float4
@@ -43,7 +43,7 @@ ccl_device_inline float4 make_float4(float f);
 ccl_device_inline float4 make_float4(float x, float y, float z, float w);
 ccl_device_inline float4 make_float4(const int4 &i);
 ccl_device_inline void print_float4(const char *label, const float4 &a);
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_float4_impl.h b/intern/cycles/util/types_float4_impl.h
index de2e7cb7061..d7858f744e3 100644
--- a/intern/cycles/util/types_float4_impl.h
+++ b/intern/cycles/util/types_float4_impl.h
@@ -14,7 +14,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 #  ifdef __KERNEL_SSE__
 __forceinline float4::float4()
 {
@@ -89,7 +89,7 @@ ccl_device_inline void print_float4(const char *label, const float4 &a)
 {
   printf("%s: %.8f %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z, (double)a.w);
 }
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_float8.h b/intern/cycles/util/types_float8.h
index 99f9ec9b867..d71149946f7 100644
--- a/intern/cycles/util/types_float8.h
+++ b/intern/cycles/util/types_float8.h
@@ -11,7 +11,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 
 struct ccl_try_align(32) float8
 {
@@ -43,7 +43,7 @@ struct ccl_try_align(32) float8
 ccl_device_inline float8 make_float8(float f);
 ccl_device_inline float8
 make_float8(float a, float b, float c, float d, float e, float f, float g, float h);
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_float8_impl.h b/intern/cycles/util/types_float8_impl.h
index 19818976b50..0694f5205a5 100644
--- a/intern/cycles/util/types_float8_impl.h
+++ b/intern/cycles/util/types_float8_impl.h
@@ -15,7 +15,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 #  ifdef __KERNEL_AVX2__
 __forceinline float8::float8()
 {
@@ -81,7 +81,7 @@ make_float8(float a, float b, float c, float d, float e, float f, float g, float
   return r;
 }
 
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_int2.h b/intern/cycles/util/types_int2.h
index 4daf387d9cf..bf69cddc653 100644
--- a/intern/cycles/util/types_int2.h
+++ b/intern/cycles/util/types_int2.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 struct int2 {
   int x, y;
 
@@ -19,7 +19,7 @@ struct int2 {
 };
 
 ccl_device_inline int2 make_int2(int x, int y);
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_int2_impl.h b/intern/cycles/util/types_int2_impl.h
index 7989c4d5506..7bdc77369ee 100644
--- a/intern/cycles/util/types_int2_impl.h
+++ b/intern/cycles/util/types_int2_impl.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 int int2::operator[](int i) const
 {
   util_assert(i >= 0);
@@ -30,7 +30,7 @@ ccl_device_inline int2 make_int2(int x, int y)
   int2 a = {x, y};
   return a;
 }
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_int3.h b/intern/cycles/util/types_int3.h
index ad9bcb39bbe..f88ff22ac35 100644
--- a/intern/cycles/util/types_int3.h
+++ b/intern/cycles/util/types_int3.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 struct ccl_try_align(16) int3
 {
 #  ifdef __KERNEL_SSE__
@@ -40,7 +40,7 @@ struct ccl_try_align(16) int3
 ccl_device_inline int3 make_int3(int i);
 ccl_device_inline int3 make_int3(int x, int y, int z);
 ccl_device_inline void print_int3(const char *label, const int3 &a);
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_int3_impl.h b/intern/cycles/util/types_int3_impl.h
index 4cfc1cf2987..1c49e97ad32 100644
--- a/intern/cycles/util/types_int3_impl.h
+++ b/intern/cycles/util/types_int3_impl.h
@@ -14,7 +14,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 #  ifdef __KERNEL_SSE__
 __forceinline int3::int3()
 {
@@ -84,7 +84,7 @@ ccl_device_inline void print_int3(const char *label, const int3 &a)
 {
   printf("%s: %d %d %d\n", label, a.x, a.y, a.z);
 }
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_int4.h b/intern/cycles/util/types_int4.h
index f35632fb52f..9d557c01344 100644
--- a/intern/cycles/util/types_int4.h
+++ b/intern/cycles/util/types_int4.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 
 struct float3;
 struct float4;
@@ -46,7 +46,7 @@ ccl_device_inline int4 make_int4(int x, int y, int z, int w);
 ccl_device_inline int4 make_int4(const float3 &f);
 ccl_device_inline int4 make_int4(const float4 &f);
 ccl_device_inline void print_int4(const char *label, const int4 &a);
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_int4_impl.h b/intern/cycles/util/types_int4_impl.h
index adb4a4cebac..11e1ede6705 100644
--- a/intern/cycles/util/types_int4_impl.h
+++ b/intern/cycles/util/types_int4_impl.h
@@ -14,7 +14,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 #  ifdef __KERNEL_SSE__
 __forceinline int4::int4()
 {
@@ -83,6 +83,8 @@ ccl_device_inline int4 make_int4(const float3 &f)
 {
 #  ifdef __KERNEL_SSE__
   int4 a(_mm_cvtps_epi32(f.m128));
+#  elif defined(__KERNEL_ONEAPI__)
+  int4 a = {(int)f.x, (int)f.y, (int)f.z, 0};
 #  else
   int4 a = {(int)f.x, (int)f.y, (int)f.z, (int)f.w};
 #  endif
@@ -103,7 +105,7 @@ ccl_device_inline void print_int4(const char *label, const int4 &a)
 {
   printf("%s: %d %d %d %d\n", label, a.x, a.y, a.z, a.w);
 }
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_uchar2.h b/intern/cycles/util/types_uchar2.h
index 445fa8dd703..0b3c9bd0331 100644
--- a/intern/cycles/util/types_uchar2.h
+++ b/intern/cycles/util/types_uchar2.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 struct uchar2 {
   uchar x, y;
 
@@ -19,7 +19,7 @@ struct uchar2 {
 };
 
 ccl_device_inline uchar2 make_uchar2(uchar x, uchar y);
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_uchar2_impl.h b/intern/cycles/util/types_uchar2_impl.h
index cec1c679050..a7254d5eaf2 100644
--- a/intern/cycles/util/types_uchar2_impl.h
+++ b/intern/cycles/util/types_uchar2_impl.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 uchar uchar2::operator[](int i) const
 {
   util_assert(i >= 0);
@@ -30,7 +30,7 @@ ccl_device_inline uchar2 make_uchar2(uchar x, uchar y)
   uchar2 a = {x, y};
   return a;
 }
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_uchar3.h b/intern/cycles/util/types_uchar3.h
index 1ebd86441c3..fc213502ada 100644
--- a/intern/cycles/util/types_uchar3.h
+++ b/intern/cycles/util/types_uchar3.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 struct uchar3 {
   uchar x, y, z;
 
@@ -19,7 +19,7 @@ struct uchar3 {
 };
 
 ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z);
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_uchar3_impl.h b/intern/cycles/util/types_uchar3_impl.h
index 0656baa3da4..0c24ffb488a 100644
--- a/intern/cycles/util/types_uchar3_impl.h
+++ b/intern/cycles/util/types_uchar3_impl.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 uchar uchar3::operator[](int i) const
 {
   util_assert(i >= 0);
@@ -30,7 +30,7 @@ ccl_device_inline uchar3 make_uchar3(uchar x, uchar y, uchar z)
   uchar3 a = {x, y, z};
   return a;
 }
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_uchar4.h b/intern/cycles/util/types_uchar4.h
index 2ac4fb56cbb..a2a2c945aaa 100644
--- a/intern/cycles/util/types_uchar4.h
+++ b/intern/cycles/util/types_uchar4.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 struct uchar4 {
   uchar x, y, z, w;
 
@@ -19,7 +19,7 @@ struct uchar4 {
 };
 
 ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w);
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_uchar4_impl.h b/intern/cycles/util/types_uchar4_impl.h
index b3e8abfe873..8ec6213a37d 100644
--- a/intern/cycles/util/types_uchar4_impl.h
+++ b/intern/cycles/util/types_uchar4_impl.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 uchar uchar4::operator[](int i) const
 {
   util_assert(i >= 0);
@@ -30,7 +30,7 @@ ccl_device_inline uchar4 make_uchar4(uchar x, uchar y, uchar z, uchar w)
   uchar4 a = {x, y, z, w};
   return a;
 }
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_uint2.h b/intern/cycles/util/types_uint2.h
index e3254b9f0e1..faa0955f903 100644
--- a/intern/cycles/util/types_uint2.h
+++ b/intern/cycles/util/types_uint2.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 struct uint2 {
   uint x, y;
 
@@ -19,7 +19,7 @@ struct uint2 {
 };
 
 ccl_device_inline uint2 make_uint2(uint x, uint y);
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_uint2_impl.h b/intern/cycles/util/types_uint2_impl.h
index e67134a011e..cac0ba6b531 100644
--- a/intern/cycles/util/types_uint2_impl.h
+++ b/intern/cycles/util/types_uint2_impl.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 __forceinline uint uint2::operator[](uint i) const
 {
   util_assert(i < 2);
@@ -28,7 +28,7 @@ ccl_device_inline uint2 make_uint2(uint x, uint y)
   uint2 a = {x, y};
   return a;
 }
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_uint3.h b/intern/cycles/util/types_uint3.h
index 885a8fb84ce..3ff87bfc791 100644
--- a/intern/cycles/util/types_uint3.h
+++ b/intern/cycles/util/types_uint3.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 struct uint3 {
   uint x, y, z;
 
@@ -19,7 +19,7 @@ struct uint3 {
 };
 
 ccl_device_inline uint3 make_uint3(uint x, uint y, uint z);
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_uint3_impl.h b/intern/cycles/util/types_uint3_impl.h
index f4d3d72469c..221883a1adb 100644
--- a/intern/cycles/util/types_uint3_impl.h
+++ b/intern/cycles/util/types_uint3_impl.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 __forceinline uint uint3::operator[](uint i) const
 {
   util_assert(i < 3);
@@ -28,7 +28,7 @@ ccl_device_inline uint3 make_uint3(uint x, uint y, uint z)
   uint3 a = {x, y, z};
   return a;
 }
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_uint4.h b/intern/cycles/util/types_uint4.h
index d582b91d2a0..504095b2383 100644
--- a/intern/cycles/util/types_uint4.h
+++ b/intern/cycles/util/types_uint4.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 struct uint4 {
   uint x, y, z, w;
 
@@ -19,7 +19,7 @@ struct uint4 {
 };
 
 ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w);
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_uint4_impl.h b/intern/cycles/util/types_uint4_impl.h
index 98a4c5e9fe9..d78db944a1f 100644
--- a/intern/cycles/util/types_uint4_impl.h
+++ b/intern/cycles/util/types_uint4_impl.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 __forceinline uint uint4::operator[](uint i) const
 {
   util_assert(i < 3);
@@ -28,7 +28,7 @@ ccl_device_inline uint4 make_uint4(uint x, uint y, uint z, uint w)
   uint4 a = {x, y, z, w};
   return a;
 }
-#endif /* __KERNEL_GPU__ */
+#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
 
 CCL_NAMESPACE_END
 
diff --git a/intern/cycles/util/types_ushort4.h b/intern/cycles/util/types_ushort4.h
index 1766c6bf734..9a6e12095ba 100644
--- a/intern/cycles/util/types_ushort4.h
+++ b/intern/cycles/util/types_ushort4.h
@@ -10,7 +10,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifndef __KERNEL_GPU__
+#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
 
 struct ushort4 {
   uint16_t x, y, z, w;
author	Xavier Hallade <xavier.hallade@intel.com>	2022-06-29 13:58:04 +0300
committer	Xavier Hallade <xavier.hallade@intel.com>	2022-06-29 13:58:04 +0300
commit	a02992f1313811c9905e44dc95a0aee31d707f67 (patch)
tree	2d1f59524e2a298bb530ad578a2d2b9e2c4a1432 /intern
parent	302b04a5a3fc0e767ac784424f78ce2edf5d2844 (diff)