3 files changed, 104 insertions, 14 deletions
diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h
index 29d851ae478..a8c72645569 100644
--- a/intern/cycles/kernel/device/gpu/image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -181,7 +181,7 @@ ccl_device_noinline typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_in
 
 ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
 {
-  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_data_fetch(texture_info, id);
 
   /* float4, byte4, ushort4 and half4 */
   const int texture_type = info.data_type;
@@ -216,7 +216,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
                                              float3 P,
                                              InterpolationType interp)
 {
-  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_data_fetch(texture_info, id);
 
   if (info.use_transform_3d) {
     P = transform_point(&info.transform_3d, P);
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index d657571a5fa..d7d2000775f 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -14,6 +14,8 @@
 
 #ifdef __KERNEL_METAL__
 #  include "kernel/device/metal/context_begin.h"
+#elif defined(__KERNEL_ONEAPI__)
+#  include "kernel/device/oneapi/context_begin.h"
 #endif
 
 #include "kernel/device/gpu/work_stealing.h"
@@ -40,6 +42,8 @@
 
 #ifdef __KERNEL_METAL__
 #  include "kernel/device/metal/context_end.h"
+#elif defined(__KERNEL_ONEAPI__)
+#  include "kernel/device/oneapi/context_end.h"
 #endif
 
 #include "kernel/film/read.h"
@@ -242,7 +246,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 ccl_gpu_kernel_postfix
 
 #if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
-constant int __dummy_constant [[function_constant(0)]];
+constant int __dummy_constant [[function_constant(Kernel_DummyConstant)]];
 #endif
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
@@ -522,7 +526,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   bool converged = true;
 
   if (x < sw && y < sh) {
-    converged = ccl_gpu_kernel_call(kernel_adaptive_sampling_convergence_check(
+    converged = ccl_gpu_kernel_call(film_adaptive_sampling_convergence_check(
         nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride));
   }
 
@@ -549,7 +553,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 
   if (y < sh) {
     ccl_gpu_kernel_call(
-        kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride));
+        film_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride));
   }
 }
 ccl_gpu_kernel_postfix
@@ -568,7 +572,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 
   if (x < sw) {
     ccl_gpu_kernel_call(
-        kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride));
+        film_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride));
   }
 }
 ccl_gpu_kernel_postfix
@@ -585,7 +589,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int pixel_index = ccl_gpu_global_id_x();
 
   if (pixel_index < num_pixels) {
-    ccl_gpu_kernel_call(kernel_cryptomatte_post(nullptr, render_buffer, pixel_index));
+    ccl_gpu_kernel_call(film_cryptomatte_post(nullptr, render_buffer, pixel_index));
   }
 }
 ccl_gpu_kernel_postfix
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index 7d7266d5edf..c1df49c4f49 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -18,15 +18,68 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
 #endif
 
-#ifndef __KERNEL_METAL__
+/* TODO: abstract more device differences, define ccl_gpu_local_syncthreads,
+ * ccl_gpu_thread_warp, ccl_gpu_warp_index, ccl_gpu_num_warps for all devices
+ * and keep device specific code in compat.h */
+
+#ifdef __KERNEL_ONEAPI__
+#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+template<typename IsActiveOp>
+void cpu_serial_active_index_array_impl(const uint num_states,
+                                        ccl_global int *ccl_restrict indices,
+                                        ccl_global int *ccl_restrict num_indices,
+                                        IsActiveOp is_active_op)
+{
+  int write_index = 0;
+  for (int state_index = 0; state_index < num_states; state_index++) {
+    if (is_active_op(state_index))
+      indices[write_index++] = state_index;
+  }
+  *num_indices = write_index;
+  return;
+}
+#  endif /* WITH_ONEAPI_SYCL_HOST_ENABLED */
+
+template<typename IsActiveOp>
+void gpu_parallel_active_index_array_impl(const uint num_states,
+                                          ccl_global int *ccl_restrict indices,
+                                          ccl_global int *ccl_restrict num_indices,
+                                          IsActiveOp is_active_op)
+{
+  const sycl::nd_item<1> &item_id = sycl::ext::oneapi::experimental::this_nd_item<1>();
+  const uint blocksize = item_id.get_local_range(0);
+
+  sycl::multi_ptr<int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1],
+                  sycl::access::address_space::local_space>
+      ptr = sycl::ext::oneapi::group_local_memory<
+          int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1]>(item_id.get_group());
+  int *warp_offset = *ptr;
+
+  /* NOTE(@nsirgien): Here we calculate the same value as below but
+   * faster for DPC++ : seems CUDA converting "%", "/", "*" based calculations below into
+   * something faster already but DPC++ doesn't, so it's better to use
+   * direct request of needed parameters - switching from this computation to computation below
+   * will cause 2.5x performance slowdown. */
+  const uint thread_index = item_id.get_local_id(0);
+  const uint thread_warp = item_id.get_sub_group().get_local_id();
+
+  const uint warp_index = item_id.get_sub_group().get_group_id();
+  const uint num_warps = item_id.get_sub_group().get_group_range()[0];
+
+  const uint state_index = item_id.get_global_id(0);
+
+  /* Test if state corresponding to this thread is active. */
+  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+#else /* !__KERNEL__ONEAPI__ */
+#  ifndef __KERNEL_METAL__
 template<uint blocksize, typename IsActiveOp>
 __device__
-#endif
+#  endif
     void
     gpu_parallel_active_index_array_impl(const uint num_states,
                                          ccl_global int *indices,
                                          ccl_global int *num_indices,
-#ifdef __KERNEL_METAL__
+#  ifdef __KERNEL_METAL__
                                          const uint is_active,
                                          const uint blocksize,
                                          const int thread_index,
@@ -37,7 +90,7 @@ __device__
                                          const int num_warps,
                                          threadgroup int *warp_offset)
 {
-#else
+#  else
                                           IsActiveOp is_active_op)
 {
   extern ccl_gpu_shared int warp_offset[];
@@ -52,18 +105,33 @@ __device__
 
   /* Test if state corresponding to this thread is active. */
   const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
-#endif
-
+#  endif
+#endif /* !__KERNEL_ONEAPI__ */
   /* For each thread within a warp compute how many other active states precede it. */
+#ifdef __KERNEL_ONEAPI__
+  const uint thread_offset = sycl::exclusive_scan_over_group(
+      item_id.get_sub_group(), is_active, std::plus<>());
+#else
   const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
                                       ccl_gpu_thread_mask(thread_warp));
+#endif
 
   /* Last thread in warp stores number of active states for each warp. */
+#ifdef __KERNEL_ONEAPI__
+  if (thread_warp == item_id.get_sub_group().get_local_range()[0] - 1) {
+#else
   if (thread_warp == ccl_gpu_warp_size - 1) {
+#endif
     warp_offset[warp_index] = thread_offset + is_active;
   }
 
+#ifdef __KERNEL_ONEAPI__
+  /* NOTE(@nsirgien): For us here only local memory writing (warp_offset) is important,
+   * so faster local barriers can be used. */
+  ccl_gpu_local_syncthreads();
+#else
   ccl_gpu_syncthreads();
+#endif
 
   /* Last thread in block converts per-warp sizes to offsets, increments global size of
    * index array and gets offset to write to. */
@@ -80,7 +148,13 @@ __device__
     warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
   }
 
+#ifdef __KERNEL_ONEAPI__
+  /* NOTE(@nsirgien): For us here only important local memory writing (warp_offset),
+   * so faster local barriers can be used. */
+  ccl_gpu_local_syncthreads();
+#else
   ccl_gpu_syncthreads();
+#endif
 
   /* Write to index array. */
   if (is_active) {
@@ -107,7 +181,19 @@ __device__
                                          simd_group_index, \
                                          num_simd_groups, \
                                          simdgroup_offset)
-
+#elif defined(__KERNEL_ONEAPI__)
+#  ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
+#    define gpu_parallel_active_index_array( \
+        blocksize, num_states, indices, num_indices, is_active_op) \
+      if (ccl_gpu_global_size_x() == 1) \
+        cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \
+      else \
+        gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op);
+#  else
+#    define gpu_parallel_active_index_array( \
+        blocksize, num_states, indices, num_indices, is_active_op) \
+      gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
+#  endif
 #else
 
 #  define gpu_parallel_active_index_array( \