From 6e268a749fee16b442bcb3fba6cb6e08850d8389 Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey@blender.org>
Date: Tue, 21 Sep 2021 17:03:22 +0200
Subject: Fix adaptive sampling artifacts on tile boundaries

Implement an overscan support for tiles, so that adaptive sampling can
rely on the pixels neighbourhood.

Differential Revision: https://developer.blender.org/D12599
---
 intern/cycles/kernel/device/gpu/kernel.h | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'intern/cycles/kernel/device/gpu')
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 7b79c0aedfa..3379114fc62 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -424,8 +424,12 @@ ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *k
     return;
   }
 
-  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
-  ccl_global const float *buffer = render_buffer + render_buffer_offset;
+  const int x = render_pixel_index % width;
+  const int y = render_pixel_index / width;
+
+  ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride +
+                                   y * stride * kfilm_convert->pass_stride;
+
   ccl_global float *pixel = pixels +
                             (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride;
 
@@ -451,17 +455,17 @@ ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
     return;
   }
 
-  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
-  ccl_global const float *buffer = render_buffer + render_buffer_offset;
+  const int x = render_pixel_index % width;
+  const int y = render_pixel_index / width;
+
+  ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride +
+                                   y * stride * kfilm_convert->pass_stride;
 
   float pixel[4];
   processor(kfilm_convert, buffer, pixel);
 
   film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
 
-  const int x = render_pixel_index % width;
-  const int y = render_pixel_index / width;
-
   ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
   float4_store_half((ccl_global half *)out, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
 }
-- 
cgit v1.2.3


From 2ba7c3aa650c3c795d903a24998204f67c75b017 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Wed, 13 Oct 2021 19:13:35 +0200
Subject: Cleanup: refactor to make number of channels for shader evaluation
 variable

---
 intern/cycles/kernel/device/gpu/kernel.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 3379114fc62..21901215757 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -615,7 +615,7 @@ KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
-                                    float4 *output,
+                                    float *output,
                                     const int offset,
                                     const int work_size)
 {
@@ -629,7 +629,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
-                                      float4 *output,
+                                      float *output,
                                       const int offset,
                                       const int work_size)
 {
-- 
cgit v1.2.3


From 1df3b51988852fa8ee6b530a64aa23346db9acd4 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht>
Date: Sun, 17 Oct 2021 16:10:10 +0200
Subject: Cycles: replace integrator state argument macros

* Rename struct KernelGlobals to struct KernelGlobalsCPU
* Add KernelGlobals, IntegratorState and ConstIntegratorState typedefs
  that every device can define in its own way.
* Remove INTEGRATOR_STATE_ARGS and INTEGRATOR_STATE_PASS macros and
  replace with these new typedefs.
* Add explicit state argument to INTEGRATOR_STATE and similar macros

In preparation for decoupling main and shadow paths.

Differential Revision: https://developer.blender.org/D12888
---
 intern/cycles/kernel/device/gpu/image.h  |  4 ++--
 intern/cycles/kernel/device/gpu/kernel.h | 24 ++++++++++++------------
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h
index b015c78a8f5..95a37c693ae 100644
--- a/intern/cycles/kernel/device/gpu/image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -189,7 +189,7 @@ ccl_device_noinline T kernel_tex_image_interp_nanovdb(
 }
 #endif
 
-ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
+ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
 {
   const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
@@ -221,7 +221,7 @@ ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float
   }
 }
 
-ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
                                              int id,
                                              float3 P,
                                              InterpolationType interp)
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 21901215757..56beaf1fd91 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -51,8 +51,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int state = ccl_gpu_global_id_x();
 
   if (state < num_states) {
-    INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
-    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+    INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
+    INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
   }
 }
 
@@ -244,7 +244,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, [kernel](const int state) {
-        return (INTEGRATOR_STATE(path, queued_kernel) == kernel);
+        return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel);
       });
 }
 
@@ -256,7 +256,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, [kernel](const int state) {
-        return (INTEGRATOR_STATE(shadow_path, queued_kernel) == kernel);
+        return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel);
       });
 }
 
@@ -265,8 +265,8 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(path, queued_kernel) != 0) ||
-               (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0);
+        return (INTEGRATOR_STATE(state, path, queued_kernel) != 0) ||
+               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0);
       });
 }
 
@@ -278,8 +278,8 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices + indices_offset, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(path, queued_kernel) == 0) &&
-               (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0);
+        return (INTEGRATOR_STATE(state, path, queued_kernel) == 0) &&
+               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
       });
 }
 
@@ -289,8 +289,8 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
 {
   gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) {
-        return (INTEGRATOR_STATE(path, queued_kernel) == kernel) ?
-                   INTEGRATOR_STATE(path, shader_sort_key) :
+        return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ?
+                   INTEGRATOR_STATE(state, path, shader_sort_key) :
                    GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
       });
 }
@@ -304,8 +304,8 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, [num_active_paths](const int state) {
         return (state >= num_active_paths) &&
-               ((INTEGRATOR_STATE(path, queued_kernel) != 0) ||
-                (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0));
+               ((INTEGRATOR_STATE(state, path, queued_kernel) != 0) ||
+                (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0));
       });
 }
 
-- 
cgit v1.2.3


From 3065d2609700d14100490a16c91152a6e71790e8 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Sun, 17 Oct 2021 20:43:06 +0200
Subject: Cycles: optimize volume stack copying for shadow catcher/compaction

Only copy the number of items used instead of the max items.

Ref D12889
---
 intern/cycles/kernel/device/gpu/kernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 56beaf1fd91..b5ecab2a4db 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -321,7 +321,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
     const int from_state = active_terminated_states[active_states_offset + global_index];
     const int to_state = active_terminated_states[terminated_states_offset + global_index];
 
-    integrator_state_move(to_state, from_state);
+    integrator_state_move(NULL, to_state, from_state);
   }
 }
 
-- 
cgit v1.2.3


From 943e73b07e26d64c04ccb7d8f656e3818a57cca0 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht>
Date: Sun, 17 Oct 2021 16:22:20 +0200
Subject: Cycles: decouple shadow paths from main path on GPU

The motivation for this is twofold. It improves performance (5-10% on most
benchmark scenes), and will help  to bring back transparency support for the
ambient occlusion pass.

* Duplicate some members from the main path state in the shadow path state.
* Add shadow paths incrementally to the array similar to what we do for
  the shadow catchers.
* For the scheduling, allow running shade surface and shade volume kernels
  as long as there is enough space in the shadow paths array. If not, execute
  shadow kernels until it is empty.

* Add IntegratorShadowState and ConstIntegratorShadowState typedefs that
  can be different between CPU and GPU. For GPU both main and shadow paths
  juse have an integer for SoA access. Bt with CPU it's a different pointer
  type so we get type safety checks in code shared between CPU and GPU.
* For CPU, add a separate IntegratorShadowStateCPU struct embedded in
  IntegratorShadowState.
* Update various functions to take the shadow state, and make SVM take either
  type of state using templates.

Differential Revision: https://developer.blender.org/D12889
---
 intern/cycles/kernel/device/gpu/kernel.h | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index b5ecab2a4db..6b4d79ed5b7 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -265,8 +265,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) != 0) ||
-               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0);
+        return (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
       });
 }
 
@@ -278,8 +277,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices + indices_offset, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) == 0) &&
-               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+        return (INTEGRATOR_STATE(state, path, queued_kernel) == 0);
       });
 }
 
@@ -303,9 +301,7 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 {
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, [num_active_paths](const int state) {
-        return (state >= num_active_paths) &&
-               ((INTEGRATOR_STATE(state, path, queued_kernel) != 0) ||
-                (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0));
+        return (state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
       });
 }
 
-- 
cgit v1.2.3


From fd77a28031daff3122ded3a1cb37a7fb44feedf6 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Mon, 20 Sep 2021 16:16:11 +0200
Subject: Cycles: bake transparent shadows for hair

These transparent shadows can be expansive to evaluate. Especially on the
GPU they can lead to poor occupancy when only some pixels require many kernel
launches to trace and evaluate many layers of transparency.

Baked transparency allows tracing a single ray in many cases by accumulating
the throughput directly in the intersection program without recording hits
or evaluating shaders. Transparency is baked at curve vertices and
interpolated, for most shaders this will look practically the same as actual
shader evaluation.

Fixes T91428, performance regression with spring demo file due to transparent
hair, and makes it render significantly faster than Blender 2.93.

Differential Revision: https://developer.blender.org/D12880
---
 intern/cycles/kernel/device/gpu/kernel.h | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 6b4d79ed5b7..b6df74e835a 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -621,7 +621,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 }
 
-/* Background Shader Evaluation */
+/* Background */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
@@ -635,6 +635,20 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 }
 
+/* Curve Shadow Transparency */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    kernel_gpu_shader_eval_curve_shadow_transparency(KernelShaderEvalInput *input,
+                                                     float *output,
+                                                     const int offset,
+                                                     const int work_size)
+{
+  int i = ccl_gpu_global_id_x();
+  if (i < work_size) {
+    kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i);
+  }
+}
+
 /* --------------------------------------------------------------------
  * Denoising.
  */
-- 
cgit v1.2.3


From cccfa597ba69944817e0913944cf3c3d0a6e1165 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Sun, 17 Oct 2021 18:08:00 +0200
Subject: Cycles: make ambient occlusion pass take into account transparency
 again

Taking advantage of the new decoupled main and shadow paths. For CPU we
just store two nested structs in the integrator state, one for direct light
shadows and one for AO. For the GPU we restrict the number of shade surface
states to be executed based on available space in the shadow paths queue.

This also helps improve performance in benchmark scenes with an AO pass,
since it is no longer needed to use the shader raytracing kernel there,
which has worse performance.

Differential Revision: https://developer.blender.org/D12900
---
 intern/cycles/kernel/device/gpu/kernel.h           | 22 +++++++++++++++++-----
 .../cycles/kernel/device/gpu/parallel_prefix_sum.h |  8 +++++---
 .../kernel/device/gpu/parallel_sorted_index.h      | 12 +++++++++++-
 3 files changed, 33 insertions(+), 9 deletions(-)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index b6df74e835a..fcb398f7e6d 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -282,11 +282,22 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
 }
 
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_sorted_paths_array(
-        int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel)
+    kernel_gpu_integrator_sorted_paths_array(int num_states,
+                                             int num_states_limit,
+                                             int *indices,
+                                             int *num_indices,
+                                             int *key_counter,
+                                             int *key_prefix_sum,
+                                             int kernel)
 {
   gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) {
+      num_states,
+      num_states_limit,
+      indices,
+      num_indices,
+      key_counter,
+      key_prefix_sum,
+      [kernel](const int state) {
         return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ?
                    INTEGRATOR_STATE(state, path, shader_sort_key) :
                    GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
@@ -322,9 +333,10 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
 }
 
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_prefix_sum(int *values, int num_values)
+    kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values)
 {
-  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values);
+  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(
+      counter, prefix_sum, num_values);
 }
 
 /* --------------------------------------------------------------------
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
index a1349e82efb..aabe6e2e27a 100644
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -33,7 +33,8 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
 #endif
 
-template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
+template<uint blocksize>
+__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values)
 {
   if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
     return;
@@ -41,8 +42,9 @@ template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, co
 
   int offset = 0;
   for (int i = 0; i < num_values; i++) {
-    const int new_offset = offset + values[i];
-    values[i] = offset;
+    const int new_offset = offset + counter[i];
+    prefix_sum[i] = offset;
+    counter[i] = 0;
     offset = new_offset;
   }
 }
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
index 9bca1fad22f..7570c5a6bbd 100644
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -35,8 +35,10 @@ CCL_NAMESPACE_BEGIN
 
 template<uint blocksize, typename GetKeyOp>
 __device__ void gpu_parallel_sorted_index_array(const uint num_states,
+                                                const int num_states_limit,
                                                 int *indices,
                                                 int *num_indices,
+                                                int *key_counter,
                                                 int *key_prefix_sum,
                                                 GetKeyOp get_key_op)
 {
@@ -46,7 +48,15 @@ __device__ void gpu_parallel_sorted_index_array(const uint num_states,
 
   if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) {
     const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
-    indices[index] = state_index;
+    if (index < num_states_limit) {
+      /* Assign state index. */
+      indices[index] = state_index;
+    }
+    else {
+      /* Can't process this state now, increase the counter again so that
+       * it will be handled in another iteration. */
+      atomic_fetch_and_add_uint32(&key_counter[key], 1);
+    }
   }
 }
 
-- 
cgit v1.2.3


From df004637643241136a3294a63c7d4ca865cdea98 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht>
Date: Thu, 21 Oct 2021 15:14:30 +0200
Subject: Cycles: add shadow path compaction for GPU rendering

Similar to main path compaction that happens before adding work tiles, this
compacts shadow paths before launching kernels that may add shadow paths.

Only do it when more than 50% of space is wasted.

It's not a clear win in all scenes, some are up to 1.5% slower. Likely caused
by different order of scheduling kernels having an unpredictable performance
impact. Still feels like compaction is just the right thing to avoid cases
where a few shadow paths can hold up a lot of main paths.

Differential Revision: https://developer.blender.org/D12944
---
 intern/cycles/kernel/device/gpu/kernel.h | 41 ++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index fcb398f7e6d..eeac09d4b29 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -281,6 +281,18 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_B
       });
 }
 
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_terminated_shadow_paths_array(int num_states,
+                                                        int *indices,
+                                                        int *num_indices,
+                                                        int indices_offset)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices + indices_offset, num_indices, [](const int state) {
+        return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+      });
+}
+
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
     kernel_gpu_integrator_sorted_paths_array(int num_states,
                                              int num_states_limit,
@@ -332,6 +344,35 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
   }
 }
 
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_shadow_paths_array(int num_states,
+                                                     int *indices,
+                                                     int *num_indices,
+                                                     int num_active_paths)
+{
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, [num_active_paths](const int state) {
+        return (state >= num_active_paths) &&
+               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0);
+      });
+}
+
+extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    kernel_gpu_integrator_compact_shadow_states(const int *active_terminated_states,
+                                                const int active_states_offset,
+                                                const int terminated_states_offset,
+                                                const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int from_state = active_terminated_states[active_states_offset + global_index];
+    const int to_state = active_terminated_states[terminated_states_offset + global_index];
+
+    integrator_shadow_state_move(NULL, to_state, from_state);
+  }
+}
+
 extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
     kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values)
 {
-- 
cgit v1.2.3


From 282516e53eba9bb3aaddd67b2b099fea98bd4c1f Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Thu, 21 Oct 2021 19:25:38 +0200
Subject: Cleanup: refactor float/half conversions for clarity

---
 intern/cycles/kernel/device/gpu/kernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index eeac09d4b29..335cb1ec0c0 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -516,7 +516,7 @@ ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
   film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
 
   ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
-  float4_store_half((ccl_global half *)out, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+  *out = float4_to_half4_display(make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
 }
 
 /* Common implementation for half4 destination and 3-channel input pass. */
-- 
cgit v1.2.3


From d7d40745fa09061a3117bd3669c5a46bbf611eae Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Sun, 24 Oct 2021 14:19:19 +0200
Subject: Cycles: changes to source code folders structure

* Split render/ into scene/ and session/. The scene/ folder now contains the
  scene and its nodes. The session/ folder contains the render session and
  associated data structures like drivers and render buffers.
* Move top level kernel headers into new folders kernel/camera/, kernel/film/,
  kernel/light/, kernel/sample/, kernel/util/
* Move integrator related kernel headers into kernel/integrator/
* Move OSL shaders from kernel/shaders/ to kernel/osl/shaders/

For patches and branches, git merge and rebase should be able to detect the
renames and move over code to the right file.
---
 intern/cycles/kernel/device/gpu/kernel.h        |  9 +++--
 intern/cycles/kernel/device/gpu/work_stealing.h | 52 +++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 4 deletions(-)
 create mode 100644 intern/cycles/kernel/device/gpu/work_stealing.h

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 335cb1ec0c0..aa360b3016a 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -19,6 +19,7 @@
 #include "kernel/device/gpu/parallel_active_index.h"
 #include "kernel/device/gpu/parallel_prefix_sum.h"
 #include "kernel/device/gpu/parallel_sorted_index.h"
+#include "kernel/device/gpu/work_stealing.h"
 
 #include "kernel/integrator/integrator_state.h"
 #include "kernel/integrator/integrator_state_flow.h"
@@ -36,10 +37,10 @@
 #include "kernel/integrator/integrator_shade_surface.h"
 #include "kernel/integrator/integrator_shade_volume.h"
 
-#include "kernel/kernel_adaptive_sampling.h"
-#include "kernel/kernel_bake.h"
-#include "kernel/kernel_film.h"
-#include "kernel/kernel_work_stealing.h"
+#include "kernel/bake/bake.h"
+
+#include "kernel/film/film_adaptive_sampling.h"
+#include "kernel/film/film_read.h"
 
 /* --------------------------------------------------------------------
  * Integrator.
diff --git a/intern/cycles/kernel/device/gpu/work_stealing.h b/intern/cycles/kernel/device/gpu/work_stealing.h
new file mode 100644
index 00000000000..fab0915c38e
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/work_stealing.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/*
+ * Utility functions for work stealing
+ */
+
+/* Map global work index to tile, pixel X/Y and sample. */
+ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
+                                      uint global_work_index,
+                                      ccl_private uint *x,
+                                      ccl_private uint *y,
+                                      ccl_private uint *sample)
+{
+#if 0
+  /* Keep threads for the same sample together. */
+  uint tile_pixels = tile->w * tile->h;
+  uint sample_offset = global_work_index / tile_pixels;
+  uint pixel_offset = global_work_index - sample_offset * tile_pixels;
+#else
+  /* Keeping threads for the same pixel together.
+   * Appears to improve performance by a few % on CUDA and OptiX. */
+  uint sample_offset = global_work_index % tile->num_samples;
+  uint pixel_offset = global_work_index / tile->num_samples;
+#endif
+
+  uint y_offset = pixel_offset / tile->w;
+  uint x_offset = pixel_offset - y_offset * tile->w;
+
+  *x = tile->x + x_offset;
+  *y = tile->y + y_offset;
+  *sample = tile->start_sample + sample_offset;
+}
+
+CCL_NAMESPACE_END
-- 
cgit v1.2.3


From fd25e883e2807a151f673b87c152a59701a0df80 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Sun, 24 Oct 2021 14:19:19 +0200
Subject: Cycles: remove prefix from source code file names

Remove prefix of filenames that is the same as the folder name. This used
to help when #includes were using individual files, but now they are always
relative to the cycles root directory and so the prefixes are redundant.

For patches and branches, git merge and rebase should be able to detect the
renames and move over code to the right file.
---
 intern/cycles/kernel/device/gpu/kernel.h           | 34 +++++++++++-----------
 .../kernel/device/gpu/parallel_active_index.h      |  2 +-
 .../cycles/kernel/device/gpu/parallel_prefix_sum.h |  2 +-
 .../kernel/device/gpu/parallel_sorted_index.h      |  2 +-
 4 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index aa360b3016a..f86a8c692aa 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -21,26 +21,26 @@
 #include "kernel/device/gpu/parallel_sorted_index.h"
 #include "kernel/device/gpu/work_stealing.h"
 
-#include "kernel/integrator/integrator_state.h"
-#include "kernel/integrator/integrator_state_flow.h"
-#include "kernel/integrator/integrator_state_util.h"
-
-#include "kernel/integrator/integrator_init_from_bake.h"
-#include "kernel/integrator/integrator_init_from_camera.h"
-#include "kernel/integrator/integrator_intersect_closest.h"
-#include "kernel/integrator/integrator_intersect_shadow.h"
-#include "kernel/integrator/integrator_intersect_subsurface.h"
-#include "kernel/integrator/integrator_intersect_volume_stack.h"
-#include "kernel/integrator/integrator_shade_background.h"
-#include "kernel/integrator/integrator_shade_light.h"
-#include "kernel/integrator/integrator_shade_shadow.h"
-#include "kernel/integrator/integrator_shade_surface.h"
-#include "kernel/integrator/integrator_shade_volume.h"
+#include "kernel/integrator/state.h"
+#include "kernel/integrator/state_flow.h"
+#include "kernel/integrator/state_util.h"
+
+#include "kernel/integrator/init_from_bake.h"
+#include "kernel/integrator/init_from_camera.h"
+#include "kernel/integrator/intersect_closest.h"
+#include "kernel/integrator/intersect_shadow.h"
+#include "kernel/integrator/intersect_subsurface.h"
+#include "kernel/integrator/intersect_volume_stack.h"
+#include "kernel/integrator/shade_background.h"
+#include "kernel/integrator/shade_light.h"
+#include "kernel/integrator/shade_shadow.h"
+#include "kernel/integrator/shade_surface.h"
+#include "kernel/integrator/shade_volume.h"
 
 #include "kernel/bake/bake.h"
 
-#include "kernel/film/film_adaptive_sampling.h"
-#include "kernel/film/film_read.h"
+#include "kernel/film/adaptive_sampling.h"
+#include "kernel/film/read.h"
 
 /* --------------------------------------------------------------------
  * Integrator.
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index db4a4bf71e0..d7416beb783 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN
  *
  * Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */
 
-#include "util/util_atomic.h"
+#include "util/atomic.h"
 
 #ifdef __HIP__
 #  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
index aabe6e2e27a..6de3a022569 100644
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -25,7 +25,7 @@ CCL_NAMESPACE_BEGIN
  * This is used for an array the size of the number of shaders in the scene
  * which is not usually huge, so might not be a significant bottleneck. */
 
-#include "util/util_atomic.h"
+#include "util/atomic.h"
 
 #ifdef __HIP__
 #  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
index 7570c5a6bbd..c06d7be444f 100644
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -24,7 +24,7 @@ CCL_NAMESPACE_BEGIN
  *
  * TODO: there may be ways to optimize this to avoid this many atomic ops? */
 
-#include "util/util_atomic.h"
+#include "util/atomic.h"
 
 #ifdef __HIP__
 #  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
-- 
cgit v1.2.3


From 440a3475b8f5410e5c41bfbed5ce82771b41356f Mon Sep 17 00:00:00 2001
From: Patrick Mours <pmours@nvidia.com>
Date: Tue, 9 Nov 2021 12:17:09 +0100
Subject: Cycles: Improve OptiX denoising with dark images and fix crash when
 denoiser is destroyed

Adds a pass before denoising that calculates the intensity of the image, which can be
passed into the OptiX denoiser for more optimal results for very dark or very bright images.

In addition this also fixes a crash that sometimes occurred on exit. The OptiX denoiser object
has to be destroyed before the OptiX device context object (since it references that). But in
C++ the destructor function of a class is called before its fields are destructed, so
"~OptiXDevice" was always called before "OptiXDevice::~Denoiser" and therefore
"optixDeviceContextDestroy" was called before "optixDenoiserDestroy", hence the crash.

Differential Revision: https://developer.blender.org/D13160
---
 intern/cycles/kernel/device/gpu/kernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index f86a8c692aa..5848ba5df9d 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -788,7 +788,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   /* Normal pass. */
-  if (render_pass_denoising_normal != PASS_UNUSED) {
+  if (guiding_pass_normal != PASS_UNUSED) {
     kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
 
     const float *normal_in = buffer + render_pass_denoising_normal;
-- 
cgit v1.2.3


From 3a4c8f406a3a3bf0627477c6183a594fa707a6e2 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael_jones>
Date: Tue, 9 Nov 2021 21:30:46 +0000
Subject: Cycles: Adapt shared kernel/device/gpu layer for MSL

This patch adapts the shared kernel entrypoints so that they can be compiled as MSL (Metal Shading Language). Where possible, the adaptations avoid changes in common code.

In MSL, kernel function inputs are explicitly bound to resources. In the case of argument buffers, we declare a struct containing the kernel arguments, accessible via device pointer. This differs from CUDA and HIP where kernel function arguments are declared as traditional C-style function parameters. This patch adapts the entrypoints declared in kernel.h so that they can be translated via a new `ccl_gpu_kernel_signature` macro into the required parameter struct + kernel entrypoint pairing for MSL.

MSL buffer attribution must be applied to function parameters or non-static class data members. To allow universal access to the integrator state, kernel data, and texture fetch adapters, we wrap all of the shared kernel code in a `MetalKernelContext` class. This is achieved by bracketing the appropriate kernel headers with "context_begin.h" and "context_end.h" on Metal. When calling deeper into the kernel code, we must reference the context class (e.g. `context.integrator_init_from_camera`). This extra prefixing is performed by a set of defines in "context_end.h". These will require explicit maintenance if entrypoints change. We invite discussion on more maintainable ways to enforce correctness.

Lambda expressions are not supported on MSL, so a new `ccl_gpu_kernel_lambda` macro generates an inline function object and optionally capturing any required state. This yields the same behaviour. This approach is applied to all parallel_... implementations which are templated by operation. The lambda expressions in the film_convert... kernels don't adapt cleanly to use function objects. However, these entrypoints can be macro-generated more concisely to avoid lambda expressions entirely, instead relying on constant folding to handle the pixel/channel conversions.

A separate implementation of `gpu_parallel_active_index_array` is provided for Metal to workaround some subtle differences in SIMD width, and also to encapsulate some required thread parameters which must be declared as explicit entrypoint function parameters.

Ref T92212

Reviewed By: brecht

Maniphest Tasks: T92212

Differential Revision: https://developer.blender.org/D13109
---
 intern/cycles/kernel/device/gpu/image.h            |  12 +-
 intern/cycles/kernel/device/gpu/kernel.h           | 837 ++++++++++-----------
 .../kernel/device/gpu/parallel_active_index.h      | 114 ++-
 .../cycles/kernel/device/gpu/parallel_prefix_sum.h |   8 +-
 .../kernel/device/gpu/parallel_sorted_index.h      |  14 +-
 5 files changed, 505 insertions(+), 480 deletions(-)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h
index 95a37c693ae..0900a45c83d 100644
--- a/intern/cycles/kernel/device/gpu/image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -65,7 +65,9 @@ ccl_device float cubic_h1(float a)
 
 /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
 template<typename T>
-ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
+ccl_device_noinline T kernel_tex_image_interp_bicubic(ccl_global const TextureInfo &info,
+                                                      float x,
+                                                      float y)
 {
   ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
@@ -94,7 +96,7 @@ ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, f
 /* Fast tricubic texture lookup using 8 trilinear lookups. */
 template<typename T>
 ccl_device_noinline T
-kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
+kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, float y, float z)
 {
   ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
@@ -169,7 +171,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl
 
 template<typename T>
 ccl_device_noinline T kernel_tex_image_interp_nanovdb(
-    const TextureInfo &info, float x, float y, float z, uint interpolation)
+    ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
 {
   using namespace nanovdb;
 
@@ -191,7 +193,7 @@ ccl_device_noinline T kernel_tex_image_interp_nanovdb(
 
 ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
 {
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
   /* float4, byte4, ushort4 and half4 */
   const int texture_type = info.data_type;
@@ -226,7 +228,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
                                              float3 P,
                                              InterpolationType interp)
 {
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
   if (info.use_transform_3d) {
     P = transform_point(&info.transform_3d, P);
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 5848ba5df9d..2ec6a49ec7b 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -21,6 +21,10 @@
 #include "kernel/device/gpu/parallel_sorted_index.h"
 #include "kernel/device/gpu/work_stealing.h"
 
+#ifdef __KERNEL_METAL__
+#  include "kernel/device/metal/context_begin.h"
+#endif
+
 #include "kernel/integrator/state.h"
 #include "kernel/integrator/state_flow.h"
 #include "kernel/integrator/state_util.h"
@@ -40,6 +44,11 @@
 #include "kernel/bake/bake.h"
 
 #include "kernel/film/adaptive_sampling.h"
+
+#ifdef __KERNEL_METAL__
+#  include "kernel/device/metal/context_end.h"
+#endif
+
 #include "kernel/film/read.h"
 
 /* --------------------------------------------------------------------
@@ -47,7 +56,8 @@
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_reset(int num_states)
+ccl_gpu_kernel_signature(integrator_reset,
+                          int num_states)
 {
   const int state = ccl_gpu_global_id_x();
 
@@ -58,10 +68,11 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles,
-                                           const int num_tiles,
-                                           float *render_buffer,
-                                           const int max_tile_work_size)
+    ccl_gpu_kernel_signature(integrator_init_from_camera,
+                             ccl_global KernelWorkTile *tiles,
+                             const int num_tiles,
+                             ccl_global float *render_buffer,
+                             const int max_tile_work_size)
 {
   const int work_index = ccl_gpu_global_id_x();
 
@@ -72,7 +83,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int tile_index = work_index / max_tile_work_size;
   const int tile_work_index = work_index - tile_index * max_tile_work_size;
 
-  const KernelWorkTile *tile = &tiles[tile_index];
+  ccl_global const KernelWorkTile *tile = &tiles[tile_index];
 
   if (tile_work_index >= tile->work_size) {
     return;
@@ -83,14 +94,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   uint x, y, sample;
   get_work_pixel(tile, tile_work_index, &x, &y, &sample);
 
-  integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample);
+  ccl_gpu_kernel_call(
+      integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample));
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles,
-                                         const int num_tiles,
-                                         float *render_buffer,
-                                         const int max_tile_work_size)
+    ccl_gpu_kernel_signature(integrator_init_from_bake,
+                             ccl_global KernelWorkTile *tiles,
+                             const int num_tiles,
+                             ccl_global float *render_buffer,
+                             const int max_tile_work_size)
 {
   const int work_index = ccl_gpu_global_id_x();
 
@@ -101,7 +114,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int tile_index = work_index / max_tile_work_size;
   const int tile_work_index = work_index - tile_index * max_tile_work_size;
 
-  const KernelWorkTile *tile = &tiles[tile_index];
+  ccl_global const KernelWorkTile *tile = &tiles[tile_index];
 
   if (tile_work_index >= tile->work_size) {
     return;
@@ -112,228 +125,260 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   uint x, y, sample;
   get_work_pixel(tile, tile_work_index, &x, &y, &sample);
 
-  integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample);
+  ccl_gpu_kernel_call(
+      integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample));
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_closest,
+                             ccl_global const int *path_index_array,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_closest(NULL, state);
+    ccl_gpu_kernel_call(integrator_intersect_closest(NULL, state));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_shadow,
+                             ccl_global const int *path_index_array,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_shadow(NULL, state);
+    ccl_gpu_kernel_call(integrator_intersect_shadow(NULL, state));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_subsurface,
+                             ccl_global const int *path_index_array,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_subsurface(NULL, state);
+    ccl_gpu_kernel_call(integrator_intersect_subsurface(NULL, state));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_volume_stack,
+                             ccl_global const int *path_index_array,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_volume_stack(NULL, state);
+    ccl_gpu_kernel_call(integrator_intersect_volume_stack(NULL, state));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_background(const int *path_index_array,
-                                           float *render_buffer,
-                                           const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_background,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_background(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_background(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_light(const int *path_index_array,
-                                      float *render_buffer,
-                                      const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_light,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_light(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_light(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_shadow(const int *path_index_array,
-                                       float *render_buffer,
-                                       const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_shadow,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_shadow(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_shadow(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_surface(const int *path_index_array,
-                                        float *render_buffer,
-                                        const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_surface,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_surface(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_surface(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array,
-                                                 float *render_buffer,
-                                                 const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_surface_raytrace,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_surface_raytrace(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_surface_raytrace(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_volume(const int *path_index_array,
-                                       float *render_buffer,
-                                       const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_volume,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_volume(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_volume(NULL, state, render_buffer));
   }
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_queued_paths_array(int num_states,
-                                             int *indices,
-                                             int *num_indices,
-                                             int kernel)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_queued_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int kernel_index)
 {
-  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [kernel](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel);
-      });
-}
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index,
+                        int kernel_index)
+      .kernel_index = kernel_index;
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_queued_shadow_paths_array(int num_states,
-                                                    int *indices,
-                                                    int *num_indices,
-                                                    int kernel)
-{
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [kernel](const int state) {
-        return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int kernel_index)
 {
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel_index,
+                        int kernel_index)
+      .kernel_index = kernel_index;
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_terminated_paths_array(int num_states,
-                                                 int *indices,
-                                                 int *num_indices,
-                                                 int indices_offset)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_active_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices)
 {
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices + indices_offset, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) == 0);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_terminated_shadow_paths_array(int num_states,
-                                                        int *indices,
-                                                        int *num_indices,
-                                                        int indices_offset)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_terminated_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int indices_offset)
 {
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices + indices_offset, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
-      });
+      num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_sorted_paths_array(int num_states,
-                                             int num_states_limit,
-                                             int *indices,
-                                             int *num_indices,
-                                             int *key_counter,
-                                             int *key_prefix_sum,
-                                             int kernel)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int indices_offset)
 {
-  gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_sorted_paths_array,
+                             int num_states,
+                             int num_states_limit,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             ccl_global int *key_counter,
+                             ccl_global int *key_prefix_sum,
+                             int kernel_index)
+{
+  ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index) ?
+                            INTEGRATOR_STATE(state, path, shader_sort_key) :
+                            GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY,
+                        int kernel_index)
+    .kernel_index = kernel_index;
+  
+  const uint state_index = ccl_gpu_global_id_x();
+  gpu_parallel_sorted_index_array(
+      state_index,
       num_states,
       num_states_limit,
       indices,
-      num_indices,
-      key_counter,
-      key_prefix_sum,
-      [kernel](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ?
-                   INTEGRATOR_STATE(state, path, shader_sort_key) :
-                   GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
-      });
-}
-
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_compact_paths_array(int num_states,
-                                              int *indices,
-                                              int *num_indices,
-                                              int num_active_paths)
-{
+                                  num_indices,
+                                  key_counter,
+                                  key_prefix_sum,
+                                  ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                         ccl_global int *num_indices,
+                         int num_active_paths)
+{
+  ccl_gpu_kernel_lambda((state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0),
+                        int num_active_paths)
+    .num_active_paths = num_active_paths;
+  
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [num_active_paths](const int state) {
-        return (state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_compact_states(const int *active_terminated_states,
-                                         const int active_states_offset,
-                                         const int terminated_states_offset,
-                                         const int work_size)
+ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_states,
+                             ccl_global const int *active_terminated_states,
+                             const int active_states_offset,
+                             const int terminated_states_offset,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
@@ -341,28 +386,31 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
     const int from_state = active_terminated_states[active_states_offset + global_index];
     const int to_state = active_terminated_states[terminated_states_offset + global_index];
 
-    integrator_state_move(NULL, to_state, from_state);
+    ccl_gpu_kernel_call(integrator_state_move(NULL, to_state, from_state));
   }
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_compact_shadow_paths_array(int num_states,
-                                                     int *indices,
-                                                     int *num_indices,
-                                                     int num_active_paths)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                         ccl_global int *num_indices,
+                         int num_active_paths)
 {
+  ccl_gpu_kernel_lambda((state >= num_active_paths) && (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0),
+                        int num_active_paths)
+    .num_active_paths = num_active_paths;
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [num_active_paths](const int state) {
-        return (state >= num_active_paths) &&
-               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_compact_shadow_states(const int *active_terminated_states,
-                                                const int active_states_offset,
-                                                const int terminated_states_offset,
-                                                const int work_size)
+ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_shadow_states,
+                             ccl_global const int *active_terminated_states,
+                             const int active_states_offset,
+                             const int terminated_states_offset,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
@@ -370,15 +418,14 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
     const int from_state = active_terminated_states[active_states_offset + global_index];
     const int to_state = active_terminated_states[terminated_states_offset + global_index];
 
-    integrator_shadow_state_move(NULL, to_state, from_state);
+    ccl_gpu_kernel_call(integrator_shadow_state_move(NULL, to_state, from_state));
   }
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values)
+ccl_gpu_kernel(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) ccl_gpu_kernel_signature(
+    prefix_sum, ccl_global int *counter, ccl_global int *prefix_sum, int num_values)
 {
-  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(
-      counter, prefix_sum, num_values);
+  gpu_parallel_prefix_sum(ccl_gpu_global_id_x(), counter, prefix_sum, num_values);
 }
 
 /* --------------------------------------------------------------------
@@ -386,16 +433,17 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLO
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer,
-                                                   int sx,
-                                                   int sy,
-                                                   int sw,
-                                                   int sh,
-                                                   float threshold,
-                                                   bool reset,
-                                                   int offset,
-                                                   int stride,
-                                                   uint *num_active_pixels)
+    ccl_gpu_kernel_signature(adaptive_sampling_convergence_check,
+                             ccl_global float *render_buffer,
+                             int sx,
+                             int sy,
+                             int sw,
+                             int sh,
+                             float threshold,
+                             bool reset,
+                             int offset,
+                             int stride,
+                             ccl_global uint *num_active_pixels)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / sw;
@@ -404,37 +452,51 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   bool converged = true;
 
   if (x < sw && y < sh) {
-    converged = kernel_adaptive_sampling_convergence_check(
-        nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride);
+    converged = ccl_gpu_kernel_call(kernel_adaptive_sampling_convergence_check(
+        nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride));
   }
 
   /* NOTE: All threads specified in the mask must execute the intrinsic. */
-  const uint num_active_pixels_mask = ccl_gpu_ballot(!converged);
+  const auto num_active_pixels_mask = ccl_gpu_ballot(!converged);
   const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
   if (lane_id == 0) {
-    atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask));
+    atomic_fetch_and_add_uint32(num_active_pixels, ccl_gpu_popc(num_active_pixels_mask));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_adaptive_sampling_filter_x(
-        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+    ccl_gpu_kernel_signature(adaptive_sampling_filter_x,
+                             ccl_global float *render_buffer,
+                             int sx,
+                             int sy,
+                             int sw,
+                             int sh,
+                             int offset,
+                             int stride)
 {
   const int y = ccl_gpu_global_id_x();
 
   if (y < sh) {
-    kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride);
+    ccl_gpu_kernel_call(
+        kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_adaptive_sampling_filter_y(
-        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+    ccl_gpu_kernel_signature(adaptive_sampling_filter_y,
+                             ccl_global float *render_buffer,
+                             int sx,
+                             int sy,
+                             int sw,
+                             int sh,
+                             int offset,
+                             int stride)
 {
   const int x = ccl_gpu_global_id_x();
 
   if (x < sw) {
-    kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride);
+    ccl_gpu_kernel_call(
+        kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride));
   }
 }
 
@@ -443,12 +505,14 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels)
+    ccl_gpu_kernel_signature(cryptomatte_postprocess,
+                             ccl_global float *render_buffer,
+                             int num_pixels)
 {
   const int pixel_index = ccl_gpu_global_id_x();
 
   if (pixel_index < num_pixels) {
-    kernel_cryptomatte_post(nullptr, render_buffer, pixel_index);
+    ccl_gpu_kernel_call(kernel_cryptomatte_post(nullptr, render_buffer, pixel_index));
   }
 }
 
@@ -456,206 +520,102 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  * Film.
  */
 
-/* Common implementation for float destination. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert,
-                                                      float *pixels,
-                                                      float *render_buffer,
-                                                      int num_pixels,
-                                                      int width,
-                                                      int offset,
-                                                      int stride,
-                                                      int dst_offset,
-                                                      int dst_stride,
-                                                      const Processor &processor)
-{
-  const int render_pixel_index = ccl_gpu_global_id_x();
-  if (render_pixel_index >= num_pixels) {
-    return;
-  }
-
-  const int x = render_pixel_index % width;
-  const int y = render_pixel_index / width;
-
-  ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride +
-                                   y * stride * kfilm_convert->pass_stride;
-
-  ccl_global float *pixel = pixels +
-                            (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride;
-
-  processor(kfilm_convert, buffer, pixel);
-}
-
-/* Common implementation for half4 destination and 4-channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
-    const KernelFilmConvert *kfilm_convert,
-    uchar4 *rgba,
-    float *render_buffer,
-    int num_pixels,
-    int width,
-    int offset,
-    int stride,
-    int rgba_offset,
-    int rgba_stride,
-    const Processor &processor)
-{
-  const int render_pixel_index = ccl_gpu_global_id_x();
-  if (render_pixel_index >= num_pixels) {
-    return;
-  }
-
-  const int x = render_pixel_index % width;
-  const int y = render_pixel_index / width;
-
-  ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride +
-                                   y * stride * kfilm_convert->pass_stride;
-
-  float pixel[4];
-  processor(kfilm_convert, buffer, pixel);
-
-  film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
-
-  ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
-  *out = float4_to_half4_display(make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
-}
-
-/* Common implementation for half4 destination and 3-channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb(
-    const KernelFilmConvert *kfilm_convert,
-    uchar4 *rgba,
-    float *render_buffer,
-    int num_pixels,
-    int width,
-    int offset,
-    int stride,
-    int rgba_offset,
-    int rgba_stride,
-    const Processor &processor)
-{
-  kernel_gpu_film_convert_half_rgba_common_rgba(
-      kfilm_convert,
-      rgba,
-      render_buffer,
-      num_pixels,
-      width,
-      offset,
-      stride,
-      rgba_offset,
-      rgba_stride,
-      [&processor](const KernelFilmConvert *kfilm_convert,
-                   ccl_global const float *buffer,
-                   float *pixel_rgba) {
-        processor(kfilm_convert, buffer, pixel_rgba);
-        pixel_rgba[3] = 1.0f;
-      });
-}
-
-/* Common implementation for half4 destination and single channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value(
-    const KernelFilmConvert *kfilm_convert,
-    uchar4 *rgba,
-    float *render_buffer,
-    int num_pixels,
-    int width,
-    int offset,
-    int stride,
-    int rgba_offset,
-    int rgba_stride,
-    const Processor &processor)
-{
-  kernel_gpu_film_convert_half_rgba_common_rgba(
-      kfilm_convert,
-      rgba,
-      render_buffer,
-      num_pixels,
-      width,
-      offset,
-      stride,
-      rgba_offset,
-      rgba_stride,
-      [&processor](const KernelFilmConvert *kfilm_convert,
-                   ccl_global const float *buffer,
-                   float *pixel_rgba) {
-        float value;
-        processor(kfilm_convert, buffer, &value);
-
-        pixel_rgba[0] = value;
-        pixel_rgba[1] = value;
-        pixel_rgba[2] = value;
-        pixel_rgba[3] = 1.0f;
-      });
-}
-
-#define KERNEL_FILM_CONVERT_PROC(name) \
-  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name
-
-#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \
-  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \
-  (const KernelFilmConvert kfilm_convert, \
-   float *pixels, \
-   float *render_buffer, \
-   int num_pixels, \
-   int width, \
-   int offset, \
-   int stride, \
-   int rgba_offset, \
-   int rgba_stride) \
+#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \
+  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
+      ccl_gpu_kernel_signature(film_convert_##variant, \
+                               const KernelFilmConvert kfilm_convert, \
+                               ccl_global uchar4 *rgba, \
+                               ccl_global float *render_buffer, \
+                               int num_pixels, \
+                               int width, \
+                               int offset, \
+                               int stride, \
+                               int rgba_offset, \
+                               int rgba_stride) \
   { \
-    kernel_gpu_film_convert_common(&kfilm_convert, \
-                                   pixels, \
-                                   render_buffer, \
-                                   num_pixels, \
-                                   width, \
-                                   offset, \
-                                   stride, \
-                                   rgba_offset, \
-                                   rgba_stride, \
-                                   film_get_pass_pixel_##variant); \
+    const int render_pixel_index = ccl_gpu_global_id_x(); \
+    if (render_pixel_index >= num_pixels) { \
+      return; \
+    } \
+\
+    const int x = render_pixel_index % width; \
+    const int y = render_pixel_index / width; \
+\
+    ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
+                                     y * stride * kfilm_convert.pass_stride; \
+\
+    float pixel[4]; \
+    film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
+\
+    film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
+\
+    if (input_channel_count == 1) { \
+      pixel[1] = pixel[2] = pixel[0]; \
+    } \
+    if (input_channel_count <= 3) { \
+      pixel[3] = 1.0f; \
+    } \
+\
+    ccl_global float *out = ((ccl_global float *)rgba) + rgba_offset + y * rgba_stride + x; \
+    *(ccl_global float4 *)out = make_float4(pixel[0], pixel[1], pixel[2], pixel[3]); \
   } \
-  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \
-  (const KernelFilmConvert kfilm_convert, \
-   uchar4 *rgba, \
-   float *render_buffer, \
-   int num_pixels, \
-   int width, \
-   int offset, \
-   int stride, \
-   int rgba_offset, \
-   int rgba_stride) \
+\
+  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
+      ccl_gpu_kernel_signature(film_convert_##variant##_half_rgba, \
+                               const KernelFilmConvert kfilm_convert, \
+                               ccl_global uchar4 *rgba, \
+                               ccl_global float *render_buffer, \
+                               int num_pixels, \
+                               int width, \
+                               int offset, \
+                               int stride, \
+                               int rgba_offset, \
+                               int rgba_stride) \
   { \
-    kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \
-                                                        rgba, \
-                                                        render_buffer, \
-                                                        num_pixels, \
-                                                        width, \
-                                                        offset, \
-                                                        stride, \
-                                                        rgba_offset, \
-                                                        rgba_stride, \
-                                                        film_get_pass_pixel_##variant); \
-  }
-
-KERNEL_FILM_CONVERT_DEFINE(depth, value)
-KERNEL_FILM_CONVERT_DEFINE(mist, value)
-KERNEL_FILM_CONVERT_DEFINE(sample_count, value)
-KERNEL_FILM_CONVERT_DEFINE(float, value)
-
-KERNEL_FILM_CONVERT_DEFINE(light_path, rgb)
-KERNEL_FILM_CONVERT_DEFINE(float3, rgb)
-
-KERNEL_FILM_CONVERT_DEFINE(motion, rgba)
-KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba)
-KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba)
-KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba)
-KERNEL_FILM_CONVERT_DEFINE(combined, rgba)
-KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
-
-#undef KERNEL_FILM_CONVERT_DEFINE
-#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE
-#undef KERNEL_FILM_CONVERT_PROC
+    const int render_pixel_index = ccl_gpu_global_id_x(); \
+    if (render_pixel_index >= num_pixels) { \
+      return; \
+    } \
+\
+    const int x = render_pixel_index % width; \
+    const int y = render_pixel_index / width; \
+\
+    ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
+                                     y * stride * kfilm_convert.pass_stride; \
+\
+    float pixel[4]; \
+    film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
+\
+    film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
+\
+    if (input_channel_count == 1) { \
+      pixel[1] = pixel[2] = pixel[0]; \
+    } \
+    if (input_channel_count <= 3) { \
+      pixel[3] = 1.0f; \
+    } \
+\
+    ccl_global half4 *out = ((ccl_global half4 *)rgba) + (rgba_offset + y * rgba_stride + x); \
+    *out = float4_to_half4_display(make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \
+  }
+
+/* 1 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(depth, 1)
+KERNEL_FILM_CONVERT_VARIANT(mist, 1)
+KERNEL_FILM_CONVERT_VARIANT(sample_count, 1)
+KERNEL_FILM_CONVERT_VARIANT(float, 1)
+
+/* 3 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(light_path, 3)
+KERNEL_FILM_CONVERT_VARIANT(float3, 3)
+
+/* 4 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(motion, 4)
+KERNEL_FILM_CONVERT_VARIANT(cryptomatte, 4)
+KERNEL_FILM_CONVERT_VARIANT(shadow_catcher, 4)
+KERNEL_FILM_CONVERT_VARIANT(shadow_catcher_matte_with_shadow, 4)
+KERNEL_FILM_CONVERT_VARIANT(combined, 4)
+KERNEL_FILM_CONVERT_VARIANT(float4, 4)
 
 /* --------------------------------------------------------------------
  * Shader evaluation.
@@ -664,42 +624,46 @@ KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
 /* Displacement */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
-                                    float *output,
-                                    const int offset,
-                                    const int work_size)
+    ccl_gpu_kernel_signature(shader_eval_displace,
+                             ccl_global KernelShaderEvalInput *input,
+                             ccl_global float *output,
+                             const int offset,
+                             const int work_size)
 {
   int i = ccl_gpu_global_id_x();
   if (i < work_size) {
-    kernel_displace_evaluate(NULL, input, output, offset + i);
+    ccl_gpu_kernel_call(kernel_displace_evaluate(NULL, input, output, offset + i));
   }
 }
 
 /* Background */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
-                                      float *output,
-                                      const int offset,
-                                      const int work_size)
+    ccl_gpu_kernel_signature(shader_eval_background,
+                             ccl_global KernelShaderEvalInput *input,
+                             ccl_global float *output,
+                             const int offset,
+                             const int work_size)
 {
   int i = ccl_gpu_global_id_x();
   if (i < work_size) {
-    kernel_background_evaluate(NULL, input, output, offset + i);
+    ccl_gpu_kernel_call(kernel_background_evaluate(NULL, input, output, offset + i));
   }
 }
 
 /* Curve Shadow Transparency */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_shader_eval_curve_shadow_transparency(KernelShaderEvalInput *input,
-                                                     float *output,
-                                                     const int offset,
-                                                     const int work_size)
+    ccl_gpu_kernel_signature(shader_eval_curve_shadow_transparency,
+                             ccl_global KernelShaderEvalInput *input,
+                             ccl_global float *output,
+                             const int offset,
+                             const int work_size)
 {
   int i = ccl_gpu_global_id_x();
   if (i < work_size) {
-    kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i);
+    ccl_gpu_kernel_call(
+        kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i));
   }
 }
 
@@ -708,15 +672,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_color_preprocess(float *render_buffer,
-                                       int full_x,
-                                       int full_y,
-                                       int width,
-                                       int height,
-                                       int offset,
-                                       int stride,
-                                       int pass_stride,
-                                       int pass_denoised)
+    ccl_gpu_kernel_signature(filter_color_preprocess,
+                             ccl_global float *render_buffer,
+                             int full_x,
+                             int full_y,
+                             int width,
+                             int height,
+                             int offset,
+                             int stride,
+                             int pass_stride,
+                             int pass_denoised)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / width;
@@ -727,31 +692,32 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
-  float *buffer = render_buffer + render_pixel_index * pass_stride;
+  ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
 
-  float *color_out = buffer + pass_denoised;
+  ccl_global float *color_out = buffer + pass_denoised;
   color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
   color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
   color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_guiding_preprocess(float *guiding_buffer,
-                                         int guiding_pass_stride,
-                                         int guiding_pass_albedo,
-                                         int guiding_pass_normal,
-                                         const float *render_buffer,
-                                         int render_offset,
-                                         int render_stride,
-                                         int render_pass_stride,
-                                         int render_pass_sample_count,
-                                         int render_pass_denoising_albedo,
-                                         int render_pass_denoising_normal,
-                                         int full_x,
-                                         int full_y,
-                                         int width,
-                                         int height,
-                                         int num_samples)
+    ccl_gpu_kernel_signature(filter_guiding_preprocess,
+                             ccl_global float *guiding_buffer,
+                             int guiding_pass_stride,
+                             int guiding_pass_albedo,
+                             int guiding_pass_normal,
+                             ccl_global const float *render_buffer,
+                             int render_offset,
+                             int render_stride,
+                             int render_pass_stride,
+                             int render_pass_sample_count,
+                             int render_pass_denoising_albedo,
+                             int render_pass_denoising_normal,
+                             int full_x,
+                             int full_y,
+                             int width,
+                             int height,
+                             int num_samples)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / width;
@@ -762,10 +728,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t guiding_pixel_index = x + y * width;
-  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+  ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
 
   const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
-  const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
+  ccl_global const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
 
   float pixel_scale;
   if (render_pass_sample_count == PASS_UNUSED) {
@@ -779,8 +745,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   if (guiding_pass_albedo != PASS_UNUSED) {
     kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
 
-    const float *aledo_in = buffer + render_pass_denoising_albedo;
-    float *albedo_out = guiding_pixel + guiding_pass_albedo;
+    ccl_global const float *aledo_in = buffer + render_pass_denoising_albedo;
+    ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
 
     albedo_out[0] = aledo_in[0] * pixel_scale;
     albedo_out[1] = aledo_in[1] * pixel_scale;
@@ -791,8 +757,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   if (guiding_pass_normal != PASS_UNUSED) {
     kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
 
-    const float *normal_in = buffer + render_pass_denoising_normal;
-    float *normal_out = guiding_pixel + guiding_pass_normal;
+    ccl_global const float *normal_in = buffer + render_pass_denoising_normal;
+    ccl_global float *normal_out = guiding_pixel + guiding_pass_normal;
 
     normal_out[0] = normal_in[0] * pixel_scale;
     normal_out[1] = normal_in[1] * pixel_scale;
@@ -801,11 +767,12 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer,
-                                              int guiding_pass_stride,
-                                              int guiding_pass_albedo,
-                                              int width,
-                                              int height)
+    ccl_gpu_kernel_signature(filter_guiding_set_fake_albedo,
+                             ccl_global float *guiding_buffer,
+                             int guiding_pass_stride,
+                             int guiding_pass_albedo,
+                             int width,
+                             int height)
 {
   kernel_assert(guiding_pass_albedo != PASS_UNUSED);
 
@@ -818,9 +785,9 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t guiding_pixel_index = x + y * width;
-  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+  ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
 
-  float *albedo_out = guiding_pixel + guiding_pass_albedo;
+  ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
 
   albedo_out[0] = 0.5f;
   albedo_out[1] = 0.5f;
@@ -828,20 +795,21 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_color_postprocess(float *render_buffer,
-                                        int full_x,
-                                        int full_y,
-                                        int width,
-                                        int height,
-                                        int offset,
-                                        int stride,
-                                        int pass_stride,
-                                        int num_samples,
-                                        int pass_noisy,
-                                        int pass_denoised,
-                                        int pass_sample_count,
-                                        int num_components,
-                                        bool use_compositing)
+    ccl_gpu_kernel_signature(filter_color_postprocess,
+                             ccl_global float *render_buffer,
+                             int full_x,
+                             int full_y,
+                             int width,
+                             int height,
+                             int offset,
+                             int stride,
+                             int pass_stride,
+                             int num_samples,
+                             int pass_noisy,
+                             int pass_denoised,
+                             int pass_sample_count,
+                             int num_components,
+                             bool use_compositing)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / width;
@@ -852,7 +820,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
-  float *buffer = render_buffer + render_pixel_index * pass_stride;
+  ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
 
   float pixel_scale;
   if (pass_sample_count == PASS_UNUSED) {
@@ -862,7 +830,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     pixel_scale = __float_as_uint(buffer[pass_sample_count]);
   }
 
-  float *denoised_pixel = buffer + pass_denoised;
+  ccl_global float *denoised_pixel = buffer + pass_denoised;
 
   denoised_pixel[0] *= pixel_scale;
   denoised_pixel[1] *= pixel_scale;
@@ -875,7 +843,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     /* Currently compositing passes are either 3-component (derived by dividing light passes)
      * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
      * simplifies logic and avoids extra memory allocation. */
-    const float *noisy_pixel = buffer + pass_noisy;
+    ccl_global const float *noisy_pixel = buffer + pass_noisy;
     denoised_pixel[3] = noisy_pixel[3];
   }
   else {
@@ -891,21 +859,22 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states,
-                                                               uint *num_possible_splits)
+    ccl_gpu_kernel_signature(integrator_shadow_catcher_count_possible_splits,
+                             int num_states,
+                             ccl_global uint *num_possible_splits)
 {
   const int state = ccl_gpu_global_id_x();
 
   bool can_split = false;
 
   if (state < num_states) {
-    can_split = kernel_shadow_catcher_path_can_split(nullptr, state);
+    can_split = ccl_gpu_kernel_call(kernel_shadow_catcher_path_can_split(nullptr, state));
   }
 
   /* NOTE: All threads specified in the mask must execute the intrinsic. */
-  const uint can_split_mask = ccl_gpu_ballot(can_split);
+  const auto can_split_mask = ccl_gpu_ballot(can_split);
   const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
   if (lane_id == 0) {
-    atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask));
+    atomic_fetch_and_add_uint32(num_possible_splits, ccl_gpu_popc(can_split_mask));
   }
 }
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index d7416beb783..f667ede2712 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -31,10 +31,43 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
 #endif
 
+#ifdef __KERNEL_METAL__
+struct ActiveIndexContext {
+  ActiveIndexContext(int _thread_index,
+                     int _global_index,
+                     int _threadgroup_size,
+                     int _simdgroup_size,
+                     int _simd_lane_index,
+                     int _simd_group_index,
+                     int _num_simd_groups,
+                     threadgroup int *_simdgroup_offset)
+      : thread_index(_thread_index),
+        global_index(_global_index),
+        blocksize(_threadgroup_size),
+        ccl_gpu_warp_size(_simdgroup_size),
+        thread_warp(_simd_lane_index),
+        warp_index(_simd_group_index),
+        num_warps(_num_simd_groups),
+        warp_offset(_simdgroup_offset)
+  {
+  }
+
+  const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index,
+      num_warps;
+  threadgroup int *warp_offset;
+
+  template<uint blocksizeDummy, typename IsActiveOp>
+  void active_index_array(const uint num_states,
+                          ccl_global int *indices,
+                          ccl_global int *num_indices,
+                          IsActiveOp is_active_op)
+  {
+    const uint state_index = global_index;
+#else
 template<uint blocksize, typename IsActiveOp>
 __device__ void gpu_parallel_active_index_array(const uint num_states,
-                                                int *indices,
-                                                int *num_indices,
+                                                ccl_global int *indices,
+                                                ccl_global int *num_indices,
                                                 IsActiveOp is_active_op)
 {
   extern ccl_gpu_shared int warp_offset[];
@@ -45,43 +78,62 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
   const uint warp_index = thread_index / ccl_gpu_warp_size;
   const uint num_warps = blocksize / ccl_gpu_warp_size;
 
-  /* Test if state corresponding to this thread is active. */
   const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
-  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+#endif
 
-  /* For each thread within a warp compute how many other active states precede it. */
-  const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
-  const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+    /* Test if state corresponding to this thread is active. */
+    const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
 
-  /* Last thread in warp stores number of active states for each warp. */
-  if (thread_warp == ccl_gpu_warp_size - 1) {
-    warp_offset[warp_index] = thread_offset + is_active;
-  }
+    /* For each thread within a warp compute how many other active states precede it. */
+    const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) &
+                                            ccl_gpu_thread_mask(thread_warp));
 
-  ccl_gpu_syncthreads();
-
-  /* Last thread in block converts per-warp sizes to offsets, increments global size of
-   * index array and gets offset to write to. */
-  if (thread_index == blocksize - 1) {
-    /* TODO: parallelize this. */
-    int offset = 0;
-    for (int i = 0; i < num_warps; i++) {
-      int num_active = warp_offset[i];
-      warp_offset[i] = offset;
-      offset += num_active;
+    /* Last thread in warp stores number of active states for each warp. */
+    if (thread_warp == ccl_gpu_warp_size - 1) {
+      warp_offset[warp_index] = thread_offset + is_active;
     }
 
-    const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
-    warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
-  }
+    ccl_gpu_syncthreads();
+
+    /* Last thread in block converts per-warp sizes to offsets, increments global size of
+     * index array and gets offset to write to. */
+    if (thread_index == blocksize - 1) {
+      /* TODO: parallelize this. */
+      int offset = 0;
+      for (int i = 0; i < num_warps; i++) {
+        int num_active = warp_offset[i];
+        warp_offset[i] = offset;
+        offset += num_active;
+      }
+
+      const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+      warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+    }
 
-  ccl_gpu_syncthreads();
+    ccl_gpu_syncthreads();
 
-  /* Write to index array. */
-  if (is_active) {
-    const uint block_offset = warp_offset[num_warps];
-    indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+    /* Write to index array. */
+    if (is_active) {
+      const uint block_offset = warp_offset[num_warps];
+      indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+    }
   }
-}
+
+#ifdef __KERNEL_METAL__
+}; /* end class ActiveIndexContext */
+
+/* inject the required thread params into a struct, and redirect to its templated member function
+ */
+#  define gpu_parallel_active_index_array \
+    ActiveIndexContext(metal_local_id, \
+                       metal_global_id, \
+                       metal_local_size, \
+                       simdgroup_size, \
+                       simd_lane_index, \
+                       simd_group_index, \
+                       num_simd_groups, \
+                       simdgroup_offset) \
+        .active_index_array
+#endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
index 6de3a022569..4bd002c27e4 100644
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -33,10 +33,12 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
 #endif
 
-template<uint blocksize>
-__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values)
+__device__ void gpu_parallel_prefix_sum(const int global_id,
+                                        ccl_global int *counter,
+                                        ccl_global int *prefix_sum,
+                                        const int num_values)
 {
-  if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
+  if (global_id != 0) {
     return;
   }
 
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
index c06d7be444f..c092e2a21ee 100644
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -33,16 +33,16 @@ CCL_NAMESPACE_BEGIN
 #endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
 
-template<uint blocksize, typename GetKeyOp>
-__device__ void gpu_parallel_sorted_index_array(const uint num_states,
+template<typename GetKeyOp>
+__device__ void gpu_parallel_sorted_index_array(const uint state_index,
+                                                const uint num_states,
                                                 const int num_states_limit,
-                                                int *indices,
-                                                int *num_indices,
-                                                int *key_counter,
-                                                int *key_prefix_sum,
+                                                ccl_global int *indices,
+                                                ccl_global int *num_indices,
+                                                ccl_global int *key_counter,
+                                                ccl_global int *key_prefix_sum,
                                                 GetKeyOp get_key_op)
 {
-  const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x;
   const int key = (state_index < num_states) ? get_key_op(state_index) :
                                                GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
 
-- 
cgit v1.2.3


From f56562043521a5c160585aea3f28167b4d3bc77d Mon Sep 17 00:00:00 2001
From: Patrick Mours <pmours@nvidia.com>
Date: Wed, 10 Nov 2021 14:37:15 +0100
Subject: Fix T92985: CUDA errors with Cycles film convert kernels

rB3a4c8f406a3a3bf0627477c6183a594fa707a6e2 changed the macros that create the film
convert kernel entry points, but in the process accidentally changed the parameter definition
to one of those (which caused CUDA launch and misaligned address errors) and changed the
implementation as well. This restores the correct implementation from before.

In addition, the `ccl_gpu_kernel_threads` macro did not work as intended and caused the
generated launch bounds to end up with an incorrect input for the second parameter (it was
set to "thread_num_registers", rather than the result of the block number calculation). I'm
not entirely sure why, as the macro definition looked sound to me. Decided to simply go with
two separate macros instead, to simplify and solve this.

Also changed how state is captured with the `ccl_gpu_kernel_lambda` macro slightly, to avoid
a compiler warning (expression has no effect) that otherwise occurred.

Maniphest Tasks: T92985

Differential Revision: https://developer.blender.org/D13175
---
 intern/cycles/kernel/device/gpu/kernel.h | 101 +++++++++++++++----------------
 1 file changed, 48 insertions(+), 53 deletions(-)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 2ec6a49ec7b..e954178ec63 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -56,8 +56,7 @@
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-ccl_gpu_kernel_signature(integrator_reset,
-                          int num_states)
+    ccl_gpu_kernel_signature(integrator_reset, int num_states)
 {
   const int state = ccl_gpu_global_id_x();
 
@@ -265,7 +264,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
     ccl_gpu_kernel_signature(integrator_queued_paths_array,
                              int num_states,
                              ccl_global int *indices,
@@ -273,14 +272,14 @@ ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                              int kernel_index)
 {
   ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index,
-                        int kernel_index)
-      .kernel_index = kernel_index;
+                        int kernel_index);
+  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
 
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
     ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array,
                              int num_states,
                              ccl_global int *indices,
@@ -288,25 +287,26 @@ ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                              int kernel_index)
 {
   ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel_index,
-                        int kernel_index)
-      .kernel_index = kernel_index;
+                        int kernel_index);
+  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
 
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
     ccl_gpu_kernel_signature(integrator_active_paths_array,
                              int num_states,
                              ccl_global int *indices,
                              ccl_global int *num_indices)
 {
   ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
     ccl_gpu_kernel_signature(integrator_terminated_paths_array,
                              int num_states,
                              ccl_global int *indices,
@@ -314,11 +314,12 @@ ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                              int indices_offset)
 {
   ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
     ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array,
                              int num_states,
                              ccl_global int *indices,
@@ -326,11 +327,12 @@ ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                              int indices_offset)
 {
   ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
     ccl_gpu_kernel_signature(integrator_sorted_paths_array,
                              int num_states,
                              int num_states_limit,
@@ -343,37 +345,37 @@ ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
   ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index) ?
                             INTEGRATOR_STATE(state, path, shader_sort_key) :
                             GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY,
-                        int kernel_index)
-    .kernel_index = kernel_index;
-  
+                        int kernel_index);
+  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
+
   const uint state_index = ccl_gpu_global_id_x();
-  gpu_parallel_sorted_index_array(
-      state_index,
-      num_states,
-      num_states_limit,
-      indices,
+  gpu_parallel_sorted_index_array(state_index,
+                                  num_states,
+                                  num_states_limit,
+                                  indices,
                                   num_indices,
                                   key_counter,
                                   key_prefix_sum,
                                   ccl_gpu_kernel_lambda_pass);
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
     ccl_gpu_kernel_signature(integrator_compact_paths_array,
                              int num_states,
                              ccl_global int *indices,
-                         ccl_global int *num_indices,
-                         int num_active_paths)
+                             ccl_global int *num_indices,
+                             int num_active_paths)
 {
-  ccl_gpu_kernel_lambda((state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0),
-                        int num_active_paths)
-    .num_active_paths = num_active_paths;
-  
+  ccl_gpu_kernel_lambda((state >= num_active_paths) &&
+                            (INTEGRATOR_STATE(state, path, queued_kernel) != 0),
+                        int num_active_paths);
+  ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
     ccl_gpu_kernel_signature(integrator_compact_states,
                              ccl_global const int *active_terminated_states,
                              const int active_states_offset,
@@ -390,22 +392,23 @@ ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
   }
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
     ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array,
                              int num_states,
                              ccl_global int *indices,
-                         ccl_global int *num_indices,
-                         int num_active_paths)
+                             ccl_global int *num_indices,
+                             int num_active_paths)
 {
-  ccl_gpu_kernel_lambda((state >= num_active_paths) && (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0),
-                        int num_active_paths)
-    .num_active_paths = num_active_paths;
+  ccl_gpu_kernel_lambda((state >= num_active_paths) &&
+                            (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0),
+                        int num_active_paths);
+  ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
 
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
       num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
     ccl_gpu_kernel_signature(integrator_compact_shadow_states,
                              ccl_global const int *active_terminated_states,
                              const int active_states_offset,
@@ -422,7 +425,7 @@ ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
   }
 }
 
-ccl_gpu_kernel(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) ccl_gpu_kernel_signature(
+ccl_gpu_kernel_threads(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) ccl_gpu_kernel_signature(
     prefix_sum, ccl_global int *counter, ccl_global int *prefix_sum, int num_values)
 {
   gpu_parallel_prefix_sum(ccl_gpu_global_id_x(), counter, prefix_sum, num_values);
@@ -524,7 +527,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
       ccl_gpu_kernel_signature(film_convert_##variant, \
                                const KernelFilmConvert kfilm_convert, \
-                               ccl_global uchar4 *rgba, \
+                               ccl_global float *pixels, \
                                ccl_global float *render_buffer, \
                                int num_pixels, \
                                int width, \
@@ -544,20 +547,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
                                      y * stride * kfilm_convert.pass_stride; \
 \
-    float pixel[4]; \
-    film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
-\
-    film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
+    ccl_global float *pixel = pixels + \
+                              (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \
 \
-    if (input_channel_count == 1) { \
-      pixel[1] = pixel[2] = pixel[0]; \
-    } \
-    if (input_channel_count <= 3) { \
-      pixel[3] = 1.0f; \
-    } \
-\
-    ccl_global float *out = ((ccl_global float *)rgba) + rgba_offset + y * rgba_stride + x; \
-    *(ccl_global float4 *)out = make_float4(pixel[0], pixel[1], pixel[2], pixel[3]); \
+    film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
   } \
 \
   ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
@@ -585,8 +578,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 \
     float pixel[4]; \
     film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
-\
-    film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
 \
     if (input_channel_count == 1) { \
       pixel[1] = pixel[2] = pixel[0]; \
@@ -595,7 +586,9 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
       pixel[3] = 1.0f; \
     } \
 \
-    ccl_global half4 *out = ((ccl_global half4 *)rgba) + (rgba_offset + y * rgba_stride + x); \
+    film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
+\
+    ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x; \
     *out = float4_to_half4_display(make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \
   }
 
@@ -617,6 +610,8 @@ KERNEL_FILM_CONVERT_VARIANT(shadow_catcher_matte_with_shadow, 4)
 KERNEL_FILM_CONVERT_VARIANT(combined, 4)
 KERNEL_FILM_CONVERT_VARIANT(float4, 4)
 
+#undef KERNEL_FILM_CONVERT_VARIANT
+
 /* --------------------------------------------------------------------
  * Shader evaluation.
  */
-- 
cgit v1.2.3


From 6b0008129e6370866808bd937161579a2cb5cb51 Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Wed, 10 Nov 2021 19:43:19 +0100
Subject: Fix T92972: Cycles HIP wrong render display after a recent refactor

It's unclear why this fails. Maybe the size of half4 is not the expected
8 bytes and adjacent pixels are overwritten. Or there is some bug in the
HIP compiler writing a struct into global memory, which we probably don't
do elsewhere in the kernel.

Thanks to Thomas, William and Jeroen for helping investigate this.
---
 intern/cycles/kernel/device/gpu/kernel.h | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 5848ba5df9d..844bbf90f67 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -486,6 +486,26 @@ ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *k
   processor(kfilm_convert, buffer, pixel);
 }
 
+ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgba,
+                                                          const int rgba_offset,
+                                                          const int rgba_stride,
+                                                          const int x,
+                                                          const int y,
+                                                          const half4 half_pixel)
+{
+  /* Work around HIP issue with half float display, see T92972. */
+#ifdef __KERNEL_HIP__
+  ccl_global half *out = ((ccl_global half *)rgba) + (rgba_offset + y * rgba_stride + x) * 4;
+  out[0] = half_pixel.x;
+  out[1] = half_pixel.y;
+  out[2] = half_pixel.z;
+  out[3] = half_pixel.w;
+#else
+  ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
+  *out = half_pixel;
+#endif
+}
+
 /* Common implementation for half4 destination and 4-channel input pass. */
 template<typename Processor>
 ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
@@ -516,8 +536,9 @@ ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
 
   film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
 
-  ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
-  *out = float4_to_half4_display(make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+  const half4 half_pixel = float4_to_half4_display(
+      make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
+  kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel);
 }
 
 /* Common implementation for half4 destination and 3-channel input pass. */
-- 
cgit v1.2.3


From d26d3cfe193793728cac77be9b44463a84a0f57e Mon Sep 17 00:00:00 2001
From: Sergey Sharybin <sergey@blender.org>
Date: Wed, 10 Nov 2021 17:18:55 +0100
Subject: Fix T92868: Cycles catcher with transparency crashes

The issue was caused by splitting happening twice.

Fixed by checking for split flag which is assigned to the both states
during split.

The tricky part was to write catcher data at the moment of split: the
transparency and shadow catcher sample count is to be accumulated at
that point. Now it is happening in the `intersect_closest` kernel.
The downside is that render buffer is to be passed to the kernel, but
the benefit is that extra split bounce check is not needed now.

Had to move the passes write to shadow catcher header, since include
of `film/passes.h` causes all the fun of requirement to have BSDF
data structures available.

Differential Revision: https://developer.blender.org/D13177
---
 intern/cycles/kernel/device/gpu/kernel.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 844bbf90f67..56fcc38b907 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -116,13 +116,15 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size)
+    kernel_gpu_integrator_intersect_closest(const int *path_index_array,
+                                            ccl_global float *render_buffer,
+                                            const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_closest(NULL, state);
+    integrator_intersect_closest(NULL, state, render_buffer);
   }
 }
 
-- 
cgit v1.2.3


From d19e35873f67c90b251ca38e007a83aa1eada211 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael_p_jones@apple.com>
Date: Thu, 18 Nov 2021 14:25:05 +0100
Subject: Cycles: several small fixes and additions for MSL

This patch contains many small leftover fixes and additions that are
required for Metal-enablement:

- Address space fixes and a few other small compile fixes
- Addition of missing functionality to the Metal adapter headers
- Addition of various scattered `__KERNEL_METAL__` blocks (e.g. for
  atomic support & maths functions)

Ref T92212

Differential Revision: https://developer.blender.org/D13263
---
 intern/cycles/kernel/device/gpu/kernel.h                | 4 ++--
 intern/cycles/kernel/device/gpu/parallel_active_index.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index dd0c6dd6893..60332af752c 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -464,7 +464,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const auto num_active_pixels_mask = ccl_gpu_ballot(!converged);
   const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
   if (lane_id == 0) {
-    atomic_fetch_and_add_uint32(num_active_pixels, ccl_gpu_popc(num_active_pixels_mask));
+    atomic_fetch_and_add_uint32(num_active_pixels, popcount(num_active_pixels_mask));
   }
 }
 
@@ -892,6 +892,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const auto can_split_mask = ccl_gpu_ballot(can_split);
   const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
   if (lane_id == 0) {
-    atomic_fetch_and_add_uint32(num_possible_splits, ccl_gpu_popc(can_split_mask));
+    atomic_fetch_and_add_uint32(num_possible_splits, popcount(can_split_mask));
   }
 }
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index f667ede2712..a5320edcb3c 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -85,8 +85,8 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
     const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
 
     /* For each thread within a warp compute how many other active states precede it. */
-    const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) &
-                                            ccl_gpu_thread_mask(thread_warp));
+    const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
+                                        ccl_gpu_thread_mask(thread_warp));
 
     /* Last thread in warp stores number of active states for each warp. */
     if (thread_warp == ccl_gpu_warp_size - 1) {
-- 
cgit v1.2.3


From d1f944c18634f215c3da0484ac3b80e994118680 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael_p_jones@apple.com>
Date: Thu, 18 Nov 2021 14:25:30 +0100
Subject: Cycles: declare constants at program scope on Metal

MSL requires that constant address space literals be declared at program
scope. This patch moves the `blackbody_table_r/g/b` and `cie_colour_match`
constants into separate files so they can be declared at the appropriate scope.

Ref T92212

Differential Revision: https://developer.blender.org/D13241
---
 intern/cycles/kernel/device/gpu/kernel.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 60332af752c..22e2a61a06d 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -21,6 +21,9 @@
 #include "kernel/device/gpu/parallel_sorted_index.h"
 #include "kernel/device/gpu/work_stealing.h"
 
+/* Include constant tables before entering Metal's context class scope (context_begin.h) */
+#include "kernel/tables.h"
+
 #ifdef __KERNEL_METAL__
 #  include "kernel/device/metal/context_begin.h"
 #endif
-- 
cgit v1.2.3


From b41c72b710d4013fd6d67dc49a8ebb2a416b4462 Mon Sep 17 00:00:00 2001
From: Alaska <Alaska>
Date: Thu, 25 Nov 2021 09:20:28 +0100
Subject: Fix performance decrease with Scrambling Distance on

With the current code in master, scrambling distance is enabled on non-hardware accelerated ray tracing devices see a measurable performance decrease when compared scrambling distance on vs off. From testing, this performance decrease comes from the large tile sizes scheduled in `tile.cpp`.

This patch attempts to address the performance decrease by using different algorithms to calculate the tile size for devices with hardware accelerated ray traversal and devices without. Large tile sizes for hardware accelerated devices and small tile sizes for others.

Most of this code is based on proposals from @brecht and @leesonw

Reviewed By: brecht, leesonw

Differential Revision: https://developer.blender.org/D13042
---
 intern/cycles/kernel/device/gpu/work_stealing.h | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/work_stealing.h b/intern/cycles/kernel/device/gpu/work_stealing.h
index fab0915c38e..c3083948057 100644
--- a/intern/cycles/kernel/device/gpu/work_stealing.h
+++ b/intern/cycles/kernel/device/gpu/work_stealing.h
@@ -29,17 +29,20 @@ ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
                                       ccl_private uint *y,
                                       ccl_private uint *sample)
 {
-#if 0
-  /* Keep threads for the same sample together. */
-  uint tile_pixels = tile->w * tile->h;
-  uint sample_offset = global_work_index / tile_pixels;
-  uint pixel_offset = global_work_index - sample_offset * tile_pixels;
-#else
-  /* Keeping threads for the same pixel together.
-   * Appears to improve performance by a few % on CUDA and OptiX. */
-  uint sample_offset = global_work_index % tile->num_samples;
-  uint pixel_offset = global_work_index / tile->num_samples;
-#endif
+  uint sample_offset, pixel_offset;
+
+  if (kernel_data.integrator.scrambling_distance < 0.9f) {
+    /* Keep threads for the same sample together. */
+    uint tile_pixels = tile->w * tile->h;
+    sample_offset = global_work_index / tile_pixels;
+    pixel_offset = global_work_index - sample_offset * tile_pixels;
+  }
+  else {
+    /* Keeping threads for the same pixel together.
+     * Appears to improve performance by a few % on CUDA and OptiX. */
+    sample_offset = global_work_index % tile->num_samples;
+    pixel_offset = global_work_index / tile->num_samples;
+  }
 
   uint y_offset = pixel_offset / tile->w;
   uint x_offset = pixel_offset - y_offset * tile->w;
-- 
cgit v1.2.3


From eb7827e7970cca8e3fb0e0bf39e8742e69f0b2b6 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael_p_jones@apple.com>
Date: Wed, 24 Nov 2021 20:34:27 +0000
Subject: Cycles: Fix film convert address space mismatch on Metal

This patch fixes an address space mismatch in the film convert kernels on Metal. The `film_get_pass_pixel_...` functions take a `ccl_private` result pointer, but the film convert kernels pass a `ccl_global` memory pointer. Specialising the pass-fetch functions with templates results in compilation errors on Visual Studio, so instead this patch just adds an intermediate local on Metal.

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D13350
---
 intern/cycles/kernel/device/gpu/kernel.h | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 22e2a61a06d..24702de496c 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -547,6 +547,33 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb
 #endif
 }
 
+#ifdef __KERNEL_METAL__
+
+/* Fetch into a local variable on Metal - there is minimal overhead. Templating the
+ * film_get_pass_pixel_... functions works on MSL, but not on other compilers. */
+#  define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
+    float local_pixel[4]; \
+    film_get_pass_pixel_##variant(&kfilm_convert, buffer, local_pixel); \
+    if (input_channel_count >= 1) { \
+      pixel[0] = local_pixel[0]; \
+    } \
+    if (input_channel_count >= 2) { \
+      pixel[1] = local_pixel[1]; \
+    } \
+    if (input_channel_count >= 3) { \
+      pixel[2] = local_pixel[2]; \
+    } \
+    if (input_channel_count >= 4) { \
+      pixel[3] = local_pixel[3]; \
+    }
+
+#else
+
+#  define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
+    film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel);
+
+#endif
+
 #define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \
   ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
       ccl_gpu_kernel_signature(film_convert_##variant, \
@@ -574,7 +601,7 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb
     ccl_global float *pixel = pixels + \
                               (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \
 \
-    film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
+    FILM_GET_PASS_PIXEL_F32(variant, input_channel_count); \
   } \
 \
   ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
-- 
cgit v1.2.3


From f613c4c0953ebaf993ecd55b12bab9cf2196dac4 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael_p_jones@apple.com>
Date: Mon, 29 Nov 2021 15:06:22 +0000
Subject: Cycles: MetalRT support (kernel side)

This patch adds MetalRT support to Cycles kernel code. It is mostly additive in nature or confined to Metal-specific code, however there are a few areas where this interacts with other code:

- MetalRT closely follows the Optix implementation, and in some cases (notably handling of transforms) it makes sense to extend Optix special-casing to MetalRT. For these generalisations we now have `__KERNEL_GPU_RAYTRACING__` instead of `__KERNEL_OPTIX__`.
- MetalRT doesn't support primitive offsetting (as with `primitiveIndexOffset` in Optix), so we define and populate a new kernel texture, `__object_prim_offset`, containing per-object primitive / curve-segment offsets. This is referenced and applied in MetalRT intersection handlers.
- Two new BVH layout enum values have been added: `BVH_LAYOUT_METAL` and `BVH_LAYOUT_MULTI_METAL_EMBREE` for XPU mode). Some host-side enum case handling has been updated where it is trivial to do so.

Ref T92212

Reviewed By: brecht

Maniphest Tasks: T92212

Differential Revision: https://developer.blender.org/D13353
---
 intern/cycles/kernel/device/gpu/kernel.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 24702de496c..0f88063e3b7 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -21,6 +21,8 @@
 #include "kernel/device/gpu/parallel_sorted_index.h"
 #include "kernel/device/gpu/work_stealing.h"
 
+#include "kernel/sample/lcg.h"
+
 /* Include constant tables before entering Metal's context class scope (context_begin.h) */
 #include "kernel/tables.h"
 
-- 
cgit v1.2.3


From 9558fa5196033390111a2348caa66ab18b8a4f89 Mon Sep 17 00:00:00 2001
From: Michael Jones <michael_p_jones@apple.com>
Date: Tue, 7 Dec 2021 15:11:35 +0000
Subject: Cycles: Metal host-side code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds the Metal host-side code:

- Add all core host-side Metal backend files (device_impl, queue, etc)
- Add MetalRT BVH setup files
- Integrate with Cycles device enumeration code
- Revive `path_source_replace_includes` in util/path (required for MSL compilation)

This patch also includes a couple of small kernel-side fixes:

- Add an implementation of `lgammaf` for Metal [Nemes, Gergő (2010), "New asymptotic expansion for the Gamma function", Archiv der Mathematik](https://users.renyi.hu/~gergonemes/)
- include "work_stealing.h" inside the Metal context class because it accesses state now

Ref T92212

Reviewed By: brecht

Maniphest Tasks: T92212

Differential Revision: https://developer.blender.org/D13423
---
 intern/cycles/kernel/device/gpu/kernel.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 0f88063e3b7..b50f492e8c7 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -19,7 +19,6 @@
 #include "kernel/device/gpu/parallel_active_index.h"
 #include "kernel/device/gpu/parallel_prefix_sum.h"
 #include "kernel/device/gpu/parallel_sorted_index.h"
-#include "kernel/device/gpu/work_stealing.h"
 
 #include "kernel/sample/lcg.h"
 
@@ -30,6 +29,8 @@
 #  include "kernel/device/metal/context_begin.h"
 #endif
 
+#include "kernel/device/gpu/work_stealing.h"
+
 #include "kernel/integrator/state.h"
 #include "kernel/integrator/state_flow.h"
 #include "kernel/integrator/state_util.h"
@@ -96,7 +97,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int state = tile->path_index_offset + tile_work_index;
 
   uint x, y, sample;
-  get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+  ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample));
 
   ccl_gpu_kernel_call(
       integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample));
@@ -127,7 +128,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int state = tile->path_index_offset + tile_work_index;
 
   uint x, y, sample;
-  get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+  ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample));
 
   ccl_gpu_kernel_call(
       integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample));
-- 
cgit v1.2.3


From 8393ccd07634b3152b18d4d527b1460dab9dbe06 Mon Sep 17 00:00:00 2001
From: Patrick Mours <pmours@nvidia.com>
Date: Tue, 4 Jan 2022 21:39:54 +0100
Subject: Cycles: Add OptiX temporal denoising support

Enables the `bpy.ops.cycles.denoise_animation()` operator again and modifies it to support
temporal denoising with OptiX. This requires renders that were done with both the "Vector"
and "Denoising Data" passes.

Differential Revision: https://developer.blender.org/D11442
---
 intern/cycles/kernel/device/gpu/kernel.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index b50f492e8c7..027b2a7a8c7 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -756,6 +756,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
                              int guiding_pass_stride,
                              int guiding_pass_albedo,
                              int guiding_pass_normal,
+                             int guiding_pass_flow,
                              ccl_global const float *render_buffer,
                              int render_offset,
                              int render_stride,
@@ -763,6 +764,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
                              int render_pass_sample_count,
                              int render_pass_denoising_albedo,
                              int render_pass_denoising_normal,
+                             int render_pass_motion,
                              int full_x,
                              int full_y,
                              int width,
@@ -814,6 +816,17 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     normal_out[1] = normal_in[1] * pixel_scale;
     normal_out[2] = normal_in[2] * pixel_scale;
   }
+
+  /* Flow pass. */
+  if (guiding_pass_flow != PASS_UNUSED) {
+    kernel_assert(render_pass_motion != PASS_UNUSED);
+
+    const float *motion_in = buffer + render_pass_motion;
+    float *flow_out = guiding_pixel + guiding_pass_flow;
+
+    flow_out[0] = -motion_in[0] * pixel_scale;
+    flow_out[1] = -motion_in[1] * pixel_scale;
+  }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
@@ -899,7 +912,6 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   else {
     /* Assigning to zero since this is a default alpha value for 3-component passes, and it
      * is an opaque pixel for 4 component passes. */
-
     denoised_pixel[3] = 0;
   }
 }
-- 
cgit v1.2.3


From efe3d60a2c8306aefd41bc304548da35b67c252c Mon Sep 17 00:00:00 2001
From: Michael Jones <michael_jones>
Date: Fri, 7 Jan 2022 15:28:43 +0000
Subject: Cycles: Fix Metal build

This patch fixes a couple of new Metal kernel compilation errors: 1) a kernel parameter count overflow, and 2) missing address space qualifiers.

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D13763
---
 intern/cycles/kernel/device/gpu/kernel.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 027b2a7a8c7..00c727b48cb 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -821,8 +821,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   if (guiding_pass_flow != PASS_UNUSED) {
     kernel_assert(render_pass_motion != PASS_UNUSED);
 
-    const float *motion_in = buffer + render_pass_motion;
-    float *flow_out = guiding_pixel + guiding_pass_flow;
+    ccl_global const float *motion_in = buffer + render_pass_motion;
+    ccl_global float *flow_out = guiding_pixel + guiding_pass_flow;
 
     flow_out[0] = -motion_in[0] * pixel_scale;
     flow_out[1] = -motion_in[1] * pixel_scale;
-- 
cgit v1.2.3


From 0cf2fafd81442518927a594e422ad2b26b54527a Mon Sep 17 00:00:00 2001
From: Brecht Van Lommel <brecht@blender.org>
Date: Thu, 13 Jan 2022 00:23:00 +0100
Subject: Fix T94050, T94570, T94527: Cycles Bevel and AO nodes not working
 with Metal

Workaround what may be a compiler bug, solution found by Michael Jones.
---
 intern/cycles/kernel/device/gpu/kernel.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'intern/cycles/kernel/device/gpu')

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 00c727b48cb..eed005803e2 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -243,6 +243,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 }
 
+#ifdef __KERNEL_METAL__
+constant int __dummy_constant [[function_constant(0)]];
+#endif
+
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     ccl_gpu_kernel_signature(integrator_shade_surface_raytrace,
                              ccl_global const int *path_index_array,
@@ -253,7 +257,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
+
+#ifdef __KERNEL_METAL__
+    KernelGlobals kg = NULL;
+    /* Workaround Ambient Occlusion and Bevel nodes not working with Metal.
+     * Dummy offset should not affect result, but somehow fixes bug! */
+    kg += __dummy_constant;
+    ccl_gpu_kernel_call(integrator_shade_surface_raytrace(kg, state, render_buffer));
+#else
     ccl_gpu_kernel_call(integrator_shade_surface_raytrace(NULL, state, render_buffer));
+#endif
   }
 }
 
-- 
cgit v1.2.3