Cycles: Adapt shared kernel/device/gpu layer for MSL

This patch adapts the shared kernel entrypoints so that they can be compiled as MSL (Metal Shading Language). Where possible, the adaptations avoid changes in common code. In MSL, kernel function inputs are explicitly bound to resources. In the case of argument buffers, we declare a struct containing the kernel arguments, accessible via device pointer. This differs from CUDA and HIP where kernel function arguments are declared as traditional C-style function parameters. This patch adapts the entrypoints declared in kernel.h so that they can be translated via a new `ccl_gpu_kernel_signature` macro into the required parameter struct + kernel entrypoint pairing for MSL. MSL buffer attribution must be applied to function parameters or non-static class data members. To allow universal access to the integrator state, kernel data, and texture fetch adapters, we wrap all of the shared kernel code in a `MetalKernelContext` class. This is achieved by bracketing the appropriate kernel headers with "context_begin.h" and "context_end.h" on Metal. When calling deeper into the kernel code, we must reference the context class (e.g. `context.integrator_init_from_camera`). This extra prefixing is performed by a set of defines in "context_end.h". These will require explicit maintenance if entrypoints change. We invite discussion on more maintainable ways to enforce correctness. Lambda expressions are not supported on MSL, so a new `ccl_gpu_kernel_lambda` macro generates an inline function object and optionally capturing any required state. This yields the same behaviour. This approach is applied to all parallel_... implementations which are templated by operation. The lambda expressions in the film_convert... kernels don't adapt cleanly to use function objects. However, these entrypoints can be macro-generated more concisely to avoid lambda expressions entirely, instead relying on constant folding to handle the pixel/channel conversions. A separate implementation of `gpu_parallel_active_index_array` is provided for Metal to workaround some subtle differences in SIMD width, and also to encapsulate some required thread parameters which must be declared as explicit entrypoint function parameters. Ref T92212 Reviewed By: brecht Maniphest Tasks: T92212 Differential Revision: https://developer.blender.org/D13109
author: Michael Jones <michael_jones> 2021-11-10 00:30:46 +0300
committer: Michael Jones <michael_p_jones@apple.com> 2021-11-10 00:43:10 +0300
commit: 3a4c8f406a3a3bf0627477c6183a594fa707a6e2 (patch)
tree: 1fed34727a2bd1538e9dddce9089159342a16ab2 /intern/cycles/kernel/device/gpu
parent: 4648c4990cd590dd0f4201cbccc2b5616856984e (diff)
5 files changed, 505 insertions, 480 deletions
diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h
index 95a37c693ae..0900a45c83d 100644
--- a/intern/cycles/kernel/device/gpu/image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -65,7 +65,9 @@ ccl_device float cubic_h1(float a)
 
 /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
 template<typename T>
-ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
+ccl_device_noinline T kernel_tex_image_interp_bicubic(ccl_global const TextureInfo &info,
+                                                      float x,
+                                                      float y)
 {
   ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
@@ -94,7 +96,7 @@ ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, f
 /* Fast tricubic texture lookup using 8 trilinear lookups. */
 template<typename T>
 ccl_device_noinline T
-kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
+kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, float y, float z)
 {
   ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
@@ -169,7 +171,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl
 
 template<typename T>
 ccl_device_noinline T kernel_tex_image_interp_nanovdb(
-    const TextureInfo &info, float x, float y, float z, uint interpolation)
+    ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
 {
   using namespace nanovdb;
 
@@ -191,7 +193,7 @@ ccl_device_noinline T kernel_tex_image_interp_nanovdb(
 
 ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
 {
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
   /* float4, byte4, ushort4 and half4 */
   const int texture_type = info.data_type;
@@ -226,7 +228,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
                                              float3 P,
                                              InterpolationType interp)
 {
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
   if (info.use_transform_3d) {
     P = transform_point(&info.transform_3d, P);
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 5848ba5df9d..2ec6a49ec7b 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -21,6 +21,10 @@
 #include "kernel/device/gpu/parallel_sorted_index.h"
 #include "kernel/device/gpu/work_stealing.h"
 
+#ifdef __KERNEL_METAL__
+#  include "kernel/device/metal/context_begin.h"
+#endif
+
 #include "kernel/integrator/state.h"
 #include "kernel/integrator/state_flow.h"
 #include "kernel/integrator/state_util.h"
@@ -40,6 +44,11 @@
 #include "kernel/bake/bake.h"
 
 #include "kernel/film/adaptive_sampling.h"
+
+#ifdef __KERNEL_METAL__
+#  include "kernel/device/metal/context_end.h"
+#endif
+
 #include "kernel/film/read.h"
 
 /* --------------------------------------------------------------------
@@ -47,7 +56,8 @@
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_reset(int num_states)
+ccl_gpu_kernel_signature(integrator_reset,
+                          int num_states)
 {
   const int state = ccl_gpu_global_id_x();
 
@@ -58,10 +68,11 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles,
-                                           const int num_tiles,
-                                           float *render_buffer,
-                                           const int max_tile_work_size)
+    ccl_gpu_kernel_signature(integrator_init_from_camera,
+                             ccl_global KernelWorkTile *tiles,
+                             const int num_tiles,
+                             ccl_global float *render_buffer,
+                             const int max_tile_work_size)
 {
   const int work_index = ccl_gpu_global_id_x();
 
@@ -72,7 +83,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int tile_index = work_index / max_tile_work_size;
   const int tile_work_index = work_index - tile_index * max_tile_work_size;
 
-  const KernelWorkTile *tile = &tiles[tile_index];
+  ccl_global const KernelWorkTile *tile = &tiles[tile_index];
 
   if (tile_work_index >= tile->work_size) {
     return;
@@ -83,14 +94,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   uint x, y, sample;
   get_work_pixel(tile, tile_work_index, &x, &y, &sample);
 
-  integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample);
+  ccl_gpu_kernel_call(
+      integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample));
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles,
-                                         const int num_tiles,
-                                         float *render_buffer,
-                                         const int max_tile_work_size)
+    ccl_gpu_kernel_signature(integrator_init_from_bake,
+                             ccl_global KernelWorkTile *tiles,
+                             const int num_tiles,
+                             ccl_global float *render_buffer,
+                             const int max_tile_work_size)
 {
   const int work_index = ccl_gpu_global_id_x();
 
@@ -101,7 +114,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int tile_index = work_index / max_tile_work_size;
   const int tile_work_index = work_index - tile_index * max_tile_work_size;
 
-  const KernelWorkTile *tile = &tiles[tile_index];
+  ccl_global const KernelWorkTile *tile = &tiles[tile_index];
 
   if (tile_work_index >= tile->work_size) {
     return;
@@ -112,228 +125,260 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   uint x, y, sample;
   get_work_pixel(tile, tile_work_index, &x, &y, &sample);
 
-  integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample);
+  ccl_gpu_kernel_call(
+      integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample));
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_closest,
+                             ccl_global const int *path_index_array,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_closest(NULL, state);
+    ccl_gpu_kernel_call(integrator_intersect_closest(NULL, state));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_shadow,
+                             ccl_global const int *path_index_array,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_shadow(NULL, state);
+    ccl_gpu_kernel_call(integrator_intersect_shadow(NULL, state));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_subsurface,
+                             ccl_global const int *path_index_array,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_subsurface(NULL, state);
+    ccl_gpu_kernel_call(integrator_intersect_subsurface(NULL, state));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_volume_stack,
+                             ccl_global const int *path_index_array,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_volume_stack(NULL, state);
+    ccl_gpu_kernel_call(integrator_intersect_volume_stack(NULL, state));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_background(const int *path_index_array,
-                                           float *render_buffer,
-                                           const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_background,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_background(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_background(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_light(const int *path_index_array,
-                                      float *render_buffer,
-                                      const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_light,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_light(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_light(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_shadow(const int *path_index_array,
-                                       float *render_buffer,
-                                       const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_shadow,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_shadow(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_shadow(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_surface(const int *path_index_array,
-                                        float *render_buffer,
-                                        const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_surface,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_surface(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_surface(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array,
-                                                 float *render_buffer,
-                                                 const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_surface_raytrace,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_surface_raytrace(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_surface_raytrace(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_volume(const int *path_index_array,
-                                       float *render_buffer,
-                                       const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_volume,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_volume(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_volume(NULL, state, render_buffer));
   }
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_queued_paths_array(int num_states,
-                                             int *indices,
-                                             int *num_indices,
-                                             int kernel)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_queued_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int kernel_index)
 {
-  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [kernel](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel);
-      });
-}
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index,
+                        int kernel_index)
+      .kernel_index = kernel_index;
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_queued_shadow_paths_array(int num_states,
-                                                    int *indices,
-                                                    int *num_indices,
-                                                    int kernel)
-{
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [kernel](const int state) {
-        return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int kernel_index)
 {
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel_index,
+                        int kernel_index)
+      .kernel_index = kernel_index;
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_terminated_paths_array(int num_states,
-                                                 int *indices,
-                                                 int *num_indices,
-                                                 int indices_offset)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_active_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices)
 {
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices + indices_offset, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) == 0);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_terminated_shadow_paths_array(int num_states,
-                                                        int *indices,
-                                                        int *num_indices,
-                                                        int indices_offset)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_terminated_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int indices_offset)
 {
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices + indices_offset, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
-      });
+      num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_sorted_paths_array(int num_states,
-                                             int num_states_limit,
-                                             int *indices,
-                                             int *num_indices,
-                                             int *key_counter,
-                                             int *key_prefix_sum,
-                                             int kernel)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int indices_offset)
 {
-  gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_sorted_paths_array,
+                             int num_states,
+                             int num_states_limit,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             ccl_global int *key_counter,
+                             ccl_global int *key_prefix_sum,
+                             int kernel_index)
+{
+  ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index) ?
+                            INTEGRATOR_STATE(state, path, shader_sort_key) :
+                            GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY,
+                        int kernel_index)
+    .kernel_index = kernel_index;
+  
+  const uint state_index = ccl_gpu_global_id_x();
+  gpu_parallel_sorted_index_array(
+      state_index,
       num_states,
       num_states_limit,
       indices,
-      num_indices,
-      key_counter,
-      key_prefix_sum,
-      [kernel](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ?
-                   INTEGRATOR_STATE(state, path, shader_sort_key) :
-                   GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
-      });
-}
-
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_compact_paths_array(int num_states,
-                                              int *indices,
-                                              int *num_indices,
-                                              int num_active_paths)
-{
+                                  num_indices,
+                                  key_counter,
+                                  key_prefix_sum,
+                                  ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                         ccl_global int *num_indices,
+                         int num_active_paths)
+{
+  ccl_gpu_kernel_lambda((state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0),
+                        int num_active_paths)
+    .num_active_paths = num_active_paths;
+  
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [num_active_paths](const int state) {
-        return (state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_compact_states(const int *active_terminated_states,
-                                         const int active_states_offset,
-                                         const int terminated_states_offset,
-                                         const int work_size)
+ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_states,
+                             ccl_global const int *active_terminated_states,
+                             const int active_states_offset,
+                             const int terminated_states_offset,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
@@ -341,28 +386,31 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
     const int from_state = active_terminated_states[active_states_offset + global_index];
     const int to_state = active_terminated_states[terminated_states_offset + global_index];
 
-    integrator_state_move(NULL, to_state, from_state);
+    ccl_gpu_kernel_call(integrator_state_move(NULL, to_state, from_state));
   }
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_compact_shadow_paths_array(int num_states,
-                                                     int *indices,
-                                                     int *num_indices,
-                                                     int num_active_paths)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                         ccl_global int *num_indices,
+                         int num_active_paths)
 {
+  ccl_gpu_kernel_lambda((state >= num_active_paths) && (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0),
+                        int num_active_paths)
+    .num_active_paths = num_active_paths;
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [num_active_paths](const int state) {
-        return (state >= num_active_paths) &&
-               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_compact_shadow_states(const int *active_terminated_states,
-                                                const int active_states_offset,
-                                                const int terminated_states_offset,
-                                                const int work_size)
+ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_shadow_states,
+                             ccl_global const int *active_terminated_states,
+                             const int active_states_offset,
+                             const int terminated_states_offset,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
@@ -370,15 +418,14 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
     const int from_state = active_terminated_states[active_states_offset + global_index];
     const int to_state = active_terminated_states[terminated_states_offset + global_index];
 
-    integrator_shadow_state_move(NULL, to_state, from_state);
+    ccl_gpu_kernel_call(integrator_shadow_state_move(NULL, to_state, from_state));
   }
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values)
+ccl_gpu_kernel(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) ccl_gpu_kernel_signature(
+    prefix_sum, ccl_global int *counter, ccl_global int *prefix_sum, int num_values)
 {
-  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(
-      counter, prefix_sum, num_values);
+  gpu_parallel_prefix_sum(ccl_gpu_global_id_x(), counter, prefix_sum, num_values);
 }
 
 /* --------------------------------------------------------------------
@@ -386,16 +433,17 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLO
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer,
-                                                   int sx,
-                                                   int sy,
-                                                   int sw,
-                                                   int sh,
-                                                   float threshold,
-                                                   bool reset,
-                                                   int offset,
-                                                   int stride,
-                                                   uint *num_active_pixels)
+    ccl_gpu_kernel_signature(adaptive_sampling_convergence_check,
+                             ccl_global float *render_buffer,
+                             int sx,
+                             int sy,
+                             int sw,
+                             int sh,
+                             float threshold,
+                             bool reset,
+                             int offset,
+                             int stride,
+                             ccl_global uint *num_active_pixels)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / sw;
@@ -404,37 +452,51 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   bool converged = true;
 
   if (x < sw && y < sh) {
-    converged = kernel_adaptive_sampling_convergence_check(
-        nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride);
+    converged = ccl_gpu_kernel_call(kernel_adaptive_sampling_convergence_check(
+        nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride));
   }
 
   /* NOTE: All threads specified in the mask must execute the intrinsic. */
-  const uint num_active_pixels_mask = ccl_gpu_ballot(!converged);
+  const auto num_active_pixels_mask = ccl_gpu_ballot(!converged);
   const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
   if (lane_id == 0) {
-    atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask));
+    atomic_fetch_and_add_uint32(num_active_pixels, ccl_gpu_popc(num_active_pixels_mask));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_adaptive_sampling_filter_x(
-        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+    ccl_gpu_kernel_signature(adaptive_sampling_filter_x,
+                             ccl_global float *render_buffer,
+                             int sx,
+                             int sy,
+                             int sw,
+                             int sh,
+                             int offset,
+                             int stride)
 {
   const int y = ccl_gpu_global_id_x();
 
   if (y < sh) {
-    kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride);
+    ccl_gpu_kernel_call(
+        kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_adaptive_sampling_filter_y(
-        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+    ccl_gpu_kernel_signature(adaptive_sampling_filter_y,
+                             ccl_global float *render_buffer,
+                             int sx,
+                             int sy,
+                             int sw,
+                             int sh,
+                             int offset,
+                             int stride)
 {
   const int x = ccl_gpu_global_id_x();
 
   if (x < sw) {
-    kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride);
+    ccl_gpu_kernel_call(
+        kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride));
   }
 }
 
@@ -443,12 +505,14 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels)
+    ccl_gpu_kernel_signature(cryptomatte_postprocess,
+                             ccl_global float *render_buffer,
+                             int num_pixels)
 {
   const int pixel_index = ccl_gpu_global_id_x();
 
   if (pixel_index < num_pixels) {
-    kernel_cryptomatte_post(nullptr, render_buffer, pixel_index);
+    ccl_gpu_kernel_call(kernel_cryptomatte_post(nullptr, render_buffer, pixel_index));
   }
 }
 
@@ -456,206 +520,102 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  * Film.
  */
 
-/* Common implementation for float destination. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert,
-                                                      float *pixels,
-                                                      float *render_buffer,
-                                                      int num_pixels,
-                                                      int width,
-                                                      int offset,
-                                                      int stride,
-                                                      int dst_offset,
-                                                      int dst_stride,
-                                                      const Processor &processor)
-{
-  const int render_pixel_index = ccl_gpu_global_id_x();
-  if (render_pixel_index >= num_pixels) {
-    return;
-  }
-
-  const int x = render_pixel_index % width;
-  const int y = render_pixel_index / width;
-
-  ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride +
-                                   y * stride * kfilm_convert->pass_stride;
-
-  ccl_global float *pixel = pixels +
-                            (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride;
-
-  processor(kfilm_convert, buffer, pixel);
-}
-
-/* Common implementation for half4 destination and 4-channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
-    const KernelFilmConvert *kfilm_convert,
-    uchar4 *rgba,
-    float *render_buffer,
-    int num_pixels,
-    int width,
-    int offset,
-    int stride,
-    int rgba_offset,
-    int rgba_stride,
-    const Processor &processor)
-{
-  const int render_pixel_index = ccl_gpu_global_id_x();
-  if (render_pixel_index >= num_pixels) {
-    return;
-  }
-
-  const int x = render_pixel_index % width;
-  const int y = render_pixel_index / width;
-
-  ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride +
-                                   y * stride * kfilm_convert->pass_stride;
-
-  float pixel[4];
-  processor(kfilm_convert, buffer, pixel);
-
-  film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
-
-  ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
-  *out = float4_to_half4_display(make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
-}
-
-/* Common implementation for half4 destination and 3-channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb(
-    const KernelFilmConvert *kfilm_convert,
-    uchar4 *rgba,
-    float *render_buffer,
-    int num_pixels,
-    int width,
-    int offset,
-    int stride,
-    int rgba_offset,
-    int rgba_stride,
-    const Processor &processor)
-{
-  kernel_gpu_film_convert_half_rgba_common_rgba(
-      kfilm_convert,
-      rgba,
-      render_buffer,
-      num_pixels,
-      width,
-      offset,
-      stride,
-      rgba_offset,
-      rgba_stride,
-      [&processor](const KernelFilmConvert *kfilm_convert,
-                   ccl_global const float *buffer,
-                   float *pixel_rgba) {
-        processor(kfilm_convert, buffer, pixel_rgba);
-        pixel_rgba[3] = 1.0f;
-      });
-}
-
-/* Common implementation for half4 destination and single channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value(
-    const KernelFilmConvert *kfilm_convert,
-    uchar4 *rgba,
-    float *render_buffer,
-    int num_pixels,
-    int width,
-    int offset,
-    int stride,
-    int rgba_offset,
-    int rgba_stride,
-    const Processor &processor)
-{
-  kernel_gpu_film_convert_half_rgba_common_rgba(
-      kfilm_convert,
-      rgba,
-      render_buffer,
-      num_pixels,
-      width,
-      offset,
-      stride,
-      rgba_offset,
-      rgba_stride,
-      [&processor](const KernelFilmConvert *kfilm_convert,
-                   ccl_global const float *buffer,
-                   float *pixel_rgba) {
-        float value;
-        processor(kfilm_convert, buffer, &value);
-
-        pixel_rgba[0] = value;
-        pixel_rgba[1] = value;
-        pixel_rgba[2] = value;
-        pixel_rgba[3] = 1.0f;
-      });
-}
-
-#define KERNEL_FILM_CONVERT_PROC(name) \
-  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name
-
-#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \
-  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \
-  (const KernelFilmConvert kfilm_convert, \
-   float *pixels, \
-   float *render_buffer, \
-   int num_pixels, \
-   int width, \
-   int offset, \
-   int stride, \
-   int rgba_offset, \
-   int rgba_stride) \
+#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \
+  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
+      ccl_gpu_kernel_signature(film_convert_##variant, \
+                               const KernelFilmConvert kfilm_convert, \
+                               ccl_global uchar4 *rgba, \
+                               ccl_global float *render_buffer, \
+                               int num_pixels, \
+                               int width, \
+                               int offset, \
+                               int stride, \
+                               int rgba_offset, \
+                               int rgba_stride) \
   { \
-    kernel_gpu_film_convert_common(&kfilm_convert, \
-                                   pixels, \
-                                   render_buffer, \
-                                   num_pixels, \
-                                   width, \
-                                   offset, \
-                                   stride, \
-                                   rgba_offset, \
-                                   rgba_stride, \
-                                   film_get_pass_pixel_##variant); \
+    const int render_pixel_index = ccl_gpu_global_id_x(); \
+    if (render_pixel_index >= num_pixels) { \
+      return; \
+    } \
+\
+    const int x = render_pixel_index % width; \
+    const int y = render_pixel_index / width; \
+\
+    ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
+                                     y * stride * kfilm_convert.pass_stride; \
+\
+    float pixel[4]; \
+    film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
+\
+    film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
+\
+    if (input_channel_count == 1) { \
+      pixel[1] = pixel[2] = pixel[0]; \
+    } \
+    if (input_channel_count <= 3) { \
+      pixel[3] = 1.0f; \
+    } \
+\
+    ccl_global float *out = ((ccl_global float *)rgba) + rgba_offset + y * rgba_stride + x; \
+    *(ccl_global float4 *)out = make_float4(pixel[0], pixel[1], pixel[2], pixel[3]); \
   } \
-  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \
-  (const KernelFilmConvert kfilm_convert, \
-   uchar4 *rgba, \
-   float *render_buffer, \
-   int num_pixels, \
-   int width, \
-   int offset, \
-   int stride, \
-   int rgba_offset, \
-   int rgba_stride) \
+\
+  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
+      ccl_gpu_kernel_signature(film_convert_##variant##_half_rgba, \
+                               const KernelFilmConvert kfilm_convert, \
+                               ccl_global uchar4 *rgba, \
+                               ccl_global float *render_buffer, \
+                               int num_pixels, \
+                               int width, \
+                               int offset, \
+                               int stride, \
+                               int rgba_offset, \
+                               int rgba_stride) \
   { \
-    kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \
-                                                        rgba, \
-                                                        render_buffer, \
-                                                        num_pixels, \
-                                                        width, \
-                                                        offset, \
-                                                        stride, \
-                                                        rgba_offset, \
-                                                        rgba_stride, \
-                                                        film_get_pass_pixel_##variant); \
-  }
-
-KERNEL_FILM_CONVERT_DEFINE(depth, value)
-KERNEL_FILM_CONVERT_DEFINE(mist, value)
-KERNEL_FILM_CONVERT_DEFINE(sample_count, value)
-KERNEL_FILM_CONVERT_DEFINE(float, value)
-
-KERNEL_FILM_CONVERT_DEFINE(light_path, rgb)
-KERNEL_FILM_CONVERT_DEFINE(float3, rgb)
-
-KERNEL_FILM_CONVERT_DEFINE(motion, rgba)
-KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba)
-KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba)
-KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba)
-KERNEL_FILM_CONVERT_DEFINE(combined, rgba)
-KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
-
-#undef KERNEL_FILM_CONVERT_DEFINE
-#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE
-#undef KERNEL_FILM_CONVERT_PROC
+    const int render_pixel_index = ccl_gpu_global_id_x(); \
+    if (render_pixel_index >= num_pixels) { \
+      return; \
+    } \
+\
+    const int x = render_pixel_index % width; \
+    const int y = render_pixel_index / width; \
+\
+    ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
+                                     y * stride * kfilm_convert.pass_stride; \
+\
+    float pixel[4]; \
+    film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
+\
+    film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
+\
+    if (input_channel_count == 1) { \
+      pixel[1] = pixel[2] = pixel[0]; \
+    } \
+    if (input_channel_count <= 3) { \
+      pixel[3] = 1.0f; \
+    } \
+\
+    ccl_global half4 *out = ((ccl_global half4 *)rgba) + (rgba_offset + y * rgba_stride + x); \
+    *out = float4_to_half4_display(make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \
+  }
+
+/* 1 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(depth, 1)
+KERNEL_FILM_CONVERT_VARIANT(mist, 1)
+KERNEL_FILM_CONVERT_VARIANT(sample_count, 1)
+KERNEL_FILM_CONVERT_VARIANT(float, 1)
+
+/* 3 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(light_path, 3)
+KERNEL_FILM_CONVERT_VARIANT(float3, 3)
+
+/* 4 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(motion, 4)
+KERNEL_FILM_CONVERT_VARIANT(cryptomatte, 4)
+KERNEL_FILM_CONVERT_VARIANT(shadow_catcher, 4)
+KERNEL_FILM_CONVERT_VARIANT(shadow_catcher_matte_with_shadow, 4)
+KERNEL_FILM_CONVERT_VARIANT(combined, 4)
+KERNEL_FILM_CONVERT_VARIANT(float4, 4)
 
 /* --------------------------------------------------------------------
  * Shader evaluation.
@@ -664,42 +624,46 @@ KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
 /* Displacement */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
-                                    float *output,
-                                    const int offset,
-                                    const int work_size)
+    ccl_gpu_kernel_signature(shader_eval_displace,
+                             ccl_global KernelShaderEvalInput *input,
+                             ccl_global float *output,
+                             const int offset,
+                             const int work_size)
 {
   int i = ccl_gpu_global_id_x();
   if (i < work_size) {
-    kernel_displace_evaluate(NULL, input, output, offset + i);
+    ccl_gpu_kernel_call(kernel_displace_evaluate(NULL, input, output, offset + i));
   }
 }
 
 /* Background */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
-                                      float *output,
-                                      const int offset,
-                                      const int work_size)
+    ccl_gpu_kernel_signature(shader_eval_background,
+                             ccl_global KernelShaderEvalInput *input,
+                             ccl_global float *output,
+                             const int offset,
+                             const int work_size)
 {
   int i = ccl_gpu_global_id_x();
   if (i < work_size) {
-    kernel_background_evaluate(NULL, input, output, offset + i);
+    ccl_gpu_kernel_call(kernel_background_evaluate(NULL, input, output, offset + i));
   }
 }
 
 /* Curve Shadow Transparency */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_shader_eval_curve_shadow_transparency(KernelShaderEvalInput *input,
-                                                     float *output,
-                                                     const int offset,
-                                                     const int work_size)
+    ccl_gpu_kernel_signature(shader_eval_curve_shadow_transparency,
+                             ccl_global KernelShaderEvalInput *input,
+                             ccl_global float *output,
+                             const int offset,
+                             const int work_size)
 {
   int i = ccl_gpu_global_id_x();
   if (i < work_size) {
-    kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i);
+    ccl_gpu_kernel_call(
+        kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i));
   }
 }
 
@@ -708,15 +672,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_color_preprocess(float *render_buffer,
-                                       int full_x,
-                                       int full_y,
-                                       int width,
-                                       int height,
-                                       int offset,
-                                       int stride,
-                                       int pass_stride,
-                                       int pass_denoised)
+    ccl_gpu_kernel_signature(filter_color_preprocess,
+                             ccl_global float *render_buffer,
+                             int full_x,
+                             int full_y,
+                             int width,
+                             int height,
+                             int offset,
+                             int stride,
+                             int pass_stride,
+                             int pass_denoised)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / width;
@@ -727,31 +692,32 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
-  float *buffer = render_buffer + render_pixel_index * pass_stride;
+  ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
 
-  float *color_out = buffer + pass_denoised;
+  ccl_global float *color_out = buffer + pass_denoised;
   color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
   color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
   color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_guiding_preprocess(float *guiding_buffer,
-                                         int guiding_pass_stride,
-                                         int guiding_pass_albedo,
-                                         int guiding_pass_normal,
-                                         const float *render_buffer,
-                                         int render_offset,
-                                         int render_stride,
-                                         int render_pass_stride,
-                                         int render_pass_sample_count,
-                                         int render_pass_denoising_albedo,
-                                         int render_pass_denoising_normal,
-                                         int full_x,
-                                         int full_y,
-                                         int width,
-                                         int height,
-                                         int num_samples)
+    ccl_gpu_kernel_signature(filter_guiding_preprocess,
+                             ccl_global float *guiding_buffer,
+                             int guiding_pass_stride,
+                             int guiding_pass_albedo,
+                             int guiding_pass_normal,
+                             ccl_global const float *render_buffer,
+                             int render_offset,
+                             int render_stride,
+                             int render_pass_stride,
+                             int render_pass_sample_count,
+                             int render_pass_denoising_albedo,
+                             int render_pass_denoising_normal,
+                             int full_x,
+                             int full_y,
+                             int width,
+                             int height,
+                             int num_samples)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / width;
@@ -762,10 +728,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t guiding_pixel_index = x + y * width;
-  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+  ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
 
   const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
-  const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
+  ccl_global const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
 
   float pixel_scale;
   if (render_pass_sample_count == PASS_UNUSED) {
@@ -779,8 +745,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   if (guiding_pass_albedo != PASS_UNUSED) {
     kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
 
-    const float *aledo_in = buffer + render_pass_denoising_albedo;
-    float *albedo_out = guiding_pixel + guiding_pass_albedo;
+    ccl_global const float *aledo_in = buffer + render_pass_denoising_albedo;
+    ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
 
     albedo_out[0] = aledo_in[0] * pixel_scale;
     albedo_out[1] = aledo_in[1] * pixel_scale;
@@ -791,8 +757,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   if (guiding_pass_normal != PASS_UNUSED) {
     kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
 
-    const float *normal_in = buffer + render_pass_denoising_normal;
-    float *normal_out = guiding_pixel + guiding_pass_normal;
+    ccl_global const float *normal_in = buffer + render_pass_denoising_normal;
+    ccl_global float *normal_out = guiding_pixel + guiding_pass_normal;
 
     normal_out[0] = normal_in[0] * pixel_scale;
     normal_out[1] = normal_in[1] * pixel_scale;
@@ -801,11 +767,12 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer,
-                                              int guiding_pass_stride,
-                                              int guiding_pass_albedo,
-                                              int width,
-                                              int height)
+    ccl_gpu_kernel_signature(filter_guiding_set_fake_albedo,
+                             ccl_global float *guiding_buffer,
+                             int guiding_pass_stride,
+                             int guiding_pass_albedo,
+                             int width,
+                             int height)
 {
   kernel_assert(guiding_pass_albedo != PASS_UNUSED);
 
@@ -818,9 +785,9 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t guiding_pixel_index = x + y * width;
-  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+  ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
 
-  float *albedo_out = guiding_pixel + guiding_pass_albedo;
+  ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
 
   albedo_out[0] = 0.5f;
   albedo_out[1] = 0.5f;
@@ -828,20 +795,21 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_color_postprocess(float *render_buffer,
-                                        int full_x,
-                                        int full_y,
-                                        int width,
-                                        int height,
-                                        int offset,
-                                        int stride,
-                                        int pass_stride,
-                                        int num_samples,
-                                        int pass_noisy,
-                                        int pass_denoised,
-                                        int pass_sample_count,
-                                        int num_components,
-                                        bool use_compositing)
+    ccl_gpu_kernel_signature(filter_color_postprocess,
+                             ccl_global float *render_buffer,
+                             int full_x,
+                             int full_y,
+                             int width,
+                             int height,
+                             int offset,
+                             int stride,
+                             int pass_stride,
+                             int num_samples,
+                             int pass_noisy,
+                             int pass_denoised,
+                             int pass_sample_count,
+                             int num_components,
+                             bool use_compositing)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / width;
@@ -852,7 +820,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
-  float *buffer = render_buffer + render_pixel_index * pass_stride;
+  ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
 
   float pixel_scale;
   if (pass_sample_count == PASS_UNUSED) {
@@ -862,7 +830,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     pixel_scale = __float_as_uint(buffer[pass_sample_count]);
   }
 
-  float *denoised_pixel = buffer + pass_denoised;
+  ccl_global float *denoised_pixel = buffer + pass_denoised;
 
   denoised_pixel[0] *= pixel_scale;
   denoised_pixel[1] *= pixel_scale;
@@ -875,7 +843,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     /* Currently compositing passes are either 3-component (derived by dividing light passes)
      * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
      * simplifies logic and avoids extra memory allocation. */
-    const float *noisy_pixel = buffer + pass_noisy;
+    ccl_global const float *noisy_pixel = buffer + pass_noisy;
     denoised_pixel[3] = noisy_pixel[3];
   }
   else {
@@ -891,21 +859,22 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states,
-                                                               uint *num_possible_splits)
+    ccl_gpu_kernel_signature(integrator_shadow_catcher_count_possible_splits,
+                             int num_states,
+                             ccl_global uint *num_possible_splits)
 {
   const int state = ccl_gpu_global_id_x();
 
   bool can_split = false;
 
   if (state < num_states) {
-    can_split = kernel_shadow_catcher_path_can_split(nullptr, state);
+    can_split = ccl_gpu_kernel_call(kernel_shadow_catcher_path_can_split(nullptr, state));
   }
 
   /* NOTE: All threads specified in the mask must execute the intrinsic. */
-  const uint can_split_mask = ccl_gpu_ballot(can_split);
+  const auto can_split_mask = ccl_gpu_ballot(can_split);
   const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
   if (lane_id == 0) {
-    atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask));
+    atomic_fetch_and_add_uint32(num_possible_splits, ccl_gpu_popc(can_split_mask));
   }
 }
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index d7416beb783..f667ede2712 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -31,10 +31,43 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
 #endif
 
+#ifdef __KERNEL_METAL__
+struct ActiveIndexContext {
+  ActiveIndexContext(int _thread_index,
+                     int _global_index,
+                     int _threadgroup_size,
+                     int _simdgroup_size,
+                     int _simd_lane_index,
+                     int _simd_group_index,
+                     int _num_simd_groups,
+                     threadgroup int *_simdgroup_offset)
+      : thread_index(_thread_index),
+        global_index(_global_index),
+        blocksize(_threadgroup_size),
+        ccl_gpu_warp_size(_simdgroup_size),
+        thread_warp(_simd_lane_index),
+        warp_index(_simd_group_index),
+        num_warps(_num_simd_groups),
+        warp_offset(_simdgroup_offset)
+  {
+  }
+
+  const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index,
+      num_warps;
+  threadgroup int *warp_offset;
+
+  template<uint blocksizeDummy, typename IsActiveOp>
+  void active_index_array(const uint num_states,
+                          ccl_global int *indices,
+                          ccl_global int *num_indices,
+                          IsActiveOp is_active_op)
+  {
+    const uint state_index = global_index;
+#else
 template<uint blocksize, typename IsActiveOp>
 __device__ void gpu_parallel_active_index_array(const uint num_states,
-                                                int *indices,
-                                                int *num_indices,
+                                                ccl_global int *indices,
+                                                ccl_global int *num_indices,
                                                 IsActiveOp is_active_op)
 {
   extern ccl_gpu_shared int warp_offset[];
@@ -45,43 +78,62 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
   const uint warp_index = thread_index / ccl_gpu_warp_size;
   const uint num_warps = blocksize / ccl_gpu_warp_size;
 
-  /* Test if state corresponding to this thread is active. */
   const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
-  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+#endif
 
-  /* For each thread within a warp compute how many other active states precede it. */
-  const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
-  const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+    /* Test if state corresponding to this thread is active. */
+    const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
 
-  /* Last thread in warp stores number of active states for each warp. */
-  if (thread_warp == ccl_gpu_warp_size - 1) {
-    warp_offset[warp_index] = thread_offset + is_active;
-  }
+    /* For each thread within a warp compute how many other active states precede it. */
+    const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) &
+                                            ccl_gpu_thread_mask(thread_warp));
 
-  ccl_gpu_syncthreads();
-
-  /* Last thread in block converts per-warp sizes to offsets, increments global size of
-   * index array and gets offset to write to. */
-  if (thread_index == blocksize - 1) {
-    /* TODO: parallelize this. */
-    int offset = 0;
-    for (int i = 0; i < num_warps; i++) {
-      int num_active = warp_offset[i];
-      warp_offset[i] = offset;
-      offset += num_active;
+    /* Last thread in warp stores number of active states for each warp. */
+    if (thread_warp == ccl_gpu_warp_size - 1) {
+      warp_offset[warp_index] = thread_offset + is_active;
     }
 
-    const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
-    warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
-  }
+    ccl_gpu_syncthreads();
+
+    /* Last thread in block converts per-warp sizes to offsets, increments global size of
+     * index array and gets offset to write to. */
+    if (thread_index == blocksize - 1) {
+      /* TODO: parallelize this. */
+      int offset = 0;
+      for (int i = 0; i < num_warps; i++) {
+        int num_active = warp_offset[i];
+        warp_offset[i] = offset;
+        offset += num_active;
+      }
+
+      const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+      warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+    }
 
-  ccl_gpu_syncthreads();
+    ccl_gpu_syncthreads();
 
-  /* Write to index array. */
-  if (is_active) {
-    const uint block_offset = warp_offset[num_warps];
-    indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+    /* Write to index array. */
+    if (is_active) {
+      const uint block_offset = warp_offset[num_warps];
+      indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+    }
   }
-}
+
+#ifdef __KERNEL_METAL__
+}; /* end class ActiveIndexContext */
+
+/* inject the required thread params into a struct, and redirect to its templated member function
+ */
+#  define gpu_parallel_active_index_array \
+    ActiveIndexContext(metal_local_id, \
+                       metal_global_id, \
+                       metal_local_size, \
+                       simdgroup_size, \
+                       simd_lane_index, \
+                       simd_group_index, \
+                       num_simd_groups, \
+                       simdgroup_offset) \
+        .active_index_array
+#endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
index 6de3a022569..4bd002c27e4 100644
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -33,10 +33,12 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
 #endif
 
-template<uint blocksize>
-__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values)
+__device__ void gpu_parallel_prefix_sum(const int global_id,
+                                        ccl_global int *counter,
+                                        ccl_global int *prefix_sum,
+                                        const int num_values)
 {
-  if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
+  if (global_id != 0) {
     return;
   }
 
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
index c06d7be444f..c092e2a21ee 100644
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -33,16 +33,16 @@ CCL_NAMESPACE_BEGIN
 #endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
 
-template<uint blocksize, typename GetKeyOp>
-__device__ void gpu_parallel_sorted_index_array(const uint num_states,
+template<typename GetKeyOp>
+__device__ void gpu_parallel_sorted_index_array(const uint state_index,
+                                                const uint num_states,
                                                 const int num_states_limit,
-                                                int *indices,
-                                                int *num_indices,
-                                                int *key_counter,
-                                                int *key_prefix_sum,
+                                                ccl_global int *indices,
+                                                ccl_global int *num_indices,
+                                                ccl_global int *key_counter,
+                                                ccl_global int *key_prefix_sum,
                                                 GetKeyOp get_key_op)
 {
-  const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x;
   const int key = (state_index < num_states) ? get_key_op(state_index) :
                                                GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
author	Michael Jones <michael_jones>	2021-11-10 00:30:46 +0300
committer	Michael Jones <michael_p_jones@apple.com>	2021-11-10 00:43:10 +0300
commit	3a4c8f406a3a3bf0627477c6183a594fa707a6e2 (patch)
tree	1fed34727a2bd1538e9dddce9089159342a16ab2 /intern/cycles/kernel/device/gpu
parent	4648c4990cd590dd0f4201cbccc2b5616856984e (diff)