16 files changed, 877 insertions, 482 deletions
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 29ff69df864..f311b0e74bb 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -39,6 +39,10 @@ set(SRC_KERNEL_DEVICE_HIP
   device/hip/kernel.cpp
 )
 
+set(SRC_KERNEL_DEVICE_METAL
+  device/metal/kernel.metal
+)
+
 set(SRC_KERNEL_DEVICE_OPTIX
   device/optix/kernel.cu
   device/optix/kernel_shader_raytrace.cu
@@ -79,6 +83,13 @@ set(SRC_KERNEL_DEVICE_OPTIX_HEADERS
   device/optix/globals.h
 )
 
+set(SRC_KERNEL_DEVICE_METAL_HEADERS
+  device/metal/compat.h
+  device/metal/context_begin.h
+  device/metal/context_end.h
+  device/metal/globals.h
+)
+
 set(SRC_KERNEL_CLOSURE_HEADERS
   closure/alloc.h
   closure/bsdf.h
@@ -368,6 +379,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
     ${SRC_KERNEL_HEADERS}
     ${SRC_KERNEL_DEVICE_GPU_HEADERS}
     ${SRC_KERNEL_DEVICE_CUDA_HEADERS}
+    ${SRC_KERNEL_DEVICE_METAL_HEADERS}
     ${SRC_UTIL_HEADERS}
   )
   set(cuda_cubins)
@@ -723,12 +735,14 @@ cycles_add_library(cycles_kernel "${LIB}"
   ${SRC_KERNEL_DEVICE_CUDA}
   ${SRC_KERNEL_DEVICE_HIP}
   ${SRC_KERNEL_DEVICE_OPTIX}
+  ${SRC_KERNEL_DEVICE_METAL}
   ${SRC_KERNEL_HEADERS}
   ${SRC_KERNEL_DEVICE_CPU_HEADERS}
   ${SRC_KERNEL_DEVICE_GPU_HEADERS}
   ${SRC_KERNEL_DEVICE_CUDA_HEADERS}
   ${SRC_KERNEL_DEVICE_HIP_HEADERS}
   ${SRC_KERNEL_DEVICE_OPTIX_HEADERS}
+  ${SRC_KERNEL_DEVICE_METAL_HEADERS}
 )
 
 source_group("bake" FILES ${SRC_KERNEL_BAKE_HEADERS})
@@ -740,6 +754,7 @@ source_group("device\\cuda" FILES ${SRC_KERNEL_DEVICE_CUDA} ${SRC_KERNEL_DEVICE_
 source_group("device\\gpu" FILES ${SRC_KERNEL_DEVICE_GPU_HEADERS})
 source_group("device\\hip" FILES ${SRC_KERNEL_DEVICE_HIP} ${SRC_KERNEL_DEVICE_HIP_HEADERS})
 source_group("device\\optix" FILES ${SRC_KERNEL_DEVICE_OPTIX} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS})
+source_group("device\\metal" FILES ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_DEVICE_METAL_HEADERS})
 source_group("film" FILES ${SRC_KERNEL_FILM_HEADERS})
 source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS})
 source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS})
@@ -772,6 +787,8 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP}" ${CYCLES_
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_FILM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/film)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator)
diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h
index 1ee82e6eb7c..2feebad074f 100644
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -75,6 +75,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
 
 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h
index 46196dcdb51..e333fe90332 100644
--- a/intern/cycles/kernel/device/cuda/config.h
+++ b/intern/cycles/kernel/device/cuda/config.h
@@ -93,11 +93,35 @@
 /* Compute number of threads per block and minimum blocks per multiprocessor
  * given the maximum number of registers per thread. */
 
-#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
+#define ccl_gpu_kernel_threads(block_num_threads) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads)
+
+#define ccl_gpu_kernel_threads_registers(block_num_threads, thread_num_registers) \
   extern "C" __global__ void __launch_bounds__(block_num_threads, \
                                                GPU_MULTIPRESSOR_MAX_REGISTERS / \
                                                    (block_num_threads * thread_num_registers))
 
+/* allow ccl_gpu_kernel to accept 1 or 2 parameters */
+#define SELECT_MACRO(_1, _2, NAME, ...) NAME
+#define ccl_gpu_kernel(...) \
+  SELECT_MACRO(__VA_ARGS__, ccl_gpu_kernel_threads_registers, ccl_gpu_kernel_threads)(__VA_ARGS__)
+
+#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
+
+#define ccl_gpu_kernel_call(x) x
+
+/* define a function object where "func" is the lambda body, and additional parameters are used to
+ * specify captured state  */
+#define ccl_gpu_kernel_lambda(func, ...) \
+  struct KernelLambda { \
+    __VA_ARGS__; \
+    __device__ int operator()(const int state) \
+    { \
+      return (func); \
+    } \
+  } ccl_gpu_kernel_lambda_pass; \
+  ccl_gpu_kernel_lambda_pass
+
 /* sanity checks */
 
 #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h
index 95a37c693ae..0900a45c83d 100644
--- a/intern/cycles/kernel/device/gpu/image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -65,7 +65,9 @@ ccl_device float cubic_h1(float a)
 
 /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
 template<typename T>
-ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
+ccl_device_noinline T kernel_tex_image_interp_bicubic(ccl_global const TextureInfo &info,
+                                                      float x,
+                                                      float y)
 {
   ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
@@ -94,7 +96,7 @@ ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, f
 /* Fast tricubic texture lookup using 8 trilinear lookups. */
 template<typename T>
 ccl_device_noinline T
-kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
+kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, float y, float z)
 {
   ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
@@ -169,7 +171,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl
 
 template<typename T>
 ccl_device_noinline T kernel_tex_image_interp_nanovdb(
-    const TextureInfo &info, float x, float y, float z, uint interpolation)
+    ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
 {
   using namespace nanovdb;
 
@@ -191,7 +193,7 @@ ccl_device_noinline T kernel_tex_image_interp_nanovdb(
 
 ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
 {
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
   /* float4, byte4, ushort4 and half4 */
   const int texture_type = info.data_type;
@@ -226,7 +228,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
                                              float3 P,
                                              InterpolationType interp)
 {
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
   if (info.use_transform_3d) {
     P = transform_point(&info.transform_3d, P);
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 5848ba5df9d..2ec6a49ec7b 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -21,6 +21,10 @@
 #include "kernel/device/gpu/parallel_sorted_index.h"
 #include "kernel/device/gpu/work_stealing.h"
 
+#ifdef __KERNEL_METAL__
+#  include "kernel/device/metal/context_begin.h"
+#endif
+
 #include "kernel/integrator/state.h"
 #include "kernel/integrator/state_flow.h"
 #include "kernel/integrator/state_util.h"
@@ -40,6 +44,11 @@
 #include "kernel/bake/bake.h"
 
 #include "kernel/film/adaptive_sampling.h"
+
+#ifdef __KERNEL_METAL__
+#  include "kernel/device/metal/context_end.h"
+#endif
+
 #include "kernel/film/read.h"
 
 /* --------------------------------------------------------------------
@@ -47,7 +56,8 @@
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_reset(int num_states)
+ccl_gpu_kernel_signature(integrator_reset,
+                          int num_states)
 {
   const int state = ccl_gpu_global_id_x();
 
@@ -58,10 +68,11 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles,
-                                           const int num_tiles,
-                                           float *render_buffer,
-                                           const int max_tile_work_size)
+    ccl_gpu_kernel_signature(integrator_init_from_camera,
+                             ccl_global KernelWorkTile *tiles,
+                             const int num_tiles,
+                             ccl_global float *render_buffer,
+                             const int max_tile_work_size)
 {
   const int work_index = ccl_gpu_global_id_x();
 
@@ -72,7 +83,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int tile_index = work_index / max_tile_work_size;
   const int tile_work_index = work_index - tile_index * max_tile_work_size;
 
-  const KernelWorkTile *tile = &tiles[tile_index];
+  ccl_global const KernelWorkTile *tile = &tiles[tile_index];
 
   if (tile_work_index >= tile->work_size) {
     return;
@@ -83,14 +94,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   uint x, y, sample;
   get_work_pixel(tile, tile_work_index, &x, &y, &sample);
 
-  integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample);
+  ccl_gpu_kernel_call(
+      integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample));
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles,
-                                         const int num_tiles,
-                                         float *render_buffer,
-                                         const int max_tile_work_size)
+    ccl_gpu_kernel_signature(integrator_init_from_bake,
+                             ccl_global KernelWorkTile *tiles,
+                             const int num_tiles,
+                             ccl_global float *render_buffer,
+                             const int max_tile_work_size)
 {
   const int work_index = ccl_gpu_global_id_x();
 
@@ -101,7 +114,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int tile_index = work_index / max_tile_work_size;
   const int tile_work_index = work_index - tile_index * max_tile_work_size;
 
-  const KernelWorkTile *tile = &tiles[tile_index];
+  ccl_global const KernelWorkTile *tile = &tiles[tile_index];
 
   if (tile_work_index >= tile->work_size) {
     return;
@@ -112,228 +125,260 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   uint x, y, sample;
   get_work_pixel(tile, tile_work_index, &x, &y, &sample);
 
-  integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample);
+  ccl_gpu_kernel_call(
+      integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample));
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_closest,
+                             ccl_global const int *path_index_array,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_closest(NULL, state);
+    ccl_gpu_kernel_call(integrator_intersect_closest(NULL, state));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_shadow,
+                             ccl_global const int *path_index_array,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_shadow(NULL, state);
+    ccl_gpu_kernel_call(integrator_intersect_shadow(NULL, state));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_subsurface,
+                             ccl_global const int *path_index_array,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_subsurface(NULL, state);
+    ccl_gpu_kernel_call(integrator_intersect_subsurface(NULL, state));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_volume_stack,
+                             ccl_global const int *path_index_array,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_volume_stack(NULL, state);
+    ccl_gpu_kernel_call(integrator_intersect_volume_stack(NULL, state));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_background(const int *path_index_array,
-                                           float *render_buffer,
-                                           const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_background,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_background(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_background(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_light(const int *path_index_array,
-                                      float *render_buffer,
-                                      const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_light,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_light(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_light(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_shadow(const int *path_index_array,
-                                       float *render_buffer,
-                                       const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_shadow,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_shadow(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_shadow(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_surface(const int *path_index_array,
-                                        float *render_buffer,
-                                        const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_surface,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_surface(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_surface(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array,
-                                                 float *render_buffer,
-                                                 const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_surface_raytrace,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_surface_raytrace(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_surface_raytrace(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_volume(const int *path_index_array,
-                                       float *render_buffer,
-                                       const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_volume,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_volume(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_volume(NULL, state, render_buffer));
   }
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_queued_paths_array(int num_states,
-                                             int *indices,
-                                             int *num_indices,
-                                             int kernel)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_queued_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int kernel_index)
 {
-  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [kernel](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel);
-      });
-}
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index,
+                        int kernel_index)
+      .kernel_index = kernel_index;
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_queued_shadow_paths_array(int num_states,
-                                                    int *indices,
-                                                    int *num_indices,
-                                                    int kernel)
-{
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [kernel](const int state) {
-        return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int kernel_index)
 {
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel_index,
+                        int kernel_index)
+      .kernel_index = kernel_index;
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_terminated_paths_array(int num_states,
-                                                 int *indices,
-                                                 int *num_indices,
-                                                 int indices_offset)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_active_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices)
 {
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices + indices_offset, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) == 0);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_terminated_shadow_paths_array(int num_states,
-                                                        int *indices,
-                                                        int *num_indices,
-                                                        int indices_offset)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_terminated_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int indices_offset)
 {
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices + indices_offset, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
-      });
+      num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_sorted_paths_array(int num_states,
-                                             int num_states_limit,
-                                             int *indices,
-                                             int *num_indices,
-                                             int *key_counter,
-                                             int *key_prefix_sum,
-                                             int kernel)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int indices_offset)
 {
-  gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_sorted_paths_array,
+                             int num_states,
+                             int num_states_limit,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             ccl_global int *key_counter,
+                             ccl_global int *key_prefix_sum,
+                             int kernel_index)
+{
+  ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index) ?
+                            INTEGRATOR_STATE(state, path, shader_sort_key) :
+                            GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY,
+                        int kernel_index)
+    .kernel_index = kernel_index;
+  
+  const uint state_index = ccl_gpu_global_id_x();
+  gpu_parallel_sorted_index_array(
+      state_index,
       num_states,
       num_states_limit,
       indices,
-      num_indices,
-      key_counter,
-      key_prefix_sum,
-      [kernel](const int state) {
-        return (INTEGRATOR_STATE(state, path, queued_kernel) == kernel) ?
-                   INTEGRATOR_STATE(state, path, shader_sort_key) :
-                   GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
-      });
-}
-
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_compact_paths_array(int num_states,
-                                              int *indices,
-                                              int *num_indices,
-                                              int num_active_paths)
-{
+                                  num_indices,
+                                  key_counter,
+                                  key_prefix_sum,
+                                  ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                         ccl_global int *num_indices,
+                         int num_active_paths)
+{
+  ccl_gpu_kernel_lambda((state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0),
+                        int num_active_paths)
+    .num_active_paths = num_active_paths;
+  
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [num_active_paths](const int state) {
-        return (state >= num_active_paths) && (INTEGRATOR_STATE(state, path, queued_kernel) != 0);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_compact_states(const int *active_terminated_states,
-                                         const int active_states_offset,
-                                         const int terminated_states_offset,
-                                         const int work_size)
+ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_states,
+                             ccl_global const int *active_terminated_states,
+                             const int active_states_offset,
+                             const int terminated_states_offset,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
@@ -341,28 +386,31 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
     const int from_state = active_terminated_states[active_states_offset + global_index];
     const int to_state = active_terminated_states[terminated_states_offset + global_index];
 
-    integrator_state_move(NULL, to_state, from_state);
+    ccl_gpu_kernel_call(integrator_state_move(NULL, to_state, from_state));
   }
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_compact_shadow_paths_array(int num_states,
-                                                     int *indices,
-                                                     int *num_indices,
-                                                     int num_active_paths)
+ccl_gpu_kernel(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                         ccl_global int *num_indices,
+                         int num_active_paths)
 {
+  ccl_gpu_kernel_lambda((state >= num_active_paths) && (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0),
+                        int num_active_paths)
+    .num_active_paths = num_active_paths;
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [num_active_paths](const int state) {
-        return (state >= num_active_paths) &&
-               (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_compact_shadow_states(const int *active_terminated_states,
-                                                const int active_states_offset,
-                                                const int terminated_states_offset,
-                                                const int work_size)
+ccl_gpu_kernel(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_shadow_states,
+                             ccl_global const int *active_terminated_states,
+                             const int active_states_offset,
+                             const int terminated_states_offset,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
@@ -370,15 +418,14 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
     const int from_state = active_terminated_states[active_states_offset + global_index];
     const int to_state = active_terminated_states[terminated_states_offset + global_index];
 
-    integrator_shadow_state_move(NULL, to_state, from_state);
+    ccl_gpu_kernel_call(integrator_shadow_state_move(NULL, to_state, from_state));
   }
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_prefix_sum(int *counter, int *prefix_sum, int num_values)
+ccl_gpu_kernel(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) ccl_gpu_kernel_signature(
+    prefix_sum, ccl_global int *counter, ccl_global int *prefix_sum, int num_values)
 {
-  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(
-      counter, prefix_sum, num_values);
+  gpu_parallel_prefix_sum(ccl_gpu_global_id_x(), counter, prefix_sum, num_values);
 }
 
 /* --------------------------------------------------------------------
@@ -386,16 +433,17 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLO
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer,
-                                                   int sx,
-                                                   int sy,
-                                                   int sw,
-                                                   int sh,
-                                                   float threshold,
-                                                   bool reset,
-                                                   int offset,
-                                                   int stride,
-                                                   uint *num_active_pixels)
+    ccl_gpu_kernel_signature(adaptive_sampling_convergence_check,
+                             ccl_global float *render_buffer,
+                             int sx,
+                             int sy,
+                             int sw,
+                             int sh,
+                             float threshold,
+                             bool reset,
+                             int offset,
+                             int stride,
+                             ccl_global uint *num_active_pixels)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / sw;
@@ -404,37 +452,51 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   bool converged = true;
 
   if (x < sw && y < sh) {
-    converged = kernel_adaptive_sampling_convergence_check(
-        nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride);
+    converged = ccl_gpu_kernel_call(kernel_adaptive_sampling_convergence_check(
+        nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride));
   }
 
   /* NOTE: All threads specified in the mask must execute the intrinsic. */
-  const uint num_active_pixels_mask = ccl_gpu_ballot(!converged);
+  const auto num_active_pixels_mask = ccl_gpu_ballot(!converged);
   const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
   if (lane_id == 0) {
-    atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask));
+    atomic_fetch_and_add_uint32(num_active_pixels, ccl_gpu_popc(num_active_pixels_mask));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_adaptive_sampling_filter_x(
-        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+    ccl_gpu_kernel_signature(adaptive_sampling_filter_x,
+                             ccl_global float *render_buffer,
+                             int sx,
+                             int sy,
+                             int sw,
+                             int sh,
+                             int offset,
+                             int stride)
 {
   const int y = ccl_gpu_global_id_x();
 
   if (y < sh) {
-    kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride);
+    ccl_gpu_kernel_call(
+        kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_adaptive_sampling_filter_y(
-        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+    ccl_gpu_kernel_signature(adaptive_sampling_filter_y,
+                             ccl_global float *render_buffer,
+                             int sx,
+                             int sy,
+                             int sw,
+                             int sh,
+                             int offset,
+                             int stride)
 {
   const int x = ccl_gpu_global_id_x();
 
   if (x < sw) {
-    kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride);
+    ccl_gpu_kernel_call(
+        kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride));
   }
 }
 
@@ -443,12 +505,14 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels)
+    ccl_gpu_kernel_signature(cryptomatte_postprocess,
+                             ccl_global float *render_buffer,
+                             int num_pixels)
 {
   const int pixel_index = ccl_gpu_global_id_x();
 
   if (pixel_index < num_pixels) {
-    kernel_cryptomatte_post(nullptr, render_buffer, pixel_index);
+    ccl_gpu_kernel_call(kernel_cryptomatte_post(nullptr, render_buffer, pixel_index));
   }
 }
 
@@ -456,206 +520,102 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  * Film.
  */
 
-/* Common implementation for float destination. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert,
-                                                      float *pixels,
-                                                      float *render_buffer,
-                                                      int num_pixels,
-                                                      int width,
-                                                      int offset,
-                                                      int stride,
-                                                      int dst_offset,
-                                                      int dst_stride,
-                                                      const Processor &processor)
-{
-  const int render_pixel_index = ccl_gpu_global_id_x();
-  if (render_pixel_index >= num_pixels) {
-    return;
-  }
-
-  const int x = render_pixel_index % width;
-  const int y = render_pixel_index / width;
-
-  ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride +
-                                   y * stride * kfilm_convert->pass_stride;
-
-  ccl_global float *pixel = pixels +
-                            (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride;
-
-  processor(kfilm_convert, buffer, pixel);
-}
-
-/* Common implementation for half4 destination and 4-channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
-    const KernelFilmConvert *kfilm_convert,
-    uchar4 *rgba,
-    float *render_buffer,
-    int num_pixels,
-    int width,
-    int offset,
-    int stride,
-    int rgba_offset,
-    int rgba_stride,
-    const Processor &processor)
-{
-  const int render_pixel_index = ccl_gpu_global_id_x();
-  if (render_pixel_index >= num_pixels) {
-    return;
-  }
-
-  const int x = render_pixel_index % width;
-  const int y = render_pixel_index / width;
-
-  ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert->pass_stride +
-                                   y * stride * kfilm_convert->pass_stride;
-
-  float pixel[4];
-  processor(kfilm_convert, buffer, pixel);
-
-  film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
-
-  ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
-  *out = float4_to_half4_display(make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
-}
-
-/* Common implementation for half4 destination and 3-channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb(
-    const KernelFilmConvert *kfilm_convert,
-    uchar4 *rgba,
-    float *render_buffer,
-    int num_pixels,
-    int width,
-    int offset,
-    int stride,
-    int rgba_offset,
-    int rgba_stride,
-    const Processor &processor)
-{
-  kernel_gpu_film_convert_half_rgba_common_rgba(
-      kfilm_convert,
-      rgba,
-      render_buffer,
-      num_pixels,
-      width,
-      offset,
-      stride,
-      rgba_offset,
-      rgba_stride,
-      [&processor](const KernelFilmConvert *kfilm_convert,
-                   ccl_global const float *buffer,
-                   float *pixel_rgba) {
-        processor(kfilm_convert, buffer, pixel_rgba);
-        pixel_rgba[3] = 1.0f;
-      });
-}
-
-/* Common implementation for half4 destination and single channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value(
-    const KernelFilmConvert *kfilm_convert,
-    uchar4 *rgba,
-    float *render_buffer,
-    int num_pixels,
-    int width,
-    int offset,
-    int stride,
-    int rgba_offset,
-    int rgba_stride,
-    const Processor &processor)
-{
-  kernel_gpu_film_convert_half_rgba_common_rgba(
-      kfilm_convert,
-      rgba,
-      render_buffer,
-      num_pixels,
-      width,
-      offset,
-      stride,
-      rgba_offset,
-      rgba_stride,
-      [&processor](const KernelFilmConvert *kfilm_convert,
-                   ccl_global const float *buffer,
-                   float *pixel_rgba) {
-        float value;
-        processor(kfilm_convert, buffer, &value);
-
-        pixel_rgba[0] = value;
-        pixel_rgba[1] = value;
-        pixel_rgba[2] = value;
-        pixel_rgba[3] = 1.0f;
-      });
-}
-
-#define KERNEL_FILM_CONVERT_PROC(name) \
-  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name
-
-#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \
-  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \
-  (const KernelFilmConvert kfilm_convert, \
-   float *pixels, \
-   float *render_buffer, \
-   int num_pixels, \
-   int width, \
-   int offset, \
-   int stride, \
-   int rgba_offset, \
-   int rgba_stride) \
+#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \
+  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
+      ccl_gpu_kernel_signature(film_convert_##variant, \
+                               const KernelFilmConvert kfilm_convert, \
+                               ccl_global uchar4 *rgba, \
+                               ccl_global float *render_buffer, \
+                               int num_pixels, \
+                               int width, \
+                               int offset, \
+                               int stride, \
+                               int rgba_offset, \
+                               int rgba_stride) \
   { \
-    kernel_gpu_film_convert_common(&kfilm_convert, \
-                                   pixels, \
-                                   render_buffer, \
-                                   num_pixels, \
-                                   width, \
-                                   offset, \
-                                   stride, \
-                                   rgba_offset, \
-                                   rgba_stride, \
-                                   film_get_pass_pixel_##variant); \
+    const int render_pixel_index = ccl_gpu_global_id_x(); \
+    if (render_pixel_index >= num_pixels) { \
+      return; \
+    } \
+\
+    const int x = render_pixel_index % width; \
+    const int y = render_pixel_index / width; \
+\
+    ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
+                                     y * stride * kfilm_convert.pass_stride; \
+\
+    float pixel[4]; \
+    film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
+\
+    film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
+\
+    if (input_channel_count == 1) { \
+      pixel[1] = pixel[2] = pixel[0]; \
+    } \
+    if (input_channel_count <= 3) { \
+      pixel[3] = 1.0f; \
+    } \
+\
+    ccl_global float *out = ((ccl_global float *)rgba) + rgba_offset + y * rgba_stride + x; \
+    *(ccl_global float4 *)out = make_float4(pixel[0], pixel[1], pixel[2], pixel[3]); \
   } \
-  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \
-  (const KernelFilmConvert kfilm_convert, \
-   uchar4 *rgba, \
-   float *render_buffer, \
-   int num_pixels, \
-   int width, \
-   int offset, \
-   int stride, \
-   int rgba_offset, \
-   int rgba_stride) \
+\
+  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
+      ccl_gpu_kernel_signature(film_convert_##variant##_half_rgba, \
+                               const KernelFilmConvert kfilm_convert, \
+                               ccl_global uchar4 *rgba, \
+                               ccl_global float *render_buffer, \
+                               int num_pixels, \
+                               int width, \
+                               int offset, \
+                               int stride, \
+                               int rgba_offset, \
+                               int rgba_stride) \
   { \
-    kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \
-                                                        rgba, \
-                                                        render_buffer, \
-                                                        num_pixels, \
-                                                        width, \
-                                                        offset, \
-                                                        stride, \
-                                                        rgba_offset, \
-                                                        rgba_stride, \
-                                                        film_get_pass_pixel_##variant); \
-  }
-
-KERNEL_FILM_CONVERT_DEFINE(depth, value)
-KERNEL_FILM_CONVERT_DEFINE(mist, value)
-KERNEL_FILM_CONVERT_DEFINE(sample_count, value)
-KERNEL_FILM_CONVERT_DEFINE(float, value)
-
-KERNEL_FILM_CONVERT_DEFINE(light_path, rgb)
-KERNEL_FILM_CONVERT_DEFINE(float3, rgb)
-
-KERNEL_FILM_CONVERT_DEFINE(motion, rgba)
-KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba)
-KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba)
-KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba)
-KERNEL_FILM_CONVERT_DEFINE(combined, rgba)
-KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
-
-#undef KERNEL_FILM_CONVERT_DEFINE
-#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE
-#undef KERNEL_FILM_CONVERT_PROC
+    const int render_pixel_index = ccl_gpu_global_id_x(); \
+    if (render_pixel_index >= num_pixels) { \
+      return; \
+    } \
+\
+    const int x = render_pixel_index % width; \
+    const int y = render_pixel_index / width; \
+\
+    ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
+                                     y * stride * kfilm_convert.pass_stride; \
+\
+    float pixel[4]; \
+    film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
+\
+    film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
+\
+    if (input_channel_count == 1) { \
+      pixel[1] = pixel[2] = pixel[0]; \
+    } \
+    if (input_channel_count <= 3) { \
+      pixel[3] = 1.0f; \
+    } \
+\
+    ccl_global half4 *out = ((ccl_global half4 *)rgba) + (rgba_offset + y * rgba_stride + x); \
+    *out = float4_to_half4_display(make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \
+  }
+
+/* 1 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(depth, 1)
+KERNEL_FILM_CONVERT_VARIANT(mist, 1)
+KERNEL_FILM_CONVERT_VARIANT(sample_count, 1)
+KERNEL_FILM_CONVERT_VARIANT(float, 1)
+
+/* 3 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(light_path, 3)
+KERNEL_FILM_CONVERT_VARIANT(float3, 3)
+
+/* 4 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(motion, 4)
+KERNEL_FILM_CONVERT_VARIANT(cryptomatte, 4)
+KERNEL_FILM_CONVERT_VARIANT(shadow_catcher, 4)
+KERNEL_FILM_CONVERT_VARIANT(shadow_catcher_matte_with_shadow, 4)
+KERNEL_FILM_CONVERT_VARIANT(combined, 4)
+KERNEL_FILM_CONVERT_VARIANT(float4, 4)
 
 /* --------------------------------------------------------------------
  * Shader evaluation.
@@ -664,42 +624,46 @@ KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
 /* Displacement */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
-                                    float *output,
-                                    const int offset,
-                                    const int work_size)
+    ccl_gpu_kernel_signature(shader_eval_displace,
+                             ccl_global KernelShaderEvalInput *input,
+                             ccl_global float *output,
+                             const int offset,
+                             const int work_size)
 {
   int i = ccl_gpu_global_id_x();
   if (i < work_size) {
-    kernel_displace_evaluate(NULL, input, output, offset + i);
+    ccl_gpu_kernel_call(kernel_displace_evaluate(NULL, input, output, offset + i));
   }
 }
 
 /* Background */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
-                                      float *output,
-                                      const int offset,
-                                      const int work_size)
+    ccl_gpu_kernel_signature(shader_eval_background,
+                             ccl_global KernelShaderEvalInput *input,
+                             ccl_global float *output,
+                             const int offset,
+                             const int work_size)
 {
   int i = ccl_gpu_global_id_x();
   if (i < work_size) {
-    kernel_background_evaluate(NULL, input, output, offset + i);
+    ccl_gpu_kernel_call(kernel_background_evaluate(NULL, input, output, offset + i));
   }
 }
 
 /* Curve Shadow Transparency */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_shader_eval_curve_shadow_transparency(KernelShaderEvalInput *input,
-                                                     float *output,
-                                                     const int offset,
-                                                     const int work_size)
+    ccl_gpu_kernel_signature(shader_eval_curve_shadow_transparency,
+                             ccl_global KernelShaderEvalInput *input,
+                             ccl_global float *output,
+                             const int offset,
+                             const int work_size)
 {
   int i = ccl_gpu_global_id_x();
   if (i < work_size) {
-    kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i);
+    ccl_gpu_kernel_call(
+        kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i));
   }
 }
 
@@ -708,15 +672,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_color_preprocess(float *render_buffer,
-                                       int full_x,
-                                       int full_y,
-                                       int width,
-                                       int height,
-                                       int offset,
-                                       int stride,
-                                       int pass_stride,
-                                       int pass_denoised)
+    ccl_gpu_kernel_signature(filter_color_preprocess,
+                             ccl_global float *render_buffer,
+                             int full_x,
+                             int full_y,
+                             int width,
+                             int height,
+                             int offset,
+                             int stride,
+                             int pass_stride,
+                             int pass_denoised)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / width;
@@ -727,31 +692,32 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
-  float *buffer = render_buffer + render_pixel_index * pass_stride;
+  ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
 
-  float *color_out = buffer + pass_denoised;
+  ccl_global float *color_out = buffer + pass_denoised;
   color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
   color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
   color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_guiding_preprocess(float *guiding_buffer,
-                                         int guiding_pass_stride,
-                                         int guiding_pass_albedo,
-                                         int guiding_pass_normal,
-                                         const float *render_buffer,
-                                         int render_offset,
-                                         int render_stride,
-                                         int render_pass_stride,
-                                         int render_pass_sample_count,
-                                         int render_pass_denoising_albedo,
-                                         int render_pass_denoising_normal,
-                                         int full_x,
-                                         int full_y,
-                                         int width,
-                                         int height,
-                                         int num_samples)
+    ccl_gpu_kernel_signature(filter_guiding_preprocess,
+                             ccl_global float *guiding_buffer,
+                             int guiding_pass_stride,
+                             int guiding_pass_albedo,
+                             int guiding_pass_normal,
+                             ccl_global const float *render_buffer,
+                             int render_offset,
+                             int render_stride,
+                             int render_pass_stride,
+                             int render_pass_sample_count,
+                             int render_pass_denoising_albedo,
+                             int render_pass_denoising_normal,
+                             int full_x,
+                             int full_y,
+                             int width,
+                             int height,
+                             int num_samples)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / width;
@@ -762,10 +728,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t guiding_pixel_index = x + y * width;
-  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+  ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
 
   const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
-  const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
+  ccl_global const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
 
   float pixel_scale;
   if (render_pass_sample_count == PASS_UNUSED) {
@@ -779,8 +745,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   if (guiding_pass_albedo != PASS_UNUSED) {
     kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
 
-    const float *aledo_in = buffer + render_pass_denoising_albedo;
-    float *albedo_out = guiding_pixel + guiding_pass_albedo;
+    ccl_global const float *aledo_in = buffer + render_pass_denoising_albedo;
+    ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
 
     albedo_out[0] = aledo_in[0] * pixel_scale;
     albedo_out[1] = aledo_in[1] * pixel_scale;
@@ -791,8 +757,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   if (guiding_pass_normal != PASS_UNUSED) {
     kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
 
-    const float *normal_in = buffer + render_pass_denoising_normal;
-    float *normal_out = guiding_pixel + guiding_pass_normal;
+    ccl_global const float *normal_in = buffer + render_pass_denoising_normal;
+    ccl_global float *normal_out = guiding_pixel + guiding_pass_normal;
 
     normal_out[0] = normal_in[0] * pixel_scale;
     normal_out[1] = normal_in[1] * pixel_scale;
@@ -801,11 +767,12 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer,
-                                              int guiding_pass_stride,
-                                              int guiding_pass_albedo,
-                                              int width,
-                                              int height)
+    ccl_gpu_kernel_signature(filter_guiding_set_fake_albedo,
+                             ccl_global float *guiding_buffer,
+                             int guiding_pass_stride,
+                             int guiding_pass_albedo,
+                             int width,
+                             int height)
 {
   kernel_assert(guiding_pass_albedo != PASS_UNUSED);
 
@@ -818,9 +785,9 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t guiding_pixel_index = x + y * width;
-  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+  ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
 
-  float *albedo_out = guiding_pixel + guiding_pass_albedo;
+  ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
 
   albedo_out[0] = 0.5f;
   albedo_out[1] = 0.5f;
@@ -828,20 +795,21 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_color_postprocess(float *render_buffer,
-                                        int full_x,
-                                        int full_y,
-                                        int width,
-                                        int height,
-                                        int offset,
-                                        int stride,
-                                        int pass_stride,
-                                        int num_samples,
-                                        int pass_noisy,
-                                        int pass_denoised,
-                                        int pass_sample_count,
-                                        int num_components,
-                                        bool use_compositing)
+    ccl_gpu_kernel_signature(filter_color_postprocess,
+                             ccl_global float *render_buffer,
+                             int full_x,
+                             int full_y,
+                             int width,
+                             int height,
+                             int offset,
+                             int stride,
+                             int pass_stride,
+                             int num_samples,
+                             int pass_noisy,
+                             int pass_denoised,
+                             int pass_sample_count,
+                             int num_components,
+                             bool use_compositing)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / width;
@@ -852,7 +820,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
-  float *buffer = render_buffer + render_pixel_index * pass_stride;
+  ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
 
   float pixel_scale;
   if (pass_sample_count == PASS_UNUSED) {
@@ -862,7 +830,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     pixel_scale = __float_as_uint(buffer[pass_sample_count]);
   }
 
-  float *denoised_pixel = buffer + pass_denoised;
+  ccl_global float *denoised_pixel = buffer + pass_denoised;
 
   denoised_pixel[0] *= pixel_scale;
   denoised_pixel[1] *= pixel_scale;
@@ -875,7 +843,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     /* Currently compositing passes are either 3-component (derived by dividing light passes)
      * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
      * simplifies logic and avoids extra memory allocation. */
-    const float *noisy_pixel = buffer + pass_noisy;
+    ccl_global const float *noisy_pixel = buffer + pass_noisy;
     denoised_pixel[3] = noisy_pixel[3];
   }
   else {
@@ -891,21 +859,22 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states,
-                                                               uint *num_possible_splits)
+    ccl_gpu_kernel_signature(integrator_shadow_catcher_count_possible_splits,
+                             int num_states,
+                             ccl_global uint *num_possible_splits)
 {
   const int state = ccl_gpu_global_id_x();
 
   bool can_split = false;
 
   if (state < num_states) {
-    can_split = kernel_shadow_catcher_path_can_split(nullptr, state);
+    can_split = ccl_gpu_kernel_call(kernel_shadow_catcher_path_can_split(nullptr, state));
   }
 
   /* NOTE: All threads specified in the mask must execute the intrinsic. */
-  const uint can_split_mask = ccl_gpu_ballot(can_split);
+  const auto can_split_mask = ccl_gpu_ballot(can_split);
   const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
   if (lane_id == 0) {
-    atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask));
+    atomic_fetch_and_add_uint32(num_possible_splits, ccl_gpu_popc(can_split_mask));
   }
 }
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index d7416beb783..f667ede2712 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -31,10 +31,43 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
 #endif
 
+#ifdef __KERNEL_METAL__
+struct ActiveIndexContext {
+  ActiveIndexContext(int _thread_index,
+                     int _global_index,
+                     int _threadgroup_size,
+                     int _simdgroup_size,
+                     int _simd_lane_index,
+                     int _simd_group_index,
+                     int _num_simd_groups,
+                     threadgroup int *_simdgroup_offset)
+      : thread_index(_thread_index),
+        global_index(_global_index),
+        blocksize(_threadgroup_size),
+        ccl_gpu_warp_size(_simdgroup_size),
+        thread_warp(_simd_lane_index),
+        warp_index(_simd_group_index),
+        num_warps(_num_simd_groups),
+        warp_offset(_simdgroup_offset)
+  {
+  }
+
+  const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index,
+      num_warps;
+  threadgroup int *warp_offset;
+
+  template<uint blocksizeDummy, typename IsActiveOp>
+  void active_index_array(const uint num_states,
+                          ccl_global int *indices,
+                          ccl_global int *num_indices,
+                          IsActiveOp is_active_op)
+  {
+    const uint state_index = global_index;
+#else
 template<uint blocksize, typename IsActiveOp>
 __device__ void gpu_parallel_active_index_array(const uint num_states,
-                                                int *indices,
-                                                int *num_indices,
+                                                ccl_global int *indices,
+                                                ccl_global int *num_indices,
                                                 IsActiveOp is_active_op)
 {
   extern ccl_gpu_shared int warp_offset[];
@@ -45,43 +78,62 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
   const uint warp_index = thread_index / ccl_gpu_warp_size;
   const uint num_warps = blocksize / ccl_gpu_warp_size;
 
-  /* Test if state corresponding to this thread is active. */
   const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
-  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+#endif
 
-  /* For each thread within a warp compute how many other active states precede it. */
-  const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
-  const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+    /* Test if state corresponding to this thread is active. */
+    const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
 
-  /* Last thread in warp stores number of active states for each warp. */
-  if (thread_warp == ccl_gpu_warp_size - 1) {
-    warp_offset[warp_index] = thread_offset + is_active;
-  }
+    /* For each thread within a warp compute how many other active states precede it. */
+    const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) &
+                                            ccl_gpu_thread_mask(thread_warp));
 
-  ccl_gpu_syncthreads();
-
-  /* Last thread in block converts per-warp sizes to offsets, increments global size of
-   * index array and gets offset to write to. */
-  if (thread_index == blocksize - 1) {
-    /* TODO: parallelize this. */
-    int offset = 0;
-    for (int i = 0; i < num_warps; i++) {
-      int num_active = warp_offset[i];
-      warp_offset[i] = offset;
-      offset += num_active;
+    /* Last thread in warp stores number of active states for each warp. */
+    if (thread_warp == ccl_gpu_warp_size - 1) {
+      warp_offset[warp_index] = thread_offset + is_active;
     }
 
-    const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
-    warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
-  }
+    ccl_gpu_syncthreads();
+
+    /* Last thread in block converts per-warp sizes to offsets, increments global size of
+     * index array and gets offset to write to. */
+    if (thread_index == blocksize - 1) {
+      /* TODO: parallelize this. */
+      int offset = 0;
+      for (int i = 0; i < num_warps; i++) {
+        int num_active = warp_offset[i];
+        warp_offset[i] = offset;
+        offset += num_active;
+      }
+
+      const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+      warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+    }
 
-  ccl_gpu_syncthreads();
+    ccl_gpu_syncthreads();
 
-  /* Write to index array. */
-  if (is_active) {
-    const uint block_offset = warp_offset[num_warps];
-    indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+    /* Write to index array. */
+    if (is_active) {
+      const uint block_offset = warp_offset[num_warps];
+      indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+    }
   }
-}
+
+#ifdef __KERNEL_METAL__
+}; /* end class ActiveIndexContext */
+
+/* inject the required thread params into a struct, and redirect to its templated member function
+ */
+#  define gpu_parallel_active_index_array \
+    ActiveIndexContext(metal_local_id, \
+                       metal_global_id, \
+                       metal_local_size, \
+                       simdgroup_size, \
+                       simd_lane_index, \
+                       simd_group_index, \
+                       num_simd_groups, \
+                       simdgroup_offset) \
+        .active_index_array
+#endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
index 6de3a022569..4bd002c27e4 100644
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -33,10 +33,12 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
 #endif
 
-template<uint blocksize>
-__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values)
+__device__ void gpu_parallel_prefix_sum(const int global_id,
+                                        ccl_global int *counter,
+                                        ccl_global int *prefix_sum,
+                                        const int num_values)
 {
-  if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
+  if (global_id != 0) {
     return;
   }
 
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
index c06d7be444f..c092e2a21ee 100644
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -33,16 +33,16 @@ CCL_NAMESPACE_BEGIN
 #endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
 
-template<uint blocksize, typename GetKeyOp>
-__device__ void gpu_parallel_sorted_index_array(const uint num_states,
+template<typename GetKeyOp>
+__device__ void gpu_parallel_sorted_index_array(const uint state_index,
+                                                const uint num_states,
                                                 const int num_states_limit,
-                                                int *indices,
-                                                int *num_indices,
-                                                int *key_counter,
-                                                int *key_prefix_sum,
+                                                ccl_global int *indices,
+                                                ccl_global int *num_indices,
+                                                ccl_global int *key_counter,
+                                                ccl_global int *key_prefix_sum,
                                                 GetKeyOp get_key_op)
 {
-  const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x;
   const int key = (state_index < num_states) ? get_key_op(state_index) :
                                                GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
 
diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h
index 282c3eca641..fb07602539b 100644
--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -74,6 +74,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
 
 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
diff --git a/intern/cycles/kernel/device/hip/config.h b/intern/cycles/kernel/device/hip/config.h
index 2fde0d46015..b9dbc2f7fa8 100644
--- a/intern/cycles/kernel/device/hip/config.h
+++ b/intern/cycles/kernel/device/hip/config.h
@@ -36,11 +36,35 @@
 /* Compute number of threads per block and minimum blocks per multiprocessor
  * given the maximum number of registers per thread. */
 
-#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
+#define ccl_gpu_kernel_threads(block_num_threads) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads)
+
+#define ccl_gpu_kernel_threads_registers(block_num_threads, thread_num_registers) \
   extern "C" __global__ void __launch_bounds__(block_num_threads, \
                                                GPU_MULTIPRESSOR_MAX_REGISTERS / \
                                                    (block_num_threads * thread_num_registers))
 
+/* allow ccl_gpu_kernel to accept 1 or 2 parameters */
+#define SELECT_MACRO(_1, _2, NAME, ...) NAME
+#define ccl_gpu_kernel(...) \
+  SELECT_MACRO(__VA_ARGS__, ccl_gpu_kernel_threads_registers, ccl_gpu_kernel_threads)(__VA_ARGS__)
+
+#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
+
+#define ccl_gpu_kernel_call(x) x
+
+/* define a function object where "func" is the lambda body, and additional parameters are used to
+ * specify captured state  */
+#define ccl_gpu_kernel_lambda(func, ...) \
+  struct KernelLambda { \
+    __VA_ARGS__; \
+    __device__ int operator()(const int state) \
+    { \
+      return (func); \
+    } \
+  } ccl_gpu_kernel_lambda_pass; \
+  ccl_gpu_kernel_lambda_pass
+
 /* sanity checks */
 
 #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h
index 77cea30914c..57c6845e508 100644
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -58,6 +58,95 @@ using namespace metal;
 
 #define kernel_assert(cond)
 
+#define ccl_gpu_global_id_x() metal_global_id
+#define ccl_gpu_warp_size simdgroup_size
+#define ccl_gpu_thread_idx_x simd_group_index
+#define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1)
+
+#define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate)))
+#define ccl_gpu_popc(x) popcount(x)
+
+// clang-format off
+
+/* kernel.h adapters */
+
+#define ccl_gpu_kernel(...)
+
+/* convert a comma-separated list into a semicolon-separated list (so that we can generate a struct based on kernel entrypoint parameters) */
+#define FN0()
+#define FN1(p1) p1;
+#define FN2(p1, p2) p1; p2;
+#define FN3(p1, p2, p3) p1; p2; p3;
+#define FN4(p1, p2, p3, p4) p1; p2; p3; p4;
+#define FN5(p1, p2, p3, p4, p5) p1; p2; p3; p4; p5;
+#define FN6(p1, p2, p3, p4, p5, p6) p1; p2; p3; p4; p5; p6;
+#define FN7(p1, p2, p3, p4, p5, p6, p7) p1; p2; p3; p4; p5; p6; p7;
+#define FN8(p1, p2, p3, p4, p5, p6, p7, p8) p1; p2; p3; p4; p5; p6; p7; p8;
+#define FN9(p1, p2, p3, p4, p5, p6, p7, p8, p9) p1; p2; p3; p4; p5; p6; p7; p8; p9;
+#define FN10(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10;
+#define FN11(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11;
+#define FN12(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12;
+#define FN13(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13;
+#define FN14(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14;
+#define FN15(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15;
+#define FN16(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16;
+#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, ...) p16
+#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0)
+
+/* generate a struct containing the entrypoint parameters and a "run" method which can access them implicitly via this-> */
+#define ccl_gpu_kernel_signature(name, ...) \
+struct kernel_gpu_##name \
+{ \
+  PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
+  void run(thread MetalKernelContext& context, \
+           threadgroup int *simdgroup_offset, \
+           const uint metal_global_id, \
+           const ushort metal_local_id, \
+           const ushort metal_local_size, \
+           uint simdgroup_size, \
+           uint simd_lane_index, \
+           uint simd_group_index, \
+           uint num_simd_groups) ccl_global const; \
+}; \
+kernel void kernel_metal_##name(device const kernel_gpu_##name *params_struct, \
+                                constant KernelParamsMetal &ccl_restrict   _launch_params_metal, \
+                                constant MetalAncillaries *_metal_ancillaries, \
+                                threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
+                                const uint metal_global_id [[thread_position_in_grid]], \
+                                const ushort metal_local_id   [[thread_position_in_threadgroup]], \
+                                const ushort metal_local_size [[threads_per_threadgroup]], \
+                                uint simdgroup_size [[threads_per_simdgroup]], \
+                                uint simd_lane_index [[thread_index_in_simdgroup]], \
+                                uint simd_group_index [[simdgroup_index_in_threadgroup]], \
+                                uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
+  MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
+  INIT_DEBUG_BUFFER \
+  params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
+} \
+void kernel_gpu_##name::run(thread MetalKernelContext& context, \
+                  threadgroup int *simdgroup_offset, \
+                  const uint metal_global_id, \
+                  const ushort metal_local_id, \
+                  const ushort metal_local_size, \
+                  uint simdgroup_size, \
+                  uint simd_lane_index, \
+                  uint simd_group_index, \
+                  uint num_simd_groups) ccl_global const
+
+#define ccl_gpu_kernel_call(x) context.x
+
+/* define a function object where "func" is the lambda body, and additional parameters are used to specify captured state  */
+#define ccl_gpu_kernel_lambda(func, ...) \
+  struct KernelLambda \
+  { \
+    KernelLambda(ccl_private MetalKernelContext &_context) : context(_context) {} \
+    ccl_private MetalKernelContext &context; \
+    __VA_ARGS__; \
+    int operator()(const int state) const { return (func); } \
+  }ccl_gpu_kernel_lambda_pass(context); ccl_gpu_kernel_lambda_pass
+
+// clang-format on
+
 /* make_type definitions with Metal style element initializers */
 #ifdef make_float2
 #  undef make_float2
@@ -124,3 +213,38 @@ using namespace metal;
 #define logf(x) trigmode::log(float(x))
 
 #define NULL 0
+
+/* texture bindings and sampler setup */
+
+struct Texture2DParamsMetal {
+  texture2d<float, access::sample> tex;
+};
+struct Texture3DParamsMetal {
+  texture3d<float, access::sample> tex;
+};
+
+struct MetalAncillaries {
+  device Texture2DParamsMetal *textures_2d;
+  device Texture3DParamsMetal *textures_3d;
+};
+
+enum SamplerType {
+  SamplerFilterNearest_AddressRepeat,
+  SamplerFilterNearest_AddressClampEdge,
+  SamplerFilterNearest_AddressClampZero,
+
+  SamplerFilterLinear_AddressRepeat,
+  SamplerFilterLinear_AddressClampEdge,
+  SamplerFilterLinear_AddressClampZero,
+
+  SamplerCount
+};
+
+constant constexpr array<sampler, SamplerCount> metal_samplers = {
+    sampler(address::repeat, filter::nearest),
+    sampler(address::clamp_to_edge, filter::nearest),
+    sampler(address::clamp_to_zero, filter::nearest),
+    sampler(address::repeat, filter::linear),
+    sampler(address::clamp_to_edge, filter::linear),
+    sampler(address::clamp_to_zero, filter::linear),
+};
+\ No newline at end of file
diff --git a/intern/cycles/kernel/device/metal/context_begin.h b/intern/cycles/kernel/device/metal/context_begin.h
new file mode 100644
index 00000000000..3b5fcdd1f7f
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/context_begin.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// clang-format off
+
+/* Open the Metal kernel context class
+ * Necessary to access resource bindings */
+class MetalKernelContext {
+  public:
+    constant KernelParamsMetal &launch_params_metal;
+    constant MetalAncillaries *metal_ancillaries;
+
+    MetalKernelContext(constant KernelParamsMetal &_launch_params_metal, constant MetalAncillaries * _metal_ancillaries)
+    : launch_params_metal(_launch_params_metal), metal_ancillaries(_metal_ancillaries)
+    {}
+
+    /* texture fetch adapter functions */
+    typedef uint64_t ccl_gpu_tex_object;
+
+    template<typename T>
+    inline __attribute__((__always_inline__))
+    T ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+      kernel_assert(0);
+      return 0;
+    }
+    template<typename T>
+    inline __attribute__((__always_inline__))
+    T ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+      kernel_assert(0);
+      return 0;
+    }
+
+    // texture2d
+    template<>
+    inline __attribute__((__always_inline__))
+    float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y));
+    }
+    template<>
+    inline __attribute__((__always_inline__))
+    float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x;
+    }
+
+    // texture3d
+    template<>
+    inline __attribute__((__always_inline__))
+    float4 ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z));
+    }
+    template<>
+    inline __attribute__((__always_inline__))
+    float ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)).x;
+    }
+#    include "kernel/device/gpu/image.h"
+
+  // clang-format on
+\ No newline at end of file
diff --git a/intern/cycles/kernel/device/metal/context_end.h b/intern/cycles/kernel/device/metal/context_end.h
new file mode 100644
index 00000000000..811abdec150
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/context_end.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+}
+; /* end of MetalKernelContext class definition */
+
+/* Silently redirect into the MetalKernelContext instance */
+/* NOTE: These macros will need maintaining as entrypoints change */
+
+#undef kernel_integrator_state
+#define kernel_integrator_state context.launch_params_metal.__integrator_state
diff --git a/intern/cycles/kernel/device/metal/globals.h b/intern/cycles/kernel/device/metal/globals.h
new file mode 100644
index 00000000000..b4963518b63
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/globals.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#include "kernel/types.h"
+#include "kernel/util/profiling.h"
+
+#include "kernel/integrator/state.h"
+
+CCL_NAMESPACE_BEGIN
+
+typedef struct KernelParamsMetal {
+
+#define KERNEL_TEX(type, name) ccl_constant type *name;
+#include "kernel/textures.h"
+#undef KERNEL_TEX
+
+  const IntegratorStateGPU __integrator_state;
+  const KernelData data;
+
+} KernelParamsMetal;
+
+typedef struct KernelGlobalsGPU {
+  int unused[1];
+} KernelGlobalsGPU;
+
+typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
+
+#define kernel_data launch_params_metal.data
+#define kernel_integrator_state launch_params_metal.__integrator_state
+
+/* data lookup defines */
+
+#define kernel_tex_fetch(tex, index) launch_params_metal.tex[index]
+#define kernel_tex_array(tex) launch_params_metal.tex
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal
new file mode 100644
index 00000000000..feca20ff475
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Metal kernel entry points */
+
+// clang-format off
+
+#include "kernel/device/metal/compat.h"
+#include "kernel/device/metal/globals.h"
+#include "kernel/device/gpu/kernel.h"
+
+// clang-format on
+\ No newline at end of file
diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h
index 835e4621d47..482b921a1a8 100644
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -76,6 +76,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
 
 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)