33 files changed, 2480 insertions, 711 deletions
diff --git a/intern/cycles/kernel/device/cpu/compat.h b/intern/cycles/kernel/device/cpu/compat.h
index bfd936c7bbd..5ccca52255f 100644
--- a/intern/cycles/kernel/device/cpu/compat.h
+++ b/intern/cycles/kernel/device/cpu/compat.h
@@ -26,13 +26,11 @@
 #  pragma GCC diagnostic ignored "-Wuninitialized"
 #endif
 
-#include "util/util_half.h"
-#include "util/util_math.h"
-#include "util/util_simd.h"
-#include "util/util_texture.h"
-#include "util/util_types.h"
-
-#define ccl_addr_space
+#include "util/half.h"
+#include "util/math.h"
+#include "util/simd.h"
+#include "util/texture.h"
+#include "util/types.h"
 
 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
  * much slower than the double version.  This was fixed in glibc 2.16.
diff --git a/intern/cycles/kernel/device/cpu/globals.h b/intern/cycles/kernel/device/cpu/globals.h
index 98b036e269d..746e48b9880 100644
--- a/intern/cycles/kernel/device/cpu/globals.h
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -18,8 +18,9 @@
 
 #pragma once
 
-#include "kernel/kernel_profiling.h"
-#include "kernel/kernel_types.h"
+#include "kernel/tables.h"
+#include "kernel/types.h"
+#include "kernel/util/profiling.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -34,9 +35,9 @@ struct OSLThreadData;
 struct OSLShadingSystem;
 #endif
 
-typedef struct KernelGlobals {
+typedef struct KernelGlobalsCPU {
 #define KERNEL_TEX(type, name) texture<type> name;
-#include "kernel/kernel_textures.h"
+#include "kernel/textures.h"
 
   KernelData __data;
 
@@ -51,7 +52,9 @@ typedef struct KernelGlobals {
   /* **** Run-time data ****  */
 
   ProfilingState profiler;
-} KernelGlobals;
+} KernelGlobalsCPU;
+
+typedef const KernelGlobalsCPU *ccl_restrict KernelGlobals;
 
 /* Abstraction macros */
 #define kernel_tex_fetch(tex, index) (kg->tex.fetch(index))
diff --git a/intern/cycles/kernel/device/cpu/image.h b/intern/cycles/kernel/device/cpu/image.h
index 57e81ab186d..93f956e354d 100644
--- a/intern/cycles/kernel/device/cpu/image.h
+++ b/intern/cycles/kernel/device/cpu/image.h
@@ -72,12 +72,12 @@ template<typename T> struct TextureInterpolator {
 
   static ccl_always_inline float4 read(half4 r)
   {
-    return half4_to_float4(r);
+    return half4_to_float4_image(r);
   }
 
   static ccl_always_inline float4 read(half r)
   {
-    float f = half_to_float(r);
+    float f = half_to_float_image(r);
     return make_float4(f, f, f, 1.0f);
   }
 
@@ -583,7 +583,7 @@ template<typename T> struct NanoVDBInterpolator {
 
 #undef SET_CUBIC_SPLINE_WEIGHTS
 
-ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
+ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
 {
   const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
@@ -611,7 +611,7 @@ ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float
   }
 }
 
-ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
                                              int id,
                                              float3 P,
                                              InterpolationType interp)
diff --git a/intern/cycles/kernel/device/cpu/kernel.cpp b/intern/cycles/kernel/device/cpu/kernel.cpp
index ac1cdf5fffe..a16c637d5ac 100644
--- a/intern/cycles/kernel/device/cpu/kernel.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -64,7 +64,7 @@ CCL_NAMESPACE_BEGIN
 
 /* Memory Copy */
 
-void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t)
+void kernel_const_copy(KernelGlobalsCPU *kg, const char *name, void *host, size_t)
 {
   if (strcmp(name, "__data") == 0) {
     kg->__data = *(KernelData *)host;
@@ -74,7 +74,7 @@ void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t)
   }
 }
 
-void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size)
+void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem, size_t size)
 {
   if (0) {
   }
@@ -85,7 +85,7 @@ void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, s
     kg->tname.data = (type *)mem; \
     kg->tname.width = size; \
   }
-#include "kernel/kernel_textures.h"
+#include "kernel/textures.h"
   else {
     assert(0);
   }
diff --git a/intern/cycles/kernel/device/cpu/kernel.h b/intern/cycles/kernel/device/cpu/kernel.h
index ae2a841835a..6af8094b1ea 100644
--- a/intern/cycles/kernel/device/cpu/kernel.h
+++ b/intern/cycles/kernel/device/cpu/kernel.h
@@ -18,9 +18,10 @@
 
 /* CPU Kernel Interface */
 
-#include "util/util_types.h"
+#include "util/half.h"
+#include "util/types.h"
 
-#include "kernel/kernel_types.h"
+#include "kernel/types.h"
 
 CCL_NAMESPACE_BEGIN
 
@@ -29,17 +30,17 @@ CCL_NAMESPACE_BEGIN
 #define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
 
 struct IntegratorStateCPU;
-struct KernelGlobals;
+struct KernelGlobalsCPU;
 struct KernelData;
 
-KernelGlobals *kernel_globals_create();
-void kernel_globals_free(KernelGlobals *kg);
+KernelGlobalsCPU *kernel_globals_create();
+void kernel_globals_free(KernelGlobalsCPU *kg);
 
-void *kernel_osl_memory(const KernelGlobals *kg);
-bool kernel_osl_use(const KernelGlobals *kg);
+void *kernel_osl_memory(const KernelGlobalsCPU *kg);
+bool kernel_osl_use(const KernelGlobalsCPU *kg);
 
-void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size);
-void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size);
+void kernel_const_copy(KernelGlobalsCPU *kg, const char *name, void *host, size_t size);
+void kernel_global_memory_copy(KernelGlobalsCPU *kg, const char *name, void *mem, size_t size);
 
 #define KERNEL_ARCH cpu
 #include "kernel/device/cpu/kernel_arch.h"
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch.h b/intern/cycles/kernel/device/cpu/kernel_arch.h
index 81f328c710b..61f62f3136b 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch.h
@@ -21,23 +21,23 @@
  */
 
 #define KERNEL_INTEGRATOR_FUNCTION(name) \
-  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \
                                                     IntegratorStateCPU *state)
 
 #define KERNEL_INTEGRATOR_SHADE_FUNCTION(name) \
-  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \
                                                     IntegratorStateCPU *state, \
                                                     ccl_global float *render_buffer)
 
 #define KERNEL_INTEGRATOR_INIT_FUNCTION(name) \
-  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *ccl_restrict kg, \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *ccl_restrict kg, \
                                                     IntegratorStateCPU *state, \
                                                     KernelWorkTile *tile, \
                                                     ccl_global float *render_buffer)
 
 KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_camera);
 KERNEL_INTEGRATOR_INIT_FUNCTION(init_from_bake);
-KERNEL_INTEGRATOR_FUNCTION(intersect_closest);
+KERNEL_INTEGRATOR_SHADE_FUNCTION(intersect_closest);
 KERNEL_INTEGRATOR_FUNCTION(intersect_shadow);
 KERNEL_INTEGRATOR_FUNCTION(intersect_subsurface);
 KERNEL_INTEGRATOR_FUNCTION(intersect_volume_stack);
@@ -52,25 +52,61 @@ KERNEL_INTEGRATOR_SHADE_FUNCTION(megakernel);
 #undef KERNEL_INTEGRATOR_INIT_FUNCTION
 #undef KERNEL_INTEGRATOR_SHADE_FUNCTION
 
+#define KERNEL_FILM_CONVERT_FUNCTION(name) \
+  void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
+                                                      const float *buffer, \
+                                                      float *pixel, \
+                                                      const int width, \
+                                                      const int buffer_stride, \
+                                                      const int pixel_stride); \
+  void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
+      const KernelFilmConvert *kfilm_convert, \
+      const float *buffer, \
+      half4 *pixel, \
+      const int width, \
+      const int buffer_stride);
+
+KERNEL_FILM_CONVERT_FUNCTION(depth)
+KERNEL_FILM_CONVERT_FUNCTION(mist)
+KERNEL_FILM_CONVERT_FUNCTION(sample_count)
+KERNEL_FILM_CONVERT_FUNCTION(float)
+
+KERNEL_FILM_CONVERT_FUNCTION(light_path)
+KERNEL_FILM_CONVERT_FUNCTION(float3)
+
+KERNEL_FILM_CONVERT_FUNCTION(motion)
+KERNEL_FILM_CONVERT_FUNCTION(cryptomatte)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow)
+KERNEL_FILM_CONVERT_FUNCTION(combined)
+KERNEL_FILM_CONVERT_FUNCTION(float4)
+
+#undef KERNEL_FILM_CONVERT_FUNCTION
+
 /* --------------------------------------------------------------------
  * Shader evaluation.
  */
 
-void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *kg,
                                                        const KernelShaderEvalInput *input,
-                                                       float4 *output,
+                                                       float *output,
                                                        const int offset);
-void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg,
                                                      const KernelShaderEvalInput *input,
-                                                     float4 *output,
+                                                     float *output,
                                                      const int offset);
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)(
+    const KernelGlobalsCPU *kg,
+    const KernelShaderEvalInput *input,
+    float *output,
+    const int offset);
 
 /* --------------------------------------------------------------------
  * Adaptive sampling.
  */
 
 bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
-    const KernelGlobals *kg,
+    const KernelGlobalsCPU *kg,
     ccl_global float *render_buffer,
     int x,
     int y,
@@ -79,14 +115,14 @@ bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
     int offset,
     int stride);
 
-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCPU *kg,
                                                            ccl_global float *render_buffer,
                                                            int y,
                                                            int start_x,
                                                            int width,
                                                            int offset,
                                                            int stride);
-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCPU *kg,
                                                            ccl_global float *render_buffer,
                                                            int x,
                                                            int start_y,
@@ -98,16 +134,8 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *
  * Cryptomatte.
  */
 
-void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *kg,
                                                         ccl_global float *render_buffer,
                                                         int pixel_index);
 
-/* --------------------------------------------------------------------
- * Bake.
- */
-/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
-
-void KERNEL_FUNCTION_FULL_NAME(bake)(
-    const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride);
-
 #undef KERNEL_ARCH
diff --git a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
index 1432abfd330..747c47c34c9 100644
--- a/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
+++ b/intern/cycles/kernel/device/cpu/kernel_arch_impl.h
@@ -29,27 +29,28 @@
 #    include "kernel/device/cpu/globals.h"
 #    include "kernel/device/cpu/image.h"
 
-#    include "kernel/integrator/integrator_state.h"
-#    include "kernel/integrator/integrator_state_flow.h"
-#    include "kernel/integrator/integrator_state_util.h"
-
-#    include "kernel/integrator/integrator_init_from_camera.h"
-#    include "kernel/integrator/integrator_init_from_bake.h"
-#    include "kernel/integrator/integrator_intersect_closest.h"
-#    include "kernel/integrator/integrator_intersect_shadow.h"
-#    include "kernel/integrator/integrator_intersect_subsurface.h"
-#    include "kernel/integrator/integrator_intersect_volume_stack.h"
-#    include "kernel/integrator/integrator_shade_background.h"
-#    include "kernel/integrator/integrator_shade_light.h"
-#    include "kernel/integrator/integrator_shade_shadow.h"
-#    include "kernel/integrator/integrator_shade_surface.h"
-#    include "kernel/integrator/integrator_shade_volume.h"
-#    include "kernel/integrator/integrator_megakernel.h"
-
-#    include "kernel/kernel_film.h"
-#    include "kernel/kernel_adaptive_sampling.h"
-#    include "kernel/kernel_bake.h"
-# include "kernel/kernel_id_passes.h"
+#    include "kernel/integrator/state.h"
+#    include "kernel/integrator/state_flow.h"
+#    include "kernel/integrator/state_util.h"
+
+#    include "kernel/integrator/init_from_camera.h"
+#    include "kernel/integrator/init_from_bake.h"
+#    include "kernel/integrator/intersect_closest.h"
+#    include "kernel/integrator/intersect_shadow.h"
+#    include "kernel/integrator/intersect_subsurface.h"
+#    include "kernel/integrator/intersect_volume_stack.h"
+#    include "kernel/integrator/shade_background.h"
+#    include "kernel/integrator/shade_light.h"
+#    include "kernel/integrator/shade_shadow.h"
+#    include "kernel/integrator/shade_surface.h"
+#    include "kernel/integrator/shade_volume.h"
+#    include "kernel/integrator/megakernel.h"
+
+#    include "kernel/film/adaptive_sampling.h"
+#    include "kernel/film/id_passes.h"
+#    include "kernel/film/read.h"
+
+#    include "kernel/bake/bake.h"
 
 #else
 #  define STUB_ASSERT(arch, name) \
@@ -69,8 +70,20 @@ CCL_NAMESPACE_BEGIN
 #  define KERNEL_INVOKE(name, ...) integrator_##name(__VA_ARGS__)
 #endif
 
+/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
+ * that it does not contain unused fields. */
+#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
+  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
+                                                    IntegratorStateCPU *state, \
+                                                    KernelWorkTile *tile, \
+                                                    ccl_global float *render_buffer) \
+  { \
+    return KERNEL_INVOKE( \
+        name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+  }
+
 #define DEFINE_INTEGRATOR_KERNEL(name) \
-  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
                                                     IntegratorStateCPU *state) \
   { \
     KERNEL_INVOKE(name, kg, state); \
@@ -78,43 +91,45 @@ CCL_NAMESPACE_BEGIN
 
 #define DEFINE_INTEGRATOR_SHADE_KERNEL(name) \
   void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
-      const KernelGlobals *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+      const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
   { \
     KERNEL_INVOKE(name, kg, state, render_buffer); \
   }
 
-/* TODO: Either use something like get_work_pixel(), or simplify tile which is passed here, so
- * that it does not contain unused fields. */
-#define DEFINE_INTEGRATOR_INIT_KERNEL(name) \
-  bool KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobals *kg, \
-                                                    IntegratorStateCPU *state, \
-                                                    KernelWorkTile *tile, \
-                                                    ccl_global float *render_buffer) \
+#define DEFINE_INTEGRATOR_SHADOW_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)(const KernelGlobalsCPU *kg, \
+                                                    IntegratorStateCPU *state) \
   { \
-    return KERNEL_INVOKE( \
-        name, kg, state, tile, render_buffer, tile->x, tile->y, tile->start_sample); \
+    KERNEL_INVOKE(name, kg, &state->shadow); \
+  }
+
+#define DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(name) \
+  void KERNEL_FUNCTION_FULL_NAME(integrator_##name)( \
+      const KernelGlobalsCPU *kg, IntegratorStateCPU *state, ccl_global float *render_buffer) \
+  { \
+    KERNEL_INVOKE(name, kg, &state->shadow, render_buffer); \
   }
 
 DEFINE_INTEGRATOR_INIT_KERNEL(init_from_camera)
 DEFINE_INTEGRATOR_INIT_KERNEL(init_from_bake)
-DEFINE_INTEGRATOR_KERNEL(intersect_closest)
-DEFINE_INTEGRATOR_KERNEL(intersect_shadow)
+DEFINE_INTEGRATOR_SHADE_KERNEL(intersect_closest)
 DEFINE_INTEGRATOR_KERNEL(intersect_subsurface)
 DEFINE_INTEGRATOR_KERNEL(intersect_volume_stack)
 DEFINE_INTEGRATOR_SHADE_KERNEL(shade_background)
 DEFINE_INTEGRATOR_SHADE_KERNEL(shade_light)
-DEFINE_INTEGRATOR_SHADE_KERNEL(shade_shadow)
 DEFINE_INTEGRATOR_SHADE_KERNEL(shade_surface)
 DEFINE_INTEGRATOR_SHADE_KERNEL(shade_volume)
 DEFINE_INTEGRATOR_SHADE_KERNEL(megakernel)
+DEFINE_INTEGRATOR_SHADOW_KERNEL(intersect_shadow)
+DEFINE_INTEGRATOR_SHADOW_SHADE_KERNEL(shade_shadow)
 
 /* --------------------------------------------------------------------
  * Shader evaluation.
  */
 
-void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobalsCPU *kg,
                                                      const KernelShaderEvalInput *input,
-                                                     float4 *output,
+                                                     float *output,
                                                      const int offset)
 {
 #ifdef KERNEL_STUB
@@ -124,9 +139,9 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_displace)(const KernelGlobals *kg,
 #endif
 }
 
-void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobalsCPU *kg,
                                                        const KernelShaderEvalInput *input,
-                                                       float4 *output,
+                                                       float *output,
                                                        const int offset)
 {
 #ifdef KERNEL_STUB
@@ -136,12 +151,25 @@ void KERNEL_FUNCTION_FULL_NAME(shader_eval_background)(const KernelGlobals *kg,
 #endif
 }
 
+void KERNEL_FUNCTION_FULL_NAME(shader_eval_curve_shadow_transparency)(
+    const KernelGlobalsCPU *kg,
+    const KernelShaderEvalInput *input,
+    float *output,
+    const int offset)
+{
+#ifdef KERNEL_STUB
+  STUB_ASSERT(KERNEL_ARCH, shader_eval_curve_shadow_transparency);
+#else
+  kernel_curve_shadow_transparency_evaluate(kg, input, output, offset);
+#endif
+}
+
 /* --------------------------------------------------------------------
  * Adaptive sampling.
  */
 
 bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
-    const KernelGlobals *kg,
+    const KernelGlobalsCPU *kg,
     ccl_global float *render_buffer,
     int x,
     int y,
@@ -159,7 +187,7 @@ bool KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_convergence_check)(
 #endif
 }
 
-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobalsCPU *kg,
                                                            ccl_global float *render_buffer,
                                                            int y,
                                                            int start_x,
@@ -174,7 +202,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_x)(const KernelGlobals *
 #endif
 }
 
-void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobalsCPU *kg,
                                                            ccl_global float *render_buffer,
                                                            int x,
                                                            int start_y,
@@ -193,7 +221,7 @@ void KERNEL_FUNCTION_FULL_NAME(adaptive_sampling_filter_y)(const KernelGlobals *
  * Cryptomatte.
  */
 
-void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
+void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobalsCPU *kg,
                                                         ccl_global float *render_buffer,
                                                         int pixel_index)
 {
@@ -205,23 +233,83 @@ void KERNEL_FUNCTION_FULL_NAME(cryptomatte_postprocess)(const KernelGlobals *kg,
 }
 
 /* --------------------------------------------------------------------
- * Bake.
+ * Film Convert.
  */
-/* TODO(sergey): Needs to be re-implemented. Or not? Brecht did it already :) */
 
-void KERNEL_FUNCTION_FULL_NAME(bake)(
-    const KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
-{
-#if 0
-#  ifdef KERNEL_STUB
-  STUB_ASSERT(KERNEL_ARCH, bake);
-#  else
-#    ifdef __BAKING__
-  kernel_bake_evaluate(kg, buffer, sample, x, y, offset, stride);
-#    endif
-#  endif /* KERNEL_STUB */
+#ifdef KERNEL_STUB
+
+#  define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \
+    void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
+                                                        const float *buffer, \
+                                                        float *pixel, \
+                                                        const int width, \
+                                                        const int buffer_stride, \
+                                                        const int pixel_stride) \
+    { \
+      STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \
+    } \
+    void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
+        const KernelFilmConvert *kfilm_convert, \
+        const float *buffer, \
+        half4 *pixel, \
+        const int width, \
+        const int buffer_stride) \
+    { \
+      STUB_ASSERT(KERNEL_ARCH, film_convert_##name); \
+    }
+
+#else
+
+#  define KERNEL_FILM_CONVERT_FUNCTION(name, is_float) \
+    void KERNEL_FUNCTION_FULL_NAME(film_convert_##name)(const KernelFilmConvert *kfilm_convert, \
+                                                        const float *buffer, \
+                                                        float *pixel, \
+                                                        const int width, \
+                                                        const int buffer_stride, \
+                                                        const int pixel_stride) \
+    { \
+      for (int i = 0; i < width; i++, buffer += buffer_stride, pixel += pixel_stride) { \
+        film_get_pass_pixel_##name(kfilm_convert, buffer, pixel); \
+      } \
+    } \
+    void KERNEL_FUNCTION_FULL_NAME(film_convert_half_rgba_##name)( \
+        const KernelFilmConvert *kfilm_convert, \
+        const float *buffer, \
+        half4 *pixel, \
+        const int width, \
+        const int buffer_stride) \
+    { \
+      for (int i = 0; i < width; i++, buffer += buffer_stride, pixel++) { \
+        float pixel_rgba[4] = {0.0f, 0.0f, 0.0f, 1.0f}; \
+        film_get_pass_pixel_##name(kfilm_convert, buffer, pixel_rgba); \
+        if (is_float) { \
+          pixel_rgba[1] = pixel_rgba[0]; \
+          pixel_rgba[2] = pixel_rgba[0]; \
+        } \
+        film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel_rgba); \
+        *pixel = float4_to_half4_display( \
+            make_float4(pixel_rgba[0], pixel_rgba[1], pixel_rgba[2], pixel_rgba[3])); \
+      } \
+    }
+
 #endif
-}
+
+KERNEL_FILM_CONVERT_FUNCTION(depth, true)
+KERNEL_FILM_CONVERT_FUNCTION(mist, true)
+KERNEL_FILM_CONVERT_FUNCTION(sample_count, true)
+KERNEL_FILM_CONVERT_FUNCTION(float, true)
+
+KERNEL_FILM_CONVERT_FUNCTION(light_path, false)
+KERNEL_FILM_CONVERT_FUNCTION(float3, false)
+
+KERNEL_FILM_CONVERT_FUNCTION(motion, false)
+KERNEL_FILM_CONVERT_FUNCTION(cryptomatte, false)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher, false)
+KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow, false)
+KERNEL_FILM_CONVERT_FUNCTION(combined, false)
+KERNEL_FILM_CONVERT_FUNCTION(float4, false)
+
+#undef KERNEL_FILM_CONVERT_FUNCTION
 
 #undef KERNEL_INVOKE
 #undef DEFINE_INTEGRATOR_KERNEL
diff --git a/intern/cycles/kernel/device/cpu/kernel_avx.cpp b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
index 220768036ab..cece750a255 100644
--- a/intern/cycles/kernel/device/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx.cpp
@@ -18,7 +18,7 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-#include "util/util_optimization.h"
+#include "util/optimization.h"
 
 #ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 #  define KERNEL_STUB
diff --git a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
index 90c05113cbe..fad4581236e 100644
--- a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
@@ -18,7 +18,7 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-#include "util/util_optimization.h"
+#include "util/optimization.h"
 
 #ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 #  define KERNEL_STUB
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse2.cpp b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
index fb85ef5b0d0..5fb4849ac08 100644
--- a/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
@@ -18,7 +18,7 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-#include "util/util_optimization.h"
+#include "util/optimization.h"
 
 #ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 #  define KERNEL_STUB
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse3.cpp b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
index 87baf04258a..c9424682fd4 100644
--- a/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse3.cpp
@@ -18,7 +18,7 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-#include "util/util_optimization.h"
+#include "util/optimization.h"
 
 #ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 #  define KERNEL_STUB
diff --git a/intern/cycles/kernel/device/cpu/kernel_sse41.cpp b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
index bb421d58815..849ebf51989 100644
--- a/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
@@ -18,7 +18,7 @@
  * optimization flags and nearly all functions inlined, while kernel.cpp
  * is compiled without for other CPU's. */
 
-#include "util/util_optimization.h"
+#include "util/optimization.h"
 
 #ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
 #  define KERNEL_STUB
diff --git a/intern/cycles/kernel/device/cuda/compat.h b/intern/cycles/kernel/device/cuda/compat.h
index 3c85a8e7bd2..658dec102b1 100644
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -52,14 +52,14 @@ typedef unsigned long long uint64_t;
 #endif
 #define ccl_device_noinline __device__ __noinline__
 #define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
 #define ccl_global
-#define ccl_static_constant __constant__
+#define ccl_inline_constant __constant__
 #define ccl_device_constant __constant__ __device__
 #define ccl_constant const
 #define ccl_gpu_shared __shared__
 #define ccl_private
 #define ccl_may_alias
-#define ccl_addr_space
 #define ccl_restrict __restrict__
 #define ccl_loop_no_unroll
 #define ccl_align(n) __align__(n)
@@ -76,6 +76,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
 
 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
@@ -85,7 +86,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
 #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
-#define ccl_gpu_popc(x) __popc(x)
 
 /* GPU texture objects */
 
@@ -129,7 +129,14 @@ __device__ half __float2half(const float f)
   return val;
 }
 
+__device__ float __half2float(const half h)
+{
+  float val;
+  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
+  return val;
+}
+
 /* Types */
 
-#include "util/util_half.h"
-#include "util/util_types.h"
+#include "util/half.h"
+#include "util/types.h"
diff --git a/intern/cycles/kernel/device/cuda/config.h b/intern/cycles/kernel/device/cuda/config.h
index 46196dcdb51..003881d7912 100644
--- a/intern/cycles/kernel/device/cuda/config.h
+++ b/intern/cycles/kernel/device/cuda/config.h
@@ -92,12 +92,29 @@
 
 /* Compute number of threads per block and minimum blocks per multiprocessor
  * given the maximum number of registers per thread. */
-
 #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
   extern "C" __global__ void __launch_bounds__(block_num_threads, \
                                                GPU_MULTIPRESSOR_MAX_REGISTERS / \
                                                    (block_num_threads * thread_num_registers))
 
+#define ccl_gpu_kernel_threads(block_num_threads) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads)
+
+#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
+
+#define ccl_gpu_kernel_call(x) x
+
+/* Define a function object where "func" is the lambda body, and additional parameters are used to
+ * specify captured state  */
+#define ccl_gpu_kernel_lambda(func, ...) \
+  struct KernelLambda { \
+    __VA_ARGS__; \
+    __device__ int operator()(const int state) \
+    { \
+      return (func); \
+    } \
+  } ccl_gpu_kernel_lambda_pass
+
 /* sanity checks */
 
 #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
diff --git a/intern/cycles/kernel/device/cuda/globals.h b/intern/cycles/kernel/device/cuda/globals.h
index 169047175f5..e5023fad40c 100644
--- a/intern/cycles/kernel/device/cuda/globals.h
+++ b/intern/cycles/kernel/device/cuda/globals.h
@@ -18,23 +18,25 @@
 
 #pragma once
 
-#include "kernel/kernel_profiling.h"
-#include "kernel/kernel_types.h"
+#include "kernel/types.h"
 
-#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/state.h"
+
+#include "kernel/util/profiling.h"
 
 CCL_NAMESPACE_BEGIN
 
 /* Not actually used, just a NULL pointer that gets passed everywhere, which we
  * hope gets optimized out by the compiler. */
-struct KernelGlobals {
+struct KernelGlobalsGPU {
   int unused[1];
 };
+typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
 
 /* Global scene data and textures */
 __constant__ KernelData __data;
 #define KERNEL_TEX(type, name) const __constant__ __device__ type *name;
-#include "kernel/kernel_textures.h"
+#include "kernel/textures.h"
 
 /* Integrator state */
 __constant__ IntegratorStateGPU __integrator_state;
diff --git a/intern/cycles/kernel/device/gpu/image.h b/intern/cycles/kernel/device/gpu/image.h
index b015c78a8f5..0900a45c83d 100644
--- a/intern/cycles/kernel/device/gpu/image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -65,7 +65,9 @@ ccl_device float cubic_h1(float a)
 
 /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
 template<typename T>
-ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
+ccl_device_noinline T kernel_tex_image_interp_bicubic(ccl_global const TextureInfo &info,
+                                                      float x,
+                                                      float y)
 {
   ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
@@ -94,7 +96,7 @@ ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, f
 /* Fast tricubic texture lookup using 8 trilinear lookups. */
 template<typename T>
 ccl_device_noinline T
-kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
+kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, float y, float z)
 {
   ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;
 
@@ -169,7 +171,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl
 
 template<typename T>
 ccl_device_noinline T kernel_tex_image_interp_nanovdb(
-    const TextureInfo &info, float x, float y, float z, uint interpolation)
+    ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
 {
   using namespace nanovdb;
 
@@ -189,9 +191,9 @@ ccl_device_noinline T kernel_tex_image_interp_nanovdb(
 }
 #endif
 
-ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float x, float y)
+ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
 {
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
   /* float4, byte4, ushort4 and half4 */
   const int texture_type = info.data_type;
@@ -221,12 +223,12 @@ ccl_device float4 kernel_tex_image_interp(const KernelGlobals *kg, int id, float
   }
 }
 
-ccl_device float4 kernel_tex_image_interp_3d(const KernelGlobals *kg,
+ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
                                              int id,
                                              float3 P,
                                              InterpolationType interp)
 {
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
 
   if (info.use_transform_3d) {
     P = transform_point(&info.transform_3d, P);
diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index 7b79c0aedfa..eed005803e2 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -20,47 +20,64 @@
 #include "kernel/device/gpu/parallel_prefix_sum.h"
 #include "kernel/device/gpu/parallel_sorted_index.h"
 
-#include "kernel/integrator/integrator_state.h"
-#include "kernel/integrator/integrator_state_flow.h"
-#include "kernel/integrator/integrator_state_util.h"
-
-#include "kernel/integrator/integrator_init_from_bake.h"
-#include "kernel/integrator/integrator_init_from_camera.h"
-#include "kernel/integrator/integrator_intersect_closest.h"
-#include "kernel/integrator/integrator_intersect_shadow.h"
-#include "kernel/integrator/integrator_intersect_subsurface.h"
-#include "kernel/integrator/integrator_intersect_volume_stack.h"
-#include "kernel/integrator/integrator_shade_background.h"
-#include "kernel/integrator/integrator_shade_light.h"
-#include "kernel/integrator/integrator_shade_shadow.h"
-#include "kernel/integrator/integrator_shade_surface.h"
-#include "kernel/integrator/integrator_shade_volume.h"
-
-#include "kernel/kernel_adaptive_sampling.h"
-#include "kernel/kernel_bake.h"
-#include "kernel/kernel_film.h"
-#include "kernel/kernel_work_stealing.h"
+#include "kernel/sample/lcg.h"
+
+/* Include constant tables before entering Metal's context class scope (context_begin.h) */
+#include "kernel/tables.h"
+
+#ifdef __KERNEL_METAL__
+#  include "kernel/device/metal/context_begin.h"
+#endif
+
+#include "kernel/device/gpu/work_stealing.h"
+
+#include "kernel/integrator/state.h"
+#include "kernel/integrator/state_flow.h"
+#include "kernel/integrator/state_util.h"
+
+#include "kernel/integrator/init_from_bake.h"
+#include "kernel/integrator/init_from_camera.h"
+#include "kernel/integrator/intersect_closest.h"
+#include "kernel/integrator/intersect_shadow.h"
+#include "kernel/integrator/intersect_subsurface.h"
+#include "kernel/integrator/intersect_volume_stack.h"
+#include "kernel/integrator/shade_background.h"
+#include "kernel/integrator/shade_light.h"
+#include "kernel/integrator/shade_shadow.h"
+#include "kernel/integrator/shade_surface.h"
+#include "kernel/integrator/shade_volume.h"
+
+#include "kernel/bake/bake.h"
+
+#include "kernel/film/adaptive_sampling.h"
+
+#ifdef __KERNEL_METAL__
+#  include "kernel/device/metal/context_end.h"
+#endif
+
+#include "kernel/film/read.h"
 
 /* --------------------------------------------------------------------
  * Integrator.
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_reset(int num_states)
+    ccl_gpu_kernel_signature(integrator_reset, int num_states)
 {
   const int state = ccl_gpu_global_id_x();
 
   if (state < num_states) {
-    INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
-    INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
+    INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
+    INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_init_from_camera(KernelWorkTile *tiles,
-                                           const int num_tiles,
-                                           float *render_buffer,
-                                           const int max_tile_work_size)
+    ccl_gpu_kernel_signature(integrator_init_from_camera,
+                             ccl_global KernelWorkTile *tiles,
+                             const int num_tiles,
+                             ccl_global float *render_buffer,
+                             const int max_tile_work_size)
 {
   const int work_index = ccl_gpu_global_id_x();
 
@@ -71,7 +88,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int tile_index = work_index / max_tile_work_size;
   const int tile_work_index = work_index - tile_index * max_tile_work_size;
 
-  const KernelWorkTile *tile = &tiles[tile_index];
+  ccl_global const KernelWorkTile *tile = &tiles[tile_index];
 
   if (tile_work_index >= tile->work_size) {
     return;
@@ -80,16 +97,18 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int state = tile->path_index_offset + tile_work_index;
 
   uint x, y, sample;
-  get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+  ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample));
 
-  integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample);
+  ccl_gpu_kernel_call(
+      integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample));
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_init_from_bake(KernelWorkTile *tiles,
-                                         const int num_tiles,
-                                         float *render_buffer,
-                                         const int max_tile_work_size)
+    ccl_gpu_kernel_signature(integrator_init_from_bake,
+                             ccl_global KernelWorkTile *tiles,
+                             const int num_tiles,
+                             ccl_global float *render_buffer,
+                             const int max_tile_work_size)
 {
   const int work_index = ccl_gpu_global_id_x();
 
@@ -100,7 +119,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int tile_index = work_index / max_tile_work_size;
   const int tile_work_index = work_index - tile_index * max_tile_work_size;
 
-  const KernelWorkTile *tile = &tiles[tile_index];
+  ccl_global const KernelWorkTile *tile = &tiles[tile_index];
 
   if (tile_work_index >= tile->work_size) {
     return;
@@ -109,211 +128,312 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   const int state = tile->path_index_offset + tile_work_index;
 
   uint x, y, sample;
-  get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+  ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample));
 
-  integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample);
+  ccl_gpu_kernel_call(
+      integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample));
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_closest(const int *path_index_array, const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_closest,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_closest(NULL, state);
+    ccl_gpu_kernel_call(integrator_intersect_closest(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_shadow(const int *path_index_array, const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_shadow,
+                             ccl_global const int *path_index_array,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_shadow(NULL, state);
+    ccl_gpu_kernel_call(integrator_intersect_shadow(NULL, state));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_subsurface(const int *path_index_array, const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_subsurface,
+                             ccl_global const int *path_index_array,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_subsurface(NULL, state);
+    ccl_gpu_kernel_call(integrator_intersect_subsurface(NULL, state));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_intersect_volume_stack(const int *path_index_array, const int work_size)
+    ccl_gpu_kernel_signature(integrator_intersect_volume_stack,
+                             ccl_global const int *path_index_array,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_intersect_volume_stack(NULL, state);
+    ccl_gpu_kernel_call(integrator_intersect_volume_stack(NULL, state));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_background(const int *path_index_array,
-                                           float *render_buffer,
-                                           const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_background,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_background(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_background(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_light(const int *path_index_array,
-                                      float *render_buffer,
-                                      const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_light,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_light(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_light(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_shadow(const int *path_index_array,
-                                       float *render_buffer,
-                                       const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_shadow,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_shadow(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_shadow(NULL, state, render_buffer));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_surface(const int *path_index_array,
-                                        float *render_buffer,
-                                        const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_surface,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_surface(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_surface(NULL, state, render_buffer));
   }
 }
 
+#ifdef __KERNEL_METAL__
+constant int __dummy_constant [[function_constant(0)]];
+#endif
+
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_surface_raytrace(const int *path_index_array,
-                                                 float *render_buffer,
-                                                 const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_surface_raytrace,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_surface_raytrace(NULL, state, render_buffer);
+
+#ifdef __KERNEL_METAL__
+    KernelGlobals kg = NULL;
+    /* Workaround Ambient Occlusion and Bevel nodes not working with Metal.
+     * Dummy offset should not affect result, but somehow fixes bug! */
+    kg += __dummy_constant;
+    ccl_gpu_kernel_call(integrator_shade_surface_raytrace(kg, state, render_buffer));
+#else
+    ccl_gpu_kernel_call(integrator_shade_surface_raytrace(NULL, state, render_buffer));
+#endif
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shade_volume(const int *path_index_array,
-                                       float *render_buffer,
-                                       const int work_size)
+    ccl_gpu_kernel_signature(integrator_shade_volume,
+                             ccl_global const int *path_index_array,
+                             ccl_global float *render_buffer,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
   if (global_index < work_size) {
     const int state = (path_index_array) ? path_index_array[global_index] : global_index;
-    integrator_shade_volume(NULL, state, render_buffer);
+    ccl_gpu_kernel_call(integrator_shade_volume(NULL, state, render_buffer));
   }
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_queued_paths_array(int num_states,
-                                             int *indices,
-                                             int *num_indices,
-                                             int kernel)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_queued_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int kernel_index)
 {
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index,
+                        int kernel_index);
+  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [kernel](const int state) {
-        return (INTEGRATOR_STATE(path, queued_kernel) == kernel);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_queued_shadow_paths_array(int num_states,
-                                                    int *indices,
-                                                    int *num_indices,
-                                                    int kernel)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int kernel_index)
 {
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel_index,
+                        int kernel_index);
+  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [kernel](const int state) {
-        return (INTEGRATOR_STATE(shadow_path, queued_kernel) == kernel);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_active_paths_array(int num_states, int *indices, int *num_indices)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_active_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices)
 {
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(path, queued_kernel) != 0) ||
-               (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0);
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_terminated_paths_array(int num_states,
-                                                 int *indices,
-                                                 int *num_indices,
-                                                 int indices_offset)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_terminated_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int indices_offset)
 {
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices + indices_offset, num_indices, [](const int state) {
-        return (INTEGRATOR_STATE(path, queued_kernel) == 0) &&
-               (INTEGRATOR_STATE(shadow_path, queued_kernel) == 0);
-      });
+      num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_sorted_paths_array(
-        int num_states, int *indices, int *num_indices, int *key_prefix_sum, int kernel)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int indices_offset)
 {
-  gpu_parallel_sorted_index_array<GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, key_prefix_sum, [kernel](const int state) {
-        return (INTEGRATOR_STATE(path, queued_kernel) == kernel) ?
-                   INTEGRATOR_STATE(path, shader_sort_key) :
-                   GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
-      });
+  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
+
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_sorted_paths_array,
+                             int num_states,
+                             int num_states_limit,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             ccl_global int *key_counter,
+                             ccl_global int *key_prefix_sum,
+                             int kernel_index)
+{
+  ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index) ?
+                            INTEGRATOR_STATE(state, path, shader_sort_key) :
+                            GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY,
+                        int kernel_index);
+  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
+
+  const uint state_index = ccl_gpu_global_id_x();
+  gpu_parallel_sorted_index_array(state_index,
+                                  num_states,
+                                  num_states_limit,
+                                  indices,
+                                  num_indices,
+                                  key_counter,
+                                  key_prefix_sum,
+                                  ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int num_active_paths)
+{
+  ccl_gpu_kernel_lambda((state >= num_active_paths) &&
+                            (INTEGRATOR_STATE(state, path, queued_kernel) != 0),
+                        int num_active_paths);
+  ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
+
+  gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
+}
+
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_states,
+                             ccl_global const int *active_terminated_states,
+                             const int active_states_offset,
+                             const int terminated_states_offset,
+                             const int work_size)
+{
+  const int global_index = ccl_gpu_global_id_x();
+
+  if (global_index < work_size) {
+    const int from_state = active_terminated_states[active_states_offset + global_index];
+    const int to_state = active_terminated_states[terminated_states_offset + global_index];
+
+    ccl_gpu_kernel_call(integrator_state_move(NULL, to_state, from_state));
+  }
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_compact_paths_array(int num_states,
-                                              int *indices,
-                                              int *num_indices,
-                                              int num_active_paths)
+ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array,
+                             int num_states,
+                             ccl_global int *indices,
+                             ccl_global int *num_indices,
+                             int num_active_paths)
 {
+  ccl_gpu_kernel_lambda((state >= num_active_paths) &&
+                            (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0),
+                        int num_active_paths);
+  ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
+
   gpu_parallel_active_index_array<GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE>(
-      num_states, indices, num_indices, [num_active_paths](const int state) {
-        return (state >= num_active_paths) &&
-               ((INTEGRATOR_STATE(path, queued_kernel) != 0) ||
-                (INTEGRATOR_STATE(shadow_path, queued_kernel) != 0));
-      });
+      num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_integrator_compact_states(const int *active_terminated_states,
-                                         const int active_states_offset,
-                                         const int terminated_states_offset,
-                                         const int work_size)
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_compact_shadow_states,
+                             ccl_global const int *active_terminated_states,
+                             const int active_states_offset,
+                             const int terminated_states_offset,
+                             const int work_size)
 {
   const int global_index = ccl_gpu_global_id_x();
 
@@ -321,14 +441,14 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_SORTED_INDEX_DEFAULT_B
     const int from_state = active_terminated_states[active_states_offset + global_index];
     const int to_state = active_terminated_states[terminated_states_offset + global_index];
 
-    integrator_state_move(to_state, from_state);
+    ccl_gpu_kernel_call(integrator_shadow_state_move(NULL, to_state, from_state));
   }
 }
 
-extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE)
-    kernel_gpu_prefix_sum(int *values, int num_values)
+ccl_gpu_kernel_threads(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE) ccl_gpu_kernel_signature(
+    prefix_sum, ccl_global int *counter, ccl_global int *prefix_sum, int num_values)
 {
-  gpu_parallel_prefix_sum<GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE>(values, num_values);
+  gpu_parallel_prefix_sum(ccl_gpu_global_id_x(), counter, prefix_sum, num_values);
 }
 
 /* --------------------------------------------------------------------
@@ -336,16 +456,17 @@ extern "C" __global__ void __launch_bounds__(GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLO
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_adaptive_sampling_convergence_check(float *render_buffer,
-                                                   int sx,
-                                                   int sy,
-                                                   int sw,
-                                                   int sh,
-                                                   float threshold,
-                                                   bool reset,
-                                                   int offset,
-                                                   int stride,
-                                                   uint *num_active_pixels)
+    ccl_gpu_kernel_signature(adaptive_sampling_convergence_check,
+                             ccl_global float *render_buffer,
+                             int sx,
+                             int sy,
+                             int sw,
+                             int sh,
+                             float threshold,
+                             bool reset,
+                             int offset,
+                             int stride,
+                             ccl_global uint *num_active_pixels)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / sw;
@@ -354,37 +475,51 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   bool converged = true;
 
   if (x < sw && y < sh) {
-    converged = kernel_adaptive_sampling_convergence_check(
-        nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride);
+    converged = ccl_gpu_kernel_call(kernel_adaptive_sampling_convergence_check(
+        nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride));
   }
 
   /* NOTE: All threads specified in the mask must execute the intrinsic. */
-  const uint num_active_pixels_mask = ccl_gpu_ballot(!converged);
+  const auto num_active_pixels_mask = ccl_gpu_ballot(!converged);
   const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
   if (lane_id == 0) {
-    atomic_fetch_and_add_uint32(num_active_pixels, __popc(num_active_pixels_mask));
+    atomic_fetch_and_add_uint32(num_active_pixels, popcount(num_active_pixels_mask));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_adaptive_sampling_filter_x(
-        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+    ccl_gpu_kernel_signature(adaptive_sampling_filter_x,
+                             ccl_global float *render_buffer,
+                             int sx,
+                             int sy,
+                             int sw,
+                             int sh,
+                             int offset,
+                             int stride)
 {
   const int y = ccl_gpu_global_id_x();
 
   if (y < sh) {
-    kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride);
+    ccl_gpu_kernel_call(
+        kernel_adaptive_sampling_filter_x(NULL, render_buffer, sy + y, sx, sw, offset, stride));
   }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_adaptive_sampling_filter_y(
-        float *render_buffer, int sx, int sy, int sw, int sh, int offset, int stride)
+    ccl_gpu_kernel_signature(adaptive_sampling_filter_y,
+                             ccl_global float *render_buffer,
+                             int sx,
+                             int sy,
+                             int sw,
+                             int sh,
+                             int offset,
+                             int stride)
 {
   const int x = ccl_gpu_global_id_x();
 
   if (x < sw) {
-    kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride);
+    ccl_gpu_kernel_call(
+        kernel_adaptive_sampling_filter_y(NULL, render_buffer, sx + x, sy, sh, offset, stride));
   }
 }
 
@@ -393,12 +528,14 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_cryptomatte_postprocess(float *render_buffer, int num_pixels)
+    ccl_gpu_kernel_signature(cryptomatte_postprocess,
+                             ccl_global float *render_buffer,
+                             int num_pixels)
 {
   const int pixel_index = ccl_gpu_global_id_x();
 
   if (pixel_index < num_pixels) {
-    kernel_cryptomatte_post(nullptr, render_buffer, pixel_index);
+    ccl_gpu_kernel_call(kernel_cryptomatte_post(nullptr, render_buffer, pixel_index));
   }
 }
 
@@ -406,202 +543,142 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  * Film.
  */
 
-/* Common implementation for float destination. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_common(const KernelFilmConvert *kfilm_convert,
-                                                      float *pixels,
-                                                      float *render_buffer,
-                                                      int num_pixels,
-                                                      int width,
-                                                      int offset,
-                                                      int stride,
-                                                      int dst_offset,
-                                                      int dst_stride,
-                                                      const Processor &processor)
-{
-  const int render_pixel_index = ccl_gpu_global_id_x();
-  if (render_pixel_index >= num_pixels) {
-    return;
-  }
-
-  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
-  ccl_global const float *buffer = render_buffer + render_buffer_offset;
-  ccl_global float *pixel = pixels +
-                            (render_pixel_index + dst_offset) * kfilm_convert->pixel_stride;
-
-  processor(kfilm_convert, buffer, pixel);
-}
-
-/* Common implementation for half4 destination and 4-channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgba(
-    const KernelFilmConvert *kfilm_convert,
-    uchar4 *rgba,
-    float *render_buffer,
-    int num_pixels,
-    int width,
-    int offset,
-    int stride,
-    int rgba_offset,
-    int rgba_stride,
-    const Processor &processor)
-{
-  const int render_pixel_index = ccl_gpu_global_id_x();
-  if (render_pixel_index >= num_pixels) {
-    return;
-  }
-
-  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
-  ccl_global const float *buffer = render_buffer + render_buffer_offset;
-
-  float pixel[4];
-  processor(kfilm_convert, buffer, pixel);
-
-  film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
-
-  const int x = render_pixel_index % width;
-  const int y = render_pixel_index / width;
-
+ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgba,
+                                                          const int rgba_offset,
+                                                          const int rgba_stride,
+                                                          const int x,
+                                                          const int y,
+                                                          const half4 half_pixel)
+{
+  /* Work around HIP issue with half float display, see T92972. */
+#ifdef __KERNEL_HIP__
+  ccl_global half *out = ((ccl_global half *)rgba) + (rgba_offset + y * rgba_stride + x) * 4;
+  out[0] = half_pixel.x;
+  out[1] = half_pixel.y;
+  out[2] = half_pixel.z;
+  out[3] = half_pixel.w;
+#else
   ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
-  float4_store_half((ccl_global half *)out, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
-}
-
-/* Common implementation for half4 destination and 3-channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_rgb(
-    const KernelFilmConvert *kfilm_convert,
-    uchar4 *rgba,
-    float *render_buffer,
-    int num_pixels,
-    int width,
-    int offset,
-    int stride,
-    int rgba_offset,
-    int rgba_stride,
-    const Processor &processor)
-{
-  kernel_gpu_film_convert_half_rgba_common_rgba(
-      kfilm_convert,
-      rgba,
-      render_buffer,
-      num_pixels,
-      width,
-      offset,
-      stride,
-      rgba_offset,
-      rgba_stride,
-      [&processor](const KernelFilmConvert *kfilm_convert,
-                   ccl_global const float *buffer,
-                   float *pixel_rgba) {
-        processor(kfilm_convert, buffer, pixel_rgba);
-        pixel_rgba[3] = 1.0f;
-      });
-}
-
-/* Common implementation for half4 destination and single channel input pass. */
-template<typename Processor>
-ccl_device_inline void kernel_gpu_film_convert_half_rgba_common_value(
-    const KernelFilmConvert *kfilm_convert,
-    uchar4 *rgba,
-    float *render_buffer,
-    int num_pixels,
-    int width,
-    int offset,
-    int stride,
-    int rgba_offset,
-    int rgba_stride,
-    const Processor &processor)
-{
-  kernel_gpu_film_convert_half_rgba_common_rgba(
-      kfilm_convert,
-      rgba,
-      render_buffer,
-      num_pixels,
-      width,
-      offset,
-      stride,
-      rgba_offset,
-      rgba_stride,
-      [&processor](const KernelFilmConvert *kfilm_convert,
-                   ccl_global const float *buffer,
-                   float *pixel_rgba) {
-        float value;
-        processor(kfilm_convert, buffer, &value);
-
-        pixel_rgba[0] = value;
-        pixel_rgba[1] = value;
-        pixel_rgba[2] = value;
-        pixel_rgba[3] = 1.0f;
-      });
-}
-
-#define KERNEL_FILM_CONVERT_PROC(name) \
-  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) name
-
-#define KERNEL_FILM_CONVERT_DEFINE(variant, channels) \
-  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant) \
-  (const KernelFilmConvert kfilm_convert, \
-   float *pixels, \
-   float *render_buffer, \
-   int num_pixels, \
-   int width, \
-   int offset, \
-   int stride, \
-   int rgba_offset, \
-   int rgba_stride) \
+  *out = half_pixel;
+#endif
+}
+
+#ifdef __KERNEL_METAL__
+
+/* Fetch into a local variable on Metal - there is minimal overhead. Templating the
+ * film_get_pass_pixel_... functions works on MSL, but not on other compilers. */
+#  define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
+    float local_pixel[4]; \
+    film_get_pass_pixel_##variant(&kfilm_convert, buffer, local_pixel); \
+    if (input_channel_count >= 1) { \
+      pixel[0] = local_pixel[0]; \
+    } \
+    if (input_channel_count >= 2) { \
+      pixel[1] = local_pixel[1]; \
+    } \
+    if (input_channel_count >= 3) { \
+      pixel[2] = local_pixel[2]; \
+    } \
+    if (input_channel_count >= 4) { \
+      pixel[3] = local_pixel[3]; \
+    }
+
+#else
+
+#  define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
+    film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel);
+
+#endif
+
+#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \
+  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
+      ccl_gpu_kernel_signature(film_convert_##variant, \
+                               const KernelFilmConvert kfilm_convert, \
+                               ccl_global float *pixels, \
+                               ccl_global float *render_buffer, \
+                               int num_pixels, \
+                               int width, \
+                               int offset, \
+                               int stride, \
+                               int rgba_offset, \
+                               int rgba_stride) \
   { \
-    kernel_gpu_film_convert_common(&kfilm_convert, \
-                                   pixels, \
-                                   render_buffer, \
-                                   num_pixels, \
-                                   width, \
-                                   offset, \
-                                   stride, \
-                                   rgba_offset, \
-                                   rgba_stride, \
-                                   film_get_pass_pixel_##variant); \
+    const int render_pixel_index = ccl_gpu_global_id_x(); \
+    if (render_pixel_index >= num_pixels) { \
+      return; \
+    } \
+\
+    const int x = render_pixel_index % width; \
+    const int y = render_pixel_index / width; \
+\
+    ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
+                                     y * stride * kfilm_convert.pass_stride; \
+\
+    ccl_global float *pixel = pixels + \
+                              (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \
+\
+    FILM_GET_PASS_PIXEL_F32(variant, input_channel_count); \
   } \
-  KERNEL_FILM_CONVERT_PROC(kernel_gpu_film_convert_##variant##_half_rgba) \
-  (const KernelFilmConvert kfilm_convert, \
-   uchar4 *rgba, \
-   float *render_buffer, \
-   int num_pixels, \
-   int width, \
-   int offset, \
-   int stride, \
-   int rgba_offset, \
-   int rgba_stride) \
+\
+  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
+      ccl_gpu_kernel_signature(film_convert_##variant##_half_rgba, \
+                               const KernelFilmConvert kfilm_convert, \
+                               ccl_global uchar4 *rgba, \
+                               ccl_global float *render_buffer, \
+                               int num_pixels, \
+                               int width, \
+                               int offset, \
+                               int stride, \
+                               int rgba_offset, \
+                               int rgba_stride) \
   { \
-    kernel_gpu_film_convert_half_rgba_common_##channels(&kfilm_convert, \
-                                                        rgba, \
-                                                        render_buffer, \
-                                                        num_pixels, \
-                                                        width, \
-                                                        offset, \
-                                                        stride, \
-                                                        rgba_offset, \
-                                                        rgba_stride, \
-                                                        film_get_pass_pixel_##variant); \
-  }
-
-KERNEL_FILM_CONVERT_DEFINE(depth, value)
-KERNEL_FILM_CONVERT_DEFINE(mist, value)
-KERNEL_FILM_CONVERT_DEFINE(sample_count, value)
-KERNEL_FILM_CONVERT_DEFINE(float, value)
-
-KERNEL_FILM_CONVERT_DEFINE(light_path, rgb)
-KERNEL_FILM_CONVERT_DEFINE(float3, rgb)
-
-KERNEL_FILM_CONVERT_DEFINE(motion, rgba)
-KERNEL_FILM_CONVERT_DEFINE(cryptomatte, rgba)
-KERNEL_FILM_CONVERT_DEFINE(shadow_catcher, rgba)
-KERNEL_FILM_CONVERT_DEFINE(shadow_catcher_matte_with_shadow, rgba)
-KERNEL_FILM_CONVERT_DEFINE(combined, rgba)
-KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
-
-#undef KERNEL_FILM_CONVERT_DEFINE
-#undef KERNEL_FILM_CONVERT_HALF_RGBA_DEFINE
-#undef KERNEL_FILM_CONVERT_PROC
+    const int render_pixel_index = ccl_gpu_global_id_x(); \
+    if (render_pixel_index >= num_pixels) { \
+      return; \
+    } \
+\
+    const int x = render_pixel_index % width; \
+    const int y = render_pixel_index / width; \
+\
+    ccl_global const float *buffer = render_buffer + offset + x * kfilm_convert.pass_stride + \
+                                     y * stride * kfilm_convert.pass_stride; \
+\
+    float pixel[4]; \
+    film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
+\
+    if (input_channel_count == 1) { \
+      pixel[1] = pixel[2] = pixel[0]; \
+    } \
+    if (input_channel_count <= 3) { \
+      pixel[3] = 1.0f; \
+    } \
+\
+    film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
+\
+    const half4 half_pixel = float4_to_half4_display( \
+        make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \
+    kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel); \
+  }
+
+/* 1 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(depth, 1)
+KERNEL_FILM_CONVERT_VARIANT(mist, 1)
+KERNEL_FILM_CONVERT_VARIANT(sample_count, 1)
+KERNEL_FILM_CONVERT_VARIANT(float, 1)
+
+/* 3 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(light_path, 3)
+KERNEL_FILM_CONVERT_VARIANT(float3, 3)
+
+/* 4 channel inputs */
+KERNEL_FILM_CONVERT_VARIANT(motion, 4)
+KERNEL_FILM_CONVERT_VARIANT(cryptomatte, 4)
+KERNEL_FILM_CONVERT_VARIANT(shadow_catcher, 4)
+KERNEL_FILM_CONVERT_VARIANT(shadow_catcher_matte_with_shadow, 4)
+KERNEL_FILM_CONVERT_VARIANT(combined, 4)
+KERNEL_FILM_CONVERT_VARIANT(float4, 4)
+
+#undef KERNEL_FILM_CONVERT_VARIANT
 
 /* --------------------------------------------------------------------
  * Shader evaluation.
@@ -610,28 +687,46 @@ KERNEL_FILM_CONVERT_DEFINE(float4, rgba)
 /* Displacement */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_shader_eval_displace(KernelShaderEvalInput *input,
-                                    float4 *output,
-                                    const int offset,
-                                    const int work_size)
+    ccl_gpu_kernel_signature(shader_eval_displace,
+                             ccl_global KernelShaderEvalInput *input,
+                             ccl_global float *output,
+                             const int offset,
+                             const int work_size)
+{
+  int i = ccl_gpu_global_id_x();
+  if (i < work_size) {
+    ccl_gpu_kernel_call(kernel_displace_evaluate(NULL, input, output, offset + i));
+  }
+}
+
+/* Background */
+
+ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
+    ccl_gpu_kernel_signature(shader_eval_background,
+                             ccl_global KernelShaderEvalInput *input,
+                             ccl_global float *output,
+                             const int offset,
+                             const int work_size)
 {
   int i = ccl_gpu_global_id_x();
   if (i < work_size) {
-    kernel_displace_evaluate(NULL, input, output, offset + i);
+    ccl_gpu_kernel_call(kernel_background_evaluate(NULL, input, output, offset + i));
   }
 }
 
-/* Background Shader Evaluation */
+/* Curve Shadow Transparency */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_shader_eval_background(KernelShaderEvalInput *input,
-                                      float4 *output,
-                                      const int offset,
-                                      const int work_size)
+    ccl_gpu_kernel_signature(shader_eval_curve_shadow_transparency,
+                             ccl_global KernelShaderEvalInput *input,
+                             ccl_global float *output,
+                             const int offset,
+                             const int work_size)
 {
   int i = ccl_gpu_global_id_x();
   if (i < work_size) {
-    kernel_background_evaluate(NULL, input, output, offset + i);
+    ccl_gpu_kernel_call(
+        kernel_curve_shadow_transparency_evaluate(NULL, input, output, offset + i));
   }
 }
 
@@ -640,15 +735,16 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_color_preprocess(float *render_buffer,
-                                       int full_x,
-                                       int full_y,
-                                       int width,
-                                       int height,
-                                       int offset,
-                                       int stride,
-                                       int pass_stride,
-                                       int pass_denoised)
+    ccl_gpu_kernel_signature(filter_color_preprocess,
+                             ccl_global float *render_buffer,
+                             int full_x,
+                             int full_y,
+                             int width,
+                             int height,
+                             int offset,
+                             int stride,
+                             int pass_stride,
+                             int pass_denoised)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / width;
@@ -659,31 +755,34 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
-  float *buffer = render_buffer + render_pixel_index * pass_stride;
+  ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
 
-  float *color_out = buffer + pass_denoised;
+  ccl_global float *color_out = buffer + pass_denoised;
   color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
   color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
   color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_guiding_preprocess(float *guiding_buffer,
-                                         int guiding_pass_stride,
-                                         int guiding_pass_albedo,
-                                         int guiding_pass_normal,
-                                         const float *render_buffer,
-                                         int render_offset,
-                                         int render_stride,
-                                         int render_pass_stride,
-                                         int render_pass_sample_count,
-                                         int render_pass_denoising_albedo,
-                                         int render_pass_denoising_normal,
-                                         int full_x,
-                                         int full_y,
-                                         int width,
-                                         int height,
-                                         int num_samples)
+    ccl_gpu_kernel_signature(filter_guiding_preprocess,
+                             ccl_global float *guiding_buffer,
+                             int guiding_pass_stride,
+                             int guiding_pass_albedo,
+                             int guiding_pass_normal,
+                             int guiding_pass_flow,
+                             ccl_global const float *render_buffer,
+                             int render_offset,
+                             int render_stride,
+                             int render_pass_stride,
+                             int render_pass_sample_count,
+                             int render_pass_denoising_albedo,
+                             int render_pass_denoising_normal,
+                             int render_pass_motion,
+                             int full_x,
+                             int full_y,
+                             int width,
+                             int height,
+                             int num_samples)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / width;
@@ -694,10 +793,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t guiding_pixel_index = x + y * width;
-  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+  ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
 
   const uint64_t render_pixel_index = render_offset + (x + full_x) + (y + full_y) * render_stride;
-  const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
+  ccl_global const float *buffer = render_buffer + render_pixel_index * render_pass_stride;
 
   float pixel_scale;
   if (render_pass_sample_count == PASS_UNUSED) {
@@ -711,8 +810,8 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   if (guiding_pass_albedo != PASS_UNUSED) {
     kernel_assert(render_pass_denoising_albedo != PASS_UNUSED);
 
-    const float *aledo_in = buffer + render_pass_denoising_albedo;
-    float *albedo_out = guiding_pixel + guiding_pass_albedo;
+    ccl_global const float *aledo_in = buffer + render_pass_denoising_albedo;
+    ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
 
     albedo_out[0] = aledo_in[0] * pixel_scale;
     albedo_out[1] = aledo_in[1] * pixel_scale;
@@ -720,24 +819,36 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   /* Normal pass. */
-  if (render_pass_denoising_normal != PASS_UNUSED) {
+  if (guiding_pass_normal != PASS_UNUSED) {
     kernel_assert(render_pass_denoising_normal != PASS_UNUSED);
 
-    const float *normal_in = buffer + render_pass_denoising_normal;
-    float *normal_out = guiding_pixel + guiding_pass_normal;
+    ccl_global const float *normal_in = buffer + render_pass_denoising_normal;
+    ccl_global float *normal_out = guiding_pixel + guiding_pass_normal;
 
     normal_out[0] = normal_in[0] * pixel_scale;
     normal_out[1] = normal_in[1] * pixel_scale;
     normal_out[2] = normal_in[2] * pixel_scale;
   }
+
+  /* Flow pass. */
+  if (guiding_pass_flow != PASS_UNUSED) {
+    kernel_assert(render_pass_motion != PASS_UNUSED);
+
+    ccl_global const float *motion_in = buffer + render_pass_motion;
+    ccl_global float *flow_out = guiding_pixel + guiding_pass_flow;
+
+    flow_out[0] = -motion_in[0] * pixel_scale;
+    flow_out[1] = -motion_in[1] * pixel_scale;
+  }
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_guiding_set_fake_albedo(float *guiding_buffer,
-                                              int guiding_pass_stride,
-                                              int guiding_pass_albedo,
-                                              int width,
-                                              int height)
+    ccl_gpu_kernel_signature(filter_guiding_set_fake_albedo,
+                             ccl_global float *guiding_buffer,
+                             int guiding_pass_stride,
+                             int guiding_pass_albedo,
+                             int width,
+                             int height)
 {
   kernel_assert(guiding_pass_albedo != PASS_UNUSED);
 
@@ -750,9 +861,9 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t guiding_pixel_index = x + y * width;
-  float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
+  ccl_global float *guiding_pixel = guiding_buffer + guiding_pixel_index * guiding_pass_stride;
 
-  float *albedo_out = guiding_pixel + guiding_pass_albedo;
+  ccl_global float *albedo_out = guiding_pixel + guiding_pass_albedo;
 
   albedo_out[0] = 0.5f;
   albedo_out[1] = 0.5f;
@@ -760,20 +871,21 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_filter_color_postprocess(float *render_buffer,
-                                        int full_x,
-                                        int full_y,
-                                        int width,
-                                        int height,
-                                        int offset,
-                                        int stride,
-                                        int pass_stride,
-                                        int num_samples,
-                                        int pass_noisy,
-                                        int pass_denoised,
-                                        int pass_sample_count,
-                                        int num_components,
-                                        bool use_compositing)
+    ccl_gpu_kernel_signature(filter_color_postprocess,
+                             ccl_global float *render_buffer,
+                             int full_x,
+                             int full_y,
+                             int width,
+                             int height,
+                             int offset,
+                             int stride,
+                             int pass_stride,
+                             int num_samples,
+                             int pass_noisy,
+                             int pass_denoised,
+                             int pass_sample_count,
+                             int num_components,
+                             bool use_compositing)
 {
   const int work_index = ccl_gpu_global_id_x();
   const int y = work_index / width;
@@ -784,7 +896,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
   }
 
   const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
-  float *buffer = render_buffer + render_pixel_index * pass_stride;
+  ccl_global float *buffer = render_buffer + render_pixel_index * pass_stride;
 
   float pixel_scale;
   if (pass_sample_count == PASS_UNUSED) {
@@ -794,7 +906,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     pixel_scale = __float_as_uint(buffer[pass_sample_count]);
   }
 
-  float *denoised_pixel = buffer + pass_denoised;
+  ccl_global float *denoised_pixel = buffer + pass_denoised;
 
   denoised_pixel[0] *= pixel_scale;
   denoised_pixel[1] *= pixel_scale;
@@ -807,13 +919,12 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
     /* Currently compositing passes are either 3-component (derived by dividing light passes)
      * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
      * simplifies logic and avoids extra memory allocation. */
-    const float *noisy_pixel = buffer + pass_noisy;
+    ccl_global const float *noisy_pixel = buffer + pass_noisy;
     denoised_pixel[3] = noisy_pixel[3];
   }
   else {
     /* Assigning to zero since this is a default alpha value for 3-component passes, and it
      * is an opaque pixel for 4 component passes. */
-
     denoised_pixel[3] = 0;
   }
 }
@@ -823,21 +934,22 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  */
 
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
-    kernel_gpu_integrator_shadow_catcher_count_possible_splits(int num_states,
-                                                               uint *num_possible_splits)
+    ccl_gpu_kernel_signature(integrator_shadow_catcher_count_possible_splits,
+                             int num_states,
+                             ccl_global uint *num_possible_splits)
 {
   const int state = ccl_gpu_global_id_x();
 
   bool can_split = false;
 
   if (state < num_states) {
-    can_split = kernel_shadow_catcher_path_can_split(nullptr, state);
+    can_split = ccl_gpu_kernel_call(kernel_shadow_catcher_path_can_split(nullptr, state));
   }
 
   /* NOTE: All threads specified in the mask must execute the intrinsic. */
-  const uint can_split_mask = ccl_gpu_ballot(can_split);
+  const auto can_split_mask = ccl_gpu_ballot(can_split);
   const int lane_id = ccl_gpu_thread_idx_x % ccl_gpu_warp_size;
   if (lane_id == 0) {
-    atomic_fetch_and_add_uint32(num_possible_splits, __popc(can_split_mask));
+    atomic_fetch_and_add_uint32(num_possible_splits, popcount(can_split_mask));
   }
 }
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index db4a4bf71e0..a5320edcb3c 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN
  *
  * Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */
 
-#include "util/util_atomic.h"
+#include "util/atomic.h"
 
 #ifdef __HIP__
 #  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
@@ -31,10 +31,43 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
 #endif
 
+#ifdef __KERNEL_METAL__
+struct ActiveIndexContext {
+  ActiveIndexContext(int _thread_index,
+                     int _global_index,
+                     int _threadgroup_size,
+                     int _simdgroup_size,
+                     int _simd_lane_index,
+                     int _simd_group_index,
+                     int _num_simd_groups,
+                     threadgroup int *_simdgroup_offset)
+      : thread_index(_thread_index),
+        global_index(_global_index),
+        blocksize(_threadgroup_size),
+        ccl_gpu_warp_size(_simdgroup_size),
+        thread_warp(_simd_lane_index),
+        warp_index(_simd_group_index),
+        num_warps(_num_simd_groups),
+        warp_offset(_simdgroup_offset)
+  {
+  }
+
+  const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index,
+      num_warps;
+  threadgroup int *warp_offset;
+
+  template<uint blocksizeDummy, typename IsActiveOp>
+  void active_index_array(const uint num_states,
+                          ccl_global int *indices,
+                          ccl_global int *num_indices,
+                          IsActiveOp is_active_op)
+  {
+    const uint state_index = global_index;
+#else
 template<uint blocksize, typename IsActiveOp>
 __device__ void gpu_parallel_active_index_array(const uint num_states,
-                                                int *indices,
-                                                int *num_indices,
+                                                ccl_global int *indices,
+                                                ccl_global int *num_indices,
                                                 IsActiveOp is_active_op)
 {
   extern ccl_gpu_shared int warp_offset[];
@@ -45,43 +78,62 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
   const uint warp_index = thread_index / ccl_gpu_warp_size;
   const uint num_warps = blocksize / ccl_gpu_warp_size;
 
-  /* Test if state corresponding to this thread is active. */
   const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
-  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+#endif
 
-  /* For each thread within a warp compute how many other active states precede it. */
-  const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
-  const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+    /* Test if state corresponding to this thread is active. */
+    const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
 
-  /* Last thread in warp stores number of active states for each warp. */
-  if (thread_warp == ccl_gpu_warp_size - 1) {
-    warp_offset[warp_index] = thread_offset + is_active;
-  }
+    /* For each thread within a warp compute how many other active states precede it. */
+    const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
+                                        ccl_gpu_thread_mask(thread_warp));
 
-  ccl_gpu_syncthreads();
-
-  /* Last thread in block converts per-warp sizes to offsets, increments global size of
-   * index array and gets offset to write to. */
-  if (thread_index == blocksize - 1) {
-    /* TODO: parallelize this. */
-    int offset = 0;
-    for (int i = 0; i < num_warps; i++) {
-      int num_active = warp_offset[i];
-      warp_offset[i] = offset;
-      offset += num_active;
+    /* Last thread in warp stores number of active states for each warp. */
+    if (thread_warp == ccl_gpu_warp_size - 1) {
+      warp_offset[warp_index] = thread_offset + is_active;
     }
 
-    const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
-    warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
-  }
+    ccl_gpu_syncthreads();
+
+    /* Last thread in block converts per-warp sizes to offsets, increments global size of
+     * index array and gets offset to write to. */
+    if (thread_index == blocksize - 1) {
+      /* TODO: parallelize this. */
+      int offset = 0;
+      for (int i = 0; i < num_warps; i++) {
+        int num_active = warp_offset[i];
+        warp_offset[i] = offset;
+        offset += num_active;
+      }
+
+      const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+      warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+    }
 
-  ccl_gpu_syncthreads();
+    ccl_gpu_syncthreads();
 
-  /* Write to index array. */
-  if (is_active) {
-    const uint block_offset = warp_offset[num_warps];
-    indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+    /* Write to index array. */
+    if (is_active) {
+      const uint block_offset = warp_offset[num_warps];
+      indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+    }
   }
-}
+
+#ifdef __KERNEL_METAL__
+}; /* end class ActiveIndexContext */
+
+/* inject the required thread params into a struct, and redirect to its templated member function
+ */
+#  define gpu_parallel_active_index_array \
+    ActiveIndexContext(metal_local_id, \
+                       metal_global_id, \
+                       metal_local_size, \
+                       simdgroup_size, \
+                       simd_lane_index, \
+                       simd_group_index, \
+                       num_simd_groups, \
+                       simdgroup_offset) \
+        .active_index_array
+#endif
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
index a1349e82efb..4bd002c27e4 100644
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -25,7 +25,7 @@ CCL_NAMESPACE_BEGIN
  * This is used for an array the size of the number of shaders in the scene
  * which is not usually huge, so might not be a significant bottleneck. */
 
-#include "util/util_atomic.h"
+#include "util/atomic.h"
 
 #ifdef __HIP__
 #  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024
@@ -33,16 +33,20 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
 #endif
 
-template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
+__device__ void gpu_parallel_prefix_sum(const int global_id,
+                                        ccl_global int *counter,
+                                        ccl_global int *prefix_sum,
+                                        const int num_values)
 {
-  if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
+  if (global_id != 0) {
     return;
   }
 
   int offset = 0;
   for (int i = 0; i < num_values; i++) {
-    const int new_offset = offset + values[i];
-    values[i] = offset;
+    const int new_offset = offset + counter[i];
+    prefix_sum[i] = offset;
+    counter[i] = 0;
     offset = new_offset;
   }
 }
diff --git a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
index 9bca1fad22f..c092e2a21ee 100644
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -24,7 +24,7 @@ CCL_NAMESPACE_BEGIN
  *
  * TODO: there may be ways to optimize this to avoid this many atomic ops? */
 
-#include "util/util_atomic.h"
+#include "util/atomic.h"
 
 #ifdef __HIP__
 #  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
@@ -33,20 +33,30 @@ CCL_NAMESPACE_BEGIN
 #endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
 
-template<uint blocksize, typename GetKeyOp>
-__device__ void gpu_parallel_sorted_index_array(const uint num_states,
-                                                int *indices,
-                                                int *num_indices,
-                                                int *key_prefix_sum,
+template<typename GetKeyOp>
+__device__ void gpu_parallel_sorted_index_array(const uint state_index,
+                                                const uint num_states,
+                                                const int num_states_limit,
+                                                ccl_global int *indices,
+                                                ccl_global int *num_indices,
+                                                ccl_global int *key_counter,
+                                                ccl_global int *key_prefix_sum,
                                                 GetKeyOp get_key_op)
 {
-  const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x;
   const int key = (state_index < num_states) ? get_key_op(state_index) :
                                                GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;
 
   if (key != GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY) {
     const uint index = atomic_fetch_and_add_uint32(&key_prefix_sum[key], 1);
-    indices[index] = state_index;
+    if (index < num_states_limit) {
+      /* Assign state index. */
+      indices[index] = state_index;
+    }
+    else {
+      /* Can't process this state now, increase the counter again so that
+       * it will be handled in another iteration. */
+      atomic_fetch_and_add_uint32(&key_counter[key], 1);
+    }
   }
 }
 
diff --git a/intern/cycles/kernel/device/gpu/work_stealing.h b/intern/cycles/kernel/device/gpu/work_stealing.h
new file mode 100644
index 00000000000..c3083948057
--- /dev/null
+++ b/intern/cycles/kernel/device/gpu/work_stealing.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+CCL_NAMESPACE_BEGIN
+
+/*
+ * Utility functions for work stealing
+ */
+
+/* Map global work index to tile, pixel X/Y and sample. */
+ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
+                                      uint global_work_index,
+                                      ccl_private uint *x,
+                                      ccl_private uint *y,
+                                      ccl_private uint *sample)
+{
+  uint sample_offset, pixel_offset;
+
+  if (kernel_data.integrator.scrambling_distance < 0.9f) {
+    /* Keep threads for the same sample together. */
+    uint tile_pixels = tile->w * tile->h;
+    sample_offset = global_work_index / tile_pixels;
+    pixel_offset = global_work_index - sample_offset * tile_pixels;
+  }
+  else {
+    /* Keeping threads for the same pixel together.
+     * Appears to improve performance by a few % on CUDA and OptiX. */
+    sample_offset = global_work_index % tile->num_samples;
+    pixel_offset = global_work_index / tile->num_samples;
+  }
+
+  uint y_offset = pixel_offset / tile->w;
+  uint x_offset = pixel_offset - y_offset * tile->w;
+
+  *x = tile->x + x_offset;
+  *y = tile->y + y_offset;
+  *sample = tile->start_sample + sample_offset;
+}
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/hip/compat.h b/intern/cycles/kernel/device/hip/compat.h
index 95338fe7d6e..fff7a09e884 100644
--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -45,14 +45,14 @@ typedef unsigned long long uint64_t;
 #define ccl_device_forceinline __device__ __forceinline__
 #define ccl_device_noinline __device__ __noinline__
 #define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
 #define ccl_global
-#define ccl_static_constant __constant__
+#define ccl_inline_constant __constant__
 #define ccl_device_constant __constant__ __device__
 #define ccl_constant const
 #define ccl_gpu_shared __shared__
 #define ccl_private
 #define ccl_may_alias
-#define ccl_addr_space
 #define ccl_restrict __restrict__
 #define ccl_loop_no_unroll
 #define ccl_align(n) __align__(n)
@@ -75,6 +75,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
 
 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
@@ -84,7 +85,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot(predicate)
 #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla)
-#define ccl_gpu_popc(x) __popc(x)
 
 /* GPU texture objects */
 typedef hipTextureObject_t ccl_gpu_tex_object;
@@ -117,5 +117,5 @@ ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object tex
 
 /* Types */
 
-#include "util/util_half.h"
-#include "util/util_types.h"
+#include "util/half.h"
+#include "util/types.h"
diff --git a/intern/cycles/kernel/device/hip/config.h b/intern/cycles/kernel/device/hip/config.h
index 2fde0d46015..7ec744d8ad2 100644
--- a/intern/cycles/kernel/device/hip/config.h
+++ b/intern/cycles/kernel/device/hip/config.h
@@ -35,12 +35,29 @@
 
 /* Compute number of threads per block and minimum blocks per multiprocessor
  * given the maximum number of registers per thread. */
-
 #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
   extern "C" __global__ void __launch_bounds__(block_num_threads, \
                                                GPU_MULTIPRESSOR_MAX_REGISTERS / \
                                                    (block_num_threads * thread_num_registers))
 
+#define ccl_gpu_kernel_threads(block_num_threads) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads)
+
+#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
+
+#define ccl_gpu_kernel_call(x) x
+
+/* Define a function object where "func" is the lambda body, and additional parameters are used to
+ * specify captured state  */
+#define ccl_gpu_kernel_lambda(func, ...) \
+  struct KernelLambda { \
+    __VA_ARGS__; \
+    __device__ int operator()(const int state) \
+    { \
+      return (func); \
+    } \
+  } ccl_gpu_kernel_lambda_pass
+
 /* sanity checks */
 
 #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
diff --git a/intern/cycles/kernel/device/hip/globals.h b/intern/cycles/kernel/device/hip/globals.h
index 39978ae7899..d9a560d668b 100644
--- a/intern/cycles/kernel/device/hip/globals.h
+++ b/intern/cycles/kernel/device/hip/globals.h
@@ -18,24 +18,25 @@
 
 #pragma once
 
-#include "kernel/kernel_profiling.h"
-#include "kernel/kernel_types.h"
+#include "kernel/types.h"
 
-#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/state.h"
+
+#include "kernel/util/profiling.h"
 
 CCL_NAMESPACE_BEGIN
 
 /* Not actually used, just a NULL pointer that gets passed everywhere, which we
  * hope gets optimized out by the compiler. */
-struct KernelGlobals {
-  /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
+struct KernelGlobalsGPU {
   int unused[1];
 };
+typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
 
 /* Global scene data and textures */
 __constant__ KernelData __data;
 #define KERNEL_TEX(type, name) __attribute__((used)) const __constant__ __device__ type *name;
-#include "kernel/kernel_textures.h"
+#include "kernel/textures.h"
 
 /* Integrator state */
 __constant__ IntegratorStateGPU __integrator_state;
diff --git a/intern/cycles/kernel/device/metal/compat.h b/intern/cycles/kernel/device/metal/compat.h
new file mode 100644
index 00000000000..1222b68f0ee
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -0,0 +1,316 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define __KERNEL_GPU__
+#define __KERNEL_METAL__
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
+
+#ifndef ATTR_FALLTHROUGH
+#  define ATTR_FALLTHROUGH
+#endif
+
+#include <metal_atomic>
+#include <metal_pack>
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+#ifdef __METALRT__
+using namespace metal::raytracing;
+#endif
+
+#pragma clang diagnostic ignored "-Wunused-variable"
+#pragma clang diagnostic ignored "-Wsign-compare"
+#pragma clang diagnostic ignored "-Wuninitialized"
+
+/* Qualifiers */
+
+#define ccl_device
+#define ccl_device_inline ccl_device
+#define ccl_device_forceinline ccl_device
+#define ccl_device_noinline ccl_device __attribute__((noinline))
+#define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
+#define ccl_global device
+#define ccl_inline_constant static constant constexpr
+#define ccl_device_constant constant
+#define ccl_constant constant
+#define ccl_gpu_shared threadgroup
+#define ccl_private thread
+#define ccl_may_alias
+#define ccl_restrict __restrict
+#define ccl_loop_no_unroll
+#define ccl_align(n) alignas(n)
+#define ccl_optional_struct_init
+
+/* No assert supported for Metal */
+
+#define kernel_assert(cond)
+
+#define ccl_gpu_global_id_x() metal_global_id
+#define ccl_gpu_warp_size simdgroup_size
+#define ccl_gpu_thread_idx_x simd_group_index
+#define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1)
+
+#define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate)))
+#define ccl_gpu_syncthreads() threadgroup_barrier(mem_flags::mem_threadgroup);
+
+// clang-format off
+
+/* kernel.h adapters */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
+#define ccl_gpu_kernel_threads(block_num_threads)
+
+/* Convert a comma-separated list into a semicolon-separated list
+ * (so that we can generate a struct based on kernel entry-point parameters). */
+#define FN0()
+#define FN1(p1) p1;
+#define FN2(p1, p2) p1; p2;
+#define FN3(p1, p2, p3) p1; p2; p3;
+#define FN4(p1, p2, p3, p4) p1; p2; p3; p4;
+#define FN5(p1, p2, p3, p4, p5) p1; p2; p3; p4; p5;
+#define FN6(p1, p2, p3, p4, p5, p6) p1; p2; p3; p4; p5; p6;
+#define FN7(p1, p2, p3, p4, p5, p6, p7) p1; p2; p3; p4; p5; p6; p7;
+#define FN8(p1, p2, p3, p4, p5, p6, p7, p8) p1; p2; p3; p4; p5; p6; p7; p8;
+#define FN9(p1, p2, p3, p4, p5, p6, p7, p8, p9) p1; p2; p3; p4; p5; p6; p7; p8; p9;
+#define FN10(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10;
+#define FN11(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11;
+#define FN12(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12;
+#define FN13(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13;
+#define FN14(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14;
+#define FN15(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15;
+#define FN16(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16;
+#define FN17(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17;
+#define FN18(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18;
+#define FN19(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; p19;
+#define FN20(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; p19; p20;
+#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, ...) p20
+#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN20, FN19, FN18, FN17, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0)
+
+/* Generate a struct containing the entry-point parameters and a "run"
+ * method which can access them implicitly via this-> */
+#define ccl_gpu_kernel_signature(name, ...) \
+struct kernel_gpu_##name \
+{ \
+  PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
+  void run(thread MetalKernelContext& context, \
+           threadgroup int *simdgroup_offset, \
+           const uint metal_global_id, \
+           const ushort metal_local_id, \
+           const ushort metal_local_size, \
+           uint simdgroup_size, \
+           uint simd_lane_index, \
+           uint simd_group_index, \
+           uint num_simd_groups) ccl_global const; \
+}; \
+kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \
+                                constant KernelParamsMetal &ccl_restrict   _launch_params_metal, \
+                                constant MetalAncillaries *_metal_ancillaries, \
+                                threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
+                                const uint metal_global_id [[thread_position_in_grid]], \
+                                const ushort metal_local_id   [[thread_position_in_threadgroup]], \
+                                const ushort metal_local_size [[threads_per_threadgroup]], \
+                                uint simdgroup_size [[threads_per_simdgroup]], \
+                                uint simd_lane_index [[thread_index_in_simdgroup]], \
+                                uint simd_group_index [[simdgroup_index_in_threadgroup]], \
+                                uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
+  MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
+  params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
+} \
+void kernel_gpu_##name::run(thread MetalKernelContext& context, \
+                  threadgroup int *simdgroup_offset, \
+                  const uint metal_global_id, \
+                  const ushort metal_local_id, \
+                  const ushort metal_local_size, \
+                  uint simdgroup_size, \
+                  uint simd_lane_index, \
+                  uint simd_group_index, \
+                  uint num_simd_groups) ccl_global const
+
+#define ccl_gpu_kernel_call(x) context.x
+
+/* define a function object where "func" is the lambda body, and additional parameters are used to specify captured state  */
+#define ccl_gpu_kernel_lambda(func, ...) \
+  struct KernelLambda \
+  { \
+    KernelLambda(ccl_private MetalKernelContext &_context) : context(_context) {} \
+    ccl_private MetalKernelContext &context; \
+    __VA_ARGS__; \
+    int operator()(const int state) const { return (func); } \
+  } ccl_gpu_kernel_lambda_pass(context)
+
+// clang-format on
+
+/* volumetric lambda functions - use function objects for lambda-like functionality */
+#define VOLUME_READ_LAMBDA(function_call) \
+  struct FnObjectRead { \
+    KernelGlobals kg; \
+    ccl_private MetalKernelContext *context; \
+    int state; \
+\
+    VolumeStack operator()(const int i) const \
+    { \
+      return context->function_call; \
+    } \
+  } volume_read_lambda_pass{kg, this, state};
+
+#define VOLUME_WRITE_LAMBDA(function_call) \
+  struct FnObjectWrite { \
+    KernelGlobals kg; \
+    ccl_private MetalKernelContext *context; \
+    int state; \
+\
+    void operator()(const int i, VolumeStack entry) const \
+    { \
+      context->function_call; \
+    } \
+  } volume_write_lambda_pass{kg, this, state};
+
+/* make_type definitions with Metal style element initializers */
+#ifdef make_float2
+#  undef make_float2
+#endif
+#ifdef make_float3
+#  undef make_float3
+#endif
+#ifdef make_float4
+#  undef make_float4
+#endif
+#ifdef make_int2
+#  undef make_int2
+#endif
+#ifdef make_int3
+#  undef make_int3
+#endif
+#ifdef make_int4
+#  undef make_int4
+#endif
+#ifdef make_uchar4
+#  undef make_uchar4
+#endif
+
+#define make_float2(x, y) float2(x, y)
+#define make_float3(x, y, z) float3(x, y, z)
+#define make_float4(x, y, z, w) float4(x, y, z, w)
+#define make_int2(x, y) int2(x, y)
+#define make_int3(x, y, z) int3(x, y, z)
+#define make_int4(x, y, z, w) int4(x, y, z, w)
+#define make_uchar4(x, y, z, w) uchar4(x, y, z, w)
+
+/* Math functions */
+
+#define __uint_as_float(x) as_type<float>(x)
+#define __float_as_uint(x) as_type<uint>(x)
+#define __int_as_float(x) as_type<float>(x)
+#define __float_as_int(x) as_type<int>(x)
+#define __float2half(x) half(x)
+#define powf(x, y) pow(float(x), float(y))
+#define fabsf(x) fabs(float(x))
+#define copysignf(x, y) copysign(float(x), float(y))
+#define asinf(x) asin(float(x))
+#define acosf(x) acos(float(x))
+#define atanf(x) atan(float(x))
+#define floorf(x) floor(float(x))
+#define ceilf(x) ceil(float(x))
+#define hypotf(x, y) hypot(float(x), float(y))
+#define atan2f(x, y) atan2(float(x), float(y))
+#define fmaxf(x, y) fmax(float(x), float(y))
+#define fminf(x, y) fmin(float(x), float(y))
+#define fmodf(x, y) fmod(float(x), float(y))
+#define sinhf(x) sinh(float(x))
+#define coshf(x) cosh(float(x))
+#define tanhf(x) tanh(float(x))
+#define saturatef(x) saturate(float(x))
+
+/* Use native functions with possibly lower precision for performance,
+ * no issues found so far. */
+#define trigmode fast
+#define sinf(x) trigmode::sin(float(x))
+#define cosf(x) trigmode::cos(float(x))
+#define tanf(x) trigmode::tan(float(x))
+#define expf(x) trigmode::exp(float(x))
+#define sqrtf(x) trigmode::sqrt(float(x))
+#define logf(x) trigmode::log(float(x))
+
+#define NULL 0
+
+#define __device__
+
+#ifdef __METALRT__
+
+#  define __KERNEL_GPU_RAYTRACING__
+
+#  if defined(__METALRT_MOTION__)
+#    define METALRT_TAGS instancing, instance_motion, primitive_motion
+#  else
+#    define METALRT_TAGS instancing
+#  endif /* __METALRT_MOTION__ */
+
+typedef acceleration_structure<METALRT_TAGS> metalrt_as_type;
+typedef intersection_function_table<triangle_data, METALRT_TAGS> metalrt_ift_type;
+typedef metal::raytracing::intersector<triangle_data, METALRT_TAGS> metalrt_intersector_type;
+
+#endif /* __METALRT__ */
+
+/* texture bindings and sampler setup */
+
+struct Texture2DParamsMetal {
+  texture2d<float, access::sample> tex;
+};
+struct Texture3DParamsMetal {
+  texture3d<float, access::sample> tex;
+};
+
+struct MetalAncillaries {
+  device Texture2DParamsMetal *textures_2d;
+  device Texture3DParamsMetal *textures_3d;
+
+#ifdef __METALRT__
+  metalrt_as_type accel_struct;
+  metalrt_ift_type ift_default;
+  metalrt_ift_type ift_shadow;
+  metalrt_ift_type ift_local;
+#endif
+};
+
+#include "util/half.h"
+#include "util/types.h"
+
+enum SamplerType {
+  SamplerFilterNearest_AddressRepeat,
+  SamplerFilterNearest_AddressClampEdge,
+  SamplerFilterNearest_AddressClampZero,
+
+  SamplerFilterLinear_AddressRepeat,
+  SamplerFilterLinear_AddressClampEdge,
+  SamplerFilterLinear_AddressClampZero,
+
+  SamplerCount
+};
+
+constant constexpr array<sampler, SamplerCount> metal_samplers = {
+    sampler(address::repeat, filter::nearest),
+    sampler(address::clamp_to_edge, filter::nearest),
+    sampler(address::clamp_to_zero, filter::nearest),
+    sampler(address::repeat, filter::linear),
+    sampler(address::clamp_to_edge, filter::linear),
+    sampler(address::clamp_to_zero, filter::linear),
+};
diff --git a/intern/cycles/kernel/device/metal/context_begin.h b/intern/cycles/kernel/device/metal/context_begin.h
new file mode 100644
index 00000000000..2eefd795aa1
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/context_begin.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// clang-format off
+
+/* Open the Metal kernel context class
+ * Necessary to access resource bindings */
+class MetalKernelContext {
+  public:
+    constant KernelParamsMetal &launch_params_metal;
+    constant MetalAncillaries *metal_ancillaries;
+
+    MetalKernelContext(constant KernelParamsMetal &_launch_params_metal, constant MetalAncillaries * _metal_ancillaries)
+    : launch_params_metal(_launch_params_metal), metal_ancillaries(_metal_ancillaries)
+    {}
+
+    MetalKernelContext(constant KernelParamsMetal &_launch_params_metal)
+    : launch_params_metal(_launch_params_metal)
+    {}
+
+    /* texture fetch adapter functions */
+    typedef uint64_t ccl_gpu_tex_object;
+
+    template<typename T>
+    inline __attribute__((__always_inline__))
+    T ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+      kernel_assert(0);
+      return 0;
+    }
+    template<typename T>
+    inline __attribute__((__always_inline__))
+    T ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+      kernel_assert(0);
+      return 0;
+    }
+
+    // texture2d
+    template<>
+    inline __attribute__((__always_inline__))
+    float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y));
+    }
+    template<>
+    inline __attribute__((__always_inline__))
+    float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x;
+    }
+
+    // texture3d
+    template<>
+    inline __attribute__((__always_inline__))
+    float4 ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z));
+    }
+    template<>
+    inline __attribute__((__always_inline__))
+    float ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)).x;
+    }
+#    include "kernel/device/gpu/image.h"
+
+  // clang-format on
diff --git a/intern/cycles/kernel/device/metal/context_end.h b/intern/cycles/kernel/device/metal/context_end.h
new file mode 100644
index 00000000000..e700f294440
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/context_end.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+}
+; /* end of MetalKernelContext class definition */
+
+/* Silently redirect into the MetalKernelContext instance */
+/* NOTE: These macros will need maintaining as entry-points change. */
+
+#undef kernel_integrator_state
+#define kernel_integrator_state context.launch_params_metal.__integrator_state
diff --git a/intern/cycles/kernel/device/metal/globals.h b/intern/cycles/kernel/device/metal/globals.h
new file mode 100644
index 00000000000..1aea36589d0
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/globals.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#include "kernel/types.h"
+#include "kernel/util/profiling.h"
+
+#include "kernel/integrator/state.h"
+
+CCL_NAMESPACE_BEGIN
+
+typedef struct KernelParamsMetal {
+
+#define KERNEL_TEX(type, name) ccl_global const type *name;
+#include "kernel/textures.h"
+#undef KERNEL_TEX
+
+  const IntegratorStateGPU __integrator_state;
+  const KernelData data;
+
+} KernelParamsMetal;
+
+typedef struct KernelGlobalsGPU {
+  int unused[1];
+} KernelGlobalsGPU;
+
+typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
+
+#define kernel_data launch_params_metal.data
+#define kernel_integrator_state launch_params_metal.__integrator_state
+
+/* data lookup defines */
+
+#define kernel_tex_fetch(tex, index) launch_params_metal.tex[index]
+#define kernel_tex_array(tex) launch_params_metal.tex
+
+CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/device/metal/kernel.metal b/intern/cycles/kernel/device/metal/kernel.metal
new file mode 100644
index 00000000000..6b77940660f
--- /dev/null
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -0,0 +1,765 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Metal kernel entry points */
+
+#include "kernel/device/metal/compat.h"
+#include "kernel/device/metal/globals.h"
+#include "kernel/device/gpu/kernel.h"
+
+/* MetalRT intersection handlers */
+#ifdef __METALRT__
+
+/* Return type for a bounding box intersection function. */
+struct BoundingBoxIntersectionResult
+{
+  bool accept [[accept_intersection]];
+  bool continue_search [[continue_search]];
+  float distance [[distance]];
+};
+
+/* Return type for a triangle intersection function. */
+struct TriangleIntersectionResult
+{
+  bool accept [[accept_intersection]];
+  bool continue_search  [[continue_search]];
+};
+
+enum { METALRT_HIT_TRIANGLE, METALRT_HIT_BOUNDING_BOX };
+
+ccl_device_inline bool intersection_skip_self(ray_data const RaySelfPrimitives& self,
+                                              const int object,
+                                              const int prim)
+{
+  return (self.prim == prim) && (self.object == object);
+}
+
+ccl_device_inline bool intersection_skip_self_shadow(ray_data const RaySelfPrimitives& self,
+                                                     const int object,
+                                                     const int prim)
+{
+  return ((self.prim == prim) && (self.object == object)) ||
+         ((self.light_prim == prim) && (self.light_object == object));
+}
+
+ccl_device_inline bool intersection_skip_self_local(ray_data const RaySelfPrimitives& self,
+                                                    const int prim)
+{
+  return (self.prim == prim);
+}
+
+template<typename TReturn, uint intersection_type>
+TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
+                          ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload,
+                          const uint object,
+                          const uint primitive_id,
+                          const float2 barycentrics,
+                          const float ray_tmax)
+{
+  TReturn result;
+  
+#ifdef __BVH_LOCAL__
+  uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+
+  if ((object != payload.local_object) || intersection_skip_self_local(payload.self, prim)) {
+    /* Only intersect with matching object and skip self-intersecton. */
+    result.accept = false;
+    result.continue_search = true;
+    return result;
+  }
+
+  const short max_hits = payload.max_hits;
+  if (max_hits == 0) {
+    /* Special case for when no hit information is requested, just report that something was hit */
+    payload.result = true;
+    result.accept = true;
+    result.continue_search = false;
+    return result;
+  }
+
+  int hit = 0;
+  if (payload.has_lcg_state) {
+    for (short i = min(max_hits, short(payload.local_isect.num_hits)) - 1; i >= 0; --i) {
+      if (ray_tmax == payload.local_isect.hits[i].t) {
+        result.accept = false;
+        result.continue_search = true;
+        return result;
+      }
+    }
+
+    hit = payload.local_isect.num_hits++;
+
+    if (payload.local_isect.num_hits > max_hits) {
+      hit = lcg_step_uint(&payload.lcg_state) % payload.local_isect.num_hits;
+      if (hit >= max_hits) {
+        result.accept = false;
+        result.continue_search = true;
+        return result;
+      }
+    }
+  }
+  else {
+    if (payload.local_isect.num_hits && ray_tmax > payload.local_isect.hits[0].t) {
+      /* Record closest intersection only. Do not terminate ray here, since there is no guarantee about distance ordering in any-hit */
+      result.accept = false;
+      result.continue_search = true;
+      return result;
+    }
+
+    payload.local_isect.num_hits = 1;
+  }
+
+  ray_data Intersection *isect = &payload.local_isect.hits[hit];
+  isect->t = ray_tmax;
+  isect->prim = prim;
+  isect->object = object;
+  isect->type = kernel_tex_fetch(__objects, object).primitive_type;
+
+  isect->u = 1.0f - barycentrics.y - barycentrics.x;
+  isect->v = barycentrics.x;
+
+  /* Record geometric normal */
+  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, isect->prim).w;
+  const float3 tri_a = float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0));
+  const float3 tri_b = float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1));
+  const float3 tri_c = float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
+  payload.local_isect.Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
+
+  /* Continue tracing (without this the trace call would return after the first hit) */
+  result.accept = false;
+  result.continue_search = true;
+  return result;
+#endif
+}
+
+[[intersection(triangle, triangle_data, METALRT_TAGS)]]
+TriangleIntersectionResult
+__anyhit__cycles_metalrt_local_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+                                       ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
+                                       uint instance_id [[user_instance_id]],
+                                       uint primitive_id [[primitive_id]],
+                                       float2 barycentrics [[barycentric_coord]],
+                                       float ray_tmax [[distance]])
+{
+  return metalrt_local_hit<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
+            launch_params_metal, payload, instance_id, primitive_id, barycentrics, ray_tmax);
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
+{
+  /* unused function */
+  BoundingBoxIntersectionResult result;
+  result.distance = ray_tmax;
+  result.accept = false;
+  result.continue_search = false;
+  return result;
+}
+
+template<uint intersection_type>
+bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
+                            ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
+                            uint object,
+                            uint prim,
+                            const float2 barycentrics,
+                            const float ray_tmax)
+{
+#ifdef __SHADOW_RECORD_ALL__
+#  ifdef __VISIBILITY_FLAG__
+  const uint visibility = payload.visibility;
+  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+    /* continue search */
+    return true;
+  }
+#  endif
+
+  if (intersection_skip_self_shadow(payload.self, object, prim)) {
+    /* continue search */
+    return true;
+  }
+
+  float u = 0.0f, v = 0.0f;
+  int type = 0;
+  if (intersection_type == METALRT_HIT_TRIANGLE) {
+    u = 1.0f - barycentrics.y - barycentrics.x;
+    v = barycentrics.x;
+    type = kernel_tex_fetch(__objects, object).primitive_type;
+  }
+#  ifdef __HAIR__
+  else {
+    u = barycentrics.x;
+    v = barycentrics.y;
+    
+    const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
+    type = segment.type;
+    prim = segment.prim;
+
+    /* Filter out curve endcaps */
+    if (u == 0.0f || u == 1.0f) {
+      /* continue search */
+      return true;
+    }
+  }
+#  endif
+
+#  ifndef __TRANSPARENT_SHADOWS__
+  /* No transparent shadows support compiled in, make opaque. */
+  payload.result = true;
+  /* terminate ray */
+  return false;
+#  else
+  short max_hits = payload.max_hits;
+  short num_hits = payload.num_hits;
+  short num_recorded_hits = payload.num_recorded_hits;
+
+  MetalKernelContext context(launch_params_metal);
+  
+  /* If no transparent shadows, all light is blocked and we can stop immediately. */
+  if (num_hits >= max_hits ||
+      !(context.intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) {
+    payload.result = true;
+    /* terminate ray */
+    return false;
+  }
+  
+  /* Always use baked shadow transparency for curves. */
+  if (type & PRIMITIVE_CURVE) {
+    float throughput = payload.throughput;
+    throughput *= context.intersection_curve_shadow_transparency(nullptr, object, prim, u);
+    payload.throughput = throughput;
+    payload.num_hits += 1;
+
+    if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
+      /* Accept result and terminate if throughput is sufficiently low */
+      payload.result = true;
+      return false;
+    }
+    else {
+      return true;
+    }
+  }
+  
+  payload.num_hits += 1;
+  payload.num_recorded_hits += 1;
+  
+  uint record_index = num_recorded_hits;
+
+  const IntegratorShadowState state = payload.state;
+
+  const uint max_record_hits = min(uint(max_hits), INTEGRATOR_SHADOW_ISECT_SIZE);
+  if (record_index >= max_record_hits) {
+    /* If maximum number of hits reached, find a hit to replace. */
+    float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t);
+    uint max_recorded_hit = 0;
+
+    for (int i = 1; i < max_record_hits; i++) {
+      const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t);
+      if (isect_t > max_recorded_t) {
+        max_recorded_t = isect_t;
+        max_recorded_hit = i;
+      }
+    }
+
+    if (ray_tmax >= max_recorded_t) {
+      /* Accept hit, so that we don't consider any more hits beyond the distance of the
+       * current hit anymore. */
+      payload.result = true;
+      return true;
+    }
+
+    record_index = max_recorded_hit;
+  }
+
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = ray_tmax;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
+  
+  /* Continue tracing. */
+#  endif /* __TRANSPARENT_SHADOWS__ */
+#endif   /* __SHADOW_RECORD_ALL__ */
+
+  return true;
+}
+
+[[intersection(triangle, triangle_data, METALRT_TAGS)]]
+TriangleIntersectionResult
+__anyhit__cycles_metalrt_shadow_all_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+                                            ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
+                                            unsigned int object [[user_instance_id]],
+                                            unsigned int primitive_id [[primitive_id]],
+                                            float2 barycentrics [[barycentric_coord]],
+                                            float ray_tmax [[distance]])
+{
+  uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+
+  TriangleIntersectionResult result;
+  result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_TRIANGLE>(
+            launch_params_metal, payload, object, prim, barycentrics, ray_tmax);
+  result.accept = !result.continue_search;
+  return result;
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__anyhit__cycles_metalrt_shadow_all_hit_box(const float ray_tmax [[max_distance]])
+{
+  /* unused function */
+  BoundingBoxIntersectionResult result;
+  result.distance = ray_tmax;
+  result.accept = false;
+  result.continue_search = false;
+  return result;
+}
+
+template<typename TReturnType, uint intersection_type>
+inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_params_metal,
+                                           ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
+                                           const uint object,
+                                           const uint prim,
+                                           const float u)
+{
+  TReturnType result;
+    
+#  ifdef __HAIR__
+  if (intersection_type == METALRT_HIT_BOUNDING_BOX) {
+    /* Filter out curve endcaps. */
+    if (u == 0.0f || u == 1.0f) {
+      result.accept = false;
+      result.continue_search = true;
+      return result;
+    }
+  }
+#  endif
+
+  uint visibility = payload.visibility;
+#  ifdef __VISIBILITY_FLAG__
+  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+    result.accept = false;
+    result.continue_search = true;
+    return result;
+  }
+#  endif
+
+  /* Shadow ray early termination. */
+  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
+    if (intersection_skip_self_shadow(payload.self, object, prim)) {
+      result.accept = false;
+      result.continue_search = true;
+      return result;
+    }
+    else {
+      result.accept = true;
+      result.continue_search = false;
+      return result;
+    }
+  }
+  else {
+    if (intersection_skip_self(payload.self, object, prim)) {
+      result.accept = false;
+      result.continue_search = true;
+      return result;
+    }
+  }
+
+  result.accept = true;
+  result.continue_search = true;
+  return result;
+}
+
+[[intersection(triangle, triangle_data, METALRT_TAGS)]]
+TriangleIntersectionResult
+__anyhit__cycles_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+                                             ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+                                             unsigned int object [[user_instance_id]],
+                                             unsigned int primitive_id [[primitive_id]])
+{
+  uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+  TriangleIntersectionResult result = metalrt_visibility_test<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
+            launch_params_metal, payload, object, prim, 0.0f);
+  if (result.accept) {
+    payload.prim = prim;
+    payload.type = kernel_tex_fetch(__objects, object).primitive_type;
+  }
+  return result;
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__anyhit__cycles_metalrt_visibility_test_box(const float ray_tmax [[max_distance]])
+{
+  /* Unused function */
+  BoundingBoxIntersectionResult result;
+  result.accept = false;
+  result.continue_search = true;
+  result.distance = ray_tmax;
+  return result;
+}
+
+#ifdef __HAIR__
+ccl_device_inline
+void metalrt_intersection_curve(constant KernelParamsMetal &launch_params_metal,
+                                ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
+                                const uint object,
+                                const uint prim,
+                                const uint type,
+                                const float3 ray_origin,
+                                const float3 ray_direction,
+                                float time,
+                                const float ray_tmax,
+                                thread BoundingBoxIntersectionResult &result)
+{
+#  ifdef __VISIBILITY_FLAG__
+  const uint visibility = payload.visibility;
+  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+    return;
+  }
+#  endif
+
+  float3 P = ray_origin;
+  float3 dir = ray_direction;
+
+  /* The direction is not normalized by default, but the curve intersection routine expects that */
+  float len;
+  dir = normalize_len(dir, &len);
+
+  Intersection isect;
+  isect.t = ray_tmax;
+  /* Transform maximum distance into object space. */
+  if (isect.t != FLT_MAX)
+    isect.t *= len;
+
+  MetalKernelContext context(launch_params_metal);
+  if (context.curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+    result = metalrt_visibility_test<BoundingBoxIntersectionResult, METALRT_HIT_BOUNDING_BOX>(
+                  launch_params_metal, payload, object, prim, isect.u);
+    if (result.accept) {
+      result.distance = isect.t / len;
+      payload.u = isect.u;
+      payload.v = isect.v;
+      payload.prim = prim;
+      payload.type = type;
+    }
+  }
+}
+
+ccl_device_inline
+void metalrt_intersection_curve_shadow(constant KernelParamsMetal &launch_params_metal,
+                                       ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
+                                       const uint object,
+                                       const uint prim,
+                                       const uint type,
+                                       const float3 ray_origin,
+                                       const float3 ray_direction,
+                                       float time,
+                                       const float ray_tmax,
+                                       thread BoundingBoxIntersectionResult &result)
+{
+  const uint visibility = payload.visibility;
+
+  float3 P = ray_origin;
+  float3 dir = ray_direction;
+
+  /* The direction is not normalized by default, but the curve intersection routine expects that */
+  float len;
+  dir = normalize_len(dir, &len);
+
+  Intersection isect;
+  isect.t = ray_tmax;
+  /* Transform maximum distance into object space */
+  if (isect.t != FLT_MAX)
+    isect.t *= len;
+
+  MetalKernelContext context(launch_params_metal);
+  if (context.curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+    result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_BOUNDING_BOX>(
+                launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax);
+    result.accept = !result.continue_search;
+
+    if (result.accept) {
+      result.distance = isect.t / len;
+    }
+  }
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__intersection__curve_ribbon(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+                             ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+                             const uint object [[user_instance_id]],
+                             const uint primitive_id [[primitive_id]],
+                             const float3 ray_origin [[origin]],
+                             const float3 ray_direction [[direction]],
+                             const float ray_tmax [[max_distance]])
+{
+  uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+  const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
+
+  BoundingBoxIntersectionResult result;
+  result.accept = false;
+  result.continue_search = true;
+  result.distance = ray_tmax;
+
+  if (segment.type & PRIMITIVE_CURVE_RIBBON) {
+    metalrt_intersection_curve(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+#  if defined(__METALRT_MOTION__)
+                               payload.time,
+#  else
+                               0.0f,
+#  endif
+                               ray_tmax, result);
+  }
+
+  return result;
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__intersection__curve_ribbon_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+                                    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
+                                    const uint object [[user_instance_id]],
+                                    const uint primitive_id [[primitive_id]],
+                                    const float3 ray_origin [[origin]],
+                                    const float3 ray_direction [[direction]],
+                                    const float ray_tmax [[max_distance]])
+{
+  uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+  const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
+
+  BoundingBoxIntersectionResult result;
+  result.accept = false;
+  result.continue_search = true;
+  result.distance = ray_tmax;
+
+  if (segment.type & PRIMITIVE_CURVE_RIBBON) {
+    metalrt_intersection_curve_shadow(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+#  if defined(__METALRT_MOTION__)
+                               payload.time,
+#  else
+                               0.0f,
+#  endif
+                               ray_tmax, result);
+  }
+
+  return result;
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__intersection__curve_all(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+                          ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+                          const uint object [[user_instance_id]],
+                          const uint primitive_id [[primitive_id]],
+                          const float3 ray_origin [[origin]],
+                          const float3 ray_direction [[direction]],
+                          const float ray_tmax [[max_distance]])
+{
+  uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+  const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
+    
+  BoundingBoxIntersectionResult result;
+  result.accept = false;
+  result.continue_search = true;
+  result.distance = ray_tmax;
+  metalrt_intersection_curve(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+#  if defined(__METALRT_MOTION__)
+                             payload.time,
+#  else
+                             0.0f,
+#  endif
+                             ray_tmax, result);
+
+  return result;
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__intersection__curve_all_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+                                 ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
+                                 const uint object [[user_instance_id]],
+                                 const uint primitive_id [[primitive_id]],
+                                 const float3 ray_origin [[origin]],
+                                 const float3 ray_direction [[direction]],
+                                 const float ray_tmax [[max_distance]])
+{
+  uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+  const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
+
+  BoundingBoxIntersectionResult result;
+  result.accept = false;
+  result.continue_search = true;
+  result.distance = ray_tmax;
+
+  metalrt_intersection_curve_shadow(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+#  if defined(__METALRT_MOTION__)
+                             payload.time,
+#  else
+                             0.0f,
+#  endif
+                             ray_tmax, result);
+
+  return result;
+}
+#endif /* __HAIR__ */
+
+#ifdef __POINTCLOUD__
+ccl_device_inline
+void metalrt_intersection_point(constant KernelParamsMetal &launch_params_metal,
+                                ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
+                                const uint object,
+                                const uint prim,
+                                const uint type,
+                                const float3 ray_origin,
+                                const float3 ray_direction,
+                                float time,
+                                const float ray_tmax,
+                                thread BoundingBoxIntersectionResult &result)
+{
+#  ifdef __VISIBILITY_FLAG__
+  const uint visibility = payload.visibility;
+  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+    return;
+  }
+#  endif
+
+  float3 P = ray_origin;
+  float3 dir = ray_direction;
+
+  /* The direction is not normalized by default, but the point intersection routine expects that */
+  float len;
+  dir = normalize_len(dir, &len);
+
+  Intersection isect;
+  isect.t = ray_tmax;
+  /* Transform maximum distance into object space. */
+  if (isect.t != FLT_MAX)
+    isect.t *= len;
+
+  MetalKernelContext context(launch_params_metal);
+  if (context.point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+    result = metalrt_visibility_test<BoundingBoxIntersectionResult, METALRT_HIT_BOUNDING_BOX>(
+                  launch_params_metal, payload, object, prim, isect.u);
+    if (result.accept) {
+      result.distance = isect.t / len;
+      payload.u = isect.u;
+      payload.v = isect.v;
+      payload.prim = prim;
+      payload.type = type;
+    }
+  }
+}
+
+ccl_device_inline
+void metalrt_intersection_point_shadow(constant KernelParamsMetal &launch_params_metal,
+                                       ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
+                                       const uint object,
+                                       const uint prim,
+                                       const uint type,
+                                       const float3 ray_origin,
+                                       const float3 ray_direction,
+                                       float time,
+                                       const float ray_tmax,
+                                       thread BoundingBoxIntersectionResult &result)
+{
+  const uint visibility = payload.visibility;
+
+  float3 P = ray_origin;
+  float3 dir = ray_direction;
+
+  /* The direction is not normalized by default, but the point intersection routine expects that */
+  float len;
+  dir = normalize_len(dir, &len);
+
+  Intersection isect;
+  isect.t = ray_tmax;
+  /* Transform maximum distance into object space */
+  if (isect.t != FLT_MAX)
+    isect.t *= len;
+
+  MetalKernelContext context(launch_params_metal);
+  if (context.point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+    result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_BOUNDING_BOX>(
+                launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax);
+    result.accept = !result.continue_search;
+
+    if (result.accept) {
+      result.distance = isect.t / len;
+    }
+  }
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__intersection__point(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+                             ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+                             const uint object [[user_instance_id]],
+                             const uint primitive_id [[primitive_id]],
+                             const float3 ray_origin [[origin]],
+                             const float3 ray_direction [[direction]],
+                             const float ray_tmax [[max_distance]])
+{
+  const uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+  const int type = kernel_tex_fetch(__objects, object).primitive_type;
+
+  BoundingBoxIntersectionResult result;
+  result.accept = false;
+  result.continue_search = true;
+  result.distance = ray_tmax;
+
+  metalrt_intersection_point(launch_params_metal, payload, object, prim, type, ray_origin, ray_direction,
+#  if defined(__METALRT_MOTION__)
+                             payload.time,
+#  else
+                             0.0f,
+#  endif
+                             ray_tmax, result);
+
+  return result;
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
+BoundingBoxIntersectionResult
+__intersection__point_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+                                    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
+                                    const uint object [[user_instance_id]],
+                                    const uint primitive_id [[primitive_id]],
+                                    const float3 ray_origin [[origin]],
+                                    const float3 ray_direction [[direction]],
+                                    const float ray_tmax [[max_distance]])
+{
+  const uint prim = primitive_id + kernel_tex_fetch(__object_prim_offset, object);
+  const int type = kernel_tex_fetch(__objects, object).primitive_type;
+
+  BoundingBoxIntersectionResult result;
+  result.accept = false;
+  result.continue_search = true;
+  result.distance = ray_tmax;
+
+  metalrt_intersection_point_shadow(launch_params_metal, payload, object, prim, type, ray_origin, ray_direction,
+#  if defined(__METALRT_MOTION__)
+                             payload.time,
+#  else
+                             0.0f,
+#  endif
+                             ray_tmax, result);
+
+  return result;
+}
+#endif /* __POINTCLOUD__ */
+#endif /* __METALRT__ */
diff --git a/intern/cycles/kernel/device/optix/compat.h b/intern/cycles/kernel/device/optix/compat.h
index fb9e094b535..db4233624b9 100644
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -21,6 +21,7 @@
 #include <optix.h>
 
 #define __KERNEL_GPU__
+#define __KERNEL_GPU_RAYTRACING__
 #define __KERNEL_CUDA__ /* OptiX kernels are implicitly CUDA kernels too */
 #define __KERNEL_OPTIX__
 #define CCL_NAMESPACE_BEGIN
@@ -49,16 +50,16 @@ typedef unsigned long long uint64_t;
   __device__ __forceinline__  // Function calls are bad for OptiX performance, so inline everything
 #define ccl_device_inline ccl_device
 #define ccl_device_forceinline ccl_device
+#define ccl_device_inline_method ccl_device
 #define ccl_device_noinline __device__ __noinline__
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_global
-#define ccl_static_constant __constant__
+#define ccl_inline_constant __constant__
 #define ccl_device_constant __constant__ __device__
 #define ccl_constant const
 #define ccl_gpu_shared __shared__
 #define ccl_private
 #define ccl_may_alias
-#define ccl_addr_space
 #define ccl_restrict __restrict__
 #define ccl_loop_no_unroll
 #define ccl_align(n) __align__(n)
@@ -77,6 +78,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
 
 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
@@ -86,7 +88,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
 #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
-#define ccl_gpu_popc(x) __popc(x)
 
 /* GPU texture objects */
 
@@ -121,7 +122,14 @@ __device__ half __float2half(const float f)
   return val;
 }
 
+__device__ float __half2float(const half h)
+{
+  float val;
+  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
+  return val;
+}
+
 /* Types */
 
-#include "util/util_half.h"
-#include "util/util_types.h"
+#include "util/half.h"
+#include "util/types.h"
diff --git a/intern/cycles/kernel/device/optix/globals.h b/intern/cycles/kernel/device/optix/globals.h
index 7d898ed5d91..e9b72369cd5 100644
--- a/intern/cycles/kernel/device/optix/globals.h
+++ b/intern/cycles/kernel/device/optix/globals.h
@@ -18,18 +18,20 @@
 
 #pragma once
 
-#include "kernel/kernel_profiling.h"
-#include "kernel/kernel_types.h"
+#include "kernel/types.h"
 
-#include "kernel/integrator/integrator_state.h"
+#include "kernel/integrator/state.h"
+
+#include "kernel/util/profiling.h"
 
 CCL_NAMESPACE_BEGIN
 
 /* Not actually used, just a NULL pointer that gets passed everywhere, which we
  * hope gets optimized out by the compiler. */
-struct KernelGlobals {
+struct KernelGlobalsGPU {
   int unused[1];
 };
+typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
 
 /* Launch parameters */
 struct KernelParamsOptiX {
@@ -40,7 +42,7 @@ struct KernelParamsOptiX {
   /* Global scene data and textures */
   KernelData data;
 #define KERNEL_TEX(type, name) const type *name;
-#include "kernel/kernel_textures.h"
+#include "kernel/textures.h"
 
   /* Integrator state */
   IntegratorStateGPU __integrator_state;
diff --git a/intern/cycles/kernel/device/optix/kernel.cu b/intern/cycles/kernel/device/optix/kernel.cu
index 7a79e0c4823..8e3d57bff8a 100644
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -21,42 +21,44 @@
 
 #include "kernel/device/gpu/image.h"  /* Texture lookup uses normal CUDA intrinsics. */
 
-#include "kernel/integrator/integrator_state.h"
-#include "kernel/integrator/integrator_state_flow.h"
-#include "kernel/integrator/integrator_state_util.h"
+#include "kernel/tables.h"
 
-#include "kernel/integrator/integrator_intersect_closest.h"
-#include "kernel/integrator/integrator_intersect_shadow.h"
-#include "kernel/integrator/integrator_intersect_subsurface.h"
-#include "kernel/integrator/integrator_intersect_volume_stack.h"
+#include "kernel/integrator/state.h"
+#include "kernel/integrator/state_flow.h"
+#include "kernel/integrator/state_util.h"
 
+#include "kernel/integrator/intersect_closest.h"
+#include "kernel/integrator/intersect_shadow.h"
+#include "kernel/integrator/intersect_subsurface.h"
+#include "kernel/integrator/intersect_volume_stack.h"
 // clang-format on
 
+#define OPTIX_DEFINE_ABI_VERSION_ONLY
+#include <optix_function_table.h>
+
 template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
 {
-  return (T *)(((uint64_t)optixGetPayload_1() << 32) | optixGetPayload_0());
+  return pointer_unpack_from_uint<T>(optixGetPayload_0(), optixGetPayload_1());
 }
 template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
 {
-  return (T *)(((uint64_t)optixGetPayload_3() << 32) | optixGetPayload_2());
+  return pointer_unpack_from_uint<T>(optixGetPayload_2(), optixGetPayload_3());
+}
+
+template<typename T> ccl_device_forceinline T *get_payload_ptr_6()
+{
+  return (T *)(((uint64_t)optixGetPayload_7() << 32) | optixGetPayload_6());
 }
 
-template<bool always = false> ccl_device_forceinline uint get_object_id()
+ccl_device_forceinline int get_object_id()
 {
 #ifdef __OBJECT_MOTION__
-  /* Always get the the instance ID from the TLAS.
+  /* Always get the instance ID from the TLAS
    * There might be a motion transform node between TLAS and BLAS which does not have one. */
-  uint object = optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
+  return optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
 #else
-  uint object = optixGetInstanceId();
+  return optixGetInstanceId();
 #endif
-  /* Choose between always returning object ID or only for instances. */
-  if (always || (object & 1) == 0)
-    /* Can just remove the low bit since instance always contains object ID. */
-    return object >> 1;
-  else
-    /* Set to OBJECT_NONE if this is not an instanced object. */
-    return OBJECT_NONE;
 }
 
 extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest()
@@ -64,7 +66,7 @@ extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest()
   const int global_index = optixGetLaunchIndex().x;
   const int path_index = (__params.path_index_array) ? __params.path_index_array[global_index] :
                                                        global_index;
-  integrator_intersect_closest(nullptr, path_index);
+  integrator_intersect_closest(nullptr, path_index, __params.render_buffer);
 }
 
 extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_shadow()
@@ -100,20 +102,26 @@ extern "C" __global__ void __miss__kernel_optix_miss()
 
 extern "C" __global__ void __anyhit__kernel_optix_local_hit()
 {
-#ifdef __HAIR__
+#if defined(__HAIR__) || defined(__POINTCLOUD__)
   if (!optixIsTriangleHit()) {
-    /* Ignore curves. */
+    /* Ignore curves and points. */
     return optixIgnoreIntersection();
   }
 #endif
 
 #ifdef __BVH_LOCAL__
-  const uint object = get_object_id<true>();
+  const int object = get_object_id();
   if (object != optixGetPayload_4() /* local_object */) {
     /* Only intersect with matching object. */
     return optixIgnoreIntersection();
   }
 
+  const int prim = optixGetPrimitiveIndex();
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+  if (intersection_skip_self_local(ray->self, prim)) {
+    return optixIgnoreIntersection();
+  }
+
   const uint max_hits = optixGetPayload_5();
   if (max_hits == 0) {
     /* Special case for when no hit information is requested, just report that something was hit */
@@ -154,19 +162,19 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
 
   Intersection *isect = &local_isect->hits[hit];
   isect->t = optixGetRayTmax();
-  isect->prim = optixGetPrimitiveIndex();
+  isect->prim = prim;
   isect->object = get_object_id();
-  isect->type = kernel_tex_fetch(__prim_type, isect->prim);
+  isect->type = kernel_tex_fetch(__objects, isect->object).primitive_type;
 
   const float2 barycentrics = optixGetTriangleBarycentrics();
   isect->u = 1.0f - barycentrics.y - barycentrics.x;
   isect->v = barycentrics.x;
 
   /* Record geometric normal. */
-  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
-  const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0));
-  const float3 tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1));
-  const float3 tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2));
+  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
+  const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0);
+  const float3 tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1);
+  const float3 tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
   local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
 
   /* Continue tracing (without this the trace call would return after the first hit). */
@@ -177,167 +185,239 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
 extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
 {
 #ifdef __SHADOW_RECORD_ALL__
-  bool ignore_intersection = false;
-
-  const uint prim = optixGetPrimitiveIndex();
+  int prim = optixGetPrimitiveIndex();
+  const uint object = get_object_id();
 #  ifdef __VISIBILITY_FLAG__
   const uint visibility = optixGetPayload_4();
-  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
-    ignore_intersection = true;
+  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+    return optixIgnoreIntersection();
   }
 #  endif
 
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+  if (intersection_skip_self_shadow(ray->self, object, prim)) {
+    return optixIgnoreIntersection();
+  }
+
   float u = 0.0f, v = 0.0f;
+  int type = 0;
   if (optixIsTriangleHit()) {
     const float2 barycentrics = optixGetTriangleBarycentrics();
     u = 1.0f - barycentrics.y - barycentrics.x;
     v = barycentrics.x;
+    type = kernel_tex_fetch(__objects, object).primitive_type;
   }
 #  ifdef __HAIR__
-  else {
+  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
     u = __uint_as_float(optixGetAttribute_0());
     v = __uint_as_float(optixGetAttribute_1());
 
+    const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
+    type = segment.type;
+    prim = segment.prim;
+
+#    if OPTIX_ABI_VERSION < 55
     /* Filter out curve endcaps. */
     if (u == 0.0f || u == 1.0f) {
-      ignore_intersection = true;
+      return optixIgnoreIntersection();
     }
+#    endif
   }
 #  endif
+  else {
+    type = kernel_tex_fetch(__objects, object).primitive_type;
+    u = 0.0f;
+    v = 0.0f;
+  }
 
-  int num_hits = optixGetPayload_2();
-  int record_index = num_hits;
-  const int max_hits = optixGetPayload_3();
+#  ifndef __TRANSPARENT_SHADOWS__
+  /* No transparent shadows support compiled in, make opaque. */
+  optixSetPayload_5(true);
+  return optixTerminateRay();
+#  else
+  const uint max_hits = optixGetPayload_3();
+  const uint num_hits_packed = optixGetPayload_2();
+  const uint num_recorded_hits = uint16_unpack_from_uint_0(num_hits_packed);
+  const uint num_hits = uint16_unpack_from_uint_1(num_hits_packed);
+
+  /* If no transparent shadows, all light is blocked and we can stop immediately. */
+  if (num_hits >= max_hits ||
+      !(intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) {
+    optixSetPayload_5(true);
+    return optixTerminateRay();
+  }
+
+  /* Always use baked shadow transparency for curves. */
+  if (type & PRIMITIVE_CURVE) {
+    float throughput = __uint_as_float(optixGetPayload_1());
+    throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, u);
+    optixSetPayload_1(__float_as_uint(throughput));
+    optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits, num_hits + 1));
 
-  if (!ignore_intersection) {
-    optixSetPayload_2(num_hits + 1);
+    if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
+      optixSetPayload_5(true);
+      return optixTerminateRay();
+    }
+    else {
+      /* Continue tracing. */
+      optixIgnoreIntersection();
+      return;
+    }
   }
 
-  Intersection *const isect_array = get_payload_ptr_0<Intersection>();
+  /* Record transparent intersection. */
+  optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits + 1, num_hits + 1));
+
+  uint record_index = num_recorded_hits;
+
+  const IntegratorShadowState state = optixGetPayload_0();
 
-#  ifdef __TRANSPARENT_SHADOWS__
-  if (num_hits >= max_hits) {
+  const uint max_record_hits = min(max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
+  if (record_index >= max_record_hits) {
     /* If maximum number of hits reached, find a hit to replace. */
-    const int num_recorded_hits = min(max_hits, num_hits);
-    float max_recorded_t = isect_array[0].t;
-    int max_recorded_hit = 0;
+    float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t);
+    uint max_recorded_hit = 0;
 
-    for (int i = 1; i < num_recorded_hits; i++) {
-      if (isect_array[i].t > max_recorded_t) {
-        max_recorded_t = isect_array[i].t;
+    for (int i = 1; i < max_record_hits; i++) {
+      const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t);
+      if (isect_t > max_recorded_t) {
+        max_recorded_t = isect_t;
         max_recorded_hit = i;
       }
     }
 
     if (optixGetRayTmax() >= max_recorded_t) {
-      /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the current
-       * hit anymore. */
+      /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the
+       * current hit anymore. */
       return;
     }
 
     record_index = max_recorded_hit;
   }
-#  endif
 
-  if (!ignore_intersection) {
-    Intersection *const isect = isect_array + record_index;
-    isect->u = u;
-    isect->v = v;
-    isect->t = optixGetRayTmax();
-    isect->prim = prim;
-    isect->object = get_object_id();
-    isect->type = kernel_tex_fetch(__prim_type, prim);
-
-#  ifdef __TRANSPARENT_SHADOWS__
-    /* Detect if this surface has a shader with transparent shadows. */
-    if (!shader_transparent_shadow(NULL, isect) || max_hits == 0) {
-#  endif
-      /* If no transparent shadows, all light is blocked and we can stop immediately. */
-      optixSetPayload_5(true);
-      return optixTerminateRay();
-#  ifdef __TRANSPARENT_SHADOWS__
-    }
-#  endif
-  }
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = optixGetRayTmax();
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
 
   /* Continue tracing. */
   optixIgnoreIntersection();
-#endif
+#  endif /* __TRANSPARENT_SHADOWS__ */
+#endif   /* __SHADOW_RECORD_ALL__ */
 }
 
 extern "C" __global__ void __anyhit__kernel_optix_volume_test()
 {
-#ifdef __HAIR__
+#if defined(__HAIR__) || defined(__POINTCLOUD__)
   if (!optixIsTriangleHit()) {
     /* Ignore curves. */
     return optixIgnoreIntersection();
   }
 #endif
 
+  const uint object = get_object_id();
 #ifdef __VISIBILITY_FLAG__
-  const uint prim = optixGetPrimitiveIndex();
   const uint visibility = optixGetPayload_4();
-  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
+  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
     return optixIgnoreIntersection();
   }
 #endif
 
-  const uint object = get_object_id<true>();
   if ((kernel_tex_fetch(__object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) {
     return optixIgnoreIntersection();
   }
+
+  const int prim = optixGetPrimitiveIndex();
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+  if (intersection_skip_self(ray->self, object, prim)) {
+    return optixIgnoreIntersection();
+  }
 }
 
 extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
 {
 #ifdef __HAIR__
-  if (!optixIsTriangleHit()) {
+#  if OPTIX_ABI_VERSION < 55
+  if (optixGetPrimitiveType() == OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE) {
     /* Filter out curve endcaps. */
     const float u = __uint_as_float(optixGetAttribute_0());
     if (u == 0.0f || u == 1.0f) {
       return optixIgnoreIntersection();
     }
   }
+#  endif
 #endif
 
-#ifdef __VISIBILITY_FLAG__
-  const uint prim = optixGetPrimitiveIndex();
+  const uint object = get_object_id();
   const uint visibility = optixGetPayload_4();
-  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
+#ifdef __VISIBILITY_FLAG__
+  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
     return optixIgnoreIntersection();
   }
+#endif
+
+  const int prim = optixGetPrimitiveIndex();
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
 
-  /* Shadow ray early termination. */
   if (visibility & PATH_RAY_SHADOW_OPAQUE) {
-    return optixTerminateRay();
+    if (intersection_skip_self_shadow(ray->self, object, prim)) {
+      return optixIgnoreIntersection();
+    }
+    else {
+      /* Shadow ray early termination. */
+      return optixTerminateRay();
+    }
+  }
+  else {
+    if (intersection_skip_self(ray->self, object, prim)) {
+      return optixIgnoreIntersection();
+    }
   }
-#endif
 }
 
 extern "C" __global__ void __closesthit__kernel_optix_hit()
 {
+  const int object = get_object_id();
+  const int prim = optixGetPrimitiveIndex();
+
   optixSetPayload_0(__float_as_uint(optixGetRayTmax())); /* Intersection distance */
-  optixSetPayload_3(optixGetPrimitiveIndex());
-  optixSetPayload_4(get_object_id());
-  /* Can be PRIMITIVE_TRIANGLE and PRIMITIVE_MOTION_TRIANGLE or curve type and segment index. */
-  optixSetPayload_5(kernel_tex_fetch(__prim_type, optixGetPrimitiveIndex()));
+  optixSetPayload_4(object);
 
   if (optixIsTriangleHit()) {
     const float2 barycentrics = optixGetTriangleBarycentrics();
     optixSetPayload_1(__float_as_uint(1.0f - barycentrics.y - barycentrics.x));
     optixSetPayload_2(__float_as_uint(barycentrics.x));
+    optixSetPayload_3(prim);
+    optixSetPayload_5(kernel_tex_fetch(__objects, object).primitive_type);
   }
-  else {
+  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
+    const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, prim);
     optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */
     optixSetPayload_2(optixGetAttribute_1());
+    optixSetPayload_3(segment.prim);
+    optixSetPayload_5(segment.type);
+  }
+  else {
+    optixSetPayload_1(0);
+    optixSetPayload_2(0);
+    optixSetPayload_3(prim);
+    optixSetPayload_5(kernel_tex_fetch(__objects, object).primitive_type);
   }
 }
 
 #ifdef __HAIR__
-ccl_device_inline void optix_intersection_curve(const uint prim, const uint type)
+ccl_device_inline void optix_intersection_curve(const int prim, const int type)
 {
-  const uint object = get_object_id<true>();
+  const int object = get_object_id();
+
+#  ifdef __VISIBILITY_FLAG__
   const uint visibility = optixGetPayload_4();
+  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+    return;
+  }
+#  endif
 
   float3 P = optixGetObjectRayOrigin();
   float3 dir = optixGetObjectRayDirection();
@@ -358,7 +438,8 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
   if (isect.t != FLT_MAX)
     isect.t *= len;
 
-  if (curve_intersect(NULL, &isect, P, dir, isect.t, visibility, object, prim, time, type)) {
+  if (curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
     optixReportIntersection(isect.t / len,
                             type & PRIMITIVE_ALL,
                             __float_as_int(isect.u),  /* Attribute_0 */
@@ -368,11 +449,53 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
 
 extern "C" __global__ void __intersection__curve_ribbon()
 {
-  const uint prim = optixGetPrimitiveIndex();
-  const uint type = kernel_tex_fetch(__prim_type, prim);
-
-  if (type & (PRIMITIVE_CURVE_RIBBON | PRIMITIVE_MOTION_CURVE_RIBBON)) {
+  const KernelCurveSegment segment = kernel_tex_fetch(__curve_segments, optixGetPrimitiveIndex());
+  const int prim = segment.prim;
+  const int type = segment.type;
+  if (type & PRIMITIVE_CURVE_RIBBON) {
     optix_intersection_curve(prim, type);
   }
 }
+
+#endif
+
+#ifdef __POINTCLOUD__
+extern "C" __global__ void __intersection__point()
+{
+  const int prim = optixGetPrimitiveIndex();
+  const int object = get_object_id();
+  const int type = kernel_tex_fetch(__objects, object).primitive_type;
+
+#  ifdef __VISIBILITY_FLAG__
+  const uint visibility = optixGetPayload_4();
+  if ((kernel_tex_fetch(__objects, object).visibility & visibility) == 0) {
+    return;
+  }
+#  endif
+
+  float3 P = optixGetObjectRayOrigin();
+  float3 dir = optixGetObjectRayDirection();
+
+  /* The direction is not normalized by default, the point intersection routine expects that. */
+  float len;
+  dir = normalize_len(dir, &len);
+
+#  ifdef __OBJECT_MOTION__
+  const float time = optixGetRayTime();
+#  else
+  const float time = 0.0f;
+#  endif
+
+  Intersection isect;
+  isect.t = optixGetRayTmax();
+  /* Transform maximum distance into object space. */
+  if (isect.t != FLT_MAX) {
+    isect.t *= len;
+  }
+
+  if (point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
+    optixReportIntersection(isect.t / len, type & PRIMITIVE_ALL);
+  }
+}
 #endif
diff --git a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
index bf787e29eaa..071e9deae0b 100644
--- a/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
+++ b/intern/cycles/kernel/device/optix/kernel_shader_raytrace.cu
@@ -18,7 +18,8 @@
  * much longer to compiler. This is only loaded when needed by the scene. */
 
 #include "kernel/device/optix/kernel.cu"
-#include "kernel/integrator/integrator_shade_surface.h"
+
+#include "kernel/integrator/shade_surface.h"
 
 extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface_raytrace()
 {